ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +187 -36
- data/lib/ragnar/cli.rb +543 -172
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +226 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/llm_manager.rb +4 -1
- data/lib/ragnar/query_processor.rb +38 -20
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +190 -73
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- metadata +43 -22
- data/lib/ragnar/topic_modeling/engine.rb +0 -221
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
data/lib/ragnar/cli.rb
CHANGED
@@ -1,27 +1,82 @@
|
|
1
|
+
require_relative "cli_visualization"
|
2
|
+
require_relative "config"
|
3
|
+
require "thor/interactive"
|
4
|
+
require "stringio"
|
5
|
+
require "fileutils"
|
6
|
+
|
1
7
|
module Ragnar
|
2
8
|
class CLI < Thor
|
9
|
+
include CLIVisualization
|
10
|
+
include Thor::Interactive::Command
|
11
|
+
|
12
|
+
# Configure interactive mode
|
13
|
+
configure_interactive(
|
14
|
+
prompt: Config.instance.interactive_prompt,
|
15
|
+
allow_nested: false,
|
16
|
+
history_file: Config.instance.history_file,
|
17
|
+
default_handler: proc do |input, thor_instance|
|
18
|
+
puts "[DEBUG] Default handler called: #{input}" if ENV["DEBUG"]
|
19
|
+
|
20
|
+
begin
|
21
|
+
# IMPORTANT: Use direct method call, NOT invoke(), to avoid Thor's
|
22
|
+
# silent deduplication that prevents repeated calls to the same method
|
23
|
+
result = thor_instance.query(input.strip)
|
24
|
+
puts "[DEBUG] Default handler completed" if ENV["DEBUG"]
|
25
|
+
result
|
26
|
+
rescue => e
|
27
|
+
puts "[DEBUG] Default handler error: #{e.message}" if ENV["DEBUG"]
|
28
|
+
puts "[DEBUG] Backtrace: #{e.backtrace.first(3)}" if ENV["DEBUG"]
|
29
|
+
raise e
|
30
|
+
end
|
31
|
+
end
|
32
|
+
)
|
33
|
+
|
34
|
+
# Class variables for caching expensive resources in interactive mode
|
35
|
+
class_variable_set(:@@cached_database, nil)
|
36
|
+
class_variable_set(:@@cached_embedder, nil)
|
37
|
+
class_variable_set(:@@cached_llm_manager, nil)
|
38
|
+
class_variable_set(:@@cached_query_processor, nil)
|
39
|
+
class_variable_set(:@@cached_db_path, nil)
|
40
|
+
|
3
41
|
desc "index PATH", "Index text files from PATH (file or directory)"
|
4
|
-
option :db_path, type: :string,
|
5
|
-
option :chunk_size, type: :numeric,
|
6
|
-
option :chunk_overlap, type: :numeric,
|
7
|
-
option :model, type: :string,
|
42
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
43
|
+
option :chunk_size, type: :numeric, desc: "Chunk size in tokens (default from config)"
|
44
|
+
option :chunk_overlap, type: :numeric, desc: "Chunk overlap in tokens (default from config)"
|
45
|
+
option :model, type: :string, desc: "Embedding model to use (default from config)"
|
8
46
|
def index(path)
|
9
|
-
|
10
|
-
|
47
|
+
# Expand user paths (handle ~ in user input)
|
48
|
+
expanded_path = File.expand_path(path)
|
49
|
+
|
50
|
+
unless File.exist?(expanded_path)
|
51
|
+
say "Error: Path does not exist: #{expanded_path}", :red
|
11
52
|
exit 1
|
12
53
|
end
|
13
54
|
|
14
55
|
say "Indexing files from: #{path}", :green
|
15
56
|
|
57
|
+
# Debug options in interactive mode
|
58
|
+
puts "Debug - options: #{options.inspect}" if ENV['DEBUG']
|
59
|
+
|
60
|
+
# Get config instance
|
61
|
+
config = Config.instance
|
62
|
+
|
63
|
+
# Clear database cache when indexing new content
|
64
|
+
db_path = options[:db_path] || config.database_path
|
65
|
+
if @@cached_db_path == db_path
|
66
|
+
@@cached_database = nil
|
67
|
+
@@cached_query_processor = nil
|
68
|
+
end
|
69
|
+
|
16
70
|
indexer = Indexer.new(
|
17
|
-
db_path:
|
18
|
-
chunk_size: options[:chunk_size],
|
19
|
-
chunk_overlap: options[:chunk_overlap],
|
20
|
-
embedding_model: options[:model]
|
71
|
+
db_path: db_path,
|
72
|
+
chunk_size: options[:chunk_size] || config.chunk_size,
|
73
|
+
chunk_overlap: options[:chunk_overlap] || config.chunk_overlap,
|
74
|
+
embedding_model: options[:model] || config.embedding_model,
|
75
|
+
show_progress: config.show_progress?
|
21
76
|
)
|
22
77
|
|
23
78
|
begin
|
24
|
-
stats = indexer.index_path(
|
79
|
+
stats = indexer.index_path(expanded_path)
|
25
80
|
say "\nIndexing complete!", :green
|
26
81
|
say "Files processed: #{stats[:files_processed]}"
|
27
82
|
say "Chunks created: #{stats[:chunks_created]}"
|
@@ -33,31 +88,39 @@ module Ragnar
|
|
33
88
|
end
|
34
89
|
|
35
90
|
desc "train-umap", "Train UMAP model on existing embeddings"
|
36
|
-
option :db_path, type: :string,
|
91
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
37
92
|
option :n_components, type: :numeric, default: 50, desc: "Number of dimensions for reduction"
|
38
93
|
option :n_neighbors, type: :numeric, default: 15, desc: "Number of neighbors for UMAP"
|
39
94
|
option :min_dist, type: :numeric, default: 0.1, desc: "Minimum distance for UMAP"
|
40
|
-
option :model_path, type: :string,
|
95
|
+
option :model_path, type: :string, desc: "Path to save UMAP model"
|
41
96
|
def train_umap
|
42
97
|
say "Training UMAP model on embeddings...", :green
|
43
98
|
|
99
|
+
config = Config.instance
|
100
|
+
# Use model_path from options if provided, otherwise use config models_dir
|
101
|
+
model_path = if options[:model_path]
|
102
|
+
options[:model_path]
|
103
|
+
else
|
104
|
+
File.join(config.models_dir, "umap_model.bin")
|
105
|
+
end
|
106
|
+
|
44
107
|
processor = UmapProcessor.new(
|
45
|
-
db_path: options[:db_path],
|
46
|
-
model_path:
|
108
|
+
db_path: options[:db_path] || config.database_path,
|
109
|
+
model_path: model_path
|
47
110
|
)
|
48
111
|
|
49
112
|
begin
|
50
113
|
stats = processor.train(
|
51
|
-
n_components: options[:n_components],
|
52
|
-
n_neighbors: options[:n_neighbors],
|
53
|
-
min_dist: options[:min_dist]
|
114
|
+
n_components: options[:n_components] || 50,
|
115
|
+
n_neighbors: options[:n_neighbors] || 15,
|
116
|
+
min_dist: options[:min_dist] || 0.1
|
54
117
|
)
|
55
118
|
|
56
119
|
say "\nUMAP training complete!", :green
|
57
120
|
say "Embeddings processed: #{stats[:embeddings_count]}"
|
58
121
|
say "Original dimensions: #{stats[:original_dims]}"
|
59
122
|
say "Reduced dimensions: #{stats[:reduced_dims]}"
|
60
|
-
say "Model saved to: #{
|
123
|
+
say "Model saved to: #{processor.model_path}"
|
61
124
|
rescue => e
|
62
125
|
say "Error during UMAP training: #{e.message}", :red
|
63
126
|
exit 1
|
@@ -65,12 +128,19 @@ module Ragnar
|
|
65
128
|
end
|
66
129
|
|
67
130
|
desc "apply-umap", "Apply trained UMAP model to reduce embedding dimensions"
|
68
|
-
option :db_path, type: :string,
|
69
|
-
option :model_path, type: :string,
|
131
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
132
|
+
option :model_path, type: :string, desc: "Path to UMAP model"
|
70
133
|
option :batch_size, type: :numeric, default: 100, desc: "Batch size for processing"
|
71
134
|
def apply_umap
|
72
|
-
|
73
|
-
|
135
|
+
config = Config.instance
|
136
|
+
model_path = if options[:model_path]
|
137
|
+
options[:model_path]
|
138
|
+
else
|
139
|
+
File.join(config.models_dir, "umap_model.bin")
|
140
|
+
end
|
141
|
+
|
142
|
+
unless File.exist?(model_path)
|
143
|
+
say "Error: UMAP model not found at: #{model_path}", :red
|
74
144
|
say "Please run 'train-umap' first to create a model.", :yellow
|
75
145
|
exit 1
|
76
146
|
end
|
@@ -78,12 +148,12 @@ module Ragnar
|
|
78
148
|
say "Applying UMAP model to embeddings...", :green
|
79
149
|
|
80
150
|
processor = UmapProcessor.new(
|
81
|
-
db_path: options[:db_path],
|
82
|
-
model_path:
|
151
|
+
db_path: options[:db_path] || config.database_path,
|
152
|
+
model_path: model_path
|
83
153
|
)
|
84
154
|
|
85
155
|
begin
|
86
|
-
stats = processor.apply(batch_size: options[:batch_size])
|
156
|
+
stats = processor.apply(batch_size: options[:batch_size] || 100)
|
87
157
|
|
88
158
|
say "\nUMAP application complete!", :green
|
89
159
|
say "Embeddings processed: #{stats[:processed]}"
|
@@ -96,18 +166,21 @@ module Ragnar
|
|
96
166
|
end
|
97
167
|
|
98
168
|
desc "topics", "Extract and display topics from indexed documents"
|
99
|
-
option :db_path, type: :string,
|
169
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
100
170
|
option :min_cluster_size, type: :numeric, default: 5, desc: "Minimum documents per topic"
|
101
171
|
option :method, type: :string, default: "hybrid", desc: "Labeling method: fast, quality, or hybrid"
|
102
172
|
option :export, type: :string, desc: "Export topics to file (json or html)"
|
103
173
|
option :verbose, type: :boolean, default: false, aliases: "-v", desc: "Show detailed processing"
|
174
|
+
option :summarize, type: :boolean, default: false, aliases: "-s", desc: "Generate human-readable topic summaries using LLM"
|
175
|
+
option :llm_model, type: :string, default: "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", desc: "LLM model for summarization"
|
176
|
+
option :gguf_file, type: :string, default: "tinyllama-1.1b-chat-v1.0.q4_k_m.gguf", desc: "GGUF file name for LLM model"
|
104
177
|
def topics
|
105
178
|
require_relative 'topic_modeling'
|
106
179
|
|
107
180
|
say "Extracting topics from indexed documents...", :green
|
108
181
|
|
109
|
-
# Load embeddings and documents from database
|
110
|
-
database =
|
182
|
+
# Load embeddings and documents from database - use cache in interactive mode
|
183
|
+
database = get_cached_database(options[:db_path] || Config.instance.database_path)
|
111
184
|
|
112
185
|
begin
|
113
186
|
# Get all documents with embeddings
|
@@ -127,7 +200,22 @@ module Ragnar
|
|
127
200
|
exit 1
|
128
201
|
end
|
129
202
|
|
130
|
-
|
203
|
+
# Check if we have reduced embeddings available
|
204
|
+
first_doc = docs_with_embeddings.first
|
205
|
+
has_reduced = first_doc[:reduced_embedding] && !first_doc[:reduced_embedding].empty?
|
206
|
+
|
207
|
+
if has_reduced
|
208
|
+
embeddings = docs_with_embeddings.map { |d| d[:reduced_embedding] }
|
209
|
+
say "Using reduced embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
|
210
|
+
# Already reduced, so don't reduce again in the engine
|
211
|
+
reduce_dims = false
|
212
|
+
else
|
213
|
+
embeddings = docs_with_embeddings.map { |d| d[:embedding] }
|
214
|
+
say "Using original embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
|
215
|
+
# Let the engine handle dimensionality reduction if needed
|
216
|
+
reduce_dims = true
|
217
|
+
end
|
218
|
+
|
131
219
|
documents = docs_with_embeddings.map { |d| d[:chunk_text] }
|
132
220
|
metadata = docs_with_embeddings.map { |d| { file_path: d[:file_path], chunk_index: d[:chunk_index] } }
|
133
221
|
|
@@ -137,7 +225,8 @@ module Ragnar
|
|
137
225
|
engine = Ragnar::TopicModeling::Engine.new(
|
138
226
|
min_cluster_size: options[:min_cluster_size],
|
139
227
|
labeling_method: options[:method].to_sym,
|
140
|
-
verbose: options[:verbose]
|
228
|
+
verbose: options[:verbose],
|
229
|
+
reduce_dimensions: reduce_dims
|
141
230
|
)
|
142
231
|
|
143
232
|
# Extract topics
|
@@ -148,12 +237,36 @@ module Ragnar
|
|
148
237
|
metadata: metadata
|
149
238
|
)
|
150
239
|
|
240
|
+
# Generate summaries if requested
|
241
|
+
if options[:summarize] && topics.any?
|
242
|
+
say "Generating topic summaries with LLM...", :yellow
|
243
|
+
begin
|
244
|
+
require 'red-candle'
|
245
|
+
|
246
|
+
# Initialize LLM for summarization once
|
247
|
+
say "Loading model: #{options[:llm_model]}", :cyan if options[:verbose]
|
248
|
+
llm = Candle::LLM.from_pretrained(options[:llm_model], gguf_file: options[:gguf_file])
|
249
|
+
|
250
|
+
# Add summaries to topics
|
251
|
+
topics.each_with_index do |topic, i|
|
252
|
+
say " Summarizing topic #{i+1}/#{topics.length}...", :yellow if options[:verbose]
|
253
|
+
topic.instance_variable_set(:@summary, summarize_topic(topic, llm))
|
254
|
+
end
|
255
|
+
|
256
|
+
say "Topic summaries generated!", :green
|
257
|
+
rescue => e
|
258
|
+
say "Warning: Could not generate topic summaries: #{e.message}", :yellow
|
259
|
+
say "Proceeding without summaries...", :yellow
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
151
263
|
# Display results
|
152
|
-
display_topics(topics)
|
264
|
+
display_topics(topics, show_summaries: options[:summarize])
|
153
265
|
|
154
266
|
# Export if requested
|
155
267
|
if options[:export]
|
156
|
-
|
268
|
+
# Pass embeddings and cluster IDs for visualization
|
269
|
+
export_topics(topics, options[:export], embeddings: embeddings, cluster_ids: engine.instance_variable_get(:@cluster_ids))
|
157
270
|
end
|
158
271
|
|
159
272
|
rescue => e
|
@@ -168,51 +281,80 @@ module Ragnar
|
|
168
281
|
option :k, type: :numeric, default: 5, desc: "Number of results to return"
|
169
282
|
option :show_scores, type: :boolean, default: false, desc: "Show similarity scores"
|
170
283
|
def search(query_text)
|
171
|
-
database =
|
172
|
-
embedder =
|
173
|
-
|
284
|
+
database = get_cached_database(options[:database] || Config.instance.database_path)
|
285
|
+
embedder = get_cached_embedder()
|
286
|
+
|
174
287
|
# Generate embedding for query
|
175
288
|
query_embedding = embedder.embed_text(query_text)
|
176
|
-
|
289
|
+
|
177
290
|
# Search for similar documents
|
178
291
|
results = database.search_similar(query_embedding, k: options[:k])
|
179
|
-
|
292
|
+
|
180
293
|
if results.empty?
|
181
294
|
say "No results found.", :yellow
|
182
295
|
return
|
183
296
|
end
|
184
|
-
|
297
|
+
|
185
298
|
say "Found #{results.length} results:\n", :green
|
186
|
-
|
299
|
+
|
187
300
|
results.each_with_index do |result, idx|
|
188
301
|
say "#{idx + 1}. File: #{result[:file_path]}", :cyan
|
189
302
|
say " Chunk: #{result[:chunk_index]}"
|
190
|
-
|
303
|
+
|
191
304
|
if options[:show_scores]
|
192
305
|
say " Distance: #{result[:distance].round(4)}"
|
193
306
|
end
|
194
|
-
|
307
|
+
|
195
308
|
# Show preview of content
|
196
309
|
preview = result[:chunk_text][0..200].gsub(/\s+/, ' ')
|
197
310
|
say " Content: #{preview}..."
|
198
311
|
say ""
|
199
312
|
end
|
200
313
|
end
|
201
|
-
|
314
|
+
|
202
315
|
desc "query QUESTION", "Query the RAG system"
|
203
|
-
option :db_path, type: :string,
|
316
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
204
317
|
option :top_k, type: :numeric, default: 3, desc: "Number of top documents to use"
|
205
318
|
option :verbose, type: :boolean, default: false, aliases: "-v", desc: "Show detailed processing steps"
|
206
319
|
option :json, type: :boolean, default: false, desc: "Output as JSON"
|
207
320
|
def query(question)
|
208
|
-
|
321
|
+
puts "Debug - Query called with: #{question.inspect}" if ENV['DEBUG']
|
322
|
+
puts "Debug - Options: #{options.inspect}" if ENV['DEBUG']
|
323
|
+
|
324
|
+
processor = get_cached_query_processor(options[:db_path] || Config.instance.database_path)
|
325
|
+
puts "Debug - Processor: #{processor.class}" if ENV['DEBUG']
|
209
326
|
|
210
327
|
begin
|
211
|
-
|
328
|
+
config = Config.instance
|
329
|
+
result = processor.query(
|
330
|
+
question,
|
331
|
+
top_k: options[:top_k] || config.query_top_k,
|
332
|
+
verbose: options[:verbose] || false,
|
333
|
+
enable_rewriting: config.enable_query_rewriting?
|
334
|
+
)
|
335
|
+
puts "Debug - Result keys: #{result.keys}" if ENV['DEBUG']
|
212
336
|
|
213
337
|
if options[:json]
|
214
338
|
puts JSON.pretty_generate(result)
|
339
|
+
elsif interactive?
|
340
|
+
# Clean output for interactive mode - just answer, confidence, and sources
|
341
|
+
say "" # Add blank line before answer for spacing
|
342
|
+
say result[:answer]
|
343
|
+
|
344
|
+
if result[:confidence]
|
345
|
+
say "\nConfidence: #{result[:confidence]}%", :magenta
|
346
|
+
end
|
347
|
+
|
348
|
+
if result[:sources] && !result[:sources].empty?
|
349
|
+
say "\nSources:", :blue
|
350
|
+
result[:sources].each_with_index do |source, idx|
|
351
|
+
say " #{idx + 1}. #{source[:source_file]}" if source[:source_file]
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
say "" # Add blank line for spacing
|
215
356
|
else
|
357
|
+
# Full output for CLI mode
|
216
358
|
say "\n" + "="*60, :green
|
217
359
|
say "Query: #{result[:query]}", :cyan
|
218
360
|
|
@@ -234,7 +376,7 @@ module Ragnar
|
|
234
376
|
end
|
235
377
|
end
|
236
378
|
|
237
|
-
if options[:verbose] && result[:sub_queries]
|
379
|
+
if (options[:verbose] || false) && result[:sub_queries]
|
238
380
|
say "\nSub-queries used:", :yellow
|
239
381
|
result[:sub_queries].each { |sq| say " - #{sq}" }
|
240
382
|
end
|
@@ -243,15 +385,15 @@ module Ragnar
|
|
243
385
|
end
|
244
386
|
rescue => e
|
245
387
|
say "Error processing query: #{e.message}", :red
|
246
|
-
|
388
|
+
puts "Debug - Full backtrace: #{e.backtrace.join("\n")}" if ENV['DEBUG']
|
247
389
|
exit 1
|
248
390
|
end
|
249
391
|
end
|
250
392
|
|
251
393
|
desc "stats", "Show database statistics"
|
252
|
-
option :db_path, type: :string,
|
394
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
253
395
|
def stats
|
254
|
-
db =
|
396
|
+
db = get_cached_database(options[:db_path] || Config.instance.database_path)
|
255
397
|
stats = db.get_stats
|
256
398
|
|
257
399
|
say "\nDatabase Statistics", :green
|
@@ -277,8 +419,325 @@ module Ragnar
|
|
277
419
|
say "Ragnar v#{Ragnar::VERSION}"
|
278
420
|
end
|
279
421
|
|
422
|
+
desc "config", "Show current configuration"
|
423
|
+
def config
|
424
|
+
config = Config.instance
|
425
|
+
|
426
|
+
say "\nConfiguration Settings:", :cyan
|
427
|
+
say "-" * 40
|
428
|
+
|
429
|
+
if config.config_exists?
|
430
|
+
say "Config file: #{config.config_file_path}", :green
|
431
|
+
else
|
432
|
+
say "Config file: None (using defaults)", :yellow
|
433
|
+
end
|
434
|
+
|
435
|
+
say "\nPaths:", :cyan
|
436
|
+
say " Database: #{config.database_path}"
|
437
|
+
say " Models: #{config.models_dir}"
|
438
|
+
say " History: #{config.history_file}"
|
439
|
+
|
440
|
+
say "\nEmbeddings:", :cyan
|
441
|
+
say " Model: #{config.embedding_model}"
|
442
|
+
say " Chunk size: #{config.chunk_size}"
|
443
|
+
say " Chunk overlap: #{config.chunk_overlap}"
|
444
|
+
|
445
|
+
say "\nLLM:", :cyan
|
446
|
+
say " Model: #{config.llm_model}"
|
447
|
+
say " GGUF file: #{config.llm_gguf_file}"
|
448
|
+
|
449
|
+
say "\nUMAP:", :cyan
|
450
|
+
say " Reduced dimensions: #{config.get('umap.reduced_dimensions', Ragnar::DEFAULT_REDUCED_DIMENSIONS)}"
|
451
|
+
say " N neighbors: #{config.get('umap.n_neighbors', 15)}"
|
452
|
+
say " Min distance: #{config.get('umap.min_dist', 0.1)}"
|
453
|
+
|
454
|
+
say "\nQuery:", :cyan
|
455
|
+
say " Top K: #{config.query_top_k}"
|
456
|
+
say " Query rewriting: #{config.enable_query_rewriting?}"
|
457
|
+
end
|
458
|
+
|
459
|
+
desc "model", "Show current LLM model information"
|
460
|
+
def model
|
461
|
+
config = Config.instance
|
462
|
+
|
463
|
+
say "\nLLM Model Configuration:", :cyan
|
464
|
+
say "-" * 40
|
465
|
+
|
466
|
+
say "\nModel:", :green
|
467
|
+
say " Repository: #{config.llm_model}"
|
468
|
+
say " GGUF file: #{config.llm_gguf_file}"
|
469
|
+
|
470
|
+
# Check if model files exist
|
471
|
+
model_path = File.join(config.models_dir, config.llm_gguf_file)
|
472
|
+
if File.exist?(model_path)
|
473
|
+
size_mb = (File.size(model_path) / 1024.0 / 1024.0).round(2)
|
474
|
+
say "\nModel file exists: #{model_path} (#{size_mb} MB)", :green
|
475
|
+
else
|
476
|
+
say "\nModel file not found: #{model_path}", :yellow
|
477
|
+
say "Run 'ragnar query' to download automatically", :yellow
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
desc "clear-cache", "Clear cached instances (useful in interactive mode)"
|
482
|
+
def clear_cache_command
|
483
|
+
clear_cache
|
484
|
+
say "Cache cleared. Next commands will create fresh instances.", :green
|
485
|
+
end
|
486
|
+
|
487
|
+
desc "reset", "Reset Ragnar data (database, models, cache)"
|
488
|
+
option :all, type: :boolean, default: false, aliases: "-a", desc: "Reset everything (database, models, cache)"
|
489
|
+
option :database, type: :boolean, default: false, aliases: "-d", desc: "Reset database only"
|
490
|
+
option :models, type: :boolean, default: false, aliases: "-m", desc: "Reset UMAP models only"
|
491
|
+
option :cache, type: :boolean, default: false, aliases: "-c", desc: "Clear cache only"
|
492
|
+
option :force, type: :boolean, default: false, aliases: "-f", desc: "Skip confirmation prompt"
|
493
|
+
def reset
|
494
|
+
# Determine what to reset
|
495
|
+
reset_all = options[:all]
|
496
|
+
reset_db = options[:database] || reset_all
|
497
|
+
reset_models = options[:models] || reset_all
|
498
|
+
reset_cache = options[:cache] || reset_all
|
499
|
+
|
500
|
+
# If no specific options, default to all
|
501
|
+
if !reset_db && !reset_models && !reset_cache
|
502
|
+
reset_all = true
|
503
|
+
reset_db = reset_models = reset_cache = true
|
504
|
+
end
|
505
|
+
|
506
|
+
# Build confirmation message
|
507
|
+
items_to_reset = []
|
508
|
+
items_to_reset << "database" if reset_db
|
509
|
+
items_to_reset << "UMAP models" if reset_models
|
510
|
+
items_to_reset << "cache" if reset_cache
|
511
|
+
|
512
|
+
# Get paths that will be affected
|
513
|
+
config = Config.instance
|
514
|
+
db_path = options[:db_path] || config.database_path
|
515
|
+
model_path = File.join(config.models_dir, "umap_model.bin")
|
516
|
+
|
517
|
+
# Show what will be deleted
|
518
|
+
say "\nWARNING: This will delete the following:", :red
|
519
|
+
say "-" * 40
|
520
|
+
|
521
|
+
if reset_db
|
522
|
+
say "Database: #{db_path}", :cyan
|
523
|
+
if File.exist?(db_path)
|
524
|
+
stats = Database.new(db_path).get_stats rescue nil
|
525
|
+
if stats
|
526
|
+
say " (#{stats[:total_documents]} documents, #{stats[:total_chunks]} chunks)", :white
|
527
|
+
end
|
528
|
+
else
|
529
|
+
say " (does not exist)", :white
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
if reset_models
|
534
|
+
say "UMAP models:", :cyan
|
535
|
+
model_files = [
|
536
|
+
model_path,
|
537
|
+
model_path.sub(/\.bin$/, '_metadata.json'),
|
538
|
+
model_path.sub(/\.bin$/, '_embeddings.json') # Old format, if exists
|
539
|
+
]
|
540
|
+
model_files.each do |file|
|
541
|
+
if File.exist?(file)
|
542
|
+
say " #{file} (#{(File.size(file) / 1024.0).round(1)} KB)", :white
|
543
|
+
end
|
544
|
+
end
|
545
|
+
if model_files.none? { |f| File.exist?(f) }
|
546
|
+
say " (no models found)", :white
|
547
|
+
end
|
548
|
+
end
|
549
|
+
|
550
|
+
if reset_cache
|
551
|
+
cache_dir = File.expand_path("~/.cache/ragnar")
|
552
|
+
say "Cache directory: #{cache_dir}", :cyan
|
553
|
+
if Dir.exist?(cache_dir)
|
554
|
+
cache_size = Dir.glob(File.join(cache_dir, "**/*"))
|
555
|
+
.select { |f| File.file?(f) }
|
556
|
+
.sum { |f| File.size(f) } / 1024.0 / 1024.0
|
557
|
+
say " (#{cache_size.round(1)} MB)", :white
|
558
|
+
else
|
559
|
+
say " (does not exist)", :white
|
560
|
+
end
|
561
|
+
end
|
562
|
+
|
563
|
+
say "-" * 40
|
564
|
+
|
565
|
+
# Ask for confirmation unless --force
|
566
|
+
unless options[:force]
|
567
|
+
message = "\nAre you sure you want to reset #{items_to_reset.join(', ')}?"
|
568
|
+
|
569
|
+
# Check if we're in interactive mode
|
570
|
+
if ENV['THOR_INTERACTIVE_SESSION'] == 'true'
|
571
|
+
# In interactive mode, use a simple prompt
|
572
|
+
say message, :yellow
|
573
|
+
response = ask("Type 'yes' to confirm, anything else to cancel:", :yellow)
|
574
|
+
confirmed = response.downcase == 'yes'
|
575
|
+
else
|
576
|
+
# In CLI mode, use Thor's yes? method
|
577
|
+
confirmed = yes?(message + " (y/N)", :yellow)
|
578
|
+
end
|
579
|
+
|
580
|
+
unless confirmed
|
581
|
+
say "\nReset cancelled.", :cyan
|
582
|
+
return
|
583
|
+
end
|
584
|
+
end
|
585
|
+
|
586
|
+
# Perform the reset
|
587
|
+
say "\nResetting...", :green
|
588
|
+
|
589
|
+
if reset_db && File.exist?(db_path)
|
590
|
+
say "Removing database: #{db_path}"
|
591
|
+
FileUtils.rm_rf(db_path)
|
592
|
+
say " ✓ Database removed", :green
|
593
|
+
end
|
594
|
+
|
595
|
+
if reset_models
|
596
|
+
model_files = [
|
597
|
+
model_path,
|
598
|
+
model_path.sub(/\.bin$/, '_metadata.json'),
|
599
|
+
model_path.sub(/\.bin$/, '_embeddings.json')
|
600
|
+
]
|
601
|
+
model_files.each do |file|
|
602
|
+
if File.exist?(file)
|
603
|
+
say "Removing model file: #{file}"
|
604
|
+
FileUtils.rm_f(file)
|
605
|
+
say " ✓ Removed", :green
|
606
|
+
end
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
if reset_cache
|
611
|
+
# Clear in-memory cache
|
612
|
+
clear_cache
|
613
|
+
|
614
|
+
# Optionally clear cache directory (but preserve history)
|
615
|
+
cache_dir = File.expand_path("~/.cache/ragnar")
|
616
|
+
if Dir.exist?(cache_dir)
|
617
|
+
# Preserve history file
|
618
|
+
history_file = File.join(cache_dir, "history")
|
619
|
+
history_content = File.read(history_file) if File.exist?(history_file)
|
620
|
+
|
621
|
+
# Remove cache directory contents except history
|
622
|
+
Dir.glob(File.join(cache_dir, "*")).each do |item|
|
623
|
+
next if File.basename(item) == "history"
|
624
|
+
if File.directory?(item)
|
625
|
+
FileUtils.rm_rf(item)
|
626
|
+
else
|
627
|
+
FileUtils.rm_f(item)
|
628
|
+
end
|
629
|
+
say "Removed cache item: #{File.basename(item)}", :green
|
630
|
+
end
|
631
|
+
end
|
632
|
+
say " ✓ Cache cleared", :green
|
633
|
+
end
|
634
|
+
|
635
|
+
say "\nReset complete!", :green
|
636
|
+
say "You can now start fresh with 'ragnar index <path>'", :cyan
|
637
|
+
end
|
638
|
+
|
639
|
+
desc "init-config", "Generate a configuration file with current defaults"
|
640
|
+
option :global, type: :boolean, default: false, aliases: "-g", desc: "Create global config in home directory"
|
641
|
+
option :force, type: :boolean, default: false, aliases: "-f", desc: "Overwrite existing config file"
|
642
|
+
def init_config
|
643
|
+
config = Config.instance
|
644
|
+
|
645
|
+
if options[:global]
|
646
|
+
config_path = File.expand_path('~/.ragnar.yml')
|
647
|
+
else
|
648
|
+
config_path = File.join(Dir.pwd, '.ragnar.yml')
|
649
|
+
end
|
650
|
+
|
651
|
+
if File.exist?(config_path) && !options[:force]
|
652
|
+
say "Config file already exists at: #{config_path}", :yellow
|
653
|
+
say "Use --force to overwrite, or choose a different location.", :yellow
|
654
|
+
return
|
655
|
+
end
|
656
|
+
|
657
|
+
generated_path = config.generate_config_file(config_path)
|
658
|
+
say "Config file created at: #{generated_path}", :green
|
659
|
+
say "Edit this file to customize Ragnar's behavior.", :cyan
|
660
|
+
|
661
|
+
if config.config_exists?
|
662
|
+
say "\nNote: Currently using config from: #{config.config_file_path}", :yellow
|
663
|
+
end
|
664
|
+
end
|
665
|
+
|
280
666
|
private
|
281
667
|
|
668
|
+
# Cached instance helpers for interactive mode
|
669
|
+
def get_cached_database(db_path = nil)
|
670
|
+
# Use config default if no path provided
|
671
|
+
db_path ||= Config.instance.database_path
|
672
|
+
|
673
|
+
# Cache database per path - clear cache if path changes
|
674
|
+
if @@cached_db_path != db_path
|
675
|
+
@@cached_database = nil
|
676
|
+
@@cached_db_path = db_path
|
677
|
+
@@cached_query_processor = nil # Also clear dependent caches
|
678
|
+
end
|
679
|
+
|
680
|
+
@@cached_database ||= Database.new(db_path)
|
681
|
+
end
|
682
|
+
|
683
|
+
def get_cached_embedder(model_name = nil)
|
684
|
+
# Use config default if no model specified
|
685
|
+
model_name ||= Config.instance.embedding_model
|
686
|
+
@@cached_embedder ||= Embedder.new(model_name: model_name)
|
687
|
+
end
|
688
|
+
|
689
|
+
def get_cached_llm_manager
|
690
|
+
@@cached_llm_manager ||= LLMManager.instance
|
691
|
+
end
|
692
|
+
|
693
|
+
def get_cached_query_processor(db_path = nil)
|
694
|
+
# Use config default if no path provided
|
695
|
+
db_path ||= Config.instance.database_path
|
696
|
+
|
697
|
+
# Cache query processor per database path
|
698
|
+
if @@cached_db_path != db_path || @@cached_query_processor.nil?
|
699
|
+
@@cached_query_processor = QueryProcessor.new(db_path: db_path)
|
700
|
+
end
|
701
|
+
|
702
|
+
@@cached_query_processor
|
703
|
+
end
|
704
|
+
|
705
|
+
def clear_cache
|
706
|
+
@@cached_database = nil
|
707
|
+
@@cached_embedder = nil
|
708
|
+
@@cached_llm_manager = nil
|
709
|
+
@@cached_query_processor = nil
|
710
|
+
@@cached_db_path = nil
|
711
|
+
end
|
712
|
+
|
713
|
+
|
714
|
+
def summarize_topic(topic, llm)
|
715
|
+
# Get representative documents for context
|
716
|
+
sample_docs = topic.representative_docs(k: 3)
|
717
|
+
|
718
|
+
# Simple, clear prompt for summarization
|
719
|
+
prompt = <<~PROMPT
|
720
|
+
Summarize what connects these documents in 1-2 sentences:
|
721
|
+
|
722
|
+
Key terms: #{topic.terms.first(5).join(', ')}
|
723
|
+
|
724
|
+
Documents:
|
725
|
+
#{sample_docs.map.with_index { |doc, i| "#{i+1}. #{doc}" }.join("\n")}
|
726
|
+
|
727
|
+
Summary:
|
728
|
+
PROMPT
|
729
|
+
|
730
|
+
begin
|
731
|
+
summary = llm.generate(prompt).strip
|
732
|
+
# Clean up common artifacts
|
733
|
+
summary = summary.lines.first&.strip || "Related documents"
|
734
|
+
summary = summary.gsub(/^(Summary:|Topic:|Documents:)/i, '').strip
|
735
|
+
summary.empty? ? "Documents about #{topic.terms.first(2).join(' and ')}" : summary
|
736
|
+
rescue => e
|
737
|
+
"Documents about #{topic.terms.first(2).join(' and ')}"
|
738
|
+
end
|
739
|
+
end
|
740
|
+
|
282
741
|
def fetch_all_documents(database)
|
283
742
|
# Temporary workaround to get all documents
|
284
743
|
# In production, we'd add a proper method to Database class
|
@@ -305,9 +764,12 @@ module Ragnar
|
|
305
764
|
[]
|
306
765
|
end
|
307
766
|
|
308
|
-
def display_topics(topics)
|
767
|
+
def display_topics(topics, show_summaries: false)
|
309
768
|
say "\n" + "="*60, :green
|
310
769
|
say "Topic Analysis Results", :cyan
|
770
|
+
if show_summaries
|
771
|
+
say " (with LLM-generated summaries)", :yellow
|
772
|
+
end
|
311
773
|
say "="*60, :green
|
312
774
|
|
313
775
|
if topics.empty?
|
@@ -326,21 +788,21 @@ module Ragnar
|
|
326
788
|
say "\n" + "─" * 40, :blue
|
327
789
|
say "MAJOR TOPICS (≥20 docs)", :blue
|
328
790
|
say "─" * 40, :blue
|
329
|
-
display_topic_group(large_topics, :cyan)
|
791
|
+
display_topic_group(large_topics, :cyan, show_summaries: show_summaries)
|
330
792
|
end
|
331
793
|
|
332
794
|
if medium_topics.any?
|
333
795
|
say "\n" + "─" * 40, :yellow
|
334
796
|
say "MEDIUM TOPICS (10-19 docs)", :yellow
|
335
797
|
say "─" * 40, :yellow
|
336
|
-
display_topic_group(medium_topics, :yellow)
|
798
|
+
display_topic_group(medium_topics, :yellow, show_summaries: show_summaries)
|
337
799
|
end
|
338
800
|
|
339
801
|
if small_topics.any?
|
340
802
|
say "\n" + "─" * 40, :white
|
341
803
|
say "MINOR TOPICS (<10 docs)", :white
|
342
804
|
say "─" * 40, :white
|
343
|
-
display_topic_group(small_topics, :white)
|
805
|
+
display_topic_group(small_topics, :white, show_summaries: show_summaries)
|
344
806
|
end
|
345
807
|
|
346
808
|
# Summary statistics
|
@@ -364,10 +826,18 @@ module Ragnar
|
|
364
826
|
say " Small (<10): #{small_topics.length} topics, #{small_topics.sum(&:size)} docs"
|
365
827
|
end
|
366
828
|
|
367
|
-
def display_topic_group(topics, color)
|
829
|
+
def display_topic_group(topics, color, show_summaries: false)
|
368
830
|
topics.sort_by { |t| -t.size }.each_with_index do |topic, idx|
|
369
831
|
say "\n#{topic.label || 'Unlabeled'} (#{topic.size} docs)", color
|
370
832
|
|
833
|
+
# Show LLM summary if available
|
834
|
+
if show_summaries
|
835
|
+
summary = topic.instance_variable_get(:@summary)
|
836
|
+
if summary
|
837
|
+
say " Summary: #{summary}", :green
|
838
|
+
end
|
839
|
+
end
|
840
|
+
|
371
841
|
# Show coherence as a bar
|
372
842
|
if topic.coherence > 0
|
373
843
|
coherence_pct = (topic.coherence * 100).round(0)
|
@@ -379,8 +849,8 @@ module Ragnar
|
|
379
849
|
# Compact term display
|
380
850
|
say " Terms: #{topic.terms.first(6).join(' • ')}" if topic.terms.any?
|
381
851
|
|
382
|
-
# Short sample
|
383
|
-
if topic.representative_docs(k: 1).any?
|
852
|
+
# Short sample (unless we showed a summary)
|
853
|
+
if !show_summaries && topic.representative_docs(k: 1).any?
|
384
854
|
preview = topic.representative_docs(k: 1).first
|
385
855
|
preview = preview[0..100] + "..." if preview.length > 100
|
386
856
|
say " \"#{preview}\"", :white
|
@@ -388,25 +858,34 @@ module Ragnar
|
|
388
858
|
end
|
389
859
|
end
|
390
860
|
|
391
|
-
def export_topics(topics, format)
|
861
|
+
def export_topics(topics, format, embeddings: nil, cluster_ids: nil)
|
392
862
|
case format.downcase
|
393
863
|
when 'json'
|
394
864
|
export_topics_json(topics)
|
395
865
|
when 'html'
|
396
|
-
export_topics_html(topics)
|
866
|
+
export_topics_html(topics, embeddings: embeddings, cluster_ids: cluster_ids)
|
397
867
|
else
|
398
868
|
say "Unknown export format: #{format}. Use 'json' or 'html'.", :red
|
399
869
|
end
|
400
870
|
end
|
401
871
|
|
402
872
|
def export_topics_json(topics)
|
873
|
+
topics_data = topics.map do |topic|
|
874
|
+
topic_hash = topic.to_h
|
875
|
+
# Add summary if it exists
|
876
|
+
summary = topic.instance_variable_get(:@summary)
|
877
|
+
topic_hash[:summary] = summary if summary
|
878
|
+
topic_hash
|
879
|
+
end
|
880
|
+
|
403
881
|
data = {
|
404
882
|
generated_at: Time.now.iso8601,
|
405
|
-
topics:
|
883
|
+
topics: topics_data,
|
406
884
|
summary: {
|
407
885
|
total_topics: topics.length,
|
408
886
|
total_documents: topics.sum(&:size),
|
409
|
-
average_size: (topics.sum(&:size).to_f / topics.length).round(1)
|
887
|
+
average_size: (topics.sum(&:size).to_f / topics.length).round(1),
|
888
|
+
has_summaries: topics.any? { |t| t.instance_variable_get(:@summary) }
|
410
889
|
}
|
411
890
|
}
|
412
891
|
|
@@ -415,9 +894,9 @@ module Ragnar
|
|
415
894
|
say "Topics exported to: #{filename}", :green
|
416
895
|
end
|
417
896
|
|
418
|
-
def export_topics_html(topics)
|
897
|
+
def export_topics_html(topics, embeddings: nil, cluster_ids: nil)
|
419
898
|
# Generate self-contained HTML with D3.js visualization
|
420
|
-
html = generate_topic_visualization_html(topics)
|
899
|
+
html = generate_topic_visualization_html(topics, embeddings: embeddings, cluster_ids: cluster_ids)
|
421
900
|
|
422
901
|
filename = "topics_#{Time.now.strftime('%Y%m%d_%H%M%S')}.html"
|
423
902
|
File.write(filename, html)
|
@@ -430,113 +909,5 @@ module Ragnar
|
|
430
909
|
end
|
431
910
|
end
|
432
911
|
|
433
|
-
def generate_topic_visualization_html(topics)
|
434
|
-
# Convert topics to JSON for D3.js
|
435
|
-
topics_json = topics.map do |topic|
|
436
|
-
{
|
437
|
-
id: topic.id,
|
438
|
-
label: topic.label || "Topic #{topic.id}",
|
439
|
-
size: topic.size,
|
440
|
-
terms: topic.terms.first(10),
|
441
|
-
coherence: topic.coherence,
|
442
|
-
samples: topic.representative_docs(k: 2).map { |d| d[0..200] }
|
443
|
-
}
|
444
|
-
end.to_json
|
445
|
-
|
446
|
-
# HTML template with embedded D3.js
|
447
|
-
<<~HTML
|
448
|
-
<!DOCTYPE html>
|
449
|
-
<html>
|
450
|
-
<head>
|
451
|
-
<meta charset="utf-8">
|
452
|
-
<title>Topic Visualization</title>
|
453
|
-
<script src="https://d3js.org/d3.v7.min.js"></script>
|
454
|
-
<style>
|
455
|
-
body { font-family: -apple-system, sans-serif; margin: 20px; }
|
456
|
-
#viz { width: 100%; height: 500px; border: 1px solid #ddd; }
|
457
|
-
.topic { cursor: pointer; }
|
458
|
-
.topic:hover { opacity: 0.8; }
|
459
|
-
#details { margin-top: 20px; padding: 15px; background: #f5f5f5; }
|
460
|
-
.term { display: inline-block; margin: 5px; padding: 5px 10px; background: #e0e0e0; border-radius: 3px; }
|
461
|
-
</style>
|
462
|
-
</head>
|
463
|
-
<body>
|
464
|
-
<h1>Topic Analysis Results</h1>
|
465
|
-
<div id="viz"></div>
|
466
|
-
<div id="details">Click on a topic to see details</div>
|
467
|
-
|
468
|
-
<script>
|
469
|
-
const data = #{topics_json};
|
470
|
-
|
471
|
-
// Create bubble chart
|
472
|
-
const width = document.getElementById('viz').clientWidth;
|
473
|
-
const height = 500;
|
474
|
-
|
475
|
-
const svg = d3.select("#viz")
|
476
|
-
.append("svg")
|
477
|
-
.attr("width", width)
|
478
|
-
.attr("height", height);
|
479
|
-
|
480
|
-
// Create scale for bubble sizes
|
481
|
-
const sizeScale = d3.scaleSqrt()
|
482
|
-
.domain([0, d3.max(data, d => d.size)])
|
483
|
-
.range([10, 50]);
|
484
|
-
|
485
|
-
// Create color scale
|
486
|
-
const colorScale = d3.scaleSequential(d3.interpolateViridis)
|
487
|
-
.domain([0, 1]);
|
488
|
-
|
489
|
-
// Create force simulation
|
490
|
-
const simulation = d3.forceSimulation(data)
|
491
|
-
.force("x", d3.forceX(width / 2).strength(0.05))
|
492
|
-
.force("y", d3.forceY(height / 2).strength(0.05))
|
493
|
-
.force("collide", d3.forceCollide(d => sizeScale(d.size) + 2));
|
494
|
-
|
495
|
-
// Create bubbles
|
496
|
-
const bubbles = svg.selectAll(".topic")
|
497
|
-
.data(data)
|
498
|
-
.enter().append("g")
|
499
|
-
.attr("class", "topic");
|
500
|
-
|
501
|
-
bubbles.append("circle")
|
502
|
-
.attr("r", d => sizeScale(d.size))
|
503
|
-
.attr("fill", d => colorScale(d.coherence))
|
504
|
-
.attr("stroke", "#fff")
|
505
|
-
.attr("stroke-width", 2);
|
506
|
-
|
507
|
-
bubbles.append("text")
|
508
|
-
.text(d => d.label)
|
509
|
-
.attr("text-anchor", "middle")
|
510
|
-
.attr("dy", ".3em")
|
511
|
-
.style("font-size", d => Math.min(sizeScale(d.size) / 3, 14) + "px");
|
512
|
-
|
513
|
-
// Add click handler
|
514
|
-
bubbles.on("click", function(event, d) {
|
515
|
-
showDetails(d);
|
516
|
-
});
|
517
|
-
|
518
|
-
// Update positions
|
519
|
-
simulation.on("tick", () => {
|
520
|
-
bubbles.attr("transform", d => `translate(${d.x},${d.y})`);
|
521
|
-
});
|
522
|
-
|
523
|
-
// Show topic details
|
524
|
-
function showDetails(topic) {
|
525
|
-
const details = document.getElementById('details');
|
526
|
-
details.innerHTML = `
|
527
|
-
<h2>${topic.label}</h2>
|
528
|
-
<p><strong>Documents:</strong> ${topic.size}</p>
|
529
|
-
<p><strong>Coherence:</strong> ${(topic.coherence * 100).toFixed(1)}%</p>
|
530
|
-
<p><strong>Top Terms:</strong></p>
|
531
|
-
<div>${topic.terms.map(t => `<span class="term">${t}</span>`).join('')}</div>
|
532
|
-
<p><strong>Sample Documents:</strong></p>
|
533
|
-
${topic.samples.map(s => `<p style="font-size: 0.9em; color: #666;">"${s}..."</p>`).join('')}
|
534
|
-
`;
|
535
|
-
}
|
536
|
-
</script>
|
537
|
-
</body>
|
538
|
-
</html>
|
539
|
-
HTML
|
540
|
-
end
|
541
912
|
end
|
542
|
-
end
|
913
|
+
end
|