ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +187 -36
- data/lib/ragnar/cli.rb +527 -172
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +226 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/llm_manager.rb +4 -1
- data/lib/ragnar/query_processor.rb +38 -20
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +77 -65
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- metadata +43 -22
- data/lib/ragnar/topic_modeling/engine.rb +0 -301
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
data/lib/ragnar/cli.rb
CHANGED
@@ -1,27 +1,82 @@
|
|
1
|
+
require_relative "cli_visualization"
|
2
|
+
require_relative "config"
|
3
|
+
require "thor/interactive"
|
4
|
+
require "stringio"
|
5
|
+
require "fileutils"
|
6
|
+
|
1
7
|
module Ragnar
|
2
8
|
class CLI < Thor
|
9
|
+
include CLIVisualization
|
10
|
+
include Thor::Interactive::Command
|
11
|
+
|
12
|
+
# Configure interactive mode
|
13
|
+
configure_interactive(
|
14
|
+
prompt: Config.instance.interactive_prompt,
|
15
|
+
allow_nested: false,
|
16
|
+
history_file: Config.instance.history_file,
|
17
|
+
default_handler: proc do |input, thor_instance|
|
18
|
+
puts "[DEBUG] Default handler called: #{input}" if ENV["DEBUG"]
|
19
|
+
|
20
|
+
begin
|
21
|
+
# IMPORTANT: Use direct method call, NOT invoke(), to avoid Thor's
|
22
|
+
# silent deduplication that prevents repeated calls to the same method
|
23
|
+
result = thor_instance.query(input.strip)
|
24
|
+
puts "[DEBUG] Default handler completed" if ENV["DEBUG"]
|
25
|
+
result
|
26
|
+
rescue => e
|
27
|
+
puts "[DEBUG] Default handler error: #{e.message}" if ENV["DEBUG"]
|
28
|
+
puts "[DEBUG] Backtrace: #{e.backtrace.first(3)}" if ENV["DEBUG"]
|
29
|
+
raise e
|
30
|
+
end
|
31
|
+
end
|
32
|
+
)
|
33
|
+
|
34
|
+
# Class variables for caching expensive resources in interactive mode
|
35
|
+
class_variable_set(:@@cached_database, nil)
|
36
|
+
class_variable_set(:@@cached_embedder, nil)
|
37
|
+
class_variable_set(:@@cached_llm_manager, nil)
|
38
|
+
class_variable_set(:@@cached_query_processor, nil)
|
39
|
+
class_variable_set(:@@cached_db_path, nil)
|
40
|
+
|
3
41
|
desc "index PATH", "Index text files from PATH (file or directory)"
|
4
|
-
option :db_path, type: :string,
|
5
|
-
option :chunk_size, type: :numeric,
|
6
|
-
option :chunk_overlap, type: :numeric,
|
7
|
-
option :model, type: :string,
|
42
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
43
|
+
option :chunk_size, type: :numeric, desc: "Chunk size in tokens (default from config)"
|
44
|
+
option :chunk_overlap, type: :numeric, desc: "Chunk overlap in tokens (default from config)"
|
45
|
+
option :model, type: :string, desc: "Embedding model to use (default from config)"
|
8
46
|
def index(path)
|
9
|
-
|
10
|
-
|
47
|
+
# Expand user paths (handle ~ in user input)
|
48
|
+
expanded_path = File.expand_path(path)
|
49
|
+
|
50
|
+
unless File.exist?(expanded_path)
|
51
|
+
say "Error: Path does not exist: #{expanded_path}", :red
|
11
52
|
exit 1
|
12
53
|
end
|
13
54
|
|
14
55
|
say "Indexing files from: #{path}", :green
|
15
56
|
|
57
|
+
# Debug options in interactive mode
|
58
|
+
puts "Debug - options: #{options.inspect}" if ENV['DEBUG']
|
59
|
+
|
60
|
+
# Get config instance
|
61
|
+
config = Config.instance
|
62
|
+
|
63
|
+
# Clear database cache when indexing new content
|
64
|
+
db_path = options[:db_path] || config.database_path
|
65
|
+
if @@cached_db_path == db_path
|
66
|
+
@@cached_database = nil
|
67
|
+
@@cached_query_processor = nil
|
68
|
+
end
|
69
|
+
|
16
70
|
indexer = Indexer.new(
|
17
|
-
db_path:
|
18
|
-
chunk_size: options[:chunk_size],
|
19
|
-
chunk_overlap: options[:chunk_overlap],
|
20
|
-
embedding_model: options[:model]
|
71
|
+
db_path: db_path,
|
72
|
+
chunk_size: options[:chunk_size] || config.chunk_size,
|
73
|
+
chunk_overlap: options[:chunk_overlap] || config.chunk_overlap,
|
74
|
+
embedding_model: options[:model] || config.embedding_model,
|
75
|
+
show_progress: config.show_progress?
|
21
76
|
)
|
22
77
|
|
23
78
|
begin
|
24
|
-
stats = indexer.index_path(
|
79
|
+
stats = indexer.index_path(expanded_path)
|
25
80
|
say "\nIndexing complete!", :green
|
26
81
|
say "Files processed: #{stats[:files_processed]}"
|
27
82
|
say "Chunks created: #{stats[:chunks_created]}"
|
@@ -33,31 +88,39 @@ module Ragnar
|
|
33
88
|
end
|
34
89
|
|
35
90
|
desc "train-umap", "Train UMAP model on existing embeddings"
|
36
|
-
option :db_path, type: :string,
|
91
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
37
92
|
option :n_components, type: :numeric, default: 50, desc: "Number of dimensions for reduction"
|
38
93
|
option :n_neighbors, type: :numeric, default: 15, desc: "Number of neighbors for UMAP"
|
39
94
|
option :min_dist, type: :numeric, default: 0.1, desc: "Minimum distance for UMAP"
|
40
|
-
option :model_path, type: :string,
|
95
|
+
option :model_path, type: :string, desc: "Path to save UMAP model"
|
41
96
|
def train_umap
|
42
97
|
say "Training UMAP model on embeddings...", :green
|
43
98
|
|
99
|
+
config = Config.instance
|
100
|
+
# Use model_path from options if provided, otherwise use config models_dir
|
101
|
+
model_path = if options[:model_path]
|
102
|
+
options[:model_path]
|
103
|
+
else
|
104
|
+
File.join(config.models_dir, "umap_model.bin")
|
105
|
+
end
|
106
|
+
|
44
107
|
processor = UmapProcessor.new(
|
45
|
-
db_path: options[:db_path],
|
46
|
-
model_path:
|
108
|
+
db_path: options[:db_path] || config.database_path,
|
109
|
+
model_path: model_path
|
47
110
|
)
|
48
111
|
|
49
112
|
begin
|
50
113
|
stats = processor.train(
|
51
|
-
n_components: options[:n_components],
|
52
|
-
n_neighbors: options[:n_neighbors],
|
53
|
-
min_dist: options[:min_dist]
|
114
|
+
n_components: options[:n_components] || 50,
|
115
|
+
n_neighbors: options[:n_neighbors] || 15,
|
116
|
+
min_dist: options[:min_dist] || 0.1
|
54
117
|
)
|
55
118
|
|
56
119
|
say "\nUMAP training complete!", :green
|
57
120
|
say "Embeddings processed: #{stats[:embeddings_count]}"
|
58
121
|
say "Original dimensions: #{stats[:original_dims]}"
|
59
122
|
say "Reduced dimensions: #{stats[:reduced_dims]}"
|
60
|
-
say "Model saved to: #{
|
123
|
+
say "Model saved to: #{processor.model_path}"
|
61
124
|
rescue => e
|
62
125
|
say "Error during UMAP training: #{e.message}", :red
|
63
126
|
exit 1
|
@@ -65,12 +128,19 @@ module Ragnar
|
|
65
128
|
end
|
66
129
|
|
67
130
|
desc "apply-umap", "Apply trained UMAP model to reduce embedding dimensions"
|
68
|
-
option :db_path, type: :string,
|
69
|
-
option :model_path, type: :string,
|
131
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
132
|
+
option :model_path, type: :string, desc: "Path to UMAP model"
|
70
133
|
option :batch_size, type: :numeric, default: 100, desc: "Batch size for processing"
|
71
134
|
def apply_umap
|
72
|
-
|
73
|
-
|
135
|
+
config = Config.instance
|
136
|
+
model_path = if options[:model_path]
|
137
|
+
options[:model_path]
|
138
|
+
else
|
139
|
+
File.join(config.models_dir, "umap_model.bin")
|
140
|
+
end
|
141
|
+
|
142
|
+
unless File.exist?(model_path)
|
143
|
+
say "Error: UMAP model not found at: #{model_path}", :red
|
74
144
|
say "Please run 'train-umap' first to create a model.", :yellow
|
75
145
|
exit 1
|
76
146
|
end
|
@@ -78,12 +148,12 @@ module Ragnar
|
|
78
148
|
say "Applying UMAP model to embeddings...", :green
|
79
149
|
|
80
150
|
processor = UmapProcessor.new(
|
81
|
-
db_path: options[:db_path],
|
82
|
-
model_path:
|
151
|
+
db_path: options[:db_path] || config.database_path,
|
152
|
+
model_path: model_path
|
83
153
|
)
|
84
154
|
|
85
155
|
begin
|
86
|
-
stats = processor.apply(batch_size: options[:batch_size])
|
156
|
+
stats = processor.apply(batch_size: options[:batch_size] || 100)
|
87
157
|
|
88
158
|
say "\nUMAP application complete!", :green
|
89
159
|
say "Embeddings processed: #{stats[:processed]}"
|
@@ -96,18 +166,21 @@ module Ragnar
|
|
96
166
|
end
|
97
167
|
|
98
168
|
desc "topics", "Extract and display topics from indexed documents"
|
99
|
-
option :db_path, type: :string,
|
169
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
100
170
|
option :min_cluster_size, type: :numeric, default: 5, desc: "Minimum documents per topic"
|
101
171
|
option :method, type: :string, default: "hybrid", desc: "Labeling method: fast, quality, or hybrid"
|
102
172
|
option :export, type: :string, desc: "Export topics to file (json or html)"
|
103
173
|
option :verbose, type: :boolean, default: false, aliases: "-v", desc: "Show detailed processing"
|
174
|
+
option :summarize, type: :boolean, default: false, aliases: "-s", desc: "Generate human-readable topic summaries using LLM"
|
175
|
+
option :llm_model, type: :string, default: "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", desc: "LLM model for summarization"
|
176
|
+
option :gguf_file, type: :string, default: "tinyllama-1.1b-chat-v1.0.q4_k_m.gguf", desc: "GGUF file name for LLM model"
|
104
177
|
def topics
|
105
178
|
require_relative 'topic_modeling'
|
106
179
|
|
107
180
|
say "Extracting topics from indexed documents...", :green
|
108
181
|
|
109
|
-
# Load embeddings and documents from database
|
110
|
-
database =
|
182
|
+
# Load embeddings and documents from database - use cache in interactive mode
|
183
|
+
database = get_cached_database(options[:db_path] || Config.instance.database_path)
|
111
184
|
|
112
185
|
begin
|
113
186
|
# Get all documents with embeddings
|
@@ -130,7 +203,7 @@ module Ragnar
|
|
130
203
|
# Check if we have reduced embeddings available
|
131
204
|
first_doc = docs_with_embeddings.first
|
132
205
|
has_reduced = first_doc[:reduced_embedding] && !first_doc[:reduced_embedding].empty?
|
133
|
-
|
206
|
+
|
134
207
|
if has_reduced
|
135
208
|
embeddings = docs_with_embeddings.map { |d| d[:reduced_embedding] }
|
136
209
|
say "Using reduced embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
|
@@ -142,7 +215,7 @@ module Ragnar
|
|
142
215
|
# Let the engine handle dimensionality reduction if needed
|
143
216
|
reduce_dims = true
|
144
217
|
end
|
145
|
-
|
218
|
+
|
146
219
|
documents = docs_with_embeddings.map { |d| d[:chunk_text] }
|
147
220
|
metadata = docs_with_embeddings.map { |d| { file_path: d[:file_path], chunk_index: d[:chunk_index] } }
|
148
221
|
|
@@ -164,12 +237,36 @@ module Ragnar
|
|
164
237
|
metadata: metadata
|
165
238
|
)
|
166
239
|
|
240
|
+
# Generate summaries if requested
|
241
|
+
if options[:summarize] && topics.any?
|
242
|
+
say "Generating topic summaries with LLM...", :yellow
|
243
|
+
begin
|
244
|
+
require 'red-candle'
|
245
|
+
|
246
|
+
# Initialize LLM for summarization once
|
247
|
+
say "Loading model: #{options[:llm_model]}", :cyan if options[:verbose]
|
248
|
+
llm = Candle::LLM.from_pretrained(options[:llm_model], gguf_file: options[:gguf_file])
|
249
|
+
|
250
|
+
# Add summaries to topics
|
251
|
+
topics.each_with_index do |topic, i|
|
252
|
+
say " Summarizing topic #{i+1}/#{topics.length}...", :yellow if options[:verbose]
|
253
|
+
topic.instance_variable_set(:@summary, summarize_topic(topic, llm))
|
254
|
+
end
|
255
|
+
|
256
|
+
say "Topic summaries generated!", :green
|
257
|
+
rescue => e
|
258
|
+
say "Warning: Could not generate topic summaries: #{e.message}", :yellow
|
259
|
+
say "Proceeding without summaries...", :yellow
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
167
263
|
# Display results
|
168
|
-
display_topics(topics)
|
264
|
+
display_topics(topics, show_summaries: options[:summarize])
|
169
265
|
|
170
266
|
# Export if requested
|
171
267
|
if options[:export]
|
172
|
-
|
268
|
+
# Pass embeddings and cluster IDs for visualization
|
269
|
+
export_topics(topics, options[:export], embeddings: embeddings, cluster_ids: engine.instance_variable_get(:@cluster_ids))
|
173
270
|
end
|
174
271
|
|
175
272
|
rescue => e
|
@@ -184,51 +281,80 @@ module Ragnar
|
|
184
281
|
option :k, type: :numeric, default: 5, desc: "Number of results to return"
|
185
282
|
option :show_scores, type: :boolean, default: false, desc: "Show similarity scores"
|
186
283
|
def search(query_text)
|
187
|
-
database =
|
188
|
-
embedder =
|
189
|
-
|
284
|
+
database = get_cached_database(options[:database] || Config.instance.database_path)
|
285
|
+
embedder = get_cached_embedder()
|
286
|
+
|
190
287
|
# Generate embedding for query
|
191
288
|
query_embedding = embedder.embed_text(query_text)
|
192
|
-
|
289
|
+
|
193
290
|
# Search for similar documents
|
194
291
|
results = database.search_similar(query_embedding, k: options[:k])
|
195
|
-
|
292
|
+
|
196
293
|
if results.empty?
|
197
294
|
say "No results found.", :yellow
|
198
295
|
return
|
199
296
|
end
|
200
|
-
|
297
|
+
|
201
298
|
say "Found #{results.length} results:\n", :green
|
202
|
-
|
299
|
+
|
203
300
|
results.each_with_index do |result, idx|
|
204
301
|
say "#{idx + 1}. File: #{result[:file_path]}", :cyan
|
205
302
|
say " Chunk: #{result[:chunk_index]}"
|
206
|
-
|
303
|
+
|
207
304
|
if options[:show_scores]
|
208
305
|
say " Distance: #{result[:distance].round(4)}"
|
209
306
|
end
|
210
|
-
|
307
|
+
|
211
308
|
# Show preview of content
|
212
309
|
preview = result[:chunk_text][0..200].gsub(/\s+/, ' ')
|
213
310
|
say " Content: #{preview}..."
|
214
311
|
say ""
|
215
312
|
end
|
216
313
|
end
|
217
|
-
|
314
|
+
|
218
315
|
desc "query QUESTION", "Query the RAG system"
|
219
|
-
option :db_path, type: :string,
|
316
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
220
317
|
option :top_k, type: :numeric, default: 3, desc: "Number of top documents to use"
|
221
318
|
option :verbose, type: :boolean, default: false, aliases: "-v", desc: "Show detailed processing steps"
|
222
319
|
option :json, type: :boolean, default: false, desc: "Output as JSON"
|
223
320
|
def query(question)
|
224
|
-
|
321
|
+
puts "Debug - Query called with: #{question.inspect}" if ENV['DEBUG']
|
322
|
+
puts "Debug - Options: #{options.inspect}" if ENV['DEBUG']
|
323
|
+
|
324
|
+
processor = get_cached_query_processor(options[:db_path] || Config.instance.database_path)
|
325
|
+
puts "Debug - Processor: #{processor.class}" if ENV['DEBUG']
|
225
326
|
|
226
327
|
begin
|
227
|
-
|
328
|
+
config = Config.instance
|
329
|
+
result = processor.query(
|
330
|
+
question,
|
331
|
+
top_k: options[:top_k] || config.query_top_k,
|
332
|
+
verbose: options[:verbose] || false,
|
333
|
+
enable_rewriting: config.enable_query_rewriting?
|
334
|
+
)
|
335
|
+
puts "Debug - Result keys: #{result.keys}" if ENV['DEBUG']
|
228
336
|
|
229
337
|
if options[:json]
|
230
338
|
puts JSON.pretty_generate(result)
|
339
|
+
elsif interactive?
|
340
|
+
# Clean output for interactive mode - just answer, confidence, and sources
|
341
|
+
say "" # Add blank line before answer for spacing
|
342
|
+
say result[:answer]
|
343
|
+
|
344
|
+
if result[:confidence]
|
345
|
+
say "\nConfidence: #{result[:confidence]}%", :magenta
|
346
|
+
end
|
347
|
+
|
348
|
+
if result[:sources] && !result[:sources].empty?
|
349
|
+
say "\nSources:", :blue
|
350
|
+
result[:sources].each_with_index do |source, idx|
|
351
|
+
say " #{idx + 1}. #{source[:source_file]}" if source[:source_file]
|
352
|
+
end
|
353
|
+
end
|
354
|
+
|
355
|
+
say "" # Add blank line for spacing
|
231
356
|
else
|
357
|
+
# Full output for CLI mode
|
232
358
|
say "\n" + "="*60, :green
|
233
359
|
say "Query: #{result[:query]}", :cyan
|
234
360
|
|
@@ -250,7 +376,7 @@ module Ragnar
|
|
250
376
|
end
|
251
377
|
end
|
252
378
|
|
253
|
-
if options[:verbose] && result[:sub_queries]
|
379
|
+
if (options[:verbose] || false) && result[:sub_queries]
|
254
380
|
say "\nSub-queries used:", :yellow
|
255
381
|
result[:sub_queries].each { |sq| say " - #{sq}" }
|
256
382
|
end
|
@@ -259,15 +385,15 @@ module Ragnar
|
|
259
385
|
end
|
260
386
|
rescue => e
|
261
387
|
say "Error processing query: #{e.message}", :red
|
262
|
-
|
388
|
+
puts "Debug - Full backtrace: #{e.backtrace.join("\n")}" if ENV['DEBUG']
|
263
389
|
exit 1
|
264
390
|
end
|
265
391
|
end
|
266
392
|
|
267
393
|
desc "stats", "Show database statistics"
|
268
|
-
option :db_path, type: :string,
|
394
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
269
395
|
def stats
|
270
|
-
db =
|
396
|
+
db = get_cached_database(options[:db_path] || Config.instance.database_path)
|
271
397
|
stats = db.get_stats
|
272
398
|
|
273
399
|
say "\nDatabase Statistics", :green
|
@@ -293,8 +419,325 @@ module Ragnar
|
|
293
419
|
say "Ragnar v#{Ragnar::VERSION}"
|
294
420
|
end
|
295
421
|
|
422
|
+
desc "config", "Show current configuration"
|
423
|
+
def config
|
424
|
+
config = Config.instance
|
425
|
+
|
426
|
+
say "\nConfiguration Settings:", :cyan
|
427
|
+
say "-" * 40
|
428
|
+
|
429
|
+
if config.config_exists?
|
430
|
+
say "Config file: #{config.config_file_path}", :green
|
431
|
+
else
|
432
|
+
say "Config file: None (using defaults)", :yellow
|
433
|
+
end
|
434
|
+
|
435
|
+
say "\nPaths:", :cyan
|
436
|
+
say " Database: #{config.database_path}"
|
437
|
+
say " Models: #{config.models_dir}"
|
438
|
+
say " History: #{config.history_file}"
|
439
|
+
|
440
|
+
say "\nEmbeddings:", :cyan
|
441
|
+
say " Model: #{config.embedding_model}"
|
442
|
+
say " Chunk size: #{config.chunk_size}"
|
443
|
+
say " Chunk overlap: #{config.chunk_overlap}"
|
444
|
+
|
445
|
+
say "\nLLM:", :cyan
|
446
|
+
say " Model: #{config.llm_model}"
|
447
|
+
say " GGUF file: #{config.llm_gguf_file}"
|
448
|
+
|
449
|
+
say "\nUMAP:", :cyan
|
450
|
+
say " Reduced dimensions: #{config.get('umap.reduced_dimensions', Ragnar::DEFAULT_REDUCED_DIMENSIONS)}"
|
451
|
+
say " N neighbors: #{config.get('umap.n_neighbors', 15)}"
|
452
|
+
say " Min distance: #{config.get('umap.min_dist', 0.1)}"
|
453
|
+
|
454
|
+
say "\nQuery:", :cyan
|
455
|
+
say " Top K: #{config.query_top_k}"
|
456
|
+
say " Query rewriting: #{config.enable_query_rewriting?}"
|
457
|
+
end
|
458
|
+
|
459
|
+
desc "model", "Show current LLM model information"
|
460
|
+
def model
|
461
|
+
config = Config.instance
|
462
|
+
|
463
|
+
say "\nLLM Model Configuration:", :cyan
|
464
|
+
say "-" * 40
|
465
|
+
|
466
|
+
say "\nModel:", :green
|
467
|
+
say " Repository: #{config.llm_model}"
|
468
|
+
say " GGUF file: #{config.llm_gguf_file}"
|
469
|
+
|
470
|
+
# Check if model files exist
|
471
|
+
model_path = File.join(config.models_dir, config.llm_gguf_file)
|
472
|
+
if File.exist?(model_path)
|
473
|
+
size_mb = (File.size(model_path) / 1024.0 / 1024.0).round(2)
|
474
|
+
say "\nModel file exists: #{model_path} (#{size_mb} MB)", :green
|
475
|
+
else
|
476
|
+
say "\nModel file not found: #{model_path}", :yellow
|
477
|
+
say "Run 'ragnar query' to download automatically", :yellow
|
478
|
+
end
|
479
|
+
end
|
480
|
+
|
481
|
+
desc "clear-cache", "Clear cached instances (useful in interactive mode)"
|
482
|
+
def clear_cache_command
|
483
|
+
clear_cache
|
484
|
+
say "Cache cleared. Next commands will create fresh instances.", :green
|
485
|
+
end
|
486
|
+
|
487
|
+
desc "reset", "Reset Ragnar data (database, models, cache)"
|
488
|
+
option :all, type: :boolean, default: false, aliases: "-a", desc: "Reset everything (database, models, cache)"
|
489
|
+
option :database, type: :boolean, default: false, aliases: "-d", desc: "Reset database only"
|
490
|
+
option :models, type: :boolean, default: false, aliases: "-m", desc: "Reset UMAP models only"
|
491
|
+
option :cache, type: :boolean, default: false, aliases: "-c", desc: "Clear cache only"
|
492
|
+
option :force, type: :boolean, default: false, aliases: "-f", desc: "Skip confirmation prompt"
|
493
|
+
def reset
|
494
|
+
# Determine what to reset
|
495
|
+
reset_all = options[:all]
|
496
|
+
reset_db = options[:database] || reset_all
|
497
|
+
reset_models = options[:models] || reset_all
|
498
|
+
reset_cache = options[:cache] || reset_all
|
499
|
+
|
500
|
+
# If no specific options, default to all
|
501
|
+
if !reset_db && !reset_models && !reset_cache
|
502
|
+
reset_all = true
|
503
|
+
reset_db = reset_models = reset_cache = true
|
504
|
+
end
|
505
|
+
|
506
|
+
# Build confirmation message
|
507
|
+
items_to_reset = []
|
508
|
+
items_to_reset << "database" if reset_db
|
509
|
+
items_to_reset << "UMAP models" if reset_models
|
510
|
+
items_to_reset << "cache" if reset_cache
|
511
|
+
|
512
|
+
# Get paths that will be affected
|
513
|
+
config = Config.instance
|
514
|
+
db_path = options[:db_path] || config.database_path
|
515
|
+
model_path = File.join(config.models_dir, "umap_model.bin")
|
516
|
+
|
517
|
+
# Show what will be deleted
|
518
|
+
say "\nWARNING: This will delete the following:", :red
|
519
|
+
say "-" * 40
|
520
|
+
|
521
|
+
if reset_db
|
522
|
+
say "Database: #{db_path}", :cyan
|
523
|
+
if File.exist?(db_path)
|
524
|
+
stats = Database.new(db_path).get_stats rescue nil
|
525
|
+
if stats
|
526
|
+
say " (#{stats[:total_documents]} documents, #{stats[:total_chunks]} chunks)", :white
|
527
|
+
end
|
528
|
+
else
|
529
|
+
say " (does not exist)", :white
|
530
|
+
end
|
531
|
+
end
|
532
|
+
|
533
|
+
if reset_models
|
534
|
+
say "UMAP models:", :cyan
|
535
|
+
model_files = [
|
536
|
+
model_path,
|
537
|
+
model_path.sub(/\.bin$/, '_metadata.json'),
|
538
|
+
model_path.sub(/\.bin$/, '_embeddings.json') # Old format, if exists
|
539
|
+
]
|
540
|
+
model_files.each do |file|
|
541
|
+
if File.exist?(file)
|
542
|
+
say " #{file} (#{(File.size(file) / 1024.0).round(1)} KB)", :white
|
543
|
+
end
|
544
|
+
end
|
545
|
+
if model_files.none? { |f| File.exist?(f) }
|
546
|
+
say " (no models found)", :white
|
547
|
+
end
|
548
|
+
end
|
549
|
+
|
550
|
+
if reset_cache
|
551
|
+
cache_dir = File.expand_path("~/.cache/ragnar")
|
552
|
+
say "Cache directory: #{cache_dir}", :cyan
|
553
|
+
if Dir.exist?(cache_dir)
|
554
|
+
cache_size = Dir.glob(File.join(cache_dir, "**/*"))
|
555
|
+
.select { |f| File.file?(f) }
|
556
|
+
.sum { |f| File.size(f) } / 1024.0 / 1024.0
|
557
|
+
say " (#{cache_size.round(1)} MB)", :white
|
558
|
+
else
|
559
|
+
say " (does not exist)", :white
|
560
|
+
end
|
561
|
+
end
|
562
|
+
|
563
|
+
say "-" * 40
|
564
|
+
|
565
|
+
# Ask for confirmation unless --force
|
566
|
+
unless options[:force]
|
567
|
+
message = "\nAre you sure you want to reset #{items_to_reset.join(', ')}?"
|
568
|
+
|
569
|
+
# Check if we're in interactive mode
|
570
|
+
if ENV['THOR_INTERACTIVE_SESSION'] == 'true'
|
571
|
+
# In interactive mode, use a simple prompt
|
572
|
+
say message, :yellow
|
573
|
+
response = ask("Type 'yes' to confirm, anything else to cancel:", :yellow)
|
574
|
+
confirmed = response.downcase == 'yes'
|
575
|
+
else
|
576
|
+
# In CLI mode, use Thor's yes? method
|
577
|
+
confirmed = yes?(message + " (y/N)", :yellow)
|
578
|
+
end
|
579
|
+
|
580
|
+
unless confirmed
|
581
|
+
say "\nReset cancelled.", :cyan
|
582
|
+
return
|
583
|
+
end
|
584
|
+
end
|
585
|
+
|
586
|
+
# Perform the reset
|
587
|
+
say "\nResetting...", :green
|
588
|
+
|
589
|
+
if reset_db && File.exist?(db_path)
|
590
|
+
say "Removing database: #{db_path}"
|
591
|
+
FileUtils.rm_rf(db_path)
|
592
|
+
say " ✓ Database removed", :green
|
593
|
+
end
|
594
|
+
|
595
|
+
if reset_models
|
596
|
+
model_files = [
|
597
|
+
model_path,
|
598
|
+
model_path.sub(/\.bin$/, '_metadata.json'),
|
599
|
+
model_path.sub(/\.bin$/, '_embeddings.json')
|
600
|
+
]
|
601
|
+
model_files.each do |file|
|
602
|
+
if File.exist?(file)
|
603
|
+
say "Removing model file: #{file}"
|
604
|
+
FileUtils.rm_f(file)
|
605
|
+
say " ✓ Removed", :green
|
606
|
+
end
|
607
|
+
end
|
608
|
+
end
|
609
|
+
|
610
|
+
if reset_cache
|
611
|
+
# Clear in-memory cache
|
612
|
+
clear_cache
|
613
|
+
|
614
|
+
# Optionally clear cache directory (but preserve history)
|
615
|
+
cache_dir = File.expand_path("~/.cache/ragnar")
|
616
|
+
if Dir.exist?(cache_dir)
|
617
|
+
# Preserve history file
|
618
|
+
history_file = File.join(cache_dir, "history")
|
619
|
+
history_content = File.read(history_file) if File.exist?(history_file)
|
620
|
+
|
621
|
+
# Remove cache directory contents except history
|
622
|
+
Dir.glob(File.join(cache_dir, "*")).each do |item|
|
623
|
+
next if File.basename(item) == "history"
|
624
|
+
if File.directory?(item)
|
625
|
+
FileUtils.rm_rf(item)
|
626
|
+
else
|
627
|
+
FileUtils.rm_f(item)
|
628
|
+
end
|
629
|
+
say "Removed cache item: #{File.basename(item)}", :green
|
630
|
+
end
|
631
|
+
end
|
632
|
+
say " ✓ Cache cleared", :green
|
633
|
+
end
|
634
|
+
|
635
|
+
say "\nReset complete!", :green
|
636
|
+
say "You can now start fresh with 'ragnar index <path>'", :cyan
|
637
|
+
end
|
638
|
+
|
639
|
+
desc "init-config", "Generate a configuration file with current defaults"
|
640
|
+
option :global, type: :boolean, default: false, aliases: "-g", desc: "Create global config in home directory"
|
641
|
+
option :force, type: :boolean, default: false, aliases: "-f", desc: "Overwrite existing config file"
|
642
|
+
def init_config
|
643
|
+
config = Config.instance
|
644
|
+
|
645
|
+
if options[:global]
|
646
|
+
config_path = File.expand_path('~/.ragnar.yml')
|
647
|
+
else
|
648
|
+
config_path = File.join(Dir.pwd, '.ragnar.yml')
|
649
|
+
end
|
650
|
+
|
651
|
+
if File.exist?(config_path) && !options[:force]
|
652
|
+
say "Config file already exists at: #{config_path}", :yellow
|
653
|
+
say "Use --force to overwrite, or choose a different location.", :yellow
|
654
|
+
return
|
655
|
+
end
|
656
|
+
|
657
|
+
generated_path = config.generate_config_file(config_path)
|
658
|
+
say "Config file created at: #{generated_path}", :green
|
659
|
+
say "Edit this file to customize Ragnar's behavior.", :cyan
|
660
|
+
|
661
|
+
if config.config_exists?
|
662
|
+
say "\nNote: Currently using config from: #{config.config_file_path}", :yellow
|
663
|
+
end
|
664
|
+
end
|
665
|
+
|
296
666
|
private
|
297
667
|
|
668
|
+
# Cached instance helpers for interactive mode
|
669
|
+
def get_cached_database(db_path = nil)
|
670
|
+
# Use config default if no path provided
|
671
|
+
db_path ||= Config.instance.database_path
|
672
|
+
|
673
|
+
# Cache database per path - clear cache if path changes
|
674
|
+
if @@cached_db_path != db_path
|
675
|
+
@@cached_database = nil
|
676
|
+
@@cached_db_path = db_path
|
677
|
+
@@cached_query_processor = nil # Also clear dependent caches
|
678
|
+
end
|
679
|
+
|
680
|
+
@@cached_database ||= Database.new(db_path)
|
681
|
+
end
|
682
|
+
|
683
|
+
def get_cached_embedder(model_name = nil)
|
684
|
+
# Use config default if no model specified
|
685
|
+
model_name ||= Config.instance.embedding_model
|
686
|
+
@@cached_embedder ||= Embedder.new(model_name: model_name)
|
687
|
+
end
|
688
|
+
|
689
|
+
def get_cached_llm_manager
|
690
|
+
@@cached_llm_manager ||= LLMManager.instance
|
691
|
+
end
|
692
|
+
|
693
|
+
def get_cached_query_processor(db_path = nil)
|
694
|
+
# Use config default if no path provided
|
695
|
+
db_path ||= Config.instance.database_path
|
696
|
+
|
697
|
+
# Cache query processor per database path
|
698
|
+
if @@cached_db_path != db_path || @@cached_query_processor.nil?
|
699
|
+
@@cached_query_processor = QueryProcessor.new(db_path: db_path)
|
700
|
+
end
|
701
|
+
|
702
|
+
@@cached_query_processor
|
703
|
+
end
|
704
|
+
|
705
|
+
def clear_cache
|
706
|
+
@@cached_database = nil
|
707
|
+
@@cached_embedder = nil
|
708
|
+
@@cached_llm_manager = nil
|
709
|
+
@@cached_query_processor = nil
|
710
|
+
@@cached_db_path = nil
|
711
|
+
end
|
712
|
+
|
713
|
+
|
714
|
+
def summarize_topic(topic, llm)
|
715
|
+
# Get representative documents for context
|
716
|
+
sample_docs = topic.representative_docs(k: 3)
|
717
|
+
|
718
|
+
# Simple, clear prompt for summarization
|
719
|
+
prompt = <<~PROMPT
|
720
|
+
Summarize what connects these documents in 1-2 sentences:
|
721
|
+
|
722
|
+
Key terms: #{topic.terms.first(5).join(', ')}
|
723
|
+
|
724
|
+
Documents:
|
725
|
+
#{sample_docs.map.with_index { |doc, i| "#{i+1}. #{doc}" }.join("\n")}
|
726
|
+
|
727
|
+
Summary:
|
728
|
+
PROMPT
|
729
|
+
|
730
|
+
begin
|
731
|
+
summary = llm.generate(prompt).strip
|
732
|
+
# Clean up common artifacts
|
733
|
+
summary = summary.lines.first&.strip || "Related documents"
|
734
|
+
summary = summary.gsub(/^(Summary:|Topic:|Documents:)/i, '').strip
|
735
|
+
summary.empty? ? "Documents about #{topic.terms.first(2).join(' and ')}" : summary
|
736
|
+
rescue => e
|
737
|
+
"Documents about #{topic.terms.first(2).join(' and ')}"
|
738
|
+
end
|
739
|
+
end
|
740
|
+
|
298
741
|
def fetch_all_documents(database)
|
299
742
|
# Temporary workaround to get all documents
|
300
743
|
# In production, we'd add a proper method to Database class
|
@@ -321,9 +764,12 @@ module Ragnar
|
|
321
764
|
[]
|
322
765
|
end
|
323
766
|
|
324
|
-
def display_topics(topics)
|
767
|
+
def display_topics(topics, show_summaries: false)
|
325
768
|
say "\n" + "="*60, :green
|
326
769
|
say "Topic Analysis Results", :cyan
|
770
|
+
if show_summaries
|
771
|
+
say " (with LLM-generated summaries)", :yellow
|
772
|
+
end
|
327
773
|
say "="*60, :green
|
328
774
|
|
329
775
|
if topics.empty?
|
@@ -342,21 +788,21 @@ module Ragnar
|
|
342
788
|
say "\n" + "─" * 40, :blue
|
343
789
|
say "MAJOR TOPICS (≥20 docs)", :blue
|
344
790
|
say "─" * 40, :blue
|
345
|
-
display_topic_group(large_topics, :cyan)
|
791
|
+
display_topic_group(large_topics, :cyan, show_summaries: show_summaries)
|
346
792
|
end
|
347
793
|
|
348
794
|
if medium_topics.any?
|
349
795
|
say "\n" + "─" * 40, :yellow
|
350
796
|
say "MEDIUM TOPICS (10-19 docs)", :yellow
|
351
797
|
say "─" * 40, :yellow
|
352
|
-
display_topic_group(medium_topics, :yellow)
|
798
|
+
display_topic_group(medium_topics, :yellow, show_summaries: show_summaries)
|
353
799
|
end
|
354
800
|
|
355
801
|
if small_topics.any?
|
356
802
|
say "\n" + "─" * 40, :white
|
357
803
|
say "MINOR TOPICS (<10 docs)", :white
|
358
804
|
say "─" * 40, :white
|
359
|
-
display_topic_group(small_topics, :white)
|
805
|
+
display_topic_group(small_topics, :white, show_summaries: show_summaries)
|
360
806
|
end
|
361
807
|
|
362
808
|
# Summary statistics
|
@@ -380,10 +826,18 @@ module Ragnar
|
|
380
826
|
say " Small (<10): #{small_topics.length} topics, #{small_topics.sum(&:size)} docs"
|
381
827
|
end
|
382
828
|
|
383
|
-
def display_topic_group(topics, color)
|
829
|
+
def display_topic_group(topics, color, show_summaries: false)
|
384
830
|
topics.sort_by { |t| -t.size }.each_with_index do |topic, idx|
|
385
831
|
say "\n#{topic.label || 'Unlabeled'} (#{topic.size} docs)", color
|
386
832
|
|
833
|
+
# Show LLM summary if available
|
834
|
+
if show_summaries
|
835
|
+
summary = topic.instance_variable_get(:@summary)
|
836
|
+
if summary
|
837
|
+
say " Summary: #{summary}", :green
|
838
|
+
end
|
839
|
+
end
|
840
|
+
|
387
841
|
# Show coherence as a bar
|
388
842
|
if topic.coherence > 0
|
389
843
|
coherence_pct = (topic.coherence * 100).round(0)
|
@@ -395,8 +849,8 @@ module Ragnar
|
|
395
849
|
# Compact term display
|
396
850
|
say " Terms: #{topic.terms.first(6).join(' • ')}" if topic.terms.any?
|
397
851
|
|
398
|
-
# Short sample
|
399
|
-
if topic.representative_docs(k: 1).any?
|
852
|
+
# Short sample (unless we showed a summary)
|
853
|
+
if !show_summaries && topic.representative_docs(k: 1).any?
|
400
854
|
preview = topic.representative_docs(k: 1).first
|
401
855
|
preview = preview[0..100] + "..." if preview.length > 100
|
402
856
|
say " \"#{preview}\"", :white
|
@@ -404,25 +858,34 @@ module Ragnar
|
|
404
858
|
end
|
405
859
|
end
|
406
860
|
|
407
|
-
def export_topics(topics, format)
|
861
|
+
def export_topics(topics, format, embeddings: nil, cluster_ids: nil)
|
408
862
|
case format.downcase
|
409
863
|
when 'json'
|
410
864
|
export_topics_json(topics)
|
411
865
|
when 'html'
|
412
|
-
export_topics_html(topics)
|
866
|
+
export_topics_html(topics, embeddings: embeddings, cluster_ids: cluster_ids)
|
413
867
|
else
|
414
868
|
say "Unknown export format: #{format}. Use 'json' or 'html'.", :red
|
415
869
|
end
|
416
870
|
end
|
417
871
|
|
418
872
|
def export_topics_json(topics)
|
873
|
+
topics_data = topics.map do |topic|
|
874
|
+
topic_hash = topic.to_h
|
875
|
+
# Add summary if it exists
|
876
|
+
summary = topic.instance_variable_get(:@summary)
|
877
|
+
topic_hash[:summary] = summary if summary
|
878
|
+
topic_hash
|
879
|
+
end
|
880
|
+
|
419
881
|
data = {
|
420
882
|
generated_at: Time.now.iso8601,
|
421
|
-
topics:
|
883
|
+
topics: topics_data,
|
422
884
|
summary: {
|
423
885
|
total_topics: topics.length,
|
424
886
|
total_documents: topics.sum(&:size),
|
425
|
-
average_size: (topics.sum(&:size).to_f / topics.length).round(1)
|
887
|
+
average_size: (topics.sum(&:size).to_f / topics.length).round(1),
|
888
|
+
has_summaries: topics.any? { |t| t.instance_variable_get(:@summary) }
|
426
889
|
}
|
427
890
|
}
|
428
891
|
|
@@ -431,9 +894,9 @@ module Ragnar
|
|
431
894
|
say "Topics exported to: #{filename}", :green
|
432
895
|
end
|
433
896
|
|
434
|
-
def export_topics_html(topics)
|
897
|
+
def export_topics_html(topics, embeddings: nil, cluster_ids: nil)
|
435
898
|
# Generate self-contained HTML with D3.js visualization
|
436
|
-
html = generate_topic_visualization_html(topics)
|
899
|
+
html = generate_topic_visualization_html(topics, embeddings: embeddings, cluster_ids: cluster_ids)
|
437
900
|
|
438
901
|
filename = "topics_#{Time.now.strftime('%Y%m%d_%H%M%S')}.html"
|
439
902
|
File.write(filename, html)
|
@@ -446,113 +909,5 @@ module Ragnar
|
|
446
909
|
end
|
447
910
|
end
|
448
911
|
|
449
|
-
def generate_topic_visualization_html(topics)
|
450
|
-
# Convert topics to JSON for D3.js
|
451
|
-
topics_json = topics.map do |topic|
|
452
|
-
{
|
453
|
-
id: topic.id,
|
454
|
-
label: topic.label || "Topic #{topic.id}",
|
455
|
-
size: topic.size,
|
456
|
-
terms: topic.terms.first(10),
|
457
|
-
coherence: topic.coherence,
|
458
|
-
samples: topic.representative_docs(k: 2).map { |d| d[0..200] }
|
459
|
-
}
|
460
|
-
end.to_json
|
461
|
-
|
462
|
-
# HTML template with embedded D3.js
|
463
|
-
<<~HTML
|
464
|
-
<!DOCTYPE html>
|
465
|
-
<html>
|
466
|
-
<head>
|
467
|
-
<meta charset="utf-8">
|
468
|
-
<title>Topic Visualization</title>
|
469
|
-
<script src="https://d3js.org/d3.v7.min.js"></script>
|
470
|
-
<style>
|
471
|
-
body { font-family: -apple-system, sans-serif; margin: 20px; }
|
472
|
-
#viz { width: 100%; height: 500px; border: 1px solid #ddd; }
|
473
|
-
.topic { cursor: pointer; }
|
474
|
-
.topic:hover { opacity: 0.8; }
|
475
|
-
#details { margin-top: 20px; padding: 15px; background: #f5f5f5; }
|
476
|
-
.term { display: inline-block; margin: 5px; padding: 5px 10px; background: #e0e0e0; border-radius: 3px; }
|
477
|
-
</style>
|
478
|
-
</head>
|
479
|
-
<body>
|
480
|
-
<h1>Topic Analysis Results</h1>
|
481
|
-
<div id="viz"></div>
|
482
|
-
<div id="details">Click on a topic to see details</div>
|
483
|
-
|
484
|
-
<script>
|
485
|
-
const data = #{topics_json};
|
486
|
-
|
487
|
-
// Create bubble chart
|
488
|
-
const width = document.getElementById('viz').clientWidth;
|
489
|
-
const height = 500;
|
490
|
-
|
491
|
-
const svg = d3.select("#viz")
|
492
|
-
.append("svg")
|
493
|
-
.attr("width", width)
|
494
|
-
.attr("height", height);
|
495
|
-
|
496
|
-
// Create scale for bubble sizes
|
497
|
-
const sizeScale = d3.scaleSqrt()
|
498
|
-
.domain([0, d3.max(data, d => d.size)])
|
499
|
-
.range([10, 50]);
|
500
|
-
|
501
|
-
// Create color scale
|
502
|
-
const colorScale = d3.scaleSequential(d3.interpolateViridis)
|
503
|
-
.domain([0, 1]);
|
504
|
-
|
505
|
-
// Create force simulation
|
506
|
-
const simulation = d3.forceSimulation(data)
|
507
|
-
.force("x", d3.forceX(width / 2).strength(0.05))
|
508
|
-
.force("y", d3.forceY(height / 2).strength(0.05))
|
509
|
-
.force("collide", d3.forceCollide(d => sizeScale(d.size) + 2));
|
510
|
-
|
511
|
-
// Create bubbles
|
512
|
-
const bubbles = svg.selectAll(".topic")
|
513
|
-
.data(data)
|
514
|
-
.enter().append("g")
|
515
|
-
.attr("class", "topic");
|
516
|
-
|
517
|
-
bubbles.append("circle")
|
518
|
-
.attr("r", d => sizeScale(d.size))
|
519
|
-
.attr("fill", d => colorScale(d.coherence))
|
520
|
-
.attr("stroke", "#fff")
|
521
|
-
.attr("stroke-width", 2);
|
522
|
-
|
523
|
-
bubbles.append("text")
|
524
|
-
.text(d => d.label)
|
525
|
-
.attr("text-anchor", "middle")
|
526
|
-
.attr("dy", ".3em")
|
527
|
-
.style("font-size", d => Math.min(sizeScale(d.size) / 3, 14) + "px");
|
528
|
-
|
529
|
-
// Add click handler
|
530
|
-
bubbles.on("click", function(event, d) {
|
531
|
-
showDetails(d);
|
532
|
-
});
|
533
|
-
|
534
|
-
// Update positions
|
535
|
-
simulation.on("tick", () => {
|
536
|
-
bubbles.attr("transform", d => `translate(${d.x},${d.y})`);
|
537
|
-
});
|
538
|
-
|
539
|
-
// Show topic details
|
540
|
-
function showDetails(topic) {
|
541
|
-
const details = document.getElementById('details');
|
542
|
-
details.innerHTML = `
|
543
|
-
<h2>${topic.label}</h2>
|
544
|
-
<p><strong>Documents:</strong> ${topic.size}</p>
|
545
|
-
<p><strong>Coherence:</strong> ${(topic.coherence * 100).toFixed(1)}%</p>
|
546
|
-
<p><strong>Top Terms:</strong></p>
|
547
|
-
<div>${topic.terms.map(t => `<span class="term">${t}</span>`).join('')}</div>
|
548
|
-
<p><strong>Sample Documents:</strong></p>
|
549
|
-
${topic.samples.map(s => `<p style="font-size: 0.9em; color: #666;">"${s}..."</p>`).join('')}
|
550
|
-
`;
|
551
|
-
}
|
552
|
-
</script>
|
553
|
-
</body>
|
554
|
-
</html>
|
555
|
-
HTML
|
556
|
-
end
|
557
912
|
end
|
558
|
-
end
|
913
|
+
end
|