ragnar-cli 0.1.0.pre.3 → 0.1.0.pre.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +249 -41
- data/lib/ragnar/cli.rb +563 -219
- data/lib/ragnar/cli_umap.rb +86 -0
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +320 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/embedder.rb +1 -1
- data/lib/ragnar/indexer.rb +4 -2
- data/lib/ragnar/llm_manager.rb +31 -27
- data/lib/ragnar/query_processor.rb +123 -70
- data/lib/ragnar/query_rewriter.rb +21 -18
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +131 -95
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- data/lib/ragnar.rb +3 -1
- metadata +71 -30
- data/lib/ragnar/topic_modeling/engine.rb +0 -301
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
data/lib/ragnar/cli.rb
CHANGED
|
@@ -1,27 +1,89 @@
|
|
|
1
|
+
require_relative "cli_visualization"
|
|
2
|
+
require_relative "cli_umap"
|
|
3
|
+
require_relative "config"
|
|
4
|
+
require "thor/interactive"
|
|
5
|
+
require "stringio"
|
|
6
|
+
require "fileutils"
|
|
7
|
+
|
|
1
8
|
module Ragnar
|
|
2
9
|
class CLI < Thor
|
|
10
|
+
include CLIVisualization
|
|
11
|
+
include Thor::Interactive::Command
|
|
12
|
+
|
|
13
|
+
default_command :interactive
|
|
14
|
+
|
|
15
|
+
class_option :profile, type: :string, aliases: "-p", desc: "LLM profile to use (e.g., red_candle, opus, sonnet)"
|
|
16
|
+
|
|
17
|
+
# Configure interactive mode
|
|
18
|
+
configure_interactive(
|
|
19
|
+
prompt: Config.instance.interactive_prompt,
|
|
20
|
+
allow_nested: false,
|
|
21
|
+
history_file: Config.instance.history_file,
|
|
22
|
+
ui_mode: :tui,
|
|
23
|
+
default_handler: proc do |input, thor_instance|
|
|
24
|
+
puts "[DEBUG] Default handler called: #{input}" if ENV["DEBUG"]
|
|
25
|
+
|
|
26
|
+
begin
|
|
27
|
+
# IMPORTANT: Use direct method call, NOT invoke(), to avoid Thor's
|
|
28
|
+
# silent deduplication that prevents repeated calls to the same method
|
|
29
|
+
result = thor_instance.query(input.strip)
|
|
30
|
+
puts "[DEBUG] Default handler completed" if ENV["DEBUG"]
|
|
31
|
+
result
|
|
32
|
+
rescue => e
|
|
33
|
+
puts "[DEBUG] Default handler error: #{e.message}" if ENV["DEBUG"]
|
|
34
|
+
puts "[DEBUG] Backtrace: #{e.backtrace.first(3)}" if ENV["DEBUG"]
|
|
35
|
+
raise e
|
|
36
|
+
end
|
|
37
|
+
end
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# Class variables for caching expensive resources in interactive mode
|
|
41
|
+
class_variable_set(:@@cached_database, nil)
|
|
42
|
+
class_variable_set(:@@cached_embedder, nil)
|
|
43
|
+
class_variable_set(:@@cached_llm_manager, nil)
|
|
44
|
+
class_variable_set(:@@cached_query_processor, nil)
|
|
45
|
+
class_variable_set(:@@cached_db_path, nil)
|
|
46
|
+
class_variable_set(:@@verbose_mode, false)
|
|
47
|
+
|
|
3
48
|
desc "index PATH", "Index text files from PATH (file or directory)"
|
|
4
|
-
option :db_path, type: :string,
|
|
5
|
-
option :chunk_size, type: :numeric,
|
|
6
|
-
option :chunk_overlap, type: :numeric,
|
|
7
|
-
option :model, type: :string,
|
|
49
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
|
50
|
+
option :chunk_size, type: :numeric, desc: "Chunk size in tokens (default from config)"
|
|
51
|
+
option :chunk_overlap, type: :numeric, desc: "Chunk overlap in tokens (default from config)"
|
|
52
|
+
option :model, type: :string, desc: "Embedding model to use (default from config)"
|
|
8
53
|
def index(path)
|
|
9
|
-
|
|
10
|
-
|
|
54
|
+
# Expand user paths (handle ~ in user input)
|
|
55
|
+
expanded_path = File.expand_path(path)
|
|
56
|
+
|
|
57
|
+
unless File.exist?(expanded_path)
|
|
58
|
+
say "Error: Path does not exist: #{expanded_path}", :red
|
|
11
59
|
exit 1
|
|
12
60
|
end
|
|
13
61
|
|
|
14
62
|
say "Indexing files from: #{path}", :green
|
|
15
63
|
|
|
64
|
+
# Debug options in interactive mode
|
|
65
|
+
puts "Debug - options: #{options.inspect}" if ENV['DEBUG']
|
|
66
|
+
|
|
67
|
+
# Get config instance
|
|
68
|
+
config = Config.instance
|
|
69
|
+
|
|
70
|
+
# Clear database cache when indexing new content
|
|
71
|
+
db_path = options[:db_path] || config.database_path
|
|
72
|
+
if @@cached_db_path == db_path
|
|
73
|
+
@@cached_database = nil
|
|
74
|
+
@@cached_query_processor = nil
|
|
75
|
+
end
|
|
76
|
+
|
|
16
77
|
indexer = Indexer.new(
|
|
17
|
-
db_path:
|
|
18
|
-
chunk_size: options[:chunk_size],
|
|
19
|
-
chunk_overlap: options[:chunk_overlap],
|
|
20
|
-
embedding_model: options[:model]
|
|
78
|
+
db_path: db_path,
|
|
79
|
+
chunk_size: options[:chunk_size] || config.chunk_size,
|
|
80
|
+
chunk_overlap: options[:chunk_overlap] || config.chunk_overlap,
|
|
81
|
+
embedding_model: options[:model] || config.embedding_model,
|
|
82
|
+
show_progress: config.show_progress?
|
|
21
83
|
)
|
|
22
84
|
|
|
23
85
|
begin
|
|
24
|
-
stats = indexer.index_path(
|
|
86
|
+
stats = indexer.index_path(expanded_path)
|
|
25
87
|
say "\nIndexing complete!", :green
|
|
26
88
|
say "Files processed: #{stats[:files_processed]}"
|
|
27
89
|
say "Chunks created: #{stats[:chunks_created]}"
|
|
@@ -32,82 +94,26 @@ module Ragnar
|
|
|
32
94
|
end
|
|
33
95
|
end
|
|
34
96
|
|
|
35
|
-
desc "
|
|
36
|
-
|
|
37
|
-
option :n_components, type: :numeric, default: 50, desc: "Number of dimensions for reduction"
|
|
38
|
-
option :n_neighbors, type: :numeric, default: 15, desc: "Number of neighbors for UMAP"
|
|
39
|
-
option :min_dist, type: :numeric, default: 0.1, desc: "Minimum distance for UMAP"
|
|
40
|
-
option :model_path, type: :string, default: "umap_model.bin", desc: "Path to save UMAP model"
|
|
41
|
-
def train_umap
|
|
42
|
-
say "Training UMAP model on embeddings...", :green
|
|
43
|
-
|
|
44
|
-
processor = UmapProcessor.new(
|
|
45
|
-
db_path: options[:db_path],
|
|
46
|
-
model_path: options[:model_path]
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
begin
|
|
50
|
-
stats = processor.train(
|
|
51
|
-
n_components: options[:n_components],
|
|
52
|
-
n_neighbors: options[:n_neighbors],
|
|
53
|
-
min_dist: options[:min_dist]
|
|
54
|
-
)
|
|
55
|
-
|
|
56
|
-
say "\nUMAP training complete!", :green
|
|
57
|
-
say "Embeddings processed: #{stats[:embeddings_count]}"
|
|
58
|
-
say "Original dimensions: #{stats[:original_dims]}"
|
|
59
|
-
say "Reduced dimensions: #{stats[:reduced_dims]}"
|
|
60
|
-
say "Model saved to: #{options[:model_path]}"
|
|
61
|
-
rescue => e
|
|
62
|
-
say "Error during UMAP training: #{e.message}", :red
|
|
63
|
-
exit 1
|
|
64
|
-
end
|
|
65
|
-
end
|
|
66
|
-
|
|
67
|
-
desc "apply-umap", "Apply trained UMAP model to reduce embedding dimensions"
|
|
68
|
-
option :db_path, type: :string, default: Ragnar::DEFAULT_DB_PATH, desc: "Path to Lance database"
|
|
69
|
-
option :model_path, type: :string, default: "umap_model.bin", desc: "Path to UMAP model"
|
|
70
|
-
option :batch_size, type: :numeric, default: 100, desc: "Batch size for processing"
|
|
71
|
-
def apply_umap
|
|
72
|
-
unless File.exist?(options[:model_path])
|
|
73
|
-
say "Error: UMAP model not found at: #{options[:model_path]}", :red
|
|
74
|
-
say "Please run 'train-umap' first to create a model.", :yellow
|
|
75
|
-
exit 1
|
|
76
|
-
end
|
|
77
|
-
|
|
78
|
-
say "Applying UMAP model to embeddings...", :green
|
|
79
|
-
|
|
80
|
-
processor = UmapProcessor.new(
|
|
81
|
-
db_path: options[:db_path],
|
|
82
|
-
model_path: options[:model_path]
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
begin
|
|
86
|
-
stats = processor.apply(batch_size: options[:batch_size])
|
|
87
|
-
|
|
88
|
-
say "\nUMAP application complete!", :green
|
|
89
|
-
say "Embeddings processed: #{stats[:processed]}"
|
|
90
|
-
say "Already processed: #{stats[:skipped]}"
|
|
91
|
-
say "Errors: #{stats[:errors]}" if stats[:errors] > 0
|
|
92
|
-
rescue => e
|
|
93
|
-
say "Error applying UMAP: #{e.message}", :red
|
|
94
|
-
exit 1
|
|
95
|
-
end
|
|
96
|
-
end
|
|
97
|
+
desc "umap SUBCOMMAND ...ARGS", "UMAP dimensionality reduction commands"
|
|
98
|
+
subcommand "umap", Umap
|
|
97
99
|
|
|
98
100
|
desc "topics", "Extract and display topics from indexed documents"
|
|
99
|
-
option :db_path, type: :string,
|
|
101
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
|
100
102
|
option :min_cluster_size, type: :numeric, default: 5, desc: "Minimum documents per topic"
|
|
101
103
|
option :method, type: :string, default: "hybrid", desc: "Labeling method: fast, quality, or hybrid"
|
|
102
104
|
option :export, type: :string, desc: "Export topics to file (json or html)"
|
|
103
105
|
option :verbose, type: :boolean, default: false, aliases: "-v", desc: "Show detailed processing"
|
|
106
|
+
option :summarize, type: :boolean, default: false, aliases: "-s", desc: "Generate human-readable topic summaries using LLM"
|
|
107
|
+
option :llm_model, type: :string, default: "MaziyarPanahi/Qwen3-4B-GGUF", desc: "LLM model for summarization"
|
|
108
|
+
option :gguf_file, type: :string, default: "Qwen3-4B.Q4_K_M.gguf", desc: "GGUF file name for LLM model"
|
|
104
109
|
def topics
|
|
110
|
+
apply_profile!
|
|
105
111
|
require_relative 'topic_modeling'
|
|
106
112
|
|
|
107
113
|
say "Extracting topics from indexed documents...", :green
|
|
108
114
|
|
|
109
|
-
# Load embeddings and documents from database
|
|
110
|
-
database =
|
|
115
|
+
# Load embeddings and documents from database - use cache in interactive mode
|
|
116
|
+
database = get_cached_database(options[:db_path] || Config.instance.database_path)
|
|
111
117
|
|
|
112
118
|
begin
|
|
113
119
|
# Get all documents with embeddings
|
|
@@ -130,7 +136,7 @@ module Ragnar
|
|
|
130
136
|
# Check if we have reduced embeddings available
|
|
131
137
|
first_doc = docs_with_embeddings.first
|
|
132
138
|
has_reduced = first_doc[:reduced_embedding] && !first_doc[:reduced_embedding].empty?
|
|
133
|
-
|
|
139
|
+
|
|
134
140
|
if has_reduced
|
|
135
141
|
embeddings = docs_with_embeddings.map { |d| d[:reduced_embedding] }
|
|
136
142
|
say "Using reduced embeddings (#{embeddings.first.size} dimensions)", :yellow if options[:verbose]
|
|
@@ -142,7 +148,7 @@ module Ragnar
|
|
|
142
148
|
# Let the engine handle dimensionality reduction if needed
|
|
143
149
|
reduce_dims = true
|
|
144
150
|
end
|
|
145
|
-
|
|
151
|
+
|
|
146
152
|
documents = docs_with_embeddings.map { |d| d[:chunk_text] }
|
|
147
153
|
metadata = docs_with_embeddings.map { |d| { file_path: d[:file_path], chunk_index: d[:chunk_index] } }
|
|
148
154
|
|
|
@@ -164,12 +170,32 @@ module Ragnar
|
|
|
164
170
|
metadata: metadata
|
|
165
171
|
)
|
|
166
172
|
|
|
173
|
+
# Generate summaries if requested
|
|
174
|
+
if options[:summarize] && topics.any?
|
|
175
|
+
say "Generating topic summaries with LLM...", :yellow
|
|
176
|
+
begin
|
|
177
|
+
chat = LLMManager.instance.default_chat
|
|
178
|
+
|
|
179
|
+
# Add summaries to topics
|
|
180
|
+
topics.each_with_index do |topic, i|
|
|
181
|
+
say " Summarizing topic #{i+1}/#{topics.length}...", :yellow if options[:verbose]
|
|
182
|
+
topic.instance_variable_set(:@summary, summarize_topic(topic, chat))
|
|
183
|
+
end
|
|
184
|
+
|
|
185
|
+
say "Topic summaries generated!", :green
|
|
186
|
+
rescue => e
|
|
187
|
+
say "Warning: Could not generate topic summaries: #{e.message}", :yellow
|
|
188
|
+
say "Proceeding without summaries...", :yellow
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
167
192
|
# Display results
|
|
168
|
-
display_topics(topics)
|
|
193
|
+
display_topics(topics, show_summaries: options[:summarize])
|
|
169
194
|
|
|
170
195
|
# Export if requested
|
|
171
196
|
if options[:export]
|
|
172
|
-
|
|
197
|
+
# Pass embeddings and cluster IDs for visualization
|
|
198
|
+
export_topics(topics, options[:export], embeddings: embeddings, cluster_ids: engine.instance_variable_get(:@cluster_ids))
|
|
173
199
|
end
|
|
174
200
|
|
|
175
201
|
rescue => e
|
|
@@ -184,51 +210,83 @@ module Ragnar
|
|
|
184
210
|
option :k, type: :numeric, default: 5, desc: "Number of results to return"
|
|
185
211
|
option :show_scores, type: :boolean, default: false, desc: "Show similarity scores"
|
|
186
212
|
def search(query_text)
|
|
187
|
-
database =
|
|
188
|
-
embedder =
|
|
189
|
-
|
|
213
|
+
database = get_cached_database(options[:database] || Config.instance.database_path)
|
|
214
|
+
embedder = get_cached_embedder()
|
|
215
|
+
|
|
190
216
|
# Generate embedding for query
|
|
191
217
|
query_embedding = embedder.embed_text(query_text)
|
|
192
|
-
|
|
218
|
+
|
|
193
219
|
# Search for similar documents
|
|
194
220
|
results = database.search_similar(query_embedding, k: options[:k])
|
|
195
|
-
|
|
221
|
+
|
|
196
222
|
if results.empty?
|
|
197
223
|
say "No results found.", :yellow
|
|
198
224
|
return
|
|
199
225
|
end
|
|
200
|
-
|
|
226
|
+
|
|
201
227
|
say "Found #{results.length} results:\n", :green
|
|
202
|
-
|
|
228
|
+
|
|
203
229
|
results.each_with_index do |result, idx|
|
|
204
230
|
say "#{idx + 1}. File: #{result[:file_path]}", :cyan
|
|
205
231
|
say " Chunk: #{result[:chunk_index]}"
|
|
206
|
-
|
|
232
|
+
|
|
207
233
|
if options[:show_scores]
|
|
208
234
|
say " Distance: #{result[:distance].round(4)}"
|
|
209
235
|
end
|
|
210
|
-
|
|
236
|
+
|
|
211
237
|
# Show preview of content
|
|
212
238
|
preview = result[:chunk_text][0..200].gsub(/\s+/, ' ')
|
|
213
239
|
say " Content: #{preview}..."
|
|
214
240
|
say ""
|
|
215
241
|
end
|
|
216
242
|
end
|
|
217
|
-
|
|
243
|
+
|
|
218
244
|
desc "query QUESTION", "Query the RAG system"
|
|
219
|
-
option :db_path, type: :string,
|
|
245
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
|
220
246
|
option :top_k, type: :numeric, default: 3, desc: "Number of top documents to use"
|
|
221
247
|
option :verbose, type: :boolean, default: false, aliases: "-v", desc: "Show detailed processing steps"
|
|
248
|
+
option :rerank, type: :boolean, default: nil, desc: "Enable cross-encoder reranking (default from config)"
|
|
222
249
|
option :json, type: :boolean, default: false, desc: "Output as JSON"
|
|
223
250
|
def query(question)
|
|
224
|
-
|
|
251
|
+
apply_profile!
|
|
252
|
+
puts "Debug - Query called with: #{question.inspect}" if ENV['DEBUG']
|
|
253
|
+
puts "Debug - Options: #{options.inspect}" if ENV['DEBUG']
|
|
254
|
+
|
|
255
|
+
processor = get_cached_query_processor(options[:db_path] || Config.instance.database_path)
|
|
256
|
+
puts "Debug - Processor: #{processor.class}" if ENV['DEBUG']
|
|
225
257
|
|
|
226
258
|
begin
|
|
227
|
-
|
|
259
|
+
config = Config.instance
|
|
260
|
+
result = processor.query(
|
|
261
|
+
question,
|
|
262
|
+
top_k: options[:top_k] || config.query_top_k,
|
|
263
|
+
verbose: options[:verbose] || @@verbose_mode,
|
|
264
|
+
enable_rewriting: config.enable_query_rewriting?,
|
|
265
|
+
enable_reranking: options[:rerank].nil? ? config.enable_reranking? : options[:rerank]
|
|
266
|
+
)
|
|
267
|
+
puts "Debug - Result keys: #{result.keys}" if ENV['DEBUG']
|
|
228
268
|
|
|
229
269
|
if options[:json]
|
|
230
270
|
puts JSON.pretty_generate(result)
|
|
271
|
+
elsif interactive?
|
|
272
|
+
# Clean output for interactive mode - just answer, confidence, and sources
|
|
273
|
+
say "" # Add blank line before answer for spacing
|
|
274
|
+
say result[:answer]
|
|
275
|
+
|
|
276
|
+
if result[:confidence]
|
|
277
|
+
say "\nConfidence: #{result[:confidence]}%", :magenta
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
if result[:sources] && !result[:sources].empty?
|
|
281
|
+
say "\nSources:", :blue
|
|
282
|
+
result[:sources].each_with_index do |source, idx|
|
|
283
|
+
say " #{idx + 1}. #{source[:source_file]}" if source[:source_file]
|
|
284
|
+
end
|
|
285
|
+
end
|
|
286
|
+
|
|
287
|
+
say "" # Add blank line for spacing
|
|
231
288
|
else
|
|
289
|
+
# Full output for CLI mode
|
|
232
290
|
say "\n" + "="*60, :green
|
|
233
291
|
say "Query: #{result[:query]}", :cyan
|
|
234
292
|
|
|
@@ -250,7 +308,7 @@ module Ragnar
|
|
|
250
308
|
end
|
|
251
309
|
end
|
|
252
310
|
|
|
253
|
-
if options[:verbose] && result[:sub_queries]
|
|
311
|
+
if (options[:verbose] || false) && result[:sub_queries]
|
|
254
312
|
say "\nSub-queries used:", :yellow
|
|
255
313
|
result[:sub_queries].each { |sq| say " - #{sq}" }
|
|
256
314
|
end
|
|
@@ -259,15 +317,15 @@ module Ragnar
|
|
|
259
317
|
end
|
|
260
318
|
rescue => e
|
|
261
319
|
say "Error processing query: #{e.message}", :red
|
|
262
|
-
|
|
320
|
+
puts "Debug - Full backtrace: #{e.backtrace.join("\n")}" if ENV['DEBUG']
|
|
263
321
|
exit 1
|
|
264
322
|
end
|
|
265
323
|
end
|
|
266
324
|
|
|
267
325
|
desc "stats", "Show database statistics"
|
|
268
|
-
option :db_path, type: :string,
|
|
326
|
+
option :db_path, type: :string, desc: "Path to Lance database (default from config)"
|
|
269
327
|
def stats
|
|
270
|
-
db =
|
|
328
|
+
db = get_cached_database(options[:db_path] || Config.instance.database_path)
|
|
271
329
|
stats = db.get_stats
|
|
272
330
|
|
|
273
331
|
say "\nDatabase Statistics", :green
|
|
@@ -293,8 +351,382 @@ module Ragnar
|
|
|
293
351
|
say "Ragnar v#{Ragnar::VERSION}"
|
|
294
352
|
end
|
|
295
353
|
|
|
354
|
+
desc "config", "Show current configuration"
|
|
355
|
+
def config
|
|
356
|
+
config = Config.instance
|
|
357
|
+
|
|
358
|
+
say "\nConfiguration Settings:", :cyan
|
|
359
|
+
say "-" * 40
|
|
360
|
+
|
|
361
|
+
if config.config_exists?
|
|
362
|
+
say "Config file: #{config.config_file_path}", :green
|
|
363
|
+
else
|
|
364
|
+
say "Config file: None (using defaults)", :yellow
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
say "\nPaths:", :cyan
|
|
368
|
+
say " Database: #{config.database_path}"
|
|
369
|
+
say " Models: #{config.models_dir}"
|
|
370
|
+
say " History: #{config.history_file}"
|
|
371
|
+
|
|
372
|
+
say "\nEmbeddings:", :cyan
|
|
373
|
+
say " Model: #{config.embedding_model}"
|
|
374
|
+
say " Chunk size: #{config.chunk_size}"
|
|
375
|
+
say " Chunk overlap: #{config.chunk_overlap}"
|
|
376
|
+
|
|
377
|
+
say "\nLLM:", :cyan
|
|
378
|
+
say " Active profile: #{config.llm_profile_name}", :green
|
|
379
|
+
say " Provider: #{config.llm_provider}"
|
|
380
|
+
say " Model: #{config.llm_model}"
|
|
381
|
+
if config.available_profiles.size > 1
|
|
382
|
+
say " Available profiles: #{config.available_profiles.join(', ')}"
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
say "\nUMAP:", :cyan
|
|
386
|
+
say " Reduced dimensions: #{config.get('umap.reduced_dimensions', Ragnar::DEFAULT_REDUCED_DIMENSIONS)}"
|
|
387
|
+
say " N neighbors: #{config.get('umap.n_neighbors', 15)}"
|
|
388
|
+
say " Min distance: #{config.get('umap.min_dist', 0.1)}"
|
|
389
|
+
|
|
390
|
+
say "\nQuery:", :cyan
|
|
391
|
+
say " Top K: #{config.query_top_k}"
|
|
392
|
+
say " Query rewriting: #{config.enable_query_rewriting?}"
|
|
393
|
+
say " Reranking: #{config.enable_reranking?}"
|
|
394
|
+
say " Reranker model: #{config.reranker_model}" if config.enable_reranking?
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
desc "model", "Show current LLM model information"
|
|
398
|
+
def model
|
|
399
|
+
config = Config.instance
|
|
400
|
+
|
|
401
|
+
say "\nLLM Model Configuration:", :cyan
|
|
402
|
+
say "-" * 40
|
|
403
|
+
|
|
404
|
+
say "\nProfile: #{config.llm_profile_name}", :green
|
|
405
|
+
say " Provider: #{config.llm_provider}"
|
|
406
|
+
say " Model: #{config.llm_model}"
|
|
407
|
+
|
|
408
|
+
# Only show GGUF/local file info for local providers
|
|
409
|
+
if config.llm_provider == 'red_candle'
|
|
410
|
+
say "\nEmbedding Model: #{config.embedding_model}"
|
|
411
|
+
|
|
412
|
+
# Check if model files exist in HuggingFace cache
|
|
413
|
+
hf_cache = File.expand_path("~/.cache/huggingface/hub")
|
|
414
|
+
model_dir = config.llm_model.gsub("/", "--")
|
|
415
|
+
model_cache = File.join(hf_cache, "models--#{model_dir}")
|
|
416
|
+
if Dir.exist?(model_cache)
|
|
417
|
+
say "\nModel cached: #{model_cache}", :green
|
|
418
|
+
else
|
|
419
|
+
say "\nModel not yet downloaded (will download on first use)", :yellow
|
|
420
|
+
end
|
|
421
|
+
else
|
|
422
|
+
api_key = config.llm_api_key
|
|
423
|
+
env_key = case config.llm_provider
|
|
424
|
+
when 'anthropic' then ENV['ANTHROPIC_API_KEY']
|
|
425
|
+
when 'openai' then ENV['OPENAI_API_KEY']
|
|
426
|
+
end
|
|
427
|
+
has_key = api_key || env_key
|
|
428
|
+
say "\nAPI key: #{has_key ? 'configured' : 'not set'}", has_key ? :green : :red
|
|
429
|
+
end
|
|
430
|
+
end
|
|
431
|
+
|
|
432
|
+
desc "profile [NAME]", "Show or switch LLM profile"
|
|
433
|
+
def profile(name = nil)
|
|
434
|
+
config = Config.instance
|
|
435
|
+
|
|
436
|
+
if name
|
|
437
|
+
begin
|
|
438
|
+
config.set_active_profile(name)
|
|
439
|
+
LLMManager.instance.clear_cache
|
|
440
|
+
say "Switched to profile: #{name}", :green
|
|
441
|
+
say " Provider: #{config.llm_provider}"
|
|
442
|
+
say " Model: #{config.llm_model}"
|
|
443
|
+
rescue ArgumentError => e
|
|
444
|
+
say e.message, :red
|
|
445
|
+
end
|
|
446
|
+
else
|
|
447
|
+
say "\nLLM Profiles:", :cyan
|
|
448
|
+
say "-" * 40
|
|
449
|
+
config.llm_profiles.each do |pname, pconfig|
|
|
450
|
+
active = pname == config.llm_profile_name ? " (active)" : ""
|
|
451
|
+
say " #{pname}#{active}", active.empty? ? :white : :green
|
|
452
|
+
say " Provider: #{pconfig['provider']}"
|
|
453
|
+
say " Model: #{pconfig['model']}"
|
|
454
|
+
end
|
|
455
|
+
end
|
|
456
|
+
end
|
|
457
|
+
|
|
458
|
+
desc "verbose", "Toggle verbose mode on/off"
|
|
459
|
+
def verbose
|
|
460
|
+
@@verbose_mode = !@@verbose_mode
|
|
461
|
+
say "Verbose mode: #{@@verbose_mode ? 'on' : 'off'}", @@verbose_mode ? :green : :yellow
|
|
462
|
+
end
|
|
463
|
+
|
|
464
|
+
desc "clear-cache", "Clear cached instances (useful in interactive mode)"
|
|
465
|
+
def clear_cache_command
|
|
466
|
+
clear_cache
|
|
467
|
+
say "Cache cleared. Next commands will create fresh instances.", :green
|
|
468
|
+
end
|
|
469
|
+
|
|
470
|
+
desc "reset", "Reset Ragnar data (database, models, cache)"
|
|
471
|
+
option :all, type: :boolean, default: false, aliases: "-a", desc: "Reset everything (database, models, cache)"
|
|
472
|
+
option :database, type: :boolean, default: false, aliases: "-d", desc: "Reset database only"
|
|
473
|
+
option :models, type: :boolean, default: false, aliases: "-m", desc: "Reset UMAP models only"
|
|
474
|
+
option :cache, type: :boolean, default: false, aliases: "-c", desc: "Clear cache only"
|
|
475
|
+
option :force, type: :boolean, default: false, aliases: "-f", desc: "Skip confirmation prompt"
|
|
476
|
+
def reset
|
|
477
|
+
# Determine what to reset
|
|
478
|
+
reset_all = options[:all]
|
|
479
|
+
reset_db = options[:database] || reset_all
|
|
480
|
+
reset_models = options[:models] || reset_all
|
|
481
|
+
reset_cache = options[:cache] || reset_all
|
|
482
|
+
|
|
483
|
+
# If no specific options, default to all
|
|
484
|
+
if !reset_db && !reset_models && !reset_cache
|
|
485
|
+
reset_all = true
|
|
486
|
+
reset_db = reset_models = reset_cache = true
|
|
487
|
+
end
|
|
488
|
+
|
|
489
|
+
# Build confirmation message
|
|
490
|
+
items_to_reset = []
|
|
491
|
+
items_to_reset << "database" if reset_db
|
|
492
|
+
items_to_reset << "UMAP models" if reset_models
|
|
493
|
+
items_to_reset << "cache" if reset_cache
|
|
494
|
+
|
|
495
|
+
# Get paths that will be affected
|
|
496
|
+
config = Config.instance
|
|
497
|
+
db_path = options[:db_path] || config.database_path
|
|
498
|
+
model_path = File.join(config.models_dir, "umap_model.bin")
|
|
499
|
+
|
|
500
|
+
# Show what will be deleted
|
|
501
|
+
say "\nWARNING: This will delete the following:", :red
|
|
502
|
+
say "-" * 40
|
|
503
|
+
|
|
504
|
+
if reset_db
|
|
505
|
+
say "Database: #{db_path}", :cyan
|
|
506
|
+
if File.exist?(db_path)
|
|
507
|
+
stats = Database.new(db_path).get_stats rescue nil
|
|
508
|
+
if stats
|
|
509
|
+
say " (#{stats[:total_documents]} documents, #{stats[:total_chunks]} chunks)", :white
|
|
510
|
+
end
|
|
511
|
+
else
|
|
512
|
+
say " (does not exist)", :white
|
|
513
|
+
end
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
if reset_models
|
|
517
|
+
say "UMAP models:", :cyan
|
|
518
|
+
model_files = [
|
|
519
|
+
model_path,
|
|
520
|
+
model_path.sub(/\.bin$/, '_metadata.json'),
|
|
521
|
+
model_path.sub(/\.bin$/, '_embeddings.json') # Old format, if exists
|
|
522
|
+
]
|
|
523
|
+
model_files.each do |file|
|
|
524
|
+
if File.exist?(file)
|
|
525
|
+
say " #{file} (#{(File.size(file) / 1024.0).round(1)} KB)", :white
|
|
526
|
+
end
|
|
527
|
+
end
|
|
528
|
+
if model_files.none? { |f| File.exist?(f) }
|
|
529
|
+
say " (no models found)", :white
|
|
530
|
+
end
|
|
531
|
+
end
|
|
532
|
+
|
|
533
|
+
if reset_cache
|
|
534
|
+
cache_dir = File.expand_path("~/.cache/ragnar")
|
|
535
|
+
say "Cache directory: #{cache_dir}", :cyan
|
|
536
|
+
if Dir.exist?(cache_dir)
|
|
537
|
+
cache_size = Dir.glob(File.join(cache_dir, "**/*"))
|
|
538
|
+
.select { |f| File.file?(f) }
|
|
539
|
+
.sum { |f| File.size(f) } / 1024.0 / 1024.0
|
|
540
|
+
say " (#{cache_size.round(1)} MB)", :white
|
|
541
|
+
else
|
|
542
|
+
say " (does not exist)", :white
|
|
543
|
+
end
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
say "-" * 40
|
|
547
|
+
|
|
548
|
+
# Ask for confirmation unless --force
|
|
549
|
+
unless options[:force]
|
|
550
|
+
message = "\nAre you sure you want to reset #{items_to_reset.join(', ')}?"
|
|
551
|
+
|
|
552
|
+
# Check if we're in interactive mode
|
|
553
|
+
if ENV['THOR_INTERACTIVE_SESSION'] == 'true'
|
|
554
|
+
# In interactive mode, use a simple prompt
|
|
555
|
+
say message, :yellow
|
|
556
|
+
response = ask("Type 'yes' to confirm, anything else to cancel:", :yellow)
|
|
557
|
+
confirmed = response.downcase == 'yes'
|
|
558
|
+
else
|
|
559
|
+
# In CLI mode, use Thor's yes? method
|
|
560
|
+
confirmed = yes?(message + " (y/N)", :yellow)
|
|
561
|
+
end
|
|
562
|
+
|
|
563
|
+
unless confirmed
|
|
564
|
+
say "\nReset cancelled.", :cyan
|
|
565
|
+
return
|
|
566
|
+
end
|
|
567
|
+
end
|
|
568
|
+
|
|
569
|
+
# Perform the reset
|
|
570
|
+
say "\nResetting...", :green
|
|
571
|
+
|
|
572
|
+
if reset_db && File.exist?(db_path)
|
|
573
|
+
say "Removing database: #{db_path}"
|
|
574
|
+
FileUtils.rm_rf(db_path)
|
|
575
|
+
say " ✓ Database removed", :green
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
if reset_models
|
|
579
|
+
model_files = [
|
|
580
|
+
model_path,
|
|
581
|
+
model_path.sub(/\.bin$/, '_metadata.json'),
|
|
582
|
+
model_path.sub(/\.bin$/, '_embeddings.json')
|
|
583
|
+
]
|
|
584
|
+
model_files.each do |file|
|
|
585
|
+
if File.exist?(file)
|
|
586
|
+
say "Removing model file: #{file}"
|
|
587
|
+
FileUtils.rm_f(file)
|
|
588
|
+
say " ✓ Removed", :green
|
|
589
|
+
end
|
|
590
|
+
end
|
|
591
|
+
end
|
|
592
|
+
|
|
593
|
+
if reset_cache
|
|
594
|
+
# Clear in-memory cache
|
|
595
|
+
clear_cache
|
|
596
|
+
|
|
597
|
+
# Optionally clear cache directory (but preserve history)
|
|
598
|
+
cache_dir = File.expand_path("~/.cache/ragnar")
|
|
599
|
+
if Dir.exist?(cache_dir)
|
|
600
|
+
# Preserve history file
|
|
601
|
+
history_file = File.join(cache_dir, "history")
|
|
602
|
+
history_content = File.read(history_file) if File.exist?(history_file)
|
|
603
|
+
|
|
604
|
+
# Remove cache directory contents except history
|
|
605
|
+
Dir.glob(File.join(cache_dir, "*")).each do |item|
|
|
606
|
+
next if File.basename(item) == "history"
|
|
607
|
+
if File.directory?(item)
|
|
608
|
+
FileUtils.rm_rf(item)
|
|
609
|
+
else
|
|
610
|
+
FileUtils.rm_f(item)
|
|
611
|
+
end
|
|
612
|
+
say "Removed cache item: #{File.basename(item)}", :green
|
|
613
|
+
end
|
|
614
|
+
end
|
|
615
|
+
say " ✓ Cache cleared", :green
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
say "\nReset complete!", :green
|
|
619
|
+
say "You can now start fresh with 'ragnar index <path>'", :cyan
|
|
620
|
+
end
|
|
621
|
+
|
|
622
|
+
desc "init-config", "Generate a configuration file with current defaults"
|
|
623
|
+
option :global, type: :boolean, default: false, aliases: "-g", desc: "Create global config in home directory"
|
|
624
|
+
option :force, type: :boolean, default: false, aliases: "-f", desc: "Overwrite existing config file"
|
|
625
|
+
def init_config
|
|
626
|
+
config = Config.instance
|
|
627
|
+
|
|
628
|
+
if options[:global]
|
|
629
|
+
config_path = File.expand_path('~/.ragnar.yml')
|
|
630
|
+
else
|
|
631
|
+
config_path = File.join(Dir.pwd, '.ragnar.yml')
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
if File.exist?(config_path) && !options[:force]
|
|
635
|
+
say "Config file already exists at: #{config_path}", :yellow
|
|
636
|
+
say "Use --force to overwrite, or choose a different location.", :yellow
|
|
637
|
+
return
|
|
638
|
+
end
|
|
639
|
+
|
|
640
|
+
generated_path = config.generate_config_file(config_path)
|
|
641
|
+
say "Config file created at: #{generated_path}", :green
|
|
642
|
+
say "Edit this file to customize Ragnar's behavior.", :cyan
|
|
643
|
+
|
|
644
|
+
if config.config_exists?
|
|
645
|
+
say "\nNote: Currently using config from: #{config.config_file_path}", :yellow
|
|
646
|
+
end
|
|
647
|
+
end
|
|
648
|
+
|
|
296
649
|
private
|
|
297
650
|
|
|
651
|
+
def apply_profile!
|
|
652
|
+
return unless options[:profile]
|
|
653
|
+
Config.instance.set_active_profile(options[:profile])
|
|
654
|
+
LLMManager.instance.clear_cache
|
|
655
|
+
end
|
|
656
|
+
|
|
657
|
+
# Cached instance helpers for interactive mode
|
|
658
|
+
def get_cached_database(db_path = nil)
|
|
659
|
+
# Use config default if no path provided
|
|
660
|
+
db_path ||= Config.instance.database_path
|
|
661
|
+
|
|
662
|
+
# Cache database per path - clear cache if path changes
|
|
663
|
+
if @@cached_db_path != db_path
|
|
664
|
+
@@cached_database = nil
|
|
665
|
+
@@cached_db_path = db_path
|
|
666
|
+
@@cached_query_processor = nil # Also clear dependent caches
|
|
667
|
+
end
|
|
668
|
+
|
|
669
|
+
@@cached_database ||= Database.new(db_path)
|
|
670
|
+
end
|
|
671
|
+
|
|
672
|
+
def get_cached_embedder(model_name = nil)
|
|
673
|
+
# Use config default if no model specified
|
|
674
|
+
model_name ||= Config.instance.embedding_model
|
|
675
|
+
@@cached_embedder ||= Embedder.new(model_name: model_name)
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
def get_cached_llm_manager
|
|
679
|
+
@@cached_llm_manager ||= LLMManager.instance
|
|
680
|
+
end
|
|
681
|
+
|
|
682
|
+
def get_cached_query_processor(db_path = nil)
|
|
683
|
+
# Use config default if no path provided
|
|
684
|
+
db_path ||= Config.instance.database_path
|
|
685
|
+
|
|
686
|
+
# Cache query processor per database path
|
|
687
|
+
if @@cached_db_path != db_path || @@cached_query_processor.nil?
|
|
688
|
+
@@cached_query_processor = QueryProcessor.new(db_path: db_path)
|
|
689
|
+
end
|
|
690
|
+
|
|
691
|
+
@@cached_query_processor
|
|
692
|
+
end
|
|
693
|
+
|
|
694
|
+
def clear_cache
|
|
695
|
+
@@cached_database = nil
|
|
696
|
+
@@cached_embedder = nil
|
|
697
|
+
@@cached_llm_manager = nil
|
|
698
|
+
@@cached_query_processor = nil
|
|
699
|
+
@@cached_db_path = nil
|
|
700
|
+
end
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def summarize_topic(topic, chat)
|
|
704
|
+
# Get representative documents for context
|
|
705
|
+
sample_docs = topic.representative_docs(k: 3)
|
|
706
|
+
|
|
707
|
+
# Simple, clear prompt for summarization
|
|
708
|
+
prompt = <<~PROMPT
|
|
709
|
+
Summarize what connects these documents in 1-2 sentences:
|
|
710
|
+
|
|
711
|
+
Key terms: #{topic.terms.first(5).join(', ')}
|
|
712
|
+
|
|
713
|
+
Documents:
|
|
714
|
+
#{sample_docs.map.with_index { |doc, i| "#{i+1}. #{doc}" }.join("\n")}
|
|
715
|
+
|
|
716
|
+
Summary:
|
|
717
|
+
PROMPT
|
|
718
|
+
|
|
719
|
+
begin
|
|
720
|
+
summary = chat.ask(prompt).content.strip
|
|
721
|
+
# Clean up common artifacts
|
|
722
|
+
summary = summary.lines.first&.strip || "Related documents"
|
|
723
|
+
summary = summary.gsub(/^(Summary:|Topic:|Documents:)/i, '').strip
|
|
724
|
+
summary.empty? ? "Documents about #{topic.terms.first(2).join(' and ')}" : summary
|
|
725
|
+
rescue => e
|
|
726
|
+
"Documents about #{topic.terms.first(2).join(' and ')}"
|
|
727
|
+
end
|
|
728
|
+
end
|
|
729
|
+
|
|
298
730
|
def fetch_all_documents(database)
|
|
299
731
|
# Temporary workaround to get all documents
|
|
300
732
|
# In production, we'd add a proper method to Database class
|
|
@@ -321,9 +753,12 @@ module Ragnar
|
|
|
321
753
|
[]
|
|
322
754
|
end
|
|
323
755
|
|
|
324
|
-
def display_topics(topics)
|
|
756
|
+
def display_topics(topics, show_summaries: false)
|
|
325
757
|
say "\n" + "="*60, :green
|
|
326
758
|
say "Topic Analysis Results", :cyan
|
|
759
|
+
if show_summaries
|
|
760
|
+
say " (with LLM-generated summaries)", :yellow
|
|
761
|
+
end
|
|
327
762
|
say "="*60, :green
|
|
328
763
|
|
|
329
764
|
if topics.empty?
|
|
@@ -342,21 +777,21 @@ module Ragnar
|
|
|
342
777
|
say "\n" + "─" * 40, :blue
|
|
343
778
|
say "MAJOR TOPICS (≥20 docs)", :blue
|
|
344
779
|
say "─" * 40, :blue
|
|
345
|
-
display_topic_group(large_topics, :cyan)
|
|
780
|
+
display_topic_group(large_topics, :cyan, show_summaries: show_summaries)
|
|
346
781
|
end
|
|
347
782
|
|
|
348
783
|
if medium_topics.any?
|
|
349
784
|
say "\n" + "─" * 40, :yellow
|
|
350
785
|
say "MEDIUM TOPICS (10-19 docs)", :yellow
|
|
351
786
|
say "─" * 40, :yellow
|
|
352
|
-
display_topic_group(medium_topics, :yellow)
|
|
787
|
+
display_topic_group(medium_topics, :yellow, show_summaries: show_summaries)
|
|
353
788
|
end
|
|
354
789
|
|
|
355
790
|
if small_topics.any?
|
|
356
791
|
say "\n" + "─" * 40, :white
|
|
357
792
|
say "MINOR TOPICS (<10 docs)", :white
|
|
358
793
|
say "─" * 40, :white
|
|
359
|
-
display_topic_group(small_topics, :white)
|
|
794
|
+
display_topic_group(small_topics, :white, show_summaries: show_summaries)
|
|
360
795
|
end
|
|
361
796
|
|
|
362
797
|
# Summary statistics
|
|
@@ -380,10 +815,18 @@ module Ragnar
|
|
|
380
815
|
say " Small (<10): #{small_topics.length} topics, #{small_topics.sum(&:size)} docs"
|
|
381
816
|
end
|
|
382
817
|
|
|
383
|
-
def display_topic_group(topics, color)
|
|
818
|
+
def display_topic_group(topics, color, show_summaries: false)
|
|
384
819
|
topics.sort_by { |t| -t.size }.each_with_index do |topic, idx|
|
|
385
820
|
say "\n#{topic.label || 'Unlabeled'} (#{topic.size} docs)", color
|
|
386
821
|
|
|
822
|
+
# Show LLM summary if available
|
|
823
|
+
if show_summaries
|
|
824
|
+
summary = topic.instance_variable_get(:@summary)
|
|
825
|
+
if summary
|
|
826
|
+
say " Summary: #{summary}", :green
|
|
827
|
+
end
|
|
828
|
+
end
|
|
829
|
+
|
|
387
830
|
# Show coherence as a bar
|
|
388
831
|
if topic.coherence > 0
|
|
389
832
|
coherence_pct = (topic.coherence * 100).round(0)
|
|
@@ -395,8 +838,8 @@ module Ragnar
|
|
|
395
838
|
# Compact term display
|
|
396
839
|
say " Terms: #{topic.terms.first(6).join(' • ')}" if topic.terms.any?
|
|
397
840
|
|
|
398
|
-
# Short sample
|
|
399
|
-
if topic.representative_docs(k: 1).any?
|
|
841
|
+
# Short sample (unless we showed a summary)
|
|
842
|
+
if !show_summaries && topic.representative_docs(k: 1).any?
|
|
400
843
|
preview = topic.representative_docs(k: 1).first
|
|
401
844
|
preview = preview[0..100] + "..." if preview.length > 100
|
|
402
845
|
say " \"#{preview}\"", :white
|
|
@@ -404,25 +847,34 @@ module Ragnar
|
|
|
404
847
|
end
|
|
405
848
|
end
|
|
406
849
|
|
|
407
|
-
def export_topics(topics, format)
|
|
850
|
+
def export_topics(topics, format, embeddings: nil, cluster_ids: nil)
|
|
408
851
|
case format.downcase
|
|
409
852
|
when 'json'
|
|
410
853
|
export_topics_json(topics)
|
|
411
854
|
when 'html'
|
|
412
|
-
export_topics_html(topics)
|
|
855
|
+
export_topics_html(topics, embeddings: embeddings, cluster_ids: cluster_ids)
|
|
413
856
|
else
|
|
414
857
|
say "Unknown export format: #{format}. Use 'json' or 'html'.", :red
|
|
415
858
|
end
|
|
416
859
|
end
|
|
417
860
|
|
|
418
861
|
def export_topics_json(topics)
|
|
862
|
+
topics_data = topics.map do |topic|
|
|
863
|
+
topic_hash = topic.to_h
|
|
864
|
+
# Add summary if it exists
|
|
865
|
+
summary = topic.instance_variable_get(:@summary)
|
|
866
|
+
topic_hash[:summary] = summary if summary
|
|
867
|
+
topic_hash
|
|
868
|
+
end
|
|
869
|
+
|
|
419
870
|
data = {
|
|
420
871
|
generated_at: Time.now.iso8601,
|
|
421
|
-
topics:
|
|
872
|
+
topics: topics_data,
|
|
422
873
|
summary: {
|
|
423
874
|
total_topics: topics.length,
|
|
424
875
|
total_documents: topics.sum(&:size),
|
|
425
|
-
average_size: (topics.sum(&:size).to_f / topics.length).round(1)
|
|
876
|
+
average_size: (topics.sum(&:size).to_f / topics.length).round(1),
|
|
877
|
+
has_summaries: topics.any? { |t| t.instance_variable_get(:@summary) }
|
|
426
878
|
}
|
|
427
879
|
}
|
|
428
880
|
|
|
@@ -431,9 +883,9 @@ module Ragnar
|
|
|
431
883
|
say "Topics exported to: #{filename}", :green
|
|
432
884
|
end
|
|
433
885
|
|
|
434
|
-
def export_topics_html(topics)
|
|
886
|
+
def export_topics_html(topics, embeddings: nil, cluster_ids: nil)
|
|
435
887
|
# Generate self-contained HTML with D3.js visualization
|
|
436
|
-
html = generate_topic_visualization_html(topics)
|
|
888
|
+
html = generate_topic_visualization_html(topics, embeddings: embeddings, cluster_ids: cluster_ids)
|
|
437
889
|
|
|
438
890
|
filename = "topics_#{Time.now.strftime('%Y%m%d_%H%M%S')}.html"
|
|
439
891
|
File.write(filename, html)
|
|
@@ -446,113 +898,5 @@ module Ragnar
|
|
|
446
898
|
end
|
|
447
899
|
end
|
|
448
900
|
|
|
449
|
-
def generate_topic_visualization_html(topics)
|
|
450
|
-
# Convert topics to JSON for D3.js
|
|
451
|
-
topics_json = topics.map do |topic|
|
|
452
|
-
{
|
|
453
|
-
id: topic.id,
|
|
454
|
-
label: topic.label || "Topic #{topic.id}",
|
|
455
|
-
size: topic.size,
|
|
456
|
-
terms: topic.terms.first(10),
|
|
457
|
-
coherence: topic.coherence,
|
|
458
|
-
samples: topic.representative_docs(k: 2).map { |d| d[0..200] }
|
|
459
|
-
}
|
|
460
|
-
end.to_json
|
|
461
|
-
|
|
462
|
-
# HTML template with embedded D3.js
|
|
463
|
-
<<~HTML
|
|
464
|
-
<!DOCTYPE html>
|
|
465
|
-
<html>
|
|
466
|
-
<head>
|
|
467
|
-
<meta charset="utf-8">
|
|
468
|
-
<title>Topic Visualization</title>
|
|
469
|
-
<script src="https://d3js.org/d3.v7.min.js"></script>
|
|
470
|
-
<style>
|
|
471
|
-
body { font-family: -apple-system, sans-serif; margin: 20px; }
|
|
472
|
-
#viz { width: 100%; height: 500px; border: 1px solid #ddd; }
|
|
473
|
-
.topic { cursor: pointer; }
|
|
474
|
-
.topic:hover { opacity: 0.8; }
|
|
475
|
-
#details { margin-top: 20px; padding: 15px; background: #f5f5f5; }
|
|
476
|
-
.term { display: inline-block; margin: 5px; padding: 5px 10px; background: #e0e0e0; border-radius: 3px; }
|
|
477
|
-
</style>
|
|
478
|
-
</head>
|
|
479
|
-
<body>
|
|
480
|
-
<h1>Topic Analysis Results</h1>
|
|
481
|
-
<div id="viz"></div>
|
|
482
|
-
<div id="details">Click on a topic to see details</div>
|
|
483
|
-
|
|
484
|
-
<script>
|
|
485
|
-
const data = #{topics_json};
|
|
486
|
-
|
|
487
|
-
// Create bubble chart
|
|
488
|
-
const width = document.getElementById('viz').clientWidth;
|
|
489
|
-
const height = 500;
|
|
490
|
-
|
|
491
|
-
const svg = d3.select("#viz")
|
|
492
|
-
.append("svg")
|
|
493
|
-
.attr("width", width)
|
|
494
|
-
.attr("height", height);
|
|
495
|
-
|
|
496
|
-
// Create scale for bubble sizes
|
|
497
|
-
const sizeScale = d3.scaleSqrt()
|
|
498
|
-
.domain([0, d3.max(data, d => d.size)])
|
|
499
|
-
.range([10, 50]);
|
|
500
|
-
|
|
501
|
-
// Create color scale
|
|
502
|
-
const colorScale = d3.scaleSequential(d3.interpolateViridis)
|
|
503
|
-
.domain([0, 1]);
|
|
504
|
-
|
|
505
|
-
// Create force simulation
|
|
506
|
-
const simulation = d3.forceSimulation(data)
|
|
507
|
-
.force("x", d3.forceX(width / 2).strength(0.05))
|
|
508
|
-
.force("y", d3.forceY(height / 2).strength(0.05))
|
|
509
|
-
.force("collide", d3.forceCollide(d => sizeScale(d.size) + 2));
|
|
510
|
-
|
|
511
|
-
// Create bubbles
|
|
512
|
-
const bubbles = svg.selectAll(".topic")
|
|
513
|
-
.data(data)
|
|
514
|
-
.enter().append("g")
|
|
515
|
-
.attr("class", "topic");
|
|
516
|
-
|
|
517
|
-
bubbles.append("circle")
|
|
518
|
-
.attr("r", d => sizeScale(d.size))
|
|
519
|
-
.attr("fill", d => colorScale(d.coherence))
|
|
520
|
-
.attr("stroke", "#fff")
|
|
521
|
-
.attr("stroke-width", 2);
|
|
522
|
-
|
|
523
|
-
bubbles.append("text")
|
|
524
|
-
.text(d => d.label)
|
|
525
|
-
.attr("text-anchor", "middle")
|
|
526
|
-
.attr("dy", ".3em")
|
|
527
|
-
.style("font-size", d => Math.min(sizeScale(d.size) / 3, 14) + "px");
|
|
528
|
-
|
|
529
|
-
// Add click handler
|
|
530
|
-
bubbles.on("click", function(event, d) {
|
|
531
|
-
showDetails(d);
|
|
532
|
-
});
|
|
533
|
-
|
|
534
|
-
// Update positions
|
|
535
|
-
simulation.on("tick", () => {
|
|
536
|
-
bubbles.attr("transform", d => `translate(${d.x},${d.y})`);
|
|
537
|
-
});
|
|
538
|
-
|
|
539
|
-
// Show topic details
|
|
540
|
-
function showDetails(topic) {
|
|
541
|
-
const details = document.getElementById('details');
|
|
542
|
-
details.innerHTML = `
|
|
543
|
-
<h2>${topic.label}</h2>
|
|
544
|
-
<p><strong>Documents:</strong> ${topic.size}</p>
|
|
545
|
-
<p><strong>Coherence:</strong> ${(topic.coherence * 100).toFixed(1)}%</p>
|
|
546
|
-
<p><strong>Top Terms:</strong></p>
|
|
547
|
-
<div>${topic.terms.map(t => `<span class="term">${t}</span>`).join('')}</div>
|
|
548
|
-
<p><strong>Sample Documents:</strong></p>
|
|
549
|
-
${topic.samples.map(s => `<p style="font-size: 0.9em; color: #666;">"${s}..."</p>`).join('')}
|
|
550
|
-
`;
|
|
551
|
-
}
|
|
552
|
-
</script>
|
|
553
|
-
</body>
|
|
554
|
-
</html>
|
|
555
|
-
HTML
|
|
556
|
-
end
|
|
557
901
|
end
|
|
558
|
-
end
|
|
902
|
+
end
|