ragnar-cli 0.1.0.pre.2 → 0.1.0.pre.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +187 -36
- data/lib/ragnar/cli.rb +543 -172
- data/lib/ragnar/cli_visualization.rb +184 -0
- data/lib/ragnar/config.rb +226 -0
- data/lib/ragnar/database.rb +94 -8
- data/lib/ragnar/llm_manager.rb +4 -1
- data/lib/ragnar/query_processor.rb +38 -20
- data/lib/ragnar/topic_modeling.rb +13 -10
- data/lib/ragnar/umap_processor.rb +190 -73
- data/lib/ragnar/umap_transform_service.rb +169 -88
- data/lib/ragnar/version.rb +1 -1
- metadata +43 -22
- data/lib/ragnar/topic_modeling/engine.rb +0 -221
- data/lib/ragnar/topic_modeling/labeling_strategies.rb +0 -300
- data/lib/ragnar/topic_modeling/llm_adapter.rb +0 -131
- data/lib/ragnar/topic_modeling/metrics.rb +0 -186
- data/lib/ragnar/topic_modeling/term_extractor.rb +0 -170
- data/lib/ragnar/topic_modeling/topic.rb +0 -117
- data/lib/ragnar/topic_modeling/topic_labeler.rb +0 -61
@@ -0,0 +1,184 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Ragnar
|
4
|
+
module CLIVisualization
|
5
|
+
def generate_topic_visualization_html(topics, embeddings: nil, cluster_ids: nil)
|
6
|
+
# Convert topics to JSON for D3.js
|
7
|
+
topics_json = topics.map do |topic|
|
8
|
+
topic_data = {
|
9
|
+
id: topic.id,
|
10
|
+
label: topic.label || "Topic #{topic.id}",
|
11
|
+
size: topic.size,
|
12
|
+
terms: topic.terms.first(10),
|
13
|
+
coherence: topic.coherence,
|
14
|
+
samples: topic.representative_docs(k: 2).map { |d| d[0..200] }
|
15
|
+
}
|
16
|
+
|
17
|
+
# Add summary if it exists
|
18
|
+
summary = topic.instance_variable_get(:@summary)
|
19
|
+
topic_data[:summary] = summary if summary
|
20
|
+
|
21
|
+
topic_data
|
22
|
+
end.to_json
|
23
|
+
|
24
|
+
# HTML template with enhanced visualization
|
25
|
+
<<~HTML
|
26
|
+
<!DOCTYPE html>
|
27
|
+
<html>
|
28
|
+
<head>
|
29
|
+
<meta charset="utf-8">
|
30
|
+
<title>Topic Visualization</title>
|
31
|
+
<script src="https://d3js.org/d3.v7.min.js"></script>
|
32
|
+
<style>
|
33
|
+
body {
|
34
|
+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
35
|
+
margin: 20px;
|
36
|
+
background: #f8f9fa;
|
37
|
+
}
|
38
|
+
.container {
|
39
|
+
max-width: 1400px;
|
40
|
+
margin: 0 auto;
|
41
|
+
}
|
42
|
+
h1 {
|
43
|
+
color: #2c3e50;
|
44
|
+
margin-bottom: 10px;
|
45
|
+
}
|
46
|
+
.viz-panel {
|
47
|
+
background: white;
|
48
|
+
border-radius: 8px;
|
49
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
50
|
+
padding: 15px;
|
51
|
+
}
|
52
|
+
#bubble-viz {
|
53
|
+
height: 500px;
|
54
|
+
}
|
55
|
+
.topic { cursor: pointer; }
|
56
|
+
.topic:hover { opacity: 0.8; }
|
57
|
+
#details {
|
58
|
+
background: white;
|
59
|
+
padding: 20px;
|
60
|
+
border-radius: 8px;
|
61
|
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
62
|
+
margin-top: 20px;
|
63
|
+
}
|
64
|
+
.term {
|
65
|
+
display: inline-block;
|
66
|
+
margin: 5px;
|
67
|
+
padding: 5px 10px;
|
68
|
+
background: #e3f2fd;
|
69
|
+
border-radius: 3px;
|
70
|
+
color: #1976d2;
|
71
|
+
font-size: 14px;
|
72
|
+
}
|
73
|
+
</style>
|
74
|
+
</head>
|
75
|
+
<body>
|
76
|
+
<div class="container">
|
77
|
+
<h1>Topic Analysis Results</h1>
|
78
|
+
|
79
|
+
<div class="viz-panel">
|
80
|
+
<h3>Topic Overview</h3>
|
81
|
+
<div id="bubble-viz"></div>
|
82
|
+
</div>
|
83
|
+
|
84
|
+
<div id="details">Click on a topic to see details</div>
|
85
|
+
</div>
|
86
|
+
|
87
|
+
<script>
|
88
|
+
const topicsData = #{topics_json};
|
89
|
+
|
90
|
+
// Create bubble chart
|
91
|
+
function createBubbleChart() {
|
92
|
+
const width = document.getElementById('bubble-viz').clientWidth - 30;
|
93
|
+
const height = 470;
|
94
|
+
|
95
|
+
const svg = d3.select("#bubble-viz")
|
96
|
+
.append("svg")
|
97
|
+
.attr("width", width)
|
98
|
+
.attr("height", height);
|
99
|
+
|
100
|
+
// Create scale for bubble sizes
|
101
|
+
const sizeScale = d3.scaleSqrt()
|
102
|
+
.domain([0, d3.max(topicsData, d => d.size)])
|
103
|
+
.range([20, 60]);
|
104
|
+
|
105
|
+
// Create color scale
|
106
|
+
const colorScale = d3.scaleSequential(d3.interpolateTurbo)
|
107
|
+
.domain([0, 1]);
|
108
|
+
|
109
|
+
// Create force simulation
|
110
|
+
const simulation = d3.forceSimulation(topicsData)
|
111
|
+
.force("x", d3.forceX(width / 2).strength(0.05))
|
112
|
+
.force("y", d3.forceY(height / 2).strength(0.05))
|
113
|
+
.force("collide", d3.forceCollide(d => sizeScale(d.size) + 3));
|
114
|
+
|
115
|
+
// Create bubbles
|
116
|
+
const bubbles = svg.selectAll(".topic")
|
117
|
+
.data(topicsData)
|
118
|
+
.enter().append("g")
|
119
|
+
.attr("class", "topic");
|
120
|
+
|
121
|
+
bubbles.append("circle")
|
122
|
+
.attr("r", d => sizeScale(d.size))
|
123
|
+
.attr("fill", d => colorScale(d.coherence))
|
124
|
+
.attr("stroke", "#fff")
|
125
|
+
.attr("stroke-width", 2)
|
126
|
+
.style("filter", "drop-shadow(0px 2px 3px rgba(0,0,0,0.2))");
|
127
|
+
|
128
|
+
bubbles.append("text")
|
129
|
+
.text(d => d.label)
|
130
|
+
.attr("text-anchor", "middle")
|
131
|
+
.attr("dy", ".3em")
|
132
|
+
.style("font-size", d => Math.min(sizeScale(d.size) / 3, 14) + "px")
|
133
|
+
.style("fill", "white")
|
134
|
+
.style("font-weight", "500");
|
135
|
+
|
136
|
+
// Add click handler
|
137
|
+
bubbles.on("click", function(event, d) {
|
138
|
+
showDetails(d);
|
139
|
+
});
|
140
|
+
|
141
|
+
// Update positions
|
142
|
+
simulation.on("tick", () => {
|
143
|
+
bubbles.attr("transform", d => `translate(${d.x},${d.y})`);
|
144
|
+
});
|
145
|
+
}
|
146
|
+
|
147
|
+
// Show topic details
|
148
|
+
function showDetails(topic) {
|
149
|
+
const details = document.getElementById('details');
|
150
|
+
let summaryHtml = '';
|
151
|
+
if (topic.summary) {
|
152
|
+
summaryHtml = `
|
153
|
+
<p><strong>Summary:</strong></p>
|
154
|
+
<p style="font-size: 1.1em; color: #2c5234; padding: 15px; background: #e8f5e8; border-radius: 6px; border-left: 4px solid #4caf50; margin: 15px 0; line-height: 1.5;">${topic.summary}</p>
|
155
|
+
`;
|
156
|
+
}
|
157
|
+
|
158
|
+
details.innerHTML = `
|
159
|
+
<h2>${topic.label}</h2>
|
160
|
+
<p><strong>Documents:</strong> ${topic.size}</p>
|
161
|
+
<p><strong>Coherence:</strong> ${(topic.coherence * 100).toFixed(1)}%</p>
|
162
|
+
${summaryHtml}
|
163
|
+
<p><strong>Top Terms:</strong></p>
|
164
|
+
<div>${topic.terms.map(t => `<span class="term">${t}</span>`).join('')}</div>
|
165
|
+
<p><strong>Sample Documents:</strong></p>
|
166
|
+
${topic.samples.map(s => `<p style="font-size: 0.9em; color: #666; padding: 10px; background: #f5f5f5; border-radius: 4px; margin: 10px 0;">"${s}..."</p>`).join('')}
|
167
|
+
`;
|
168
|
+
}
|
169
|
+
|
170
|
+
// Initialize visualizations
|
171
|
+
createBubbleChart();
|
172
|
+
|
173
|
+
// Show first topic details by default
|
174
|
+
if (topicsData.length > 0) {
|
175
|
+
showDetails(topicsData[0]);
|
176
|
+
}
|
177
|
+
</script>
|
178
|
+
</body>
|
179
|
+
</html>
|
180
|
+
HTML
|
181
|
+
end
|
182
|
+
|
183
|
+
end
|
184
|
+
end
|
@@ -0,0 +1,226 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'yaml'
|
4
|
+
require 'pathname'
|
5
|
+
require 'singleton'
|
6
|
+
require 'fileutils'
|
7
|
+
|
8
|
+
module Ragnar
|
9
|
+
class Config
|
10
|
+
include Singleton
|
11
|
+
|
12
|
+
CONFIG_FILENAMES = ['.ragnar.yml', '.ragnarrc.yml', 'ragnar.yml'].freeze
|
13
|
+
|
14
|
+
def initialize
|
15
|
+
@config = load_config
|
16
|
+
ensure_directories_exist
|
17
|
+
end
|
18
|
+
|
19
|
+
# Main config access method
|
20
|
+
def get(key_path, default = nil)
|
21
|
+
keys = key_path.split('.')
|
22
|
+
value = keys.reduce(@config) { |config, key| config&.dig(key) }
|
23
|
+
|
24
|
+
# Use default only if value is nil (not false)
|
25
|
+
result = value.nil? ? default : value
|
26
|
+
|
27
|
+
# Expand paths that start with ~
|
28
|
+
if result.is_a?(String) && result.start_with?('~')
|
29
|
+
File.expand_path(result)
|
30
|
+
else
|
31
|
+
result
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
# Common config accessors
|
36
|
+
def database_path
|
37
|
+
get('storage.database_path', default_database_path)
|
38
|
+
end
|
39
|
+
|
40
|
+
def history_file
|
41
|
+
get('storage.history_file', default_history_file)
|
42
|
+
end
|
43
|
+
|
44
|
+
def models_dir
|
45
|
+
get('storage.models_dir', default_models_dir)
|
46
|
+
end
|
47
|
+
|
48
|
+
def embedding_model
|
49
|
+
get('embeddings.model', Ragnar::DEFAULT_EMBEDDING_MODEL)
|
50
|
+
end
|
51
|
+
|
52
|
+
def chunk_size
|
53
|
+
get('embeddings.chunk_size', Ragnar::DEFAULT_CHUNK_SIZE)
|
54
|
+
end
|
55
|
+
|
56
|
+
def chunk_overlap
|
57
|
+
get('embeddings.chunk_overlap', Ragnar::DEFAULT_CHUNK_OVERLAP)
|
58
|
+
end
|
59
|
+
|
60
|
+
def llm_model
|
61
|
+
get('llm.default_model', "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF")
|
62
|
+
end
|
63
|
+
|
64
|
+
def llm_gguf_file
|
65
|
+
get('llm.default_gguf_file', "tinyllama-1.1b-chat-v1.0.q4_k_m.gguf")
|
66
|
+
end
|
67
|
+
|
68
|
+
def interactive_prompt
|
69
|
+
get('interactive.prompt', "ragnar> ")
|
70
|
+
end
|
71
|
+
|
72
|
+
def quiet_mode?
|
73
|
+
get('interactive.quiet_mode', true)
|
74
|
+
end
|
75
|
+
|
76
|
+
def show_progress?
|
77
|
+
get('output.show_progress', true)
|
78
|
+
end
|
79
|
+
|
80
|
+
def query_top_k
|
81
|
+
get('query.top_k', 3)
|
82
|
+
end
|
83
|
+
|
84
|
+
def enable_query_rewriting?
|
85
|
+
get('query.enable_query_rewriting', true)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Config file management
|
89
|
+
def config_file_path
|
90
|
+
@config_file_path
|
91
|
+
end
|
92
|
+
|
93
|
+
def config_exists?
|
94
|
+
!@config_file_path.nil?
|
95
|
+
end
|
96
|
+
|
97
|
+
def reload!
|
98
|
+
@config = load_config
|
99
|
+
ensure_directories_exist
|
100
|
+
end
|
101
|
+
|
102
|
+
# Generate a config file with current/default settings
|
103
|
+
def generate_config_file(path = nil)
|
104
|
+
path ||= File.expand_path('~/.ragnar.yml')
|
105
|
+
|
106
|
+
config_content = {
|
107
|
+
'storage' => {
|
108
|
+
'database_path' => '~/.cache/ragnar/database',
|
109
|
+
'models_dir' => '~/.cache/ragnar/models',
|
110
|
+
'history_file' => '~/.cache/ragnar/history'
|
111
|
+
},
|
112
|
+
'embeddings' => {
|
113
|
+
'model' => Ragnar::DEFAULT_EMBEDDING_MODEL,
|
114
|
+
'chunk_size' => Ragnar::DEFAULT_CHUNK_SIZE,
|
115
|
+
'chunk_overlap' => Ragnar::DEFAULT_CHUNK_OVERLAP
|
116
|
+
},
|
117
|
+
'umap' => {
|
118
|
+
'reduced_dimensions' => Ragnar::DEFAULT_REDUCED_DIMENSIONS,
|
119
|
+
'n_neighbors' => 15,
|
120
|
+
'min_dist' => 0.1,
|
121
|
+
'model_filename' => 'umap_model.bin'
|
122
|
+
},
|
123
|
+
'llm' => {
|
124
|
+
'default_model' => 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF',
|
125
|
+
'default_gguf_file' => 'tinyllama-1.1b-chat-v1.0.q4_k_m.gguf'
|
126
|
+
},
|
127
|
+
'query' => {
|
128
|
+
'top_k' => 3,
|
129
|
+
'enable_query_rewriting' => true
|
130
|
+
},
|
131
|
+
'interactive' => {
|
132
|
+
'prompt' => 'ragnar> ',
|
133
|
+
'quiet_mode' => true
|
134
|
+
},
|
135
|
+
'output' => {
|
136
|
+
'show_progress' => true
|
137
|
+
}
|
138
|
+
}
|
139
|
+
|
140
|
+
# Ensure parent directory exists
|
141
|
+
FileUtils.mkdir_p(File.dirname(path))
|
142
|
+
|
143
|
+
# Write config file with comments
|
144
|
+
File.write(path, generate_yaml_with_comments(config_content))
|
145
|
+
path
|
146
|
+
end
|
147
|
+
|
148
|
+
private
|
149
|
+
|
150
|
+
def load_config
|
151
|
+
@config_file_path = find_config_file
|
152
|
+
|
153
|
+
if @config_file_path && File.exist?(@config_file_path)
|
154
|
+
YAML.load_file(@config_file_path) || {}
|
155
|
+
else
|
156
|
+
{}
|
157
|
+
end
|
158
|
+
rescue => e
|
159
|
+
warn "Warning: Error loading config file #{@config_file_path}: #{e.message}"
|
160
|
+
{}
|
161
|
+
end
|
162
|
+
|
163
|
+
def find_config_file
|
164
|
+
# Search order: current directory → home directory
|
165
|
+
search_paths = [
|
166
|
+
Dir.pwd,
|
167
|
+
File.expand_path('~')
|
168
|
+
]
|
169
|
+
|
170
|
+
search_paths.each do |dir|
|
171
|
+
CONFIG_FILENAMES.each do |filename|
|
172
|
+
path = File.join(dir, filename)
|
173
|
+
return path if File.exist?(path)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
|
177
|
+
nil
|
178
|
+
end
|
179
|
+
|
180
|
+
def ensure_directories_exist
|
181
|
+
directories = [
|
182
|
+
database_path,
|
183
|
+
models_dir,
|
184
|
+
File.dirname(history_file)
|
185
|
+
]
|
186
|
+
|
187
|
+
directories.each do |dir|
|
188
|
+
FileUtils.mkdir_p(dir) unless dir.nil?
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
def default_database_path
|
193
|
+
xdg_cache_home = ENV['XDG_CACHE_HOME'] || File.expand_path('~/.cache')
|
194
|
+
File.join(xdg_cache_home, 'ragnar', 'database')
|
195
|
+
end
|
196
|
+
|
197
|
+
def default_history_file
|
198
|
+
xdg_cache_home = ENV['XDG_CACHE_HOME'] || File.expand_path('~/.cache')
|
199
|
+
File.join(xdg_cache_home, 'ragnar', 'history')
|
200
|
+
end
|
201
|
+
|
202
|
+
def default_models_dir
|
203
|
+
xdg_cache_home = ENV['XDG_CACHE_HOME'] || File.expand_path('~/.cache')
|
204
|
+
File.join(xdg_cache_home, 'ragnar', 'models')
|
205
|
+
end
|
206
|
+
|
207
|
+
def generate_yaml_with_comments(config_hash)
|
208
|
+
yaml_content = YAML.dump(config_hash)
|
209
|
+
|
210
|
+
# Add header comment
|
211
|
+
commented = <<~HEADER
|
212
|
+
# Ragnar Configuration File
|
213
|
+
#
|
214
|
+
# This file configures default settings for Ragnar RAG system.
|
215
|
+
# Save as .ragnar.yml in your project directory or ~/.ragnar.yml for global settings.
|
216
|
+
#
|
217
|
+
# Search order: ./.ragnar.yml → ~/.ragnar.yml → built-in defaults
|
218
|
+
#
|
219
|
+
# All paths support ~ expansion (e.g., ~/.cache/ragnar/database)
|
220
|
+
|
221
|
+
HEADER
|
222
|
+
|
223
|
+
commented + yaml_content
|
224
|
+
end
|
225
|
+
end
|
226
|
+
end
|
data/lib/ragnar/database.rb
CHANGED
@@ -5,6 +5,7 @@ module Ragnar
|
|
5
5
|
def initialize(db_path, table_name: "documents")
|
6
6
|
@db_path = db_path
|
7
7
|
@table_name = table_name
|
8
|
+
@dataset_cache = nil # Cache to prevent file descriptor leaks
|
8
9
|
ensure_database_exists
|
9
10
|
end
|
10
11
|
|
@@ -34,16 +35,23 @@ module Ragnar
|
|
34
35
|
metadata: :string
|
35
36
|
}
|
36
37
|
|
38
|
+
# Clear cache before modifying dataset
|
39
|
+
clear_dataset_cache
|
40
|
+
|
37
41
|
# Use the new open_or_create method from Lancelot
|
38
42
|
# This automatically handles both creating new and opening existing datasets
|
39
43
|
dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
|
40
44
|
dataset.add_documents(data)
|
45
|
+
|
46
|
+
# Clear cache after modification to ensure fresh data on next read
|
47
|
+
clear_dataset_cache
|
41
48
|
end
|
42
49
|
|
43
50
|
def get_embeddings(limit: nil, offset: 0)
|
44
51
|
return [] unless dataset_exists?
|
45
52
|
|
46
|
-
dataset =
|
53
|
+
dataset = cached_dataset
|
54
|
+
return [] unless dataset
|
47
55
|
|
48
56
|
# Get all documents or a subset
|
49
57
|
docs = if limit && offset > 0
|
@@ -67,7 +75,8 @@ module Ragnar
|
|
67
75
|
def update_reduced_embeddings(updates)
|
68
76
|
return if updates.empty?
|
69
77
|
|
70
|
-
dataset =
|
78
|
+
dataset = cached_dataset
|
79
|
+
return unless dataset
|
71
80
|
|
72
81
|
# Get all existing documents and safely extract their data
|
73
82
|
all_docs = dataset.to_a.map do |doc|
|
@@ -113,17 +122,24 @@ module Ragnar
|
|
113
122
|
metadata: :string
|
114
123
|
}
|
115
124
|
|
125
|
+
# Clear cache before recreating dataset
|
126
|
+
clear_dataset_cache
|
127
|
+
|
116
128
|
# Remove old dataset and create new one with updated data
|
117
129
|
FileUtils.rm_rf(@db_path)
|
118
130
|
# Use open_or_create which will create since we just deleted the path
|
119
131
|
dataset = Lancelot::Dataset.open_or_create(@db_path, schema: schema)
|
120
132
|
dataset.add_documents(updated_docs)
|
133
|
+
|
134
|
+
# Clear cache after modification
|
135
|
+
clear_dataset_cache
|
121
136
|
end
|
122
137
|
|
123
138
|
def search_similar(embedding, k: 10, use_reduced: false)
|
124
139
|
return [] unless dataset_exists?
|
125
140
|
|
126
|
-
dataset =
|
141
|
+
dataset = cached_dataset
|
142
|
+
return [] unless dataset
|
127
143
|
|
128
144
|
embedding_field = use_reduced ? :reduced_embedding : :embedding
|
129
145
|
|
@@ -149,7 +165,9 @@ module Ragnar
|
|
149
165
|
def count
|
150
166
|
return 0 unless dataset_exists?
|
151
167
|
|
152
|
-
dataset =
|
168
|
+
dataset = cached_dataset
|
169
|
+
return 0 unless dataset
|
170
|
+
|
153
171
|
dataset.to_a.size
|
154
172
|
end
|
155
173
|
|
@@ -166,7 +184,18 @@ module Ragnar
|
|
166
184
|
}
|
167
185
|
end
|
168
186
|
|
169
|
-
dataset =
|
187
|
+
dataset = cached_dataset
|
188
|
+
unless dataset
|
189
|
+
return {
|
190
|
+
document_count: 0,
|
191
|
+
total_documents: 0,
|
192
|
+
unique_files: 0,
|
193
|
+
total_chunks: 0,
|
194
|
+
with_embeddings: 0,
|
195
|
+
with_reduced_embeddings: 0,
|
196
|
+
total_size_mb: 0.0
|
197
|
+
}
|
198
|
+
end
|
170
199
|
|
171
200
|
# Get all documents
|
172
201
|
all_docs = dataset.to_a
|
@@ -214,7 +243,9 @@ module Ragnar
|
|
214
243
|
def get_all_documents_with_embeddings(limit: nil)
|
215
244
|
return [] unless dataset_exists?
|
216
245
|
|
217
|
-
dataset =
|
246
|
+
dataset = cached_dataset
|
247
|
+
return [] unless dataset
|
248
|
+
|
218
249
|
all_docs = limit ? dataset.first(limit) : dataset.to_a
|
219
250
|
|
220
251
|
all_docs.select { |doc| doc[:embedding] && !doc[:embedding].empty? }
|
@@ -223,7 +254,8 @@ module Ragnar
|
|
223
254
|
def full_text_search(query, limit: 10)
|
224
255
|
return [] unless dataset_exists?
|
225
256
|
|
226
|
-
dataset =
|
257
|
+
dataset = cached_dataset
|
258
|
+
return [] unless dataset
|
227
259
|
|
228
260
|
# Use Lancelot's full-text search
|
229
261
|
results = dataset.full_text_search(
|
@@ -243,11 +275,49 @@ module Ragnar
|
|
243
275
|
end
|
244
276
|
end
|
245
277
|
|
278
|
+
# Get the total number of documents in the database
|
279
|
+
def document_count
|
280
|
+
count
|
281
|
+
end
|
282
|
+
|
283
|
+
# Get documents by their IDs
|
284
|
+
# @param ids [Array<String>] Document IDs to fetch
|
285
|
+
# @return [Array<Hash>] Documents with their embeddings
|
286
|
+
def get_documents_by_ids(ids)
|
287
|
+
return [] if ids.empty? || !dataset_exists?
|
288
|
+
|
289
|
+
dataset = cached_dataset
|
290
|
+
return [] unless dataset
|
291
|
+
|
292
|
+
# Create ID lookup set for efficiency
|
293
|
+
id_set = ids.to_set
|
294
|
+
|
295
|
+
# Filter documents by IDs
|
296
|
+
dataset.to_a.select { |doc| id_set.include?(doc[:id]) }.map do |doc|
|
297
|
+
{
|
298
|
+
id: doc[:id],
|
299
|
+
chunk_text: doc[:chunk_text],
|
300
|
+
file_path: doc[:file_path],
|
301
|
+
chunk_index: doc[:chunk_index],
|
302
|
+
embedding: doc[:embedding],
|
303
|
+
reduced_embedding: doc[:reduced_embedding],
|
304
|
+
metadata: JSON.parse(doc[:metadata] || "{}")
|
305
|
+
}
|
306
|
+
end
|
307
|
+
end
|
308
|
+
|
246
309
|
def dataset_exists?
|
247
310
|
return false unless File.exist?(@db_path)
|
248
311
|
|
312
|
+
# Try to use cached dataset if available
|
313
|
+
if @dataset_cache
|
314
|
+
return true
|
315
|
+
end
|
316
|
+
|
317
|
+
# Otherwise check if we can open it
|
249
318
|
begin
|
250
|
-
|
319
|
+
# Don't cache here, just check existence
|
320
|
+
dataset = Lancelot::Dataset.open(@db_path)
|
251
321
|
true
|
252
322
|
rescue
|
253
323
|
false
|
@@ -263,5 +333,21 @@ module Ragnar
|
|
263
333
|
def table_exists?
|
264
334
|
dataset_exists?
|
265
335
|
end
|
336
|
+
|
337
|
+
# Cached dataset accessor to prevent file descriptor leaks
|
338
|
+
def cached_dataset
|
339
|
+
return nil unless File.exist?(@db_path)
|
340
|
+
|
341
|
+
@dataset_cache ||= begin
|
342
|
+
Lancelot::Dataset.open(@db_path)
|
343
|
+
rescue => e
|
344
|
+
nil
|
345
|
+
end
|
346
|
+
end
|
347
|
+
|
348
|
+
# Clear the cached dataset (e.g., after modifications)
|
349
|
+
def clear_dataset_cache
|
350
|
+
@dataset_cache = nil
|
351
|
+
end
|
266
352
|
end
|
267
353
|
end
|
data/lib/ragnar/llm_manager.rb
CHANGED
@@ -18,7 +18,10 @@ module Ragnar
|
|
18
18
|
|
19
19
|
@mutex.synchronize do
|
20
20
|
@llms[cache_key] ||= begin
|
21
|
-
|
21
|
+
# Only show loading message if not in interactive mode or if verbose
|
22
|
+
show_loading = ENV['DEBUG'] # Only show in debug mode for now
|
23
|
+
puts "Loading LLM: #{model_id}..." if show_loading && !@llms.key?(cache_key)
|
24
|
+
|
22
25
|
if gguf_file
|
23
26
|
Candle::LLM.from_pretrained(model_id, gguf_file: gguf_file)
|
24
27
|
else
|
@@ -16,29 +16,47 @@ module Ragnar
|
|
16
16
|
@reranker = nil # Will initialize when needed
|
17
17
|
end
|
18
18
|
|
19
|
-
def query(user_query, top_k: 3, verbose: false)
|
19
|
+
def query(user_query, top_k: 3, verbose: false, enable_rewriting: true)
|
20
20
|
puts "Processing query: #{user_query}" if verbose
|
21
21
|
|
22
|
-
# Step 1: Rewrite and analyze the query
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
22
|
+
# Step 1: Rewrite and analyze the query (if enabled)
|
23
|
+
if enable_rewriting
|
24
|
+
puts "\n#{'-'*60}" if verbose
|
25
|
+
puts "STEP 1: Query Analysis & Rewriting" if verbose
|
26
|
+
puts "-"*60 if verbose
|
27
|
+
|
28
|
+
rewritten = @rewriter.rewrite(user_query)
|
29
|
+
|
30
|
+
if verbose
|
31
|
+
puts "\nOriginal Query: #{user_query}"
|
32
|
+
puts "\nRewritten Query Analysis:"
|
33
|
+
puts " Clarified Intent: #{rewritten['clarified_intent']}"
|
34
|
+
puts " Query Type: #{rewritten['query_type']}"
|
35
|
+
puts " Context Needed: #{rewritten['context_needed']}"
|
36
|
+
puts "\nGenerated Sub-queries (#{rewritten['sub_queries'].length}):"
|
37
|
+
rewritten['sub_queries'].each_with_index do |sq, idx|
|
38
|
+
puts " #{idx + 1}. #{sq}"
|
39
|
+
end
|
40
|
+
if rewritten['key_terms'] && !rewritten['key_terms'].empty?
|
41
|
+
puts "\nKey Terms Identified:"
|
42
|
+
puts " #{rewritten['key_terms'].join(', ')}"
|
43
|
+
end
|
38
44
|
end
|
39
|
-
|
40
|
-
|
41
|
-
|
45
|
+
else
|
46
|
+
# Skip rewriting - use original query directly
|
47
|
+
rewritten = {
|
48
|
+
'clarified_intent' => user_query,
|
49
|
+
'query_type' => 'direct',
|
50
|
+
'context_needed' => 'general',
|
51
|
+
'sub_queries' => [user_query],
|
52
|
+
'key_terms' => []
|
53
|
+
}
|
54
|
+
|
55
|
+
if verbose
|
56
|
+
puts "\n#{'-'*60}"
|
57
|
+
puts "STEP 1: Query Analysis (Rewriting Disabled)"
|
58
|
+
puts "-"*60
|
59
|
+
puts "\nUsing original query directly"
|
42
60
|
end
|
43
61
|
end
|
44
62
|
|