simple-rag-zc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: d8b0204bb64f55c075ecb1287b983fda160b1ffcf7e552e346372ab7f95bb3b8
4
+ data.tar.gz: 330043e72800a113dcc4df223dbfb978c26449ca2c2d477b13db82c7e5c2e743
5
+ SHA512:
6
+ metadata.gz: 074af0f36149c2e9d5c0b7cd0dacf369d1c21bb4f812bd3c63f4b41772b5a6cb05f3ff8f75f9ed3a5d28f5c9846adb45e11ec0d1b18b7ada77f3a845ca5a989f
7
+ data.tar.gz: 6c34c79345703bc0cfb83bff5373b9d04a03bbba1c5549a96749b26986b2e49ba56d2e7dc63ba1bf749a0df01a9aec0ac66cb96f5ec03ed1c822a3c73a4379fc
data/README.md ADDED
@@ -0,0 +1,28 @@
1
+ # simple-rag
2
+
3
+ RAG on Markdown Files
4
+
5
+ - Use **Search** for standard retrieval
6
+ - Use **Search+** for agentic query expansion and fast text match
7
+ - Use **Synthesize** to combine retrieved notes
8
+
9
+ ## Setup
10
+
11
+ - Setup Config JSON
12
+ - Run `run-index config.json`
13
+ - Run `run-server config.json` and open `http://localhost:4567/q.html`
14
+
15
+ ## Publishing
16
+
17
+ To release a new version to [RubyGems](https://rubygems.org), run:
18
+
19
+ ```bash
20
+ gem build simple-rag.gemspec
21
+ gem push simple-rag-$(ruby -Ilib -e 'require "simple_rag/version"; puts SimpleRag::VERSION').gem
22
+ ```
23
+
24
+ Install the gem directly:
25
+
26
+ ```bash
27
+ gem install simple-rag
28
+ ```
@@ -0,0 +1,21 @@
1
+ {
2
+ "chat": {
3
+ "provider": "openai",
4
+ "url": "",
5
+ "model": "gpt-3.5-turbo-16k"
6
+ },
7
+ "embedding": {
8
+ "provider": "openai",
9
+ "url": "",
10
+ "model": "text-embedding-3-small"
11
+ },
12
+ "paths": [
13
+ {
14
+ "name": "temp",
15
+ "reader": "text",
16
+ "threshold": 0.3,
17
+ "dir": "D:\\Studies\\tmp\\learning",
18
+ "out": "D:\\Studies\\tmp\\learning-gpt1.dt"
19
+ }
20
+ ]
21
+ }
data/exe/public/q.html ADDED
@@ -0,0 +1,381 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Search Page</title>
7
+ <style>
8
+ body {
9
+ display: flex;
10
+ margin: 0;
11
+ padding: 0;
12
+ font-family: Arial, sans-serif;
13
+ }
14
+ #main-content {
15
+ flex-grow: 1;
16
+ padding: 20px;
17
+ }
18
+ #paths-container {
19
+ margin-bottom: 20px;
20
+ }
21
+ #paths-list {
22
+ list-style-type: none;
23
+ padding: 0;
24
+ display: flex;
25
+ flex-wrap: wrap;
26
+ gap: 10px;
27
+ }
28
+ #paths-list li {
29
+ display: flex;
30
+ align-items: center;
31
+ }
32
+ #paths-list label {
33
+ margin-left: 5px;
34
+ }
35
+ #search-container {
36
+ display: flex;
37
+ margin-bottom: 20px;
38
+ }
39
+ #search-input {
40
+ flex-grow: 1;
41
+ height: 60px;
42
+ font-size: 16px;
43
+ padding: 0 10px;
44
+ }
45
+ #search-button {
46
+ width: 100px;
47
+ height: 66px;
48
+ font-size: 16px;
49
+ margin-left: 10px;
50
+ }
51
+ #search-plus-button {
52
+ width: 100px;
53
+ height: 66px;
54
+ font-size: 16px;
55
+ margin-left: 10px;
56
+ }
57
+ #synthesize-button {
58
+ width: 120px;
59
+ height: 66px;
60
+ font-size: 16px;
61
+ margin-left: 10px;
62
+ }
63
+ #response-container {
64
+ display: flex;
65
+ flex-wrap: wrap;
66
+ gap: 10px;
67
+ }
68
+ .response-item {
69
+ border: 1px solid #ccc;
70
+ padding: 10px;
71
+ border-radius: 5px;
72
+ width: calc(50% - 5px);
73
+ box-sizing: border-box;
74
+ }
75
+ .discuss-button {
76
+ margin-top: 10px;
77
+ }
78
+ </style>
79
+ <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
80
+ </head>
81
+ <body>
82
+ <div id="main-content">
83
+ <div id="search-container">
84
+ <input type="text" id="search-input" placeholder="Enter your search query">
85
+ <button id="search-button">Search</button>
86
+ <button id="search-plus-button">Search+</button>
87
+ <button id="synthesize-button">Synthesize</button>
88
+ </div>
89
+ <div id="paths-container">
90
+ <ul id="paths-list"></ul>
91
+ </div>
92
+ <div id="response-container"></div>
93
+ </div>
94
+
95
+ <script>
96
+ document.addEventListener('DOMContentLoaded', function() {
97
+ const pathsList = document.getElementById('paths-list');
98
+ const searchInput = document.getElementById('search-input');
99
+ const searchButton = document.getElementById('search-button');
100
+ const searchPlusButton = document.getElementById('search-plus-button');
101
+ const synthesizeButton = document.getElementById('synthesize-button');
102
+ const responseContainer = document.getElementById('response-container');
103
+ let retrievedNotes = [];
104
+
105
+ // Fetch paths and render sidebar
106
+ fetch('http://localhost:4567/paths')
107
+ .then(response => response.json())
108
+ .then(data => {
109
+ data.forEach(item => {
110
+ const li = document.createElement('li');
111
+ li.style.backgroundColor = textToLightColor(item.name);
112
+
113
+ const checkbox = document.createElement('input');
114
+ checkbox.type = 'checkbox';
115
+ checkbox.id = item.name;
116
+ checkbox.name = item.name;
117
+ checkbox.checked = true;
118
+
119
+ const label = document.createElement('label');
120
+ label.htmlFor = item.name;
121
+ label.appendChild(document.createTextNode(item.name));
122
+
123
+ li.appendChild(checkbox);
124
+ li.appendChild(label);
125
+ pathsList.appendChild(li);
126
+ });
127
+ })
128
+ .catch(error => console.error('Error fetching paths:', error));
129
+
130
+ // Search function
131
+ function performSearch() {
132
+ const query = searchInput.value;
133
+ const checkedPaths = Array.from(pathsList.querySelectorAll('input[type="checkbox"]:checked'))
134
+ .map(checkbox => checkbox.name);
135
+
136
+ fetch('http://localhost:4567/q', {
137
+ method: 'POST',
138
+ headers: {
139
+ 'Content-Type': 'application/json',
140
+ },
141
+ body: JSON.stringify({
142
+ q: query,
143
+ paths: checkedPaths,
144
+ })
145
+ })
146
+ .then(response => response.json())
147
+ .then(resp => {
148
+ responseContainer.innerHTML = '';
149
+ retrievedNotes = [];
150
+
151
+ resp.data.forEach(item => {
152
+ const div = document.createElement('div');
153
+ div.className = 'response-item';
154
+ div.style.backgroundColor = textToLightColor(item.lookup);
155
+ div.dataset.note = item.text;
156
+ div.innerHTML = `
157
+ <div><strong>Path:</strong> <a href="${item.url}">${item.id}</a></div>
158
+ <div><strong>Score:</strong> ${item.score}</div>
159
+ <div class="markdown-content">${marked.parse(item.text)}</div>
160
+ `;
161
+ const btn = document.createElement('button');
162
+ btn.className = 'discuss-button';
163
+ btn.textContent = 'Discuss';
164
+ btn.addEventListener('click', () => discussCard(div));
165
+ div.appendChild(btn);
166
+ responseContainer.appendChild(div);
167
+ retrievedNotes.push(item.text);
168
+ });
169
+ })
170
+ .catch(error => console.error('Error performing search:', error));
171
+ }
172
+
173
+ function performAgentSearch() {
174
+ const query = searchInput.value;
175
+ const checkedPaths = Array.from(pathsList.querySelectorAll('input[type="checkbox"]:checked'))
176
+ .map(checkbox => checkbox.name);
177
+
178
+ fetch('http://localhost:4567/q_plus', {
179
+ method: 'POST',
180
+ headers: {
181
+ 'Content-Type': 'application/json',
182
+ },
183
+ body: JSON.stringify({
184
+ q: query,
185
+ paths: checkedPaths,
186
+ })
187
+ })
188
+ .then(response => response.json())
189
+ .then(resp => {
190
+ responseContainer.innerHTML = '';
191
+ retrievedNotes = [];
192
+
193
+ if (!!resp.expanded) {
194
+ const div = document.createElement('div');
195
+ div.className = 'response-item';
196
+ div.style.backgroundColor = textToLightColor('expanded');
197
+ div.innerHTML = `<div><strong>Expanded Query:</strong> ${resp.expanded}</div>`;
198
+ responseContainer.appendChild(div);
199
+ }
200
+
201
+ if (resp.variants && resp.variants.length > 0) {
202
+ const div = document.createElement('div');
203
+ div.className = 'response-item';
204
+ div.style.backgroundColor = textToLightColor('variants');
205
+ div.innerHTML = `<div><strong>Variants:</strong> ${resp.variants.join(', ')}</div>`;
206
+ responseContainer.appendChild(div);
207
+ }
208
+
209
+ resp.data.forEach(item => {
210
+ const div = document.createElement('div');
211
+ div.className = 'response-item';
212
+ div.style.backgroundColor = textToLightColor(item.lookup);
213
+ div.dataset.note = item.text;
214
+ div.innerHTML = `
215
+ <div><strong>Path:</strong> <a href="${item.url}">${item.id}</a></div>
216
+ <div><strong>Score:</strong> ${item.score}</div>
217
+ <div class="markdown-content">${marked.parse(item.text)}</div>
218
+ `;
219
+ const btn = document.createElement('button');
220
+ btn.className = 'discuss-button';
221
+ btn.textContent = 'Discuss';
222
+ btn.addEventListener('click', () => discussCard(div));
223
+ div.appendChild(btn);
224
+ responseContainer.appendChild(div);
225
+ retrievedNotes.push(item.text);
226
+ });
227
+ })
228
+ .catch(error => console.error('Error performing agent search:', error));
229
+ }
230
+
231
+ function performAgentSearch() {
232
+ const query = searchInput.value;
233
+ const configExperiment = configExperimentCheckbox.checked
234
+ const checkedPaths = Array.from(pathsList.querySelectorAll('input[type="checkbox"]:checked'))
235
+ .map(checkbox => checkbox.name);
236
+
237
+ fetch('http://localhost:4567/q_plus', {
238
+ method: 'POST',
239
+ headers: {
240
+ 'Content-Type': 'application/json',
241
+ },
242
+ body: JSON.stringify({
243
+ q: query,
244
+ paths: checkedPaths,
245
+ experiment: configExperiment,
246
+ })
247
+ })
248
+ .then(response => response.json())
249
+ .then(resp => {
250
+ responseContainer.innerHTML = '';
251
+
252
+ if (!!resp.expanded) {
253
+ const div = document.createElement('div');
254
+ div.className = 'response-item';
255
+ div.style.backgroundColor = textToLightColor("expanded");
256
+ div.innerHTML = `<div><strong>Expanded Query:</strong> ${resp.expanded}</div>`;
257
+ responseContainer.appendChild(div);
258
+ }
259
+
260
+ if (resp.variants && resp.variants.length > 0) {
261
+ const div = document.createElement('div');
262
+ div.className = 'response-item';
263
+ div.style.backgroundColor = textToLightColor("variants");
264
+ div.innerHTML = `
265
+ <div><strong>Variants:</strong> ${resp.variants.join(', ')}</div>
266
+ `;
267
+ responseContainer.appendChild(div);
268
+ }
269
+
270
+ if (!!resp.eval) {
271
+ const div = document.createElement('div');
272
+ div.className = 'response-item';
273
+ div.style.backgroundColor = textToLightColor("experiment");
274
+ div.innerHTML = `
275
+ <div class="markdown-content">${marked.parse(resp.eval)}</div>
276
+ `;
277
+ responseContainer.appendChild(div);
278
+ }
279
+
280
+ resp.data.forEach(item => {
281
+ const div = document.createElement('div');
282
+ div.className = 'response-item';
283
+ div.style.backgroundColor = textToLightColor(item.lookup);
284
+ div.dataset.note = item.text;
285
+ div.innerHTML = `
286
+ <div><strong>Path:</strong> <a href="${item.url}">${item.id}</a></div>
287
+ <div><strong>Score:</strong> ${item.score}</div>
288
+ <div class="markdown-content">${marked.parse(item.text)}</div>
289
+ `;
290
+ const btn = document.createElement('button');
291
+ btn.className = 'discuss-button';
292
+ btn.textContent = 'Discuss';
293
+ btn.addEventListener('click', () => discussCard(div));
294
+ div.appendChild(btn);
295
+ responseContainer.appendChild(div);
296
+ });
297
+ })
298
+ .catch(error => console.error('Error performing agent search:', error));
299
+ }
300
+
301
+ function textToLightColor(text) {
302
+ // Generate a hash from the text
303
+ let hash = 0;
304
+ for (let i = 0; i < text.length; i++) {
305
+ hash = text.charCodeAt(i) + ((hash << 5) - hash);
306
+ }
307
+
308
+ // Use the hash to generate RGB values
309
+ let r = (hash & 0xFF) % 64 + 192; // 192-255
310
+ let g = ((hash >> 8) & 0xFF) % 64 + 192; // 192-255
311
+ let b = ((hash >> 16) & 0xFF) % 64 + 192; // 192-255
312
+
313
+ // Convert to hex and return
314
+ return `#${r.toString(16).padStart(2, '0')}${g.toString(16).padStart(2, '0')}${b.toString(16).padStart(2, '0')}`;
315
+ }
316
+
317
+ function synthesizeNotes() {
318
+ if (retrievedNotes.length === 0) {
319
+ return;
320
+ }
321
+
322
+ fetch('http://localhost:4567/synthesize', {
323
+ method: 'POST',
324
+ headers: {
325
+ 'Content-Type': 'application/json',
326
+ },
327
+ body: JSON.stringify({
328
+ notes: retrievedNotes,
329
+ })
330
+ })
331
+ .then(response => response.json())
332
+ .then(resp => {
333
+ const div = document.createElement('div');
334
+ div.className = 'response-item';
335
+ div.style.backgroundColor = textToLightColor('synthesize');
336
+ div.innerHTML = `
337
+ <div class="markdown-content">${marked.parse(resp.note)}</div>
338
+ `;
339
+ responseContainer.prepend(div);
340
+ })
341
+ .catch(error => console.error('Error synthesizing notes:', error));
342
+ }
343
+
344
+ function discussCard(div) {
345
+ const note = div.dataset.note;
346
+ if (!note) {
347
+ return;
348
+ }
349
+
350
+ fetch('http://localhost:4567/discuss', {
351
+ method: 'POST',
352
+ headers: {
353
+ 'Content-Type': 'application/json',
354
+ },
355
+ body: JSON.stringify({
356
+ note: note,
357
+ })
358
+ })
359
+ .then(response => response.json())
360
+ .then(resp => {
361
+ const mdDiv = div.querySelector('.markdown-content');
362
+ mdDiv.innerHTML += marked.parse(resp.discussion);
363
+ })
364
+ .catch(error => console.error('Error discussing note:', error));
365
+ }
366
+
367
+ // Event listeners
368
+ searchButton.addEventListener('click', performSearch);
369
+ searchPlusButton.addEventListener('click', performAgentSearch);
370
+ synthesizeButton.addEventListener('click', synthesizeNotes);
371
+
372
+ searchInput.addEventListener('keypress', function(e) {
373
+ if (e.key === 'Enter') {
374
+ performSearch();
375
+ }
376
+ });
377
+
378
+ });
379
+ </script>
380
+ </body>
381
+ </html>
data/exe/run-index ADDED
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # Index all markdown files in a directory
5
+ #
6
+ # Usage: run-index config.json
7
+ #
8
+ # Requires OpenAI API Key stored in DOT_OPENAI_KEY
9
+
10
+ require "json"
11
+ require "ostruct"
12
+ require "digest"
13
+
14
+ require_relative "../llm/openai"
15
+ require_relative "../llm/embedding"
16
+ require_relative "../readers/reader"
17
+
18
+ if ARGV.length != 1
19
+ STDOUT << "Invalid arguments received, need a config file\n"
20
+ exit 1
21
+ end
22
+
23
+ config = JSON.parse(File.read(ARGV[0]))
24
+ CONFIG = OpenStruct.new(config)
25
+ CONFIG.paths = CONFIG.paths.map { |p| OpenStruct.new(p) }
26
+
27
+ OPENAI_KEY = ENV["DOT_OPENAI_KEY"] || ""
28
+ if OPENAI_KEY.empty?
29
+ STDOUT << "Remember to set env DOT_OPENAI_KEY\n"
30
+ exit 9
31
+ end
32
+
33
+ CONFIG.paths.each do |path|
34
+ STDOUT << "Read path name: #{path.name}, reader: #{path.reader}\n"
35
+
36
+ # Read existing index
37
+ STDOUT << "Read existing index: #{path.out}, time: @#{Time.now}\n"
38
+ index_db = {}
39
+ index_file = File.expand_path(path.out)
40
+
41
+ File.foreach(index_file) do |line|
42
+ item = JSON.parse(line)
43
+ index_db[item["hash"]] = item
44
+ end if File.exist?(index_file)
45
+ STDOUT << "Found index: #{index_db.length}\n"
46
+
47
+ # Scan directory
48
+ name_match = path.nameMatch || "*.{md,markdown}"
49
+ dir_blob = File.join(File.expand_path(path.dir), "**", name_match)
50
+ files = Dir[dir_blob]
51
+ STDOUT << "Scan dir: #{dir_blob}, Found: #{files.length}\n"
52
+
53
+ # Get reader class
54
+ reader_class = get_reader(path.reader)
55
+ if reader_class.nil?
56
+ STDOUT << "Reader undefinied: #{path.reader}\n"
57
+ exit 9
58
+ end
59
+
60
+ # Build index
61
+ STDOUT << "Building index @#{Time.now}\n["
62
+ skipped = 0
63
+ created = 0
64
+ File.open(index_file, "w") do |index_newdb|
65
+ files.each_with_index do |file, file_idx|
66
+ chunks = reader_class.new(file).load.chunks
67
+
68
+ chunks.each_with_index do |chunk, chunk_idx|
69
+ hash = Digest::SHA256.hexdigest(chunk)
70
+
71
+ if index_db[hash] # found in old DB
72
+ index_newdb.puts(index_db[hash].to_json)
73
+
74
+ skipped += 1
75
+ next
76
+ end
77
+
78
+ created += 1
79
+ embedding = embedding(chunk)
80
+
81
+ line = { path: file, hash: hash, chunk: chunk_idx, embedding: embedding }
82
+ index_newdb.puts(line.to_json)
83
+ end
84
+
85
+ if file_idx % 50 == 0 # flush the file writes
86
+ index_newdb.flush
87
+ STDOUT << file_idx
88
+ else
89
+ STDOUT << "."
90
+ end
91
+ end
92
+ end
93
+
94
+ STDOUT << "]\nDone @#{Time.now}, Created: #{created}, Skipped: #{skipped}\n"
95
+ end
data/exe/run-server ADDED
@@ -0,0 +1,167 @@
1
+ #!/usr/bin/env ruby
2
+ # encoding: utf-8
3
+
4
+ # Query and answer questions based on an index file
5
+ #
6
+ # Usage: run-server config.json
7
+ #
8
+ # Requires OpenAI API Key stored in DOT_OPENAI_KEY
9
+
10
+ require "json"
11
+ require "ostruct"
12
+ require "sinatra"
13
+
14
+ require_relative "../server/retriever"
15
+ require_relative "../server/synthesizer"
16
+ require_relative "../server/discuss"
17
+
18
+ if ARGV.length != 1
19
+ STDOUT << "Invalid arguments received, need a config file\n"
20
+ exit 1
21
+ end
22
+
23
+ config = JSON.parse(File.read(ARGV[0]))
24
+ CONFIG = OpenStruct.new(config)
25
+ CONFIG.paths = CONFIG.paths.map { |p| OpenStruct.new(p) }
26
+ CONFIG.path_map = {}
27
+ CONFIG.paths.each { |p| CONFIG.path_map[p.name] = p }
28
+
29
+ OPENAI_KEY = ENV["DOT_OPENAI_KEY"] || ""
30
+ if OPENAI_KEY.empty?
31
+ STDOUT << "Remember to set env DOT_OPENAI_KEY\n"
32
+ exit 9
33
+ end
34
+
35
+ # list all the paths that can be searched
36
+ get '/paths' do
37
+ content_type :json
38
+
39
+ resp = []
40
+ CONFIG.paths.each do |p|
41
+ resp << { "name": p.name }
42
+ end
43
+ resp.to_json
44
+ end
45
+
46
+ # query within the paths
47
+ post '/q' do
48
+ content_type :json
49
+
50
+ data = JSON.parse(request.body.read)
51
+
52
+ lookup_paths = (data["paths"] || CONFIG.paths_map.keys).map do |name|
53
+ CONFIG.path_map[name]
54
+ end
55
+
56
+ topN = (data["topN"] || 20).to_i
57
+
58
+ q = data["q"]
59
+ entries = retrieve_by_embedding(lookup_paths, q)
60
+ if q.to_s.strip.length < 5 && q.to_s.split(/\s+/).length < 5
61
+ entries.concat(retrieve_by_text(lookup_paths, q))
62
+
63
+ unique = {}
64
+ entries.each do |e|
65
+ key = [e["path"], e["chunk"]]
66
+ if unique[key]
67
+ unique[key]["score"] = (unique[key]["score"] || 0) + (e["score"] || 0)
68
+ else
69
+ unique[key] = e
70
+ end
71
+ end
72
+
73
+ entries = unique.values
74
+ end
75
+ entries = entries.sort_by { |item| -item["score"] }.take(topN)
76
+
77
+ resp = {
78
+ data: [],
79
+ }
80
+
81
+ entries.each do |item|
82
+ resp[:data] << {
83
+ path: item["path"],
84
+ lookup: item["lookup"],
85
+ id: item["id"],
86
+ url: item["url"],
87
+ text: item["reader"].load.get_chunk(item["chunk"]),
88
+ score: item["score"],
89
+ }
90
+ end
91
+
92
+ resp.to_json
93
+ end
94
+
95
+ # agentic query - expand the query using LLM before searching
96
+ post '/q_plus' do
97
+ content_type :json
98
+
99
+ data = JSON.parse(request.body.read)
100
+
101
+ lookup_paths = (data["paths"] || CONFIG.paths_map.keys).map do |name|
102
+ CONFIG.path_map[name]
103
+ end
104
+
105
+ topN = (data["topN"] || 20).to_i
106
+
107
+ expanded_q = expand_query(data["q"])
108
+ variants = expand_variants(data["q"])
109
+
110
+ entries = []
111
+ entries.concat(retrieve_by_embedding(lookup_paths, data["q"]))
112
+ entries.concat(retrieve_by_embedding(lookup_paths, expanded_q))
113
+ variants.each { |v| entries.concat(retrieve_by_text(lookup_paths, v)) }
114
+
115
+ unique = {}
116
+ entries.each do |e|
117
+ key = [e["path"], e["chunk"]]
118
+ if unique[key]
119
+ unique[key]["score"] = (unique[key]["score"] || 0) + (e["score"] || 0)
120
+ else
121
+ unique[key] = e
122
+ end
123
+ end
124
+
125
+ ordered = unique.values.sort_by { |item| -item["score"] }.take(topN)
126
+
127
+ resp = {
128
+ data: [],
129
+ expanded: expanded_q,
130
+ variants: variants,
131
+ }
132
+
133
+ ordered.each do |item|
134
+ resp[:data] << {
135
+ path: item["path"],
136
+ lookup: item["lookup"],
137
+ id: item["id"],
138
+ url: item["url"],
139
+ text: item["reader"].load.get_chunk(item["chunk"]),
140
+ score: item["score"],
141
+ }
142
+ end
143
+
144
+ resp.to_json
145
+ end
146
+
147
+ # synthesize notes into a summary
148
+ post '/synthesize' do
149
+ content_type :json
150
+
151
+ data = JSON.parse(request.body.read)
152
+
153
+ summary = synthesize_notes(data["notes"])
154
+
155
+ { note: summary }.to_json
156
+ end
157
+
158
+ # generate discussion for a single note
159
+ post '/discuss' do
160
+ content_type :json
161
+
162
+ data = JSON.parse(request.body.read)
163
+
164
+ discussion = discuss_note(data["note"])
165
+
166
+ { discussion: discussion }.to_json
167
+ end
@@ -0,0 +1,3 @@
1
+ module SimpleRag
2
+ VERSION = "0.1.0"
3
+ end
data/lib/simple_rag.rb ADDED
@@ -0,0 +1,17 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "simple_rag/version"
4
+
5
+ # Adjust load path so require_relative works from gem
6
+ $LOAD_PATH.unshift File.expand_path("..", __dir__)
7
+
8
+ module SimpleRag
9
+ end
10
+
11
+ require "llm/openai"
12
+ require "llm/embedding"
13
+ require "readers/reader"
14
+ require "server/retriever"
15
+ require "server/synthesizer"
16
+ require "server/discuss"
17
+ require "storage/mem"
data/llm/embedding.rb ADDED
@@ -0,0 +1,19 @@
1
+
2
+ def cosine_similarity(array1, array2)
3
+ dot_product = 0.0
4
+ norm_a = 0.0
5
+ norm_b = 0.0
6
+
7
+ array1.each_with_index do |value1, index|
8
+ value2 = array2[index]
9
+
10
+ dot_product += value1 * value2
11
+ norm_a += value1 * value1
12
+ norm_b += value2 * value2
13
+ end
14
+
15
+ norm_a = Math.sqrt(norm_a)
16
+ norm_b = Math.sqrt(norm_b)
17
+
18
+ cosine_similarity = dot_product / (norm_a * norm_b)
19
+ end
data/llm/http.rb ADDED
@@ -0,0 +1,18 @@
1
+ require "net/http"
2
+ require "json"
3
+
4
+ def http_post(uri, auth, reqData)
5
+ url = URI(uri)
6
+
7
+ http = Net::HTTP.new(url.host, url.port)
8
+ http.use_ssl = true unless auth.nil?
9
+ http.read_timeout = 600 # Time in seconds
10
+
11
+ headers = { "Content-Type" => "application/json" }
12
+ headers["Authorization"] = "Bearer #{auth}" unless auth.nil?
13
+
14
+ request = Net::HTTP::Post.new(url, headers)
15
+ request.body = reqData.to_json
16
+
17
+ return http.request(request)
18
+ end
data/llm/ollama.rb ADDED
@@ -0,0 +1,19 @@
1
+ require_relative "http"
2
+
3
+ def embedding_ollama(txts, opts = {})
4
+ data = {
5
+ "model" => "nomic-embed-text",
6
+ "prompt" => txts
7
+ }.merge(opts)
8
+
9
+ uri = "http://localhost:11434/api/embeddings"
10
+ response = http_post(uri, nil, data)
11
+
12
+ if response.code != "200"
13
+ STDOUT << "Embedding error: #{response}\n"
14
+ exit 1
15
+ end
16
+
17
+ result = JSON.parse(response.body)
18
+ result["embedding"]
19
+ end
data/llm/openai.rb ADDED
@@ -0,0 +1,44 @@
1
+ require_relative "http"
2
+
3
+ ROLE_SYSTEM = "system"
4
+ ROLE_USER = "user"
5
+ ROLE_ASSISTANT = "assistant"
6
+ NEXT_ROLE = ->(role) { role != ROLE_USER ? ROLE_USER : ROLE_ASSISTANT }
7
+
8
+ def chat(messages, opts = {})
9
+ data = {
10
+ "model" => "gpt-4o-mini",
11
+ "messages" => messages
12
+ }.merge(opts)
13
+
14
+ uri = "https://api.openai.com/v1/chat/completions"
15
+ response = http_post(uri, OPENAI_KEY, data)
16
+
17
+ if response.code != "200"
18
+ STDOUT << "Chat error: #{response}\n"
19
+ exit 1
20
+ end
21
+
22
+ result = JSON.parse(response.body)
23
+ STDOUT << "Chat usage: #{result["usage"]}, model: #{data["model"]}\n"
24
+
25
+ result["choices"][0]["message"]["content"]
26
+ end
27
+
28
+ def embedding(txts, opts = {})
29
+ data = {
30
+ "model" => "text-embedding-3-small",
31
+ "input" => txts
32
+ }.merge(opts)
33
+
34
+ uri = "https://api.openai.com/v1/embeddings"
35
+ response = http_post(uri, OPENAI_KEY, data)
36
+
37
+ if response.code != "200"
38
+ STDOUT << "Embedding error: #{response.body}\n"
39
+ exit 1
40
+ end
41
+
42
+ result = JSON.parse(response.body)
43
+ result["data"][0]["embedding"]
44
+ end
@@ -0,0 +1,18 @@
1
+ require_relative "reader"
2
+
3
+ # check-reader reader filepath
4
+
5
+ reader = get_reader(ARGV[0])
6
+ if reader.nil?
7
+ STDOUT << "Reader #{ARGV[0]} not found\n"
8
+ exit 1
9
+ end
10
+
11
+ file = reader.new(ARGV[1])
12
+ file.load
13
+
14
+ STDOUT << "Print chunks #{ARGV[1]} [#{file.chunks.length}]:\n"
15
+
16
+ file.chunks.each do |chunk|
17
+ STDOUT << chunk << "\n---\n"
18
+ end
data/readers/note.rb ADDED
@@ -0,0 +1,74 @@
1
+
2
+ class NoteReader
3
+ HEADER_CONF = /^## (.+?) \[(.+?)\]$/
4
+ LINK = /^- \[([ xX])\] /
5
+
6
+ Note = Struct.new(:lineno, :body, :title, :done)
7
+
8
+ attr_accessor :file, :chunks, :notes
9
+
10
+ def initialize(file)
11
+ @file = file
12
+ @loaded = false
13
+ @chunks = []
14
+ @notes = []
15
+ end
16
+
17
+ def load
18
+ return self if @loaded
19
+
20
+ File.open(@file) do |file|
21
+ parse_conf(file)
22
+ end
23
+
24
+ @notes.each do |note|
25
+ next unless note.done
26
+ chunks << note.body.join("\n")
27
+ end
28
+
29
+ @loaded = true
30
+ self
31
+ end
32
+
33
+ # ## Title [Author - Conf]
34
+ #
35
+ # - [x] http://link
36
+ #
37
+ # **Summary:**
38
+ def parse_conf(file)
39
+ note = nil
40
+
41
+ file.each_line do |line|
42
+ line = line.chomp # remove crlf chars
43
+
44
+ if line =~ HEADER_CONF
45
+ # close the previous note
46
+ if !note.nil?
47
+ @notes << note
48
+ note = nil
49
+ end
50
+
51
+ note = Note.new
52
+ note.lineno = file.lineno
53
+ note.title = $1
54
+ note.body = [line]
55
+ elsif !note.nil?
56
+ if line =~ LINK # skip links in body
57
+ note.done = ($1 != ' ')
58
+ else
59
+ note.body << line unless line.strip.empty?
60
+ end
61
+ end
62
+ end
63
+
64
+ # append the last parsed note if the file does not end with another header
65
+ if !note.nil?
66
+ @notes << note
67
+ note = nil
68
+ end
69
+ end
70
+
71
+ def get_chunk(idx)
72
+ @chunks[idx || 0]
73
+ end
74
+ end
data/readers/reader.rb ADDED
@@ -0,0 +1,12 @@
1
+ def get_reader(name)
2
+ case name.downcase
3
+ when "text"
4
+ require_relative "text"
5
+ return TextReader
6
+ when "note"
7
+ require_relative "note"
8
+ return NoteReader
9
+ else
10
+ return nil
11
+ end
12
+ end
data/readers/text.rb ADDED
@@ -0,0 +1,34 @@
1
+
2
+ class TextReader
3
+ attr_accessor :file, :chunks
4
+
5
+ def initialize(file)
6
+ @file = file
7
+ @loaded = false
8
+ @chunks = []
9
+ end
10
+
11
+ def load
12
+ return self if @loaded
13
+
14
+ chunk = ""
15
+ File.foreach(@file) do |line|
16
+ if line.start_with?(/- .+:/) || line.start_with?(' - [[') # yaml like
17
+ next
18
+ elsif line.start_with?('<') # html like
19
+ next
20
+ else
21
+ chunk << line unless line.strip.empty?
22
+ end
23
+ end
24
+
25
+ @chunks << chunk
26
+ @loaded = true
27
+
28
+ self
29
+ end
30
+
31
+ def get_chunk(idx)
32
+ @chunks[idx || 0]
33
+ end
34
+ end
data/server/cache.rb ADDED
@@ -0,0 +1,30 @@
1
+ class MemCache
2
+ attr_accessor :data
3
+
4
+ def initialize
5
+ @data = {}
6
+ end
7
+
8
+ def set(data, val)
9
+ hash = Digest::SHA256.hexdigest(data)
10
+ @data[hash] = val
11
+ end
12
+
13
+ def get(data)
14
+ hash = Digest::SHA256.hexdigest(data)
15
+ @data[hash]
16
+ end
17
+
18
+ def get_or_set(data, fn)
19
+ hash = Digest::SHA256.hexdigest(data)
20
+ return @data[hash] if @data[hash]
21
+
22
+ STDOUT << "Set then get cache #{hash}\n"
23
+
24
+ val = fn.call(data)
25
+ @data[hash] = val
26
+ return val
27
+ end
28
+ end
29
+
30
+ CACHE = MemCache.new
data/server/discuss.rb ADDED
@@ -0,0 +1,19 @@
1
+ DISCUSS_PROMPT = <<~PROMPT
2
+ You provide a short discussion of a note from multiple perspectives.
3
+ Focus on explaining key concepts succinctly.
4
+ PROMPT
5
+
6
+ require_relative "../llm/openai"
7
+
8
+ # note: string
9
+ # Returns discussion text
10
+ def discuss_note(note)
11
+ return "" if note.nil? || note.strip.empty?
12
+
13
+ msgs = [
14
+ { role: ROLE_SYSTEM, content: DISCUSS_PROMPT },
15
+ { role: ROLE_USER, content: note },
16
+ ]
17
+
18
+ chat(msgs)
19
+ end
@@ -0,0 +1,124 @@
1
+ require "pathname"
2
+
3
+ require_relative "cache"
4
+
5
+ require_relative "../llm/openai"
6
+ require_relative "../llm/embedding"
7
+
8
+ require_relative "../readers/reader"
9
+
10
+ AGENT_PROMPT = <<~PROMPT
11
+ You expand a short search query so it is easier to retrieve related markdown
12
+ documents. Return only the expanded query in a single line.
13
+ PROMPT
14
+
15
+ def expand_query(q)
16
+ msgs = [
17
+ { role: ROLE_SYSTEM, content: AGENT_PROMPT },
18
+ { role: ROLE_USER, content: q },
19
+ ]
20
+ chat(msgs).strip
21
+ end
22
+
23
+ def retrieve_by_embedding(lookup_paths, q)
24
+ qe = CACHE.get_or_set(q, method(:embedding).to_proc)
25
+
26
+ entries = []
27
+ lookup_paths.each do |p|
28
+ STDOUT << "Reading index: #{p.name}\n"
29
+
30
+ index_file = File.expand_path(p.out)
31
+ unless File.exist?(index_file)
32
+ STDOUT << "Path not exists! path: #{index_file}\n"
33
+ next
34
+ end
35
+
36
+ reader = get_reader(p.reader)
37
+ if reader.nil?
38
+ STDOUT << "Reader undefinied! reader: #{path.reader}\n"
39
+ next
40
+ end
41
+
42
+ File.foreach(index_file) do |line|
43
+ item = JSON.parse(line)
44
+
45
+ score = cosine_similarity(qe, item["embedding"])
46
+ next if score < p.threshold
47
+
48
+ item["score"] = score
49
+ item["lookup"] = p.name
50
+ item["id"] = extract_id(item["path"])
51
+ item["url"] = extract_url(item["path"], p.url)
52
+ item["reader"] = reader.new(item["path"])
53
+
54
+ entries << item
55
+ end
56
+
57
+ STDOUT << "Matched num: #{entries.length}\n"
58
+ end
59
+
60
+ entries
61
+ end
62
+
63
+ def extract_id(file_path)
64
+ path = Pathname.new(file_path)
65
+ File.join(path.each_filename.to_a[-2..-1])
66
+ end
67
+
68
+ def extract_url(file_path, url)
69
+ if url
70
+ path = Pathname.new(file_path)
71
+ # Extract the filename without the extension
72
+ filename_without_extension = path.basename(path.extname).to_s
73
+ # Return the final URL
74
+ "#{url}#{filename_without_extension}"
75
+ else
76
+ "file://#{file_path}"
77
+ end
78
+ end
79
+
80
+ VARIANT_PROMPT = <<~PROMPT
81
+ You generate a few alternative short search queries for exact text match.
82
+ Return a JSON array of strings with three different variants.
83
+ PROMPT
84
+
85
+ def expand_variants(q)
86
+ msgs = [
87
+ { role: ROLE_SYSTEM, content: VARIANT_PROMPT },
88
+ { role: ROLE_USER, content: q },
89
+ ]
90
+ JSON.parse(chat(msgs)) rescue []
91
+ end
92
+
93
+ def retrieve_by_text(lookup_paths, q)
94
+ entries = []
95
+ lookup_paths.each do |p|
96
+ STDOUT << "Reading text index: #{p.name}\n"
97
+
98
+ index_file = File.expand_path(p.out)
99
+ next unless File.exist?(index_file)
100
+
101
+ reader_cls = get_reader(p.reader)
102
+ next if reader_cls.nil?
103
+
104
+ file_cache = {}
105
+ File.foreach(index_file) do |line|
106
+ item = JSON.parse(line)
107
+ reader = file_cache[item["path"]] ||= reader_cls.new(item["path"]).load
108
+ chunk_text = reader.get_chunk(item["chunk"])
109
+ next unless chunk_text&.include?(q)
110
+
111
+ item["score"] = 1.0
112
+ item["lookup"] = p.name
113
+ item["id"] = extract_id(item["path"])
114
+ item["url"] = extract_url(item["path"], p.url)
115
+ item["reader"] = reader
116
+
117
+ entries << item
118
+ end
119
+
120
+ STDOUT << "Matched num: #{entries.length}\n"
121
+ end
122
+
123
+ entries
124
+ end
@@ -0,0 +1,20 @@
1
+ SUM_PROMPT = """You are an expert at combining notes.
2
+ Given a collection of notes, synthesize them into a concise new note capturing the key points.
3
+ """
4
+
5
+ require_relative "../llm/openai"
6
+
7
+ # notes: array of strings
8
+ # Returns summary text
9
+ def synthesize_notes(notes)
10
+ return "" if notes.nil? || notes.empty?
11
+
12
+ msgs = [{ role: ROLE_SYSTEM, content: SUM_PROMPT }]
13
+ content = "Notes:\n"
14
+ notes.each do |n|
15
+ content << "<note>\n#{n}\n</note>\n"
16
+ end
17
+ msgs << { role: ROLE_USER, content: content }
18
+
19
+ chat(msgs)
20
+ end
data/storage/mem.rb ADDED
@@ -0,0 +1,48 @@
1
+ require 'json'
2
+
3
+ class MemStorage
4
+ def initialize
5
+ @storage = {}
6
+ end
7
+
8
+ # Load or create a table from a JSON line file
9
+ def load_or_create(table, filepath)
10
+ @storage[table] ||= {}
11
+ File.readlines(filepath).each do |line|
12
+ data = JSON.parse(line)
13
+ @storage[table][data['id']] = data
14
+ end
15
+ rescue Errno::ENOENT
16
+ puts "File not found: #{filepath}"
17
+ end
18
+
19
+ # Add an entry to a specific table
20
+ def add(table, entryid, entry)
21
+ @storage[table] ||= {}
22
+ @storage[table][entryid] = entry
23
+ end
24
+
25
+ # Get an entry by ID from a specific table
26
+ def get(table, entryid)
27
+ @storage.dig(table, entryid)
28
+ end
29
+
30
+ # Locate an entry across all tables
31
+ def locate(entryid)
32
+ @storage.each do |table, entries|
33
+ return { table: table, entry: entries[entryid] } if entries.has_key?(entryid)
34
+ end
35
+ nil
36
+ end
37
+
38
+ # Scan a table and apply a lambda to each entry
39
+ def scan(table)
40
+ if block_given?
41
+ @storage[table]&.each do |entryid, entry|
42
+ yield entryid, entry
43
+ end
44
+ else
45
+ raise ArgumentError, "No block given"
46
+ end
47
+ end
48
+ end
metadata ADDED
@@ -0,0 +1,93 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: simple-rag-zc
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Zhuochun
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2025-06-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: sinatra
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '4.1'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '4.1'
27
+ - !ruby/object:Gem::Dependency
28
+ name: puma
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '6.5'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '6.5'
41
+ description: Simple retrieval-augmented generation on markdown files
42
+ email:
43
+ - zhuochun@hotmail.com
44
+ executables:
45
+ - run-index
46
+ - run-server
47
+ extensions: []
48
+ extra_rdoc_files: []
49
+ files:
50
+ - README.md
51
+ - example_config.json
52
+ - exe/public/q.html
53
+ - exe/run-index
54
+ - exe/run-server
55
+ - lib/simple_rag.rb
56
+ - lib/simple_rag/version.rb
57
+ - llm/embedding.rb
58
+ - llm/http.rb
59
+ - llm/ollama.rb
60
+ - llm/openai.rb
61
+ - readers/check-reader.rb
62
+ - readers/note.rb
63
+ - readers/reader.rb
64
+ - readers/text.rb
65
+ - server/cache.rb
66
+ - server/discuss.rb
67
+ - server/retriever.rb
68
+ - server/synthesizer.rb
69
+ - storage/mem.rb
70
+ homepage: https://github.com/zhuochun/simple-rag
71
+ licenses:
72
+ - MIT
73
+ metadata: {}
74
+ post_install_message:
75
+ rdoc_options: []
76
+ require_paths:
77
+ - lib
78
+ required_ruby_version: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '3.0'
83
+ required_rubygems_version: !ruby/object:Gem::Requirement
84
+ requirements:
85
+ - - ">="
86
+ - !ruby/object:Gem::Version
87
+ version: '0'
88
+ requirements: []
89
+ rubygems_version: 3.3.7
90
+ signing_key:
91
+ specification_version: 4
92
+ summary: RAG on Markdown Files
93
+ test_files: []