RubyGems - simple-rag-zc - Versions diffs - 0.1.0 - Mend

simple-rag-zc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: d8b0204bb64f55c075ecb1287b983fda160b1ffcf7e552e346372ab7f95bb3b8
+  data.tar.gz: 330043e72800a113dcc4df223dbfb978c26449ca2c2d477b13db82c7e5c2e743
+SHA512:
+  metadata.gz: 074af0f36149c2e9d5c0b7cd0dacf369d1c21bb4f812bd3c63f4b41772b5a6cb05f3ff8f75f9ed3a5d28f5c9846adb45e11ec0d1b18b7ada77f3a845ca5a989f
+  data.tar.gz: 6c34c79345703bc0cfb83bff5373b9d04a03bbba1c5549a96749b26986b2e49ba56d2e7dc63ba1bf749a0df01a9aec0ac66cb96f5ec03ed1c822a3c73a4379fc

data/README.md ADDED Viewed

@@ -0,0 +1,28 @@
+# simple-rag
+RAG on Markdown Files
+- Use **Search** for standard retrieval
+- Use **Search+** for agentic query expansion and fast text match
+- Use **Synthesize** to combine retrieved notes
+## Setup
+- Setup Config JSON
+- Run `run-index config.json`
+- Run `run-server config.json` and open `http://localhost:4567/q.html`
+## Publishing
+To release a new version to [RubyGems](https://rubygems.org), run:
+```bash
+gem build simple-rag.gemspec
+gem push simple-rag-$(ruby -Ilib -e 'require "simple_rag/version"; puts SimpleRag::VERSION').gem
+```
+Install the gem directly:
+```bash
+gem install simple-rag
+```

data/example_config.json ADDED Viewed

@@ -0,0 +1,21 @@
+{
+    "chat": {
+        "provider": "openai",
+        "url": "",
+        "model": "gpt-3.5-turbo-16k"
+    },
+    "embedding": {
+        "provider": "openai",
+        "url": "",
+        "model": "text-embedding-3-small"
+    },
+    "paths": [
+        {
+            "name": "temp",
+            "reader": "text",
+            "threshold": 0.3,
+            "dir": "D:\\Studies\\tmp\\learning",
+            "out": "D:\\Studies\\tmp\\learning-gpt1.dt"
+        }
+    ]
+}

data/exe/public/q.html ADDED Viewed

@@ -0,0 +1,381 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Search Page</title>
+    <style>
+        body {
+            display: flex;
+            margin: 0;
+            padding: 0;
+            font-family: Arial, sans-serif;
+        }
+        #main-content {
+            flex-grow: 1;
+            padding: 20px;
+        }
+        #paths-container {
+            margin-bottom: 20px;
+        }
+        #paths-list {
+            list-style-type: none;
+            padding: 0;
+            display: flex;
+            flex-wrap: wrap;
+            gap: 10px;
+        }
+        #paths-list li {
+            display: flex;
+            align-items: center;
+        }
+        #paths-list label {
+            margin-left: 5px;
+        }
+        #search-container {
+            display: flex;
+            margin-bottom: 20px;
+        }
+        #search-input {
+            flex-grow: 1;
+            height: 60px;
+            font-size: 16px;
+            padding: 0 10px;
+        }
+        #search-button {
+            width: 100px;
+            height: 66px;
+            font-size: 16px;
+            margin-left: 10px;
+        }
+        #search-plus-button {
+            width: 100px;
+            height: 66px;
+            font-size: 16px;
+            margin-left: 10px;
+        }
+        #synthesize-button {
+            width: 120px;
+            height: 66px;
+            font-size: 16px;
+            margin-left: 10px;
+        }
+        #response-container {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 10px;
+        }
+        .response-item {
+            border: 1px solid #ccc;
+            padding: 10px;
+            border-radius: 5px;
+            width: calc(50% - 5px);
+            box-sizing: border-box;
+        }
+        .discuss-button {
+            margin-top: 10px;
+        }
+    </style>
+    <script src="https://cdn.jsdelivr.net/npm/marked/marked.min.js"></script>
+</head>
+<body>
+    <div id="main-content">
+        <div id="search-container">
+            <input type="text" id="search-input" placeholder="Enter your search query">
+            <button id="search-button">Search</button>
+            <button id="search-plus-button">Search+</button>
+            <button id="synthesize-button">Synthesize</button>
+        </div>
+        <div id="paths-container">
+            <ul id="paths-list"></ul>
+        </div>
+        <div id="response-container"></div>
+    </div>
+    <script>
+        document.addEventListener('DOMContentLoaded', function() {
+            const pathsList = document.getElementById('paths-list');
+            const searchInput = document.getElementById('search-input');
+            const searchButton = document.getElementById('search-button');
+            const searchPlusButton = document.getElementById('search-plus-button');
+            const synthesizeButton = document.getElementById('synthesize-button');
+            const responseContainer = document.getElementById('response-container');
+            let retrievedNotes = [];
+            // Fetch paths and render sidebar
+            fetch('http://localhost:4567/paths')
+                .then(response => response.json())
+                .then(data => {
+                    data.forEach(item => {
+                        const li = document.createElement('li');
+                        li.style.backgroundColor = textToLightColor(item.name);
+                        const checkbox = document.createElement('input');
+                        checkbox.type = 'checkbox';
+                        checkbox.id = item.name;
+                        checkbox.name = item.name;
+                        checkbox.checked = true;
+                        const label = document.createElement('label');
+                        label.htmlFor = item.name;
+                        label.appendChild(document.createTextNode(item.name));
+                        li.appendChild(checkbox);
+                        li.appendChild(label);
+                        pathsList.appendChild(li);
+                    });
+                })
+                .catch(error => console.error('Error fetching paths:', error));
+            // Search function
+            function performSearch() {
+                const query = searchInput.value;
+                const checkedPaths = Array.from(pathsList.querySelectorAll('input[type="checkbox"]:checked'))
+                    .map(checkbox => checkbox.name);
+                fetch('http://localhost:4567/q', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        q: query,
+                        paths: checkedPaths,
+                    })
+                })
+                .then(response => response.json())
+                .then(resp => {
+                    responseContainer.innerHTML = '';
+                    retrievedNotes = [];
+                    resp.data.forEach(item => {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor(item.lookup);
+                        div.dataset.note = item.text;
+                        div.innerHTML = `
+                            <div><strong>Path:</strong> <a href="${item.url}">${item.id}</a></div>
+                            <div><strong>Score:</strong> ${item.score}</div>
+                            <div class="markdown-content">${marked.parse(item.text)}</div>
+                        `;
+                        const btn = document.createElement('button');
+                        btn.className = 'discuss-button';
+                        btn.textContent = 'Discuss';
+                        btn.addEventListener('click', () => discussCard(div));
+                        div.appendChild(btn);
+                        responseContainer.appendChild(div);
+                        retrievedNotes.push(item.text);
+                    });
+                })
+                .catch(error => console.error('Error performing search:', error));
+            }
+            function performAgentSearch() {
+                const query = searchInput.value;
+                const checkedPaths = Array.from(pathsList.querySelectorAll('input[type="checkbox"]:checked'))
+                    .map(checkbox => checkbox.name);
+                fetch('http://localhost:4567/q_plus', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        q: query,
+                        paths: checkedPaths,
+                    })
+                })
+                .then(response => response.json())
+                .then(resp => {
+                    responseContainer.innerHTML = '';
+                    retrievedNotes = [];
+                    if (!!resp.expanded) {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor('expanded');
+                        div.innerHTML = `<div><strong>Expanded Query:</strong> ${resp.expanded}</div>`;
+                        responseContainer.appendChild(div);
+                    }
+                    if (resp.variants && resp.variants.length > 0) {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor('variants');
+                        div.innerHTML = `<div><strong>Variants:</strong> ${resp.variants.join(', ')}</div>`;
+                        responseContainer.appendChild(div);
+                    }
+                    resp.data.forEach(item => {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor(item.lookup);
+                        div.dataset.note = item.text;
+                        div.innerHTML = `
+                            <div><strong>Path:</strong> <a href="${item.url}">${item.id}</a></div>
+                            <div><strong>Score:</strong> ${item.score}</div>
+                            <div class="markdown-content">${marked.parse(item.text)}</div>
+                        `;
+                        const btn = document.createElement('button');
+                        btn.className = 'discuss-button';
+                        btn.textContent = 'Discuss';
+                        btn.addEventListener('click', () => discussCard(div));
+                        div.appendChild(btn);
+                        responseContainer.appendChild(div);
+                        retrievedNotes.push(item.text);
+                    });
+                })
+                .catch(error => console.error('Error performing agent search:', error));
+            }
+            function performAgentSearch() {
+                const query = searchInput.value;
+                const configExperiment = configExperimentCheckbox.checked
+                const checkedPaths = Array.from(pathsList.querySelectorAll('input[type="checkbox"]:checked'))
+                    .map(checkbox => checkbox.name);
+                fetch('http://localhost:4567/q_plus', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        q: query,
+                        paths: checkedPaths,
+                        experiment: configExperiment,
+                    })
+                })
+                .then(response => response.json())
+                .then(resp => {
+                    responseContainer.innerHTML = '';
+                    if (!!resp.expanded) {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor("expanded");
+                        div.innerHTML = `<div><strong>Expanded Query:</strong> ${resp.expanded}</div>`;
+                        responseContainer.appendChild(div);
+                    }
+                    if (resp.variants && resp.variants.length > 0) {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor("variants");
+                        div.innerHTML = `
+                            <div><strong>Variants:</strong> ${resp.variants.join(', ')}</div>
+                        `;
+                        responseContainer.appendChild(div);
+                    }
+                    if (!!resp.eval) {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor("experiment");
+                        div.innerHTML = `
+                            <div class="markdown-content">${marked.parse(resp.eval)}</div>
+                        `;
+                        responseContainer.appendChild(div);
+                    }
+                    resp.data.forEach(item => {
+                        const div = document.createElement('div');
+                        div.className = 'response-item';
+                        div.style.backgroundColor = textToLightColor(item.lookup);
+                        div.dataset.note = item.text;
+                        div.innerHTML = `
+                            <div><strong>Path:</strong> <a href="${item.url}">${item.id}</a></div>
+                            <div><strong>Score:</strong> ${item.score}</div>
+                            <div class="markdown-content">${marked.parse(item.text)}</div>
+                        `;
+                        const btn = document.createElement('button');
+                        btn.className = 'discuss-button';
+                        btn.textContent = 'Discuss';
+                        btn.addEventListener('click', () => discussCard(div));
+                        div.appendChild(btn);
+                        responseContainer.appendChild(div);
+                    });
+                })
+                .catch(error => console.error('Error performing agent search:', error));
+            }
+            function textToLightColor(text) {
+                // Generate a hash from the text
+                let hash = 0;
+                for (let i = 0; i < text.length; i++) {
+                    hash = text.charCodeAt(i) + ((hash << 5) - hash);
+                }
+                // Use the hash to generate RGB values
+                let r = (hash & 0xFF) % 64 + 192; // 192-255
+                let g = ((hash >> 8) & 0xFF) % 64 + 192; // 192-255
+                let b = ((hash >> 16) & 0xFF) % 64 + 192; // 192-255
+                // Convert to hex and return
+                return `#${r.toString(16).padStart(2, '0')}${g.toString(16).padStart(2, '0')}${b.toString(16).padStart(2, '0')}`;
+            }
+            function synthesizeNotes() {
+                if (retrievedNotes.length === 0) {
+                    return;
+                }
+                fetch('http://localhost:4567/synthesize', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        notes: retrievedNotes,
+                    })
+                })
+                .then(response => response.json())
+                .then(resp => {
+                    const div = document.createElement('div');
+                    div.className = 'response-item';
+                    div.style.backgroundColor = textToLightColor('synthesize');
+                    div.innerHTML = `
+                        <div class="markdown-content">${marked.parse(resp.note)}</div>
+                    `;
+                    responseContainer.prepend(div);
+                })
+                .catch(error => console.error('Error synthesizing notes:', error));
+            }
+            function discussCard(div) {
+                const note = div.dataset.note;
+                if (!note) {
+                    return;
+                }
+                fetch('http://localhost:4567/discuss', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        note: note,
+                    })
+                })
+                .then(response => response.json())
+                .then(resp => {
+                    const mdDiv = div.querySelector('.markdown-content');
+                    mdDiv.innerHTML += marked.parse(resp.discussion);
+                })
+                .catch(error => console.error('Error discussing note:', error));
+            }
+            // Event listeners
+            searchButton.addEventListener('click', performSearch);
+            searchPlusButton.addEventListener('click', performAgentSearch);
+            synthesizeButton.addEventListener('click', synthesizeNotes);
+            searchInput.addEventListener('keypress', function(e) {
+                if (e.key === 'Enter') {
+                    performSearch();
+                }
+            });
+        });
+    </script>
+</body>
+</html>

data/exe/run-index ADDED Viewed

@@ -0,0 +1,95 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+# Index all markdown files in a directory
+#
+# Usage: run-index config.json
+#
+# Requires OpenAI API Key stored in DOT_OPENAI_KEY
+require "json"
+require "ostruct"
+require "digest"
+require_relative "../llm/openai"
+require_relative "../llm/embedding"
+require_relative "../readers/reader"
+if ARGV.length != 1
+    STDOUT << "Invalid arguments received, need a config file\n"
+    exit 1
+end
+config = JSON.parse(File.read(ARGV[0]))
+CONFIG = OpenStruct.new(config)
+CONFIG.paths = CONFIG.paths.map { |p| OpenStruct.new(p) }
+OPENAI_KEY = ENV["DOT_OPENAI_KEY"] || ""
+if OPENAI_KEY.empty?
+    STDOUT << "Remember to set env DOT_OPENAI_KEY\n"
+    exit 9
+end
+CONFIG.paths.each do |path|
+    STDOUT << "Read path name: #{path.name}, reader: #{path.reader}\n"
+    # Read existing index
+    STDOUT << "Read existing index: #{path.out}, time: @#{Time.now}\n"
+    index_db = {}
+    index_file = File.expand_path(path.out)
+    File.foreach(index_file) do |line|
+        item = JSON.parse(line)
+        index_db[item["hash"]] = item
+    end if File.exist?(index_file)
+    STDOUT << "Found index: #{index_db.length}\n"
+    # Scan directory
+    name_match = path.nameMatch || "*.{md,markdown}"
+    dir_blob = File.join(File.expand_path(path.dir), "**", name_match)
+    files = Dir[dir_blob]
+    STDOUT << "Scan dir: #{dir_blob}, Found: #{files.length}\n"
+    # Get reader class
+    reader_class = get_reader(path.reader)
+    if reader_class.nil?
+        STDOUT << "Reader undefinied: #{path.reader}\n"
+        exit 9
+    end
+    # Build index
+    STDOUT << "Building index @#{Time.now}\n["
+    skipped = 0
+    created = 0
+    File.open(index_file, "w") do |index_newdb|
+        files.each_with_index do |file, file_idx|
+            chunks = reader_class.new(file).load.chunks
+            chunks.each_with_index do |chunk, chunk_idx|
+                hash = Digest::SHA256.hexdigest(chunk)
+                if index_db[hash] # found in old DB
+                    index_newdb.puts(index_db[hash].to_json)
+                    skipped += 1
+                    next
+                end
+                created += 1
+                embedding = embedding(chunk)
+                line = { path: file, hash: hash, chunk: chunk_idx, embedding: embedding }
+                index_newdb.puts(line.to_json)
+            end
+            if file_idx % 50 == 0 # flush the file writes
+                index_newdb.flush
+                STDOUT << file_idx
+            else
+                STDOUT << "."
+            end
+        end
+    end
+    STDOUT << "]\nDone @#{Time.now}, Created: #{created}, Skipped: #{skipped}\n"
+end

data/exe/run-server ADDED Viewed

@@ -0,0 +1,167 @@
+#!/usr/bin/env ruby
+# encoding: utf-8
+# Query and answer questions based on an index file
+#
+# Usage: run-server config.json
+#
+# Requires OpenAI API Key stored in DOT_OPENAI_KEY
+require "json"
+require "ostruct"
+require "sinatra"
+require_relative "../server/retriever"
+require_relative "../server/synthesizer"
+require_relative "../server/discuss"
+if ARGV.length != 1
+    STDOUT << "Invalid arguments received, need a config file\n"
+    exit 1
+end
+config = JSON.parse(File.read(ARGV[0]))
+CONFIG = OpenStruct.new(config)
+CONFIG.paths = CONFIG.paths.map { |p| OpenStruct.new(p) }
+CONFIG.path_map = {}
+CONFIG.paths.each { |p| CONFIG.path_map[p.name] = p }
+OPENAI_KEY = ENV["DOT_OPENAI_KEY"] || ""
+if OPENAI_KEY.empty?
+    STDOUT << "Remember to set env DOT_OPENAI_KEY\n"
+    exit 9
+end
+# list all the paths that can be searched
+get '/paths' do
+    content_type :json
+    resp = []
+    CONFIG.paths.each do |p|
+        resp << { "name": p.name }
+    end
+    resp.to_json
+end
+# query within the paths
+post '/q' do
+    content_type :json
+    data = JSON.parse(request.body.read)
+    lookup_paths = (data["paths"] || CONFIG.paths_map.keys).map do |name|
+        CONFIG.path_map[name]
+    end
+    topN = (data["topN"] || 20).to_i
+    q = data["q"]
+    entries = retrieve_by_embedding(lookup_paths, q)
+    if q.to_s.strip.length < 5 && q.to_s.split(/\s+/).length < 5
+        entries.concat(retrieve_by_text(lookup_paths, q))
+        unique = {}
+        entries.each do |e|
+            key = [e["path"], e["chunk"]]
+            if unique[key]
+                unique[key]["score"] = (unique[key]["score"] || 0) + (e["score"] || 0)
+            else
+                unique[key] = e
+            end
+        end
+        entries = unique.values
+    end
+    entries = entries.sort_by { |item| -item["score"] }.take(topN)
+    resp = {
+        data: [],
+    }
+    entries.each do |item|
+        resp[:data] << {
+            path: item["path"],
+            lookup: item["lookup"],
+            id: item["id"],
+            url: item["url"],
+            text: item["reader"].load.get_chunk(item["chunk"]),
+            score: item["score"],
+        }
+    end
+    resp.to_json
+end
+# agentic query - expand the query using LLM before searching
+post '/q_plus' do
+    content_type :json
+    data = JSON.parse(request.body.read)
+    lookup_paths = (data["paths"] || CONFIG.paths_map.keys).map do |name|
+        CONFIG.path_map[name]
+    end
+    topN = (data["topN"] || 20).to_i
+    expanded_q = expand_query(data["q"])
+    variants = expand_variants(data["q"])
+    entries = []
+    entries.concat(retrieve_by_embedding(lookup_paths, data["q"]))
+    entries.concat(retrieve_by_embedding(lookup_paths, expanded_q))
+    variants.each { |v| entries.concat(retrieve_by_text(lookup_paths, v)) }
+    unique = {}
+    entries.each do |e|
+        key = [e["path"], e["chunk"]]
+        if unique[key]
+            unique[key]["score"] = (unique[key]["score"] || 0) + (e["score"] || 0)
+        else
+            unique[key] = e
+        end
+    end
+    ordered = unique.values.sort_by { |item| -item["score"] }.take(topN)
+    resp = {
+        data: [],
+        expanded: expanded_q,
+        variants: variants,
+    }
+    ordered.each do |item|
+        resp[:data] << {
+            path: item["path"],
+            lookup: item["lookup"],
+            id: item["id"],
+            url: item["url"],
+            text: item["reader"].load.get_chunk(item["chunk"]),
+            score: item["score"],
+        }
+    end
+    resp.to_json
+end
+# synthesize notes into a summary
+post '/synthesize' do
+    content_type :json
+    data = JSON.parse(request.body.read)
+    summary = synthesize_notes(data["notes"])
+    { note: summary }.to_json
+end
+# generate discussion for a single note
+post '/discuss' do
+    content_type :json
+    data = JSON.parse(request.body.read)
+    discussion = discuss_note(data["note"])
+    { discussion: discussion }.to_json
+end

data/lib/simple_rag/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module SimpleRag
+  VERSION = "0.1.0"
+end

data/lib/simple_rag.rb ADDED Viewed

@@ -0,0 +1,17 @@
+# frozen_string_literal: true
+require_relative "simple_rag/version"
+# Adjust load path so require_relative works from gem
+$LOAD_PATH.unshift File.expand_path("..", __dir__)
+module SimpleRag
+end
+require "llm/openai"
+require "llm/embedding"
+require "readers/reader"
+require "server/retriever"
+require "server/synthesizer"
+require "server/discuss"
+require "storage/mem"

data/llm/embedding.rb ADDED Viewed

@@ -0,0 +1,19 @@
+def cosine_similarity(array1, array2)
+  dot_product = 0.0
+  norm_a = 0.0
+  norm_b = 0.0
+  array1.each_with_index do |value1, index|
+    value2 = array2[index]
+    dot_product += value1 * value2
+    norm_a += value1 * value1
+    norm_b += value2 * value2
+  end
+  norm_a = Math.sqrt(norm_a)
+  norm_b = Math.sqrt(norm_b)
+  cosine_similarity = dot_product / (norm_a * norm_b)
+end

data/llm/http.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require "net/http"
+require "json"
+def http_post(uri, auth, reqData)
+  url = URI(uri)
+  http = Net::HTTP.new(url.host, url.port)
+  http.use_ssl = true unless auth.nil?
+  http.read_timeout = 600 # Time in seconds
+  headers = { "Content-Type" => "application/json" }
+  headers["Authorization"] = "Bearer #{auth}" unless auth.nil?
+  request = Net::HTTP::Post.new(url, headers)
+  request.body = reqData.to_json
+  return http.request(request)
+end

data/llm/ollama.rb ADDED Viewed

@@ -0,0 +1,19 @@
+require_relative "http"
+def embedding_ollama(txts, opts = {})
+  data = {
+    "model" => "nomic-embed-text",
+    "prompt" => txts
+  }.merge(opts)
+  uri = "http://localhost:11434/api/embeddings"
+  response = http_post(uri, nil, data)
+  if response.code != "200"
+    STDOUT << "Embedding error: #{response}\n"
+    exit 1
+  end
+  result = JSON.parse(response.body)
+  result["embedding"]
+end

data/llm/openai.rb ADDED Viewed

@@ -0,0 +1,44 @@
+require_relative "http"
+ROLE_SYSTEM = "system"
+ROLE_USER = "user"
+ROLE_ASSISTANT = "assistant"
+NEXT_ROLE = ->(role) { role != ROLE_USER ? ROLE_USER : ROLE_ASSISTANT }
+def chat(messages, opts = {})
+  data = {
+    "model" => "gpt-4o-mini",
+    "messages" => messages
+  }.merge(opts)
+  uri = "https://api.openai.com/v1/chat/completions"
+  response = http_post(uri, OPENAI_KEY, data)
+  if response.code != "200"
+    STDOUT << "Chat error: #{response}\n"
+    exit 1
+  end
+  result = JSON.parse(response.body)
+  STDOUT << "Chat usage: #{result["usage"]}, model: #{data["model"]}\n"
+  result["choices"][0]["message"]["content"]
+end
+def embedding(txts, opts = {})
+  data = {
+    "model" => "text-embedding-3-small",
+    "input" => txts
+  }.merge(opts)
+  uri = "https://api.openai.com/v1/embeddings"
+  response = http_post(uri, OPENAI_KEY, data)
+  if response.code != "200"
+    STDOUT << "Embedding error: #{response.body}\n"
+    exit 1
+  end
+  result = JSON.parse(response.body)
+  result["data"][0]["embedding"]
+end

data/readers/check-reader.rb ADDED Viewed

@@ -0,0 +1,18 @@
+require_relative "reader"
+# check-reader reader filepath
+reader = get_reader(ARGV[0])
+if reader.nil?
+    STDOUT << "Reader #{ARGV[0]} not found\n"
+    exit 1
+end
+file = reader.new(ARGV[1])
+file.load
+STDOUT << "Print chunks #{ARGV[1]} [#{file.chunks.length}]:\n"
+file.chunks.each do |chunk|
+    STDOUT << chunk << "\n---\n"
+end

data/readers/note.rb ADDED Viewed

@@ -0,0 +1,74 @@
+class NoteReader
+    HEADER_CONF = /^## (.+?) \[(.+?)\]$/
+    LINK = /^- \[([ xX])\] /
+    Note = Struct.new(:lineno, :body, :title, :done)
+    attr_accessor :file, :chunks, :notes
+    def initialize(file)
+        @file = file
+        @loaded = false
+        @chunks = []
+        @notes = []
+    end
+    def load
+        return self if @loaded
+        File.open(@file) do |file|
+            parse_conf(file)
+        end
+        @notes.each do |note|
+            next unless note.done
+            chunks << note.body.join("\n")
+        end
+        @loaded = true
+        self
+    end
+    # ## Title [Author - Conf]
+    #
+    # - [x] http://link
+    #
+    # **Summary:**
+    def parse_conf(file)
+        note = nil
+        file.each_line do |line|
+            line = line.chomp # remove crlf chars
+            if line =~ HEADER_CONF
+                # close the previous note
+                if !note.nil?
+                    @notes << note
+                    note = nil
+                end
+                note = Note.new
+                note.lineno = file.lineno
+                note.title = $1
+                note.body = [line]
+            elsif !note.nil?
+                if line =~ LINK # skip links in body
+                    note.done = ($1 != ' ')
+                else
+                    note.body << line unless line.strip.empty?
+                end
+            end
+        end
+        # append the last parsed note if the file does not end with another header
+        if !note.nil?
+            @notes << note
+            note = nil
+        end
+    end
+    def get_chunk(idx)
+        @chunks[idx || 0]
+    end
+end

data/readers/reader.rb ADDED Viewed

@@ -0,0 +1,12 @@
+def get_reader(name)
+    case name.downcase
+    when "text"
+        require_relative "text"
+        return TextReader
+    when "note"
+        require_relative "note"
+        return NoteReader
+    else
+        return nil
+    end
+end

data/readers/text.rb ADDED Viewed

@@ -0,0 +1,34 @@
+class TextReader
+    attr_accessor :file, :chunks
+    def initialize(file)
+        @file = file
+        @loaded = false
+        @chunks = []
+    end
+    def load
+        return self if @loaded
+        chunk = ""
+        File.foreach(@file) do |line|
+            if line.start_with?(/- .+:/) || line.start_with?('  - [[') # yaml like
+                next
+            elsif line.start_with?('<') # html like
+                next
+            else
+                chunk << line unless line.strip.empty?
+            end
+        end
+        @chunks << chunk
+        @loaded = true
+        self
+    end
+    def get_chunk(idx)
+        @chunks[idx || 0]
+    end
+end

data/server/cache.rb ADDED Viewed

@@ -0,0 +1,30 @@
+class MemCache
+    attr_accessor :data
+    def initialize
+        @data = {}
+    end
+    def set(data, val)
+        hash = Digest::SHA256.hexdigest(data)
+        @data[hash] = val
+    end
+    def get(data)
+        hash = Digest::SHA256.hexdigest(data)
+        @data[hash]
+    end
+    def get_or_set(data, fn)
+        hash = Digest::SHA256.hexdigest(data)
+        return @data[hash] if @data[hash]
+        STDOUT << "Set then get cache #{hash}\n"
+        val = fn.call(data)
+        @data[hash] = val
+        return val
+    end
+end
+CACHE = MemCache.new

data/server/discuss.rb ADDED Viewed

@@ -0,0 +1,19 @@
+DISCUSS_PROMPT = <<~PROMPT
+You provide a short discussion of a note from multiple perspectives.
+Focus on explaining key concepts succinctly.
+PROMPT
+require_relative "../llm/openai"
+# note: string
+# Returns discussion text
+def discuss_note(note)
+    return "" if note.nil? || note.strip.empty?
+    msgs = [
+        { role: ROLE_SYSTEM, content: DISCUSS_PROMPT },
+        { role: ROLE_USER, content: note },
+    ]
+    chat(msgs)
+end

data/server/retriever.rb ADDED Viewed

@@ -0,0 +1,124 @@
+require "pathname"
+require_relative "cache"
+require_relative "../llm/openai"
+require_relative "../llm/embedding"
+require_relative "../readers/reader"
+AGENT_PROMPT = <<~PROMPT
+You expand a short search query so it is easier to retrieve related markdown
+documents. Return only the expanded query in a single line.
+PROMPT
+def expand_query(q)
+    msgs = [
+        { role: ROLE_SYSTEM, content: AGENT_PROMPT },
+        { role: ROLE_USER, content: q },
+    ]
+    chat(msgs).strip
+end
+def retrieve_by_embedding(lookup_paths, q)
+    qe = CACHE.get_or_set(q, method(:embedding).to_proc)
+    entries = []
+    lookup_paths.each do |p|
+        STDOUT << "Reading index: #{p.name}\n"
+        index_file = File.expand_path(p.out)
+        unless File.exist?(index_file)
+            STDOUT << "Path not exists! path: #{index_file}\n"
+            next
+        end
+        reader = get_reader(p.reader)
+        if reader.nil?
+            STDOUT << "Reader undefinied! reader: #{path.reader}\n"
+            next
+        end
+        File.foreach(index_file) do |line|
+            item = JSON.parse(line)
+            score = cosine_similarity(qe, item["embedding"])
+            next if score < p.threshold
+            item["score"] = score
+            item["lookup"] = p.name
+            item["id"] = extract_id(item["path"])
+            item["url"] = extract_url(item["path"], p.url)
+            item["reader"] = reader.new(item["path"])
+            entries << item
+        end
+        STDOUT << "Matched num: #{entries.length}\n"
+    end
+    entries
+end
+def extract_id(file_path)
+    path = Pathname.new(file_path)
+    File.join(path.each_filename.to_a[-2..-1])
+end
+def extract_url(file_path, url)
+    if url
+        path = Pathname.new(file_path)
+        # Extract the filename without the extension
+        filename_without_extension = path.basename(path.extname).to_s
+        # Return the final URL
+        "#{url}#{filename_without_extension}"
+    else
+        "file://#{file_path}"
+    end
+end
+VARIANT_PROMPT = <<~PROMPT
+You generate a few alternative short search queries for exact text match.
+Return a JSON array of strings with three different variants.
+PROMPT
+def expand_variants(q)
+    msgs = [
+        { role: ROLE_SYSTEM, content: VARIANT_PROMPT },
+        { role: ROLE_USER, content: q },
+    ]
+    JSON.parse(chat(msgs)) rescue []
+end
+def retrieve_by_text(lookup_paths, q)
+    entries = []
+    lookup_paths.each do |p|
+        STDOUT << "Reading text index: #{p.name}\n"
+        index_file = File.expand_path(p.out)
+        next unless File.exist?(index_file)
+        reader_cls = get_reader(p.reader)
+        next if reader_cls.nil?
+        file_cache = {}
+        File.foreach(index_file) do |line|
+            item = JSON.parse(line)
+            reader = file_cache[item["path"]] ||= reader_cls.new(item["path"]).load
+            chunk_text = reader.get_chunk(item["chunk"])
+            next unless chunk_text&.include?(q)
+            item["score"] = 1.0
+            item["lookup"] = p.name
+            item["id"] = extract_id(item["path"])
+            item["url"] = extract_url(item["path"], p.url)
+            item["reader"] = reader
+            entries << item
+        end
+        STDOUT << "Matched num: #{entries.length}\n"
+    end
+    entries
+end

data/server/synthesizer.rb ADDED Viewed

@@ -0,0 +1,20 @@
+SUM_PROMPT = """You are an expert at combining notes.
+Given a collection of notes, synthesize them into a concise new note capturing the key points.
+"""
+require_relative "../llm/openai"
+# notes: array of strings
+# Returns summary text
+def synthesize_notes(notes)
+    return "" if notes.nil? || notes.empty?
+    msgs = [{ role: ROLE_SYSTEM, content: SUM_PROMPT }]
+    content = "Notes:\n"
+    notes.each do |n|
+        content << "<note>\n#{n}\n</note>\n"
+    end
+    msgs << { role: ROLE_USER, content: content }
+    chat(msgs)
+end

data/storage/mem.rb ADDED Viewed

@@ -0,0 +1,48 @@
+require 'json'
+class MemStorage
+  def initialize
+    @storage = {}
+  end
+  # Load or create a table from a JSON line file
+  def load_or_create(table, filepath)
+    @storage[table] ||= {}
+    File.readlines(filepath).each do |line|
+      data = JSON.parse(line)
+      @storage[table][data['id']] = data
+    end
+  rescue Errno::ENOENT
+    puts "File not found: #{filepath}"
+  end
+  # Add an entry to a specific table
+  def add(table, entryid, entry)
+    @storage[table] ||= {}
+    @storage[table][entryid] = entry
+  end
+  # Get an entry by ID from a specific table
+  def get(table, entryid)
+    @storage.dig(table, entryid)
+  end
+  # Locate an entry across all tables
+  def locate(entryid)
+    @storage.each do |table, entries|
+      return { table: table, entry: entries[entryid] } if entries.has_key?(entryid)
+    end
+    nil
+  end
+  # Scan a table and apply a lambda to each entry
+  def scan(table)
+    if block_given?
+      @storage[table]&.each do |entryid, entry|
+        yield entryid, entry
+      end
+    else
+      raise ArgumentError, "No block given"
+    end
+  end
+end

metadata ADDED Viewed

@@ -0,0 +1,93 @@
+--- !ruby/object:Gem::Specification
+name: simple-rag-zc
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Zhuochun
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2025-06-07 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: sinatra
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.1'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '4.1'
+- !ruby/object:Gem::Dependency
+  name: puma
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '6.5'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '6.5'
+description: Simple retrieval-augmented generation on markdown files
+email:
+- zhuochun@hotmail.com
+executables:
+- run-index
+- run-server
+extensions: []
+extra_rdoc_files: []
+files:
+- README.md
+- example_config.json
+- exe/public/q.html
+- exe/run-index
+- exe/run-server
+- lib/simple_rag.rb
+- lib/simple_rag/version.rb
+- llm/embedding.rb
+- llm/http.rb
+- llm/ollama.rb
+- llm/openai.rb
+- readers/check-reader.rb
+- readers/note.rb
+- readers/reader.rb
+- readers/text.rb
+- server/cache.rb
+- server/discuss.rb
+- server/retriever.rb
+- server/synthesizer.rb
+- storage/mem.rb
+homepage: https://github.com/zhuochun/simple-rag
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '3.0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubygems_version: 3.3.7
+signing_key:
+specification_version: 4
+summary: RAG on Markdown Files
+test_files: []