raqa 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
raqa-0.0.1/LICENSE ADDED
File without changes
raqa-0.0.1/PKG-INFO ADDED
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.4
2
+ Name: raqa
3
+ Version: 0.0.1
4
+ Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
5
+ Project-URL: GitHub repository, https://github.com
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Operating System :: OS Independent
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: build
12
+ Requires-Dist: faiss-cpu
13
+ Requires-Dist: numpy
14
+ Requires-Dist: openai
15
+ Requires-Dist: python-frontmatter
16
+ Requires-Dist: sentence-transformers
17
+ Requires-Dist: tqdm
18
+ Requires-Dist: typer[all]
19
+ Requires-Dist: twine
20
+ Dynamic: license-file
21
+
22
+ # RAQA
23
+
24
+ **R**etrieval-**A**ugmented **Q**uestion-**A**nswering
25
+
26
+ Retrieval-augmented, pip-installable, CLI-based question answering over arbitrary document collections.
27
+
28
+ # Usage
29
+
30
+ ## Installation
31
+
32
+ ```
33
+ pip install raqa
34
+ ```
35
+
36
+ **Locally**
37
+
38
+ `pip install -e .`
39
+
40
+ ## Run
41
+
42
+ ### BASH via Python interpreter
43
+
44
+ 1. Build DB
45
+
46
+ `python cli.py build --path ./docs`
47
+
48
+ 2. Chat
49
+
50
+ `python cli.py chat`
51
+
52
+ 3. One-shot retrieval
53
+
54
+ `python cli.py search "what is retrieval augmented generation?"`
55
+
56
+ 4. Rebuild and chat
57
+
58
+ `python cli.py rebuild-and-chat`
59
+
60
+ 5. Get stats
61
+
62
+ `python cli.py stats`
63
+
64
+ ### BASH natively
65
+
66
+ ```
67
+ raqa build --path ./markdown_files
68
+ raqa chat
69
+ raqa search "what is RAG?"
70
+ raqa stats
71
+ raqa rebuild-and-chat
72
+ ```
73
+
74
+
75
+ ## Python
76
+
77
+ ### Build database
78
+
79
+ ```
80
+ from db import VectorDB
81
+ from config import MARKDOWN_ROOT
82
+
83
+ db = VectorDB()
84
+ db.build(MARKDOWN_ROOT)
85
+ ```
86
+
87
+ ### Run
88
+
89
+ ```
90
+ from agent import RAGAgent
91
+
92
+ agent = RAGAgent()
93
+ agent.chat()
94
+ ```
95
+
96
+
97
+ ## Build instructions
98
+
99
+ Next steps:
100
+
101
+ 1. If any changes are made, update `pyproject.toml`.
102
+ 2. Building the package before uploading:
103
+ `cd raqa; python -m build`.
104
+ 3. Upload the package to pypi:
105
+ `python -m twine upload --repository {pypi|testpypi} dist/*`
106
+
107
+ ## Next steps
108
+
109
+ ### Real tool-calling (instead of implicit RAG)
110
+
111
+ Define OpenAI tool:
112
+
113
+ ```
114
+ {
115
+ "name": "search_docs",
116
+ "description": "...",
117
+ "parameters": { "query": "string" }
118
+ }
119
+ ```
120
+
121
+ ### Hybrid search
122
+
123
+ Combine BM25 (rank-bm25) + embeddings
raqa-0.0.1/README.md ADDED
@@ -0,0 +1,102 @@
1
+ # RAQA
2
+
3
+ **R**etrieval-**A**ugmented **Q**uestion-**A**nswering
4
+
5
+ Retrieval-augmented, pip-installable, CLI-based question answering over arbitrary document collections.
6
+
7
+ # Usage
8
+
9
+ ## Installation
10
+
11
+ ```
12
+ pip install raqa
13
+ ```
14
+
15
+ **Locally**
16
+
17
+ `pip install -e .`
18
+
19
+ ## Run
20
+
21
+ ### BASH via Python interpreter
22
+
23
+ 1. Build DB
24
+
25
+ `python cli.py build --path ./docs`
26
+
27
+ 2. Chat
28
+
29
+ `python cli.py chat`
30
+
31
+ 3. One-shot retrieval
32
+
33
+ `python cli.py search "what is retrieval augmented generation?"`
34
+
35
+ 4. Rebuild and chat
36
+
37
+ `python cli.py rebuild-and-chat`
38
+
39
+ 5. Get stats
40
+
41
+ `python cli.py stats`
42
+
43
+ ### BASH natively
44
+
45
+ ```
46
+ raqa build --path ./markdown_files
47
+ raqa chat
48
+ raqa search "what is RAG?"
49
+ raqa stats
50
+ raqa rebuild-and-chat
51
+ ```
52
+
53
+
54
+ ## Python
55
+
56
+ ### Build database
57
+
58
+ ```
59
+ from db import VectorDB
60
+ from config import MARKDOWN_ROOT
61
+
62
+ db = VectorDB()
63
+ db.build(MARKDOWN_ROOT)
64
+ ```
65
+
66
+ ### Run
67
+
68
+ ```
69
+ from agent import RAGAgent
70
+
71
+ agent = RAGAgent()
72
+ agent.chat()
73
+ ```
74
+
75
+
76
+ ## Build instructions
77
+
78
+ Next steps:
79
+
80
+ 1. If any changes are made, update `pyproject.toml`.
81
+ 2. Building the package before uploading:
82
+ `cd raqa; python -m build`.
83
+ 3. Upload the package to pypi:
84
+ `python -m twine upload --repository {pypi|testpypi} dist/*`
85
+
86
+ ## Next steps
87
+
88
+ ### Real tool-calling (instead of implicit RAG)
89
+
90
+ Define OpenAI tool:
91
+
92
+ ```
93
+ {
94
+ "name": "search_docs",
95
+ "description": "...",
96
+ "parameters": { "query": "string" }
97
+ }
98
+ ```
99
+
100
+ ### Hybrid search
101
+
102
+ Combine BM25 (rank-bm25) + embeddings
@@ -0,0 +1,34 @@
1
+ [build-system]
2
+ requires = ["setuptools>=75.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "raqa"
7
+ version = "0.0.1"
8
+ authors = [
9
+ { name="Jordi Carrera Ventura", email="jordi.carrera.ventura@gmail.com" },
10
+ ]
11
+ description = ""
12
+ readme = "README.md"
13
+ requires-python = ">=3.10"
14
+ classifiers = [
15
+ "Programming Language :: Python :: 3",
16
+ "Operating System :: OS Independent",
17
+ ]
18
+ dependencies = [
19
+ "build",
20
+ "faiss-cpu",
21
+ "numpy",
22
+ "openai",
23
+ "python-frontmatter",
24
+ "sentence-transformers",
25
+ "tqdm",
26
+ "typer[all]",
27
+ "twine"
28
+ ]
29
+
30
+ [project.urls]
31
+ "GitHub repository" = "https://github.com"
32
+
33
+ [project.scripts]
34
+ raqa = "raqa.cli:app"
raqa-0.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,56 @@
1
+ from openai import OpenAI
2
+
3
+ from .db import VectorDB
4
+ from .utils import get_openai_key
5
+
6
+ class RAGAgent:
7
+ def __init__(self):
8
+ self.db = VectorDB()
9
+ self.db.load()
10
+
11
+ api_key = get_openai_key()
12
+ self.client = OpenAI(api_key=api_key)
13
+
14
+ def retrieve(self, query: str):
15
+ results = self.db.search(query)
16
+ filtered = self.db.nucleus_filter(results)
17
+
18
+ context = "\n\n".join(
19
+ f"[{r['data']['source']}]\n{r['data']['text']}"
20
+ for r in filtered
21
+ )
22
+
23
+ return context
24
+
25
+ def chat(self):
26
+ print("💬 RAG Agent ready. Type 'exit' to quit.")
27
+
28
+ messages = [{"role": "system", "content": "You are a helpful assistant."}]
29
+
30
+ while True:
31
+ user_input = input("\nYou: ")
32
+ if user_input.lower() in ("exit", "quit"):
33
+ break
34
+
35
+ context = self.retrieve(user_input)
36
+
37
+ augmented_prompt = f"""
38
+ Use the context below if relevant:
39
+
40
+ {context}
41
+
42
+ User question:
43
+ {user_input}
44
+ """
45
+
46
+ messages.append({"role": "user", "content": augmented_prompt})
47
+
48
+ response = self.client.chat.completions.create(
49
+ model="gpt-4.1-mini",
50
+ messages=messages
51
+ )
52
+
53
+ reply = response.choices[0].message.content
54
+ messages.append({"role": "assistant", "content": reply})
55
+
56
+ print("\nAssistant:", reply)
@@ -0,0 +1,105 @@
1
+ import typer
2
+ from typing import Optional
3
+
4
+ from .db import VectorDB
5
+ from .agent import RAGAgent
6
+ from .config import MARKDOWN_ROOT
7
+
8
+ app = typer.Typer(help="📚 Markdown RAG CLI")
9
+
10
+
11
+ # ---------------------------
12
+ # BUILD DATABASE
13
+ # ---------------------------
14
+ @app.command()
15
+ def build(
16
+ path: str = typer.Option(
17
+ MARKDOWN_ROOT,
18
+ help="Path to markdown folder"
19
+ )
20
+ ):
21
+ """
22
+ Build vector database from markdown files.
23
+ """
24
+ db = VectorDB()
25
+ db.build(path)
26
+
27
+ typer.secho("✅ Database built successfully.", fg=typer.colors.GREEN)
28
+
29
+
30
+ # ---------------------------
31
+ # SEARCH ONLY (DEBUG TOOL)
32
+ # ---------------------------
33
+ @app.command()
34
+ def search(
35
+ query: str = typer.Argument(..., help="Search query"),
36
+ k: int = typer.Option(10, help="Top K results")
37
+ ):
38
+ """
39
+ Run retrieval without LLM (debugging).
40
+ """
41
+ db = VectorDB()
42
+ db.load()
43
+
44
+ results = db.search(query, k=k)
45
+
46
+ typer.secho("\n🔎 Raw Results:\n", fg=typer.colors.BLUE)
47
+
48
+ for i, r in enumerate(results):
49
+ typer.echo(f"\n--- Result {i+1} ---")
50
+ typer.echo(f"Score: {r['score']:.4f}")
51
+ typer.echo(f"Source: {r['data']['source']}")
52
+ typer.echo(r["data"]["text"][:500])
53
+
54
+
55
+ # ---------------------------
56
+ # CHAT (MAIN ENTRYPOINT)
57
+ # ---------------------------
58
+ @app.command()
59
+ def chat():
60
+ """
61
+ Start conversational RAG agent.
62
+ """
63
+ agent = RAGAgent()
64
+ agent.chat()
65
+
66
+
67
+ # ---------------------------
68
+ # REBUILD + CHAT (CONVENIENCE)
69
+ # ---------------------------
70
+ @app.command()
71
+ def rebuild_and_chat(
72
+ path: str = typer.Option(
73
+ MARKDOWN_ROOT,
74
+ help="Markdown folder"
75
+ )
76
+ ):
77
+ """
78
+ Rebuild database and immediately start chat.
79
+ """
80
+ db = VectorDB()
81
+ db.build(path)
82
+
83
+ typer.secho("\n🚀 Starting chat...\n", fg=typer.colors.GREEN)
84
+
85
+ agent = RAGAgent()
86
+ agent.chat()
87
+
88
+
89
+ # ---------------------------
90
+ # INSPECT DB
91
+ # ---------------------------
92
+ @app.command()
93
+ def stats():
94
+ """
95
+ Show database stats.
96
+ """
97
+ db = VectorDB()
98
+ db.load()
99
+
100
+ typer.echo("📊 Database Stats:")
101
+ typer.echo(f"Total chunks: {len(db.metadata)}")
102
+
103
+
104
+ if __name__ == "__main__":
105
+ app()
@@ -0,0 +1,24 @@
1
+ from pathlib import Path
2
+
3
+ # User folder for raqa configs
4
+ HOME = Path.home()
5
+ RAQA_HOME = HOME / ".raqa"
6
+
7
+ # OpenAI credential file
8
+ ENV_FILE = RAQA_HOME / "env"
9
+
10
+ # Database folder (moved from project data)
11
+ DATA_DIR = RAQA_HOME / "data"
12
+
13
+ # Embeddings & chunk config
14
+ EMBEDDING_MODEL = "joeddav/xlm-roberta-large-xnli"
15
+ CHUNK_WINDOW = 3
16
+ TOP_K = 50
17
+ SIMILARITY_RADIUS = 0.4
18
+
19
+ # Default markdown folder (can override via CLI)
20
+ MARKDOWN_ROOT = "./markdown_files"
21
+
22
+ # Ensure directories exist
23
+ RAQA_HOME.mkdir(parents=True, exist_ok=True)
24
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
@@ -0,0 +1,119 @@
1
+ import os
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import faiss
6
+ import numpy as np
7
+ import frontmatter
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ from .utils import split_sentences, window_chunks
11
+ from .config import *
12
+
13
+ os.environ["TOKENIZERS_PARALLELISM"] = True
14
+
15
+
16
+ class VectorDB:
17
+ def __init__(self):
18
+ self.model = SentenceTransformer(EMBEDDING_MODEL)
19
+ self.index = None
20
+ self.metadata = []
21
+
22
+ def ingest_markdown(self, root_dir: str):
23
+ all_chunks = []
24
+
25
+ for path in Path(root_dir).rglob("*.md"):
26
+ with open(path, "r", encoding="utf-8") as f:
27
+ post = frontmatter.load(f)
28
+
29
+ text = post.content
30
+ meta = post.metadata
31
+
32
+ sentences = split_sentences(text)
33
+ chunks = window_chunks(sentences, CHUNK_WINDOW)
34
+
35
+ for c in chunks:
36
+ all_chunks.append({
37
+ "text": c["text"],
38
+ "source": str(path),
39
+ "meta": meta,
40
+ "position": c["index"]
41
+ })
42
+
43
+ return all_chunks
44
+
45
+ def build(self, root_dir: str):
46
+ print("📥 Ingesting markdown...")
47
+ chunks = self.ingest_markdown(root_dir)
48
+
49
+ print("🧠 Encoding...")
50
+ texts = [c["text"] for c in chunks]
51
+ embeddings = self.model.encode(texts, show_progress_bar=True)
52
+
53
+ dim = embeddings.shape[1]
54
+ # self.index = faiss.IndexFlatIP(dim)
55
+ self.index = faiss.IndexHNSWFlat(dim, 32)
56
+
57
+ # normalize for cosine similarity
58
+ faiss.normalize_L2(embeddings)
59
+ self.index.add(embeddings)
60
+
61
+ self.metadata = chunks
62
+
63
+ self.save()
64
+
65
+ def save(self):
66
+ DATA_DIR.mkdir(exist_ok=True)
67
+
68
+ faiss.write_index(self.index, str(DATA_DIR / "index.faiss"))
69
+
70
+ with open(DATA_DIR / "meta.json", "w", encoding="utf-8") as f:
71
+ json.dump(self.metadata, f)
72
+
73
+ def load(self):
74
+ if not (DATA_DIR / "index.faiss").exists():
75
+ raise FileNotFoundError(f"No database found at {DATA_DIR}. Please run build first.")
76
+
77
+ self.index = faiss.read_index(str(DATA_DIR / "index.faiss"))
78
+
79
+ with open(DATA_DIR / "meta.json", "r", encoding="utf-8") as f:
80
+ self.metadata = json.load(f)
81
+
82
+ def search(self, query: str, k=TOP_K):
83
+ q_emb = self.model.encode([query])
84
+ faiss.normalize_L2(q_emb)
85
+
86
+ scores, indices = self.index.search(q_emb, k)
87
+
88
+ results = []
89
+ for score, idx in zip(scores[0], indices[0]):
90
+ results.append({
91
+ "score": float(score),
92
+ "data": self.metadata[idx]
93
+ })
94
+
95
+ return results
96
+
97
+ def nucleus_filter(self, results):
98
+ if not results:
99
+ return []
100
+
101
+ best = results[0]["score"]
102
+
103
+ filtered = [
104
+ r for r in results
105
+ if (best - r["score"]) <= SIMILARITY_RADIUS
106
+ ]
107
+
108
+ # softmax sampling
109
+ scores = np.array([r["score"] for r in filtered])
110
+ probs = np.exp(scores) / np.sum(np.exp(scores))
111
+
112
+ sampled_indices = np.random.choice(
113
+ len(filtered),
114
+ size=min(len(filtered), 10),
115
+ replace=False,
116
+ p=probs
117
+ )
118
+
119
+ return [filtered[i] for i in sampled_indices]
@@ -0,0 +1,49 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+ from typing import List
5
+
6
+ from .config import ENV_FILE
7
+
8
+ def split_sentences(text: str) -> List[str]:
9
+ # simple but effective
10
+ sentences = re.split(r'(?<=[.!?])\s+', text)
11
+ return [s.strip() for s in sentences if s.strip()]
12
+
13
+
14
+ def window_chunks(sentences: List[str], window: int = 3):
15
+ chunks = []
16
+
17
+ for i in range(len(sentences)):
18
+ left = sentences[max(0, i - window): i]
19
+ center = [sentences[i]]
20
+ right = sentences[i + 1: i + 1 + window]
21
+
22
+ chunk_text = " ".join(left + center + right)
23
+
24
+ chunks.append({
25
+ "text": chunk_text,
26
+ "center": sentences[i],
27
+ "index": i
28
+ })
29
+
30
+ return chunks
31
+
32
+
33
+ def get_openai_key() -> str:
34
+ """
35
+ Load the OpenAI API key from ENV_FILE, prompt user if missing.
36
+ """
37
+ if ENV_FILE.exists():
38
+ key = ENV_FILE.read_text().strip()
39
+ if key:
40
+ return key
41
+
42
+ # Prompt user
43
+ print(f"🔑 OpenAI API key not found. Enter your key (it will be saved at {ENV_FILE}):")
44
+ key = input("API Key: ").strip()
45
+
46
+ # Save to file
47
+ ENV_FILE.write_text(key)
48
+ print(f"✅ Key saved at {ENV_FILE}")
49
+ return key
@@ -0,0 +1,123 @@
1
+ Metadata-Version: 2.4
2
+ Name: raqa
3
+ Version: 0.0.1
4
+ Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
5
+ Project-URL: GitHub repository, https://github.com
6
+ Classifier: Programming Language :: Python :: 3
7
+ Classifier: Operating System :: OS Independent
8
+ Requires-Python: >=3.10
9
+ Description-Content-Type: text/markdown
10
+ License-File: LICENSE
11
+ Requires-Dist: build
12
+ Requires-Dist: faiss-cpu
13
+ Requires-Dist: numpy
14
+ Requires-Dist: openai
15
+ Requires-Dist: python-frontmatter
16
+ Requires-Dist: sentence-transformers
17
+ Requires-Dist: tqdm
18
+ Requires-Dist: typer[all]
19
+ Requires-Dist: twine
20
+ Dynamic: license-file
21
+
22
+ # RAQA
23
+
24
+ **R**etrieval-**A**ugmented **Q**uestion-**A**nswering
25
+
26
+ Retrieval-augmented, pip-installable, CLI-based question answering over arbitrary document collections.
27
+
28
+ # Usage
29
+
30
+ ## Installation
31
+
32
+ ```
33
+ pip install raqa
34
+ ```
35
+
36
+ **Locally**
37
+
38
+ `pip install -e .`
39
+
40
+ ## Run
41
+
42
+ ### BASH via Python interpreter
43
+
44
+ 1. Build DB
45
+
46
+ `python cli.py build --path ./docs`
47
+
48
+ 2. Chat
49
+
50
+ `python cli.py chat`
51
+
52
+ 3. One-shot retrieval
53
+
54
+ `python cli.py search "what is retrieval augmented generation?"`
55
+
56
+ 4. Rebuild and chat
57
+
58
+ `python cli.py rebuild-and-chat`
59
+
60
+ 5. Get stats
61
+
62
+ `python cli.py stats`
63
+
64
+ ### BASH natively
65
+
66
+ ```
67
+ raqa build --path ./markdown_files
68
+ raqa chat
69
+ raqa search "what is RAG?"
70
+ raqa stats
71
+ raqa rebuild-and-chat
72
+ ```
73
+
74
+
75
+ ## Python
76
+
77
+ ### Build database
78
+
79
+ ```
80
+ from db import VectorDB
81
+ from config import MARKDOWN_ROOT
82
+
83
+ db = VectorDB()
84
+ db.build(MARKDOWN_ROOT)
85
+ ```
86
+
87
+ ### Run
88
+
89
+ ```
90
+ from agent import RAGAgent
91
+
92
+ agent = RAGAgent()
93
+ agent.chat()
94
+ ```
95
+
96
+
97
+ ## Build instructions
98
+
99
+ Next steps:
100
+
101
+ 1. If any changes are made, update `pyproject.toml`.
102
+ 2. Building the package before uploading:
103
+ `cd raqa; python -m build`.
104
+ 3. Upload the package to pypi:
105
+ `python -m twine upload --repository {pypi|testpypi} dist/*`
106
+
107
+ ## Next steps
108
+
109
+ ### Real tool-calling (instead of implicit RAG)
110
+
111
+ Define OpenAI tool:
112
+
113
+ ```
114
+ {
115
+ "name": "search_docs",
116
+ "description": "...",
117
+ "parameters": { "query": "string" }
118
+ }
119
+ ```
120
+
121
+ ### Hybrid search
122
+
123
+ Combine BM25 (rank-bm25) + embeddings
@@ -0,0 +1,15 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/raqa/__init__.py
5
+ src/raqa/agent.py
6
+ src/raqa/cli.py
7
+ src/raqa/config.py
8
+ src/raqa/db.py
9
+ src/raqa/utils.py
10
+ src/raqa.egg-info/PKG-INFO
11
+ src/raqa.egg-info/SOURCES.txt
12
+ src/raqa.egg-info/dependency_links.txt
13
+ src/raqa.egg-info/entry_points.txt
14
+ src/raqa.egg-info/requires.txt
15
+ src/raqa.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ raqa = raqa.cli:app
@@ -0,0 +1,9 @@
1
+ build
2
+ faiss-cpu
3
+ numpy
4
+ openai
5
+ python-frontmatter
6
+ sentence-transformers
7
+ tqdm
8
+ typer[all]
9
+ twine
@@ -0,0 +1 @@
1
+ raqa