raqa 0.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- raqa-0.0.1/LICENSE +0 -0
- raqa-0.0.1/PKG-INFO +123 -0
- raqa-0.0.1/README.md +102 -0
- raqa-0.0.1/pyproject.toml +34 -0
- raqa-0.0.1/setup.cfg +4 -0
- raqa-0.0.1/src/raqa/__init__.py +0 -0
- raqa-0.0.1/src/raqa/agent.py +56 -0
- raqa-0.0.1/src/raqa/cli.py +105 -0
- raqa-0.0.1/src/raqa/config.py +24 -0
- raqa-0.0.1/src/raqa/db.py +119 -0
- raqa-0.0.1/src/raqa/utils.py +49 -0
- raqa-0.0.1/src/raqa.egg-info/PKG-INFO +123 -0
- raqa-0.0.1/src/raqa.egg-info/SOURCES.txt +15 -0
- raqa-0.0.1/src/raqa.egg-info/dependency_links.txt +1 -0
- raqa-0.0.1/src/raqa.egg-info/entry_points.txt +2 -0
- raqa-0.0.1/src/raqa.egg-info/requires.txt +9 -0
- raqa-0.0.1/src/raqa.egg-info/top_level.txt +1 -0
raqa-0.0.1/LICENSE
ADDED
|
File without changes
|
raqa-0.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: raqa
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
|
|
5
|
+
Project-URL: GitHub repository, https://github.com
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: build
|
|
12
|
+
Requires-Dist: faiss-cpu
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: openai
|
|
15
|
+
Requires-Dist: python-frontmatter
|
|
16
|
+
Requires-Dist: sentence-transformers
|
|
17
|
+
Requires-Dist: tqdm
|
|
18
|
+
Requires-Dist: typer[all]
|
|
19
|
+
Requires-Dist: twine
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# RAQA
|
|
23
|
+
|
|
24
|
+
**R**etrieval-**A**ugmented **Q**uestion-**A**nswering
|
|
25
|
+
|
|
26
|
+
Retrieval-augmented, pip-installable, CLI-based question answering over arbitrary document collections.
|
|
27
|
+
|
|
28
|
+
# Usage
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
pip install raqa
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Locally**
|
|
37
|
+
|
|
38
|
+
`pip install -e .`
|
|
39
|
+
|
|
40
|
+
## Run
|
|
41
|
+
|
|
42
|
+
### BASH via Python interpreter
|
|
43
|
+
|
|
44
|
+
1. Build DB
|
|
45
|
+
|
|
46
|
+
`python cli.py build --path ./docs`
|
|
47
|
+
|
|
48
|
+
2. Chat
|
|
49
|
+
|
|
50
|
+
`python cli.py chat`
|
|
51
|
+
|
|
52
|
+
3. One-shot retrieval
|
|
53
|
+
|
|
54
|
+
`python cli.py search "what is retrieval augmented generation?"`
|
|
55
|
+
|
|
56
|
+
4. Rebuild and chat
|
|
57
|
+
|
|
58
|
+
`python cli.py rebuild-and-chat`
|
|
59
|
+
|
|
60
|
+
5. Get stats
|
|
61
|
+
|
|
62
|
+
`python cli.py stats`
|
|
63
|
+
|
|
64
|
+
### BASH natively
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
raqa build --path ./markdown_files
|
|
68
|
+
raqa chat
|
|
69
|
+
raqa search "what is RAG?"
|
|
70
|
+
raqa stats
|
|
71
|
+
raqa rebuild-and-chat
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## Python
|
|
76
|
+
|
|
77
|
+
### Build database
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
from db import VectorDB
|
|
81
|
+
from config import MARKDOWN_ROOT
|
|
82
|
+
|
|
83
|
+
db = VectorDB()
|
|
84
|
+
db.build(MARKDOWN_ROOT)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Run
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
from agent import RAGAgent
|
|
91
|
+
|
|
92
|
+
agent = RAGAgent()
|
|
93
|
+
agent.chat()
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
## Build instructions
|
|
98
|
+
|
|
99
|
+
Next steps:
|
|
100
|
+
|
|
101
|
+
1. If any changes are made, update `pyproject.toml`.
|
|
102
|
+
2. Building the package before uploading:
|
|
103
|
+
`cd raqa; python -m build`.
|
|
104
|
+
3. Upload the package to pypi:
|
|
105
|
+
`python -m twine upload --repository {pypi|testpypi} dist/*`
|
|
106
|
+
|
|
107
|
+
## Next steps
|
|
108
|
+
|
|
109
|
+
### Real tool-calling (instead of implicit RAG)
|
|
110
|
+
|
|
111
|
+
Define OpenAI tool:
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
{
|
|
115
|
+
"name": "search_docs",
|
|
116
|
+
"description": "...",
|
|
117
|
+
"parameters": { "query": "string" }
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Hybrid search
|
|
122
|
+
|
|
123
|
+
Combine BM25 (rank-bm25) + embeddings
|
raqa-0.0.1/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# RAQA
|
|
2
|
+
|
|
3
|
+
**R**etrieval-**A**ugmented **Q**uestion-**A**nswering
|
|
4
|
+
|
|
5
|
+
Retrieval-augmented, pip-installable, CLI-based question answering over arbitrary document collections.
|
|
6
|
+
|
|
7
|
+
# Usage
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
pip install raqa
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
**Locally**
|
|
16
|
+
|
|
17
|
+
`pip install -e .`
|
|
18
|
+
|
|
19
|
+
## Run
|
|
20
|
+
|
|
21
|
+
### BASH via Python interpreter
|
|
22
|
+
|
|
23
|
+
1. Build DB
|
|
24
|
+
|
|
25
|
+
`python cli.py build --path ./docs`
|
|
26
|
+
|
|
27
|
+
2. Chat
|
|
28
|
+
|
|
29
|
+
`python cli.py chat`
|
|
30
|
+
|
|
31
|
+
3. One-shot retrieval
|
|
32
|
+
|
|
33
|
+
`python cli.py search "what is retrieval augmented generation?"`
|
|
34
|
+
|
|
35
|
+
4. Rebuild and chat
|
|
36
|
+
|
|
37
|
+
`python cli.py rebuild-and-chat`
|
|
38
|
+
|
|
39
|
+
5. Get stats
|
|
40
|
+
|
|
41
|
+
`python cli.py stats`
|
|
42
|
+
|
|
43
|
+
### BASH natively
|
|
44
|
+
|
|
45
|
+
```
|
|
46
|
+
raqa build --path ./markdown_files
|
|
47
|
+
raqa chat
|
|
48
|
+
raqa search "what is RAG?"
|
|
49
|
+
raqa stats
|
|
50
|
+
raqa rebuild-and-chat
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
## Python
|
|
55
|
+
|
|
56
|
+
### Build database
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
from db import VectorDB
|
|
60
|
+
from config import MARKDOWN_ROOT
|
|
61
|
+
|
|
62
|
+
db = VectorDB()
|
|
63
|
+
db.build(MARKDOWN_ROOT)
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### Run
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
from agent import RAGAgent
|
|
70
|
+
|
|
71
|
+
agent = RAGAgent()
|
|
72
|
+
agent.chat()
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
## Build instructions
|
|
77
|
+
|
|
78
|
+
Next steps:
|
|
79
|
+
|
|
80
|
+
1. If any changes are made, update `pyproject.toml`.
|
|
81
|
+
2. Building the package before uploading:
|
|
82
|
+
`cd raqa; python -m build`.
|
|
83
|
+
3. Upload the package to pypi:
|
|
84
|
+
`python -m twine upload --repository {pypi|testpypi} dist/*`
|
|
85
|
+
|
|
86
|
+
## Next steps
|
|
87
|
+
|
|
88
|
+
### Real tool-calling (instead of implicit RAG)
|
|
89
|
+
|
|
90
|
+
Define OpenAI tool:
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
{
|
|
94
|
+
"name": "search_docs",
|
|
95
|
+
"description": "...",
|
|
96
|
+
"parameters": { "query": "string" }
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Hybrid search
|
|
101
|
+
|
|
102
|
+
Combine BM25 (rank-bm25) + embeddings
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=75.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "raqa"
|
|
7
|
+
version = "0.0.1"
|
|
8
|
+
authors = [
|
|
9
|
+
{ name="Jordi Carrera Ventura", email="jordi.carrera.ventura@gmail.com" },
|
|
10
|
+
]
|
|
11
|
+
description = ""
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
requires-python = ">=3.10"
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Operating System :: OS Independent",
|
|
17
|
+
]
|
|
18
|
+
dependencies = [
|
|
19
|
+
"build",
|
|
20
|
+
"faiss-cpu",
|
|
21
|
+
"numpy",
|
|
22
|
+
"openai",
|
|
23
|
+
"python-frontmatter",
|
|
24
|
+
"sentence-transformers",
|
|
25
|
+
"tqdm",
|
|
26
|
+
"typer[all]",
|
|
27
|
+
"twine"
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
"GitHub repository" = "https://github.com"
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
raqa = "raqa.cli:app"
|
raqa-0.0.1/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
from openai import OpenAI
|
|
2
|
+
|
|
3
|
+
from .db import VectorDB
|
|
4
|
+
from .utils import get_openai_key
|
|
5
|
+
|
|
6
|
+
class RAGAgent:
|
|
7
|
+
def __init__(self):
|
|
8
|
+
self.db = VectorDB()
|
|
9
|
+
self.db.load()
|
|
10
|
+
|
|
11
|
+
api_key = get_openai_key()
|
|
12
|
+
self.client = OpenAI(api_key=api_key)
|
|
13
|
+
|
|
14
|
+
def retrieve(self, query: str):
|
|
15
|
+
results = self.db.search(query)
|
|
16
|
+
filtered = self.db.nucleus_filter(results)
|
|
17
|
+
|
|
18
|
+
context = "\n\n".join(
|
|
19
|
+
f"[{r['data']['source']}]\n{r['data']['text']}"
|
|
20
|
+
for r in filtered
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
return context
|
|
24
|
+
|
|
25
|
+
def chat(self):
|
|
26
|
+
print("💬 RAG Agent ready. Type 'exit' to quit.")
|
|
27
|
+
|
|
28
|
+
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
|
29
|
+
|
|
30
|
+
while True:
|
|
31
|
+
user_input = input("\nYou: ")
|
|
32
|
+
if user_input.lower() in ("exit", "quit"):
|
|
33
|
+
break
|
|
34
|
+
|
|
35
|
+
context = self.retrieve(user_input)
|
|
36
|
+
|
|
37
|
+
augmented_prompt = f"""
|
|
38
|
+
Use the context below if relevant:
|
|
39
|
+
|
|
40
|
+
{context}
|
|
41
|
+
|
|
42
|
+
User question:
|
|
43
|
+
{user_input}
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
messages.append({"role": "user", "content": augmented_prompt})
|
|
47
|
+
|
|
48
|
+
response = self.client.chat.completions.create(
|
|
49
|
+
model="gpt-4.1-mini",
|
|
50
|
+
messages=messages
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
reply = response.choices[0].message.content
|
|
54
|
+
messages.append({"role": "assistant", "content": reply})
|
|
55
|
+
|
|
56
|
+
print("\nAssistant:", reply)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import typer
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
from .db import VectorDB
|
|
5
|
+
from .agent import RAGAgent
|
|
6
|
+
from .config import MARKDOWN_ROOT
|
|
7
|
+
|
|
8
|
+
app = typer.Typer(help="📚 Markdown RAG CLI")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# ---------------------------
|
|
12
|
+
# BUILD DATABASE
|
|
13
|
+
# ---------------------------
|
|
14
|
+
@app.command()
|
|
15
|
+
def build(
|
|
16
|
+
path: str = typer.Option(
|
|
17
|
+
MARKDOWN_ROOT,
|
|
18
|
+
help="Path to markdown folder"
|
|
19
|
+
)
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Build vector database from markdown files.
|
|
23
|
+
"""
|
|
24
|
+
db = VectorDB()
|
|
25
|
+
db.build(path)
|
|
26
|
+
|
|
27
|
+
typer.secho("✅ Database built successfully.", fg=typer.colors.GREEN)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# ---------------------------
|
|
31
|
+
# SEARCH ONLY (DEBUG TOOL)
|
|
32
|
+
# ---------------------------
|
|
33
|
+
@app.command()
|
|
34
|
+
def search(
|
|
35
|
+
query: str = typer.Argument(..., help="Search query"),
|
|
36
|
+
k: int = typer.Option(10, help="Top K results")
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Run retrieval without LLM (debugging).
|
|
40
|
+
"""
|
|
41
|
+
db = VectorDB()
|
|
42
|
+
db.load()
|
|
43
|
+
|
|
44
|
+
results = db.search(query, k=k)
|
|
45
|
+
|
|
46
|
+
typer.secho("\n🔎 Raw Results:\n", fg=typer.colors.BLUE)
|
|
47
|
+
|
|
48
|
+
for i, r in enumerate(results):
|
|
49
|
+
typer.echo(f"\n--- Result {i+1} ---")
|
|
50
|
+
typer.echo(f"Score: {r['score']:.4f}")
|
|
51
|
+
typer.echo(f"Source: {r['data']['source']}")
|
|
52
|
+
typer.echo(r["data"]["text"][:500])
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# ---------------------------
|
|
56
|
+
# CHAT (MAIN ENTRYPOINT)
|
|
57
|
+
# ---------------------------
|
|
58
|
+
@app.command()
|
|
59
|
+
def chat():
|
|
60
|
+
"""
|
|
61
|
+
Start conversational RAG agent.
|
|
62
|
+
"""
|
|
63
|
+
agent = RAGAgent()
|
|
64
|
+
agent.chat()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ---------------------------
|
|
68
|
+
# REBUILD + CHAT (CONVENIENCE)
|
|
69
|
+
# ---------------------------
|
|
70
|
+
@app.command()
|
|
71
|
+
def rebuild_and_chat(
|
|
72
|
+
path: str = typer.Option(
|
|
73
|
+
MARKDOWN_ROOT,
|
|
74
|
+
help="Markdown folder"
|
|
75
|
+
)
|
|
76
|
+
):
|
|
77
|
+
"""
|
|
78
|
+
Rebuild database and immediately start chat.
|
|
79
|
+
"""
|
|
80
|
+
db = VectorDB()
|
|
81
|
+
db.build(path)
|
|
82
|
+
|
|
83
|
+
typer.secho("\n🚀 Starting chat...\n", fg=typer.colors.GREEN)
|
|
84
|
+
|
|
85
|
+
agent = RAGAgent()
|
|
86
|
+
agent.chat()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# ---------------------------
|
|
90
|
+
# INSPECT DB
|
|
91
|
+
# ---------------------------
|
|
92
|
+
@app.command()
|
|
93
|
+
def stats():
|
|
94
|
+
"""
|
|
95
|
+
Show database stats.
|
|
96
|
+
"""
|
|
97
|
+
db = VectorDB()
|
|
98
|
+
db.load()
|
|
99
|
+
|
|
100
|
+
typer.echo("📊 Database Stats:")
|
|
101
|
+
typer.echo(f"Total chunks: {len(db.metadata)}")
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
if __name__ == "__main__":
|
|
105
|
+
app()
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
# User folder for raqa configs
|
|
4
|
+
HOME = Path.home()
|
|
5
|
+
RAQA_HOME = HOME / ".raqa"
|
|
6
|
+
|
|
7
|
+
# OpenAI credential file
|
|
8
|
+
ENV_FILE = RAQA_HOME / "env"
|
|
9
|
+
|
|
10
|
+
# Database folder (moved from project data)
|
|
11
|
+
DATA_DIR = RAQA_HOME / "data"
|
|
12
|
+
|
|
13
|
+
# Embeddings & chunk config
|
|
14
|
+
EMBEDDING_MODEL = "joeddav/xlm-roberta-large-xnli"
|
|
15
|
+
CHUNK_WINDOW = 3
|
|
16
|
+
TOP_K = 50
|
|
17
|
+
SIMILARITY_RADIUS = 0.4
|
|
18
|
+
|
|
19
|
+
# Default markdown folder (can override via CLI)
|
|
20
|
+
MARKDOWN_ROOT = "./markdown_files"
|
|
21
|
+
|
|
22
|
+
# Ensure directories exist
|
|
23
|
+
RAQA_HOME.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import faiss
|
|
6
|
+
import numpy as np
|
|
7
|
+
import frontmatter
|
|
8
|
+
from sentence_transformers import SentenceTransformer
|
|
9
|
+
|
|
10
|
+
from .utils import split_sentences, window_chunks
|
|
11
|
+
from .config import *
|
|
12
|
+
|
|
13
|
+
os.environ["TOKENIZERS_PARALLELISM"] = True
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class VectorDB:
|
|
17
|
+
def __init__(self):
|
|
18
|
+
self.model = SentenceTransformer(EMBEDDING_MODEL)
|
|
19
|
+
self.index = None
|
|
20
|
+
self.metadata = []
|
|
21
|
+
|
|
22
|
+
def ingest_markdown(self, root_dir: str):
|
|
23
|
+
all_chunks = []
|
|
24
|
+
|
|
25
|
+
for path in Path(root_dir).rglob("*.md"):
|
|
26
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
27
|
+
post = frontmatter.load(f)
|
|
28
|
+
|
|
29
|
+
text = post.content
|
|
30
|
+
meta = post.metadata
|
|
31
|
+
|
|
32
|
+
sentences = split_sentences(text)
|
|
33
|
+
chunks = window_chunks(sentences, CHUNK_WINDOW)
|
|
34
|
+
|
|
35
|
+
for c in chunks:
|
|
36
|
+
all_chunks.append({
|
|
37
|
+
"text": c["text"],
|
|
38
|
+
"source": str(path),
|
|
39
|
+
"meta": meta,
|
|
40
|
+
"position": c["index"]
|
|
41
|
+
})
|
|
42
|
+
|
|
43
|
+
return all_chunks
|
|
44
|
+
|
|
45
|
+
def build(self, root_dir: str):
|
|
46
|
+
print("📥 Ingesting markdown...")
|
|
47
|
+
chunks = self.ingest_markdown(root_dir)
|
|
48
|
+
|
|
49
|
+
print("🧠 Encoding...")
|
|
50
|
+
texts = [c["text"] for c in chunks]
|
|
51
|
+
embeddings = self.model.encode(texts, show_progress_bar=True)
|
|
52
|
+
|
|
53
|
+
dim = embeddings.shape[1]
|
|
54
|
+
# self.index = faiss.IndexFlatIP(dim)
|
|
55
|
+
self.index = faiss.IndexHNSWFlat(dim, 32)
|
|
56
|
+
|
|
57
|
+
# normalize for cosine similarity
|
|
58
|
+
faiss.normalize_L2(embeddings)
|
|
59
|
+
self.index.add(embeddings)
|
|
60
|
+
|
|
61
|
+
self.metadata = chunks
|
|
62
|
+
|
|
63
|
+
self.save()
|
|
64
|
+
|
|
65
|
+
def save(self):
|
|
66
|
+
DATA_DIR.mkdir(exist_ok=True)
|
|
67
|
+
|
|
68
|
+
faiss.write_index(self.index, str(DATA_DIR / "index.faiss"))
|
|
69
|
+
|
|
70
|
+
with open(DATA_DIR / "meta.json", "w", encoding="utf-8") as f:
|
|
71
|
+
json.dump(self.metadata, f)
|
|
72
|
+
|
|
73
|
+
def load(self):
|
|
74
|
+
if not (DATA_DIR / "index.faiss").exists():
|
|
75
|
+
raise FileNotFoundError(f"No database found at {DATA_DIR}. Please run build first.")
|
|
76
|
+
|
|
77
|
+
self.index = faiss.read_index(str(DATA_DIR / "index.faiss"))
|
|
78
|
+
|
|
79
|
+
with open(DATA_DIR / "meta.json", "r", encoding="utf-8") as f:
|
|
80
|
+
self.metadata = json.load(f)
|
|
81
|
+
|
|
82
|
+
def search(self, query: str, k=TOP_K):
|
|
83
|
+
q_emb = self.model.encode([query])
|
|
84
|
+
faiss.normalize_L2(q_emb)
|
|
85
|
+
|
|
86
|
+
scores, indices = self.index.search(q_emb, k)
|
|
87
|
+
|
|
88
|
+
results = []
|
|
89
|
+
for score, idx in zip(scores[0], indices[0]):
|
|
90
|
+
results.append({
|
|
91
|
+
"score": float(score),
|
|
92
|
+
"data": self.metadata[idx]
|
|
93
|
+
})
|
|
94
|
+
|
|
95
|
+
return results
|
|
96
|
+
|
|
97
|
+
def nucleus_filter(self, results):
|
|
98
|
+
if not results:
|
|
99
|
+
return []
|
|
100
|
+
|
|
101
|
+
best = results[0]["score"]
|
|
102
|
+
|
|
103
|
+
filtered = [
|
|
104
|
+
r for r in results
|
|
105
|
+
if (best - r["score"]) <= SIMILARITY_RADIUS
|
|
106
|
+
]
|
|
107
|
+
|
|
108
|
+
# softmax sampling
|
|
109
|
+
scores = np.array([r["score"] for r in filtered])
|
|
110
|
+
probs = np.exp(scores) / np.sum(np.exp(scores))
|
|
111
|
+
|
|
112
|
+
sampled_indices = np.random.choice(
|
|
113
|
+
len(filtered),
|
|
114
|
+
size=min(len(filtered), 10),
|
|
115
|
+
replace=False,
|
|
116
|
+
p=probs
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
return [filtered[i] for i in sampled_indices]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from .config import ENV_FILE
|
|
7
|
+
|
|
8
|
+
def split_sentences(text: str) -> List[str]:
|
|
9
|
+
# simple but effective
|
|
10
|
+
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
11
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def window_chunks(sentences: List[str], window: int = 3):
|
|
15
|
+
chunks = []
|
|
16
|
+
|
|
17
|
+
for i in range(len(sentences)):
|
|
18
|
+
left = sentences[max(0, i - window): i]
|
|
19
|
+
center = [sentences[i]]
|
|
20
|
+
right = sentences[i + 1: i + 1 + window]
|
|
21
|
+
|
|
22
|
+
chunk_text = " ".join(left + center + right)
|
|
23
|
+
|
|
24
|
+
chunks.append({
|
|
25
|
+
"text": chunk_text,
|
|
26
|
+
"center": sentences[i],
|
|
27
|
+
"index": i
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
return chunks
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def get_openai_key() -> str:
|
|
34
|
+
"""
|
|
35
|
+
Load the OpenAI API key from ENV_FILE, prompt user if missing.
|
|
36
|
+
"""
|
|
37
|
+
if ENV_FILE.exists():
|
|
38
|
+
key = ENV_FILE.read_text().strip()
|
|
39
|
+
if key:
|
|
40
|
+
return key
|
|
41
|
+
|
|
42
|
+
# Prompt user
|
|
43
|
+
print(f"🔑 OpenAI API key not found. Enter your key (it will be saved at {ENV_FILE}):")
|
|
44
|
+
key = input("API Key: ").strip()
|
|
45
|
+
|
|
46
|
+
# Save to file
|
|
47
|
+
ENV_FILE.write_text(key)
|
|
48
|
+
print(f"✅ Key saved at {ENV_FILE}")
|
|
49
|
+
return key
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: raqa
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
|
|
5
|
+
Project-URL: GitHub repository, https://github.com
|
|
6
|
+
Classifier: Programming Language :: Python :: 3
|
|
7
|
+
Classifier: Operating System :: OS Independent
|
|
8
|
+
Requires-Python: >=3.10
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Requires-Dist: build
|
|
12
|
+
Requires-Dist: faiss-cpu
|
|
13
|
+
Requires-Dist: numpy
|
|
14
|
+
Requires-Dist: openai
|
|
15
|
+
Requires-Dist: python-frontmatter
|
|
16
|
+
Requires-Dist: sentence-transformers
|
|
17
|
+
Requires-Dist: tqdm
|
|
18
|
+
Requires-Dist: typer[all]
|
|
19
|
+
Requires-Dist: twine
|
|
20
|
+
Dynamic: license-file
|
|
21
|
+
|
|
22
|
+
# RAQA
|
|
23
|
+
|
|
24
|
+
**R**etrieval-**A**ugmented **Q**uestion-**A**nswering
|
|
25
|
+
|
|
26
|
+
Retrieval-augmented, pip-installable, CLI-based question answering over arbitrary document collections.
|
|
27
|
+
|
|
28
|
+
# Usage
|
|
29
|
+
|
|
30
|
+
## Installation
|
|
31
|
+
|
|
32
|
+
```
|
|
33
|
+
pip install raqa
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
**Locally**
|
|
37
|
+
|
|
38
|
+
`pip install -e .`
|
|
39
|
+
|
|
40
|
+
## Run
|
|
41
|
+
|
|
42
|
+
### BASH via Python interpreter
|
|
43
|
+
|
|
44
|
+
1. Build DB
|
|
45
|
+
|
|
46
|
+
`python cli.py build --path ./docs`
|
|
47
|
+
|
|
48
|
+
2. Chat
|
|
49
|
+
|
|
50
|
+
`python cli.py chat`
|
|
51
|
+
|
|
52
|
+
3. One-shot retrieval
|
|
53
|
+
|
|
54
|
+
`python cli.py search "what is retrieval augmented generation?"`
|
|
55
|
+
|
|
56
|
+
4. Rebuild and chat
|
|
57
|
+
|
|
58
|
+
`python cli.py rebuild-and-chat`
|
|
59
|
+
|
|
60
|
+
5. Get stats
|
|
61
|
+
|
|
62
|
+
`python cli.py stats`
|
|
63
|
+
|
|
64
|
+
### BASH natively
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
raqa build --path ./markdown_files
|
|
68
|
+
raqa chat
|
|
69
|
+
raqa search "what is RAG?"
|
|
70
|
+
raqa stats
|
|
71
|
+
raqa rebuild-and-chat
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
## Python
|
|
76
|
+
|
|
77
|
+
### Build database
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
from db import VectorDB
|
|
81
|
+
from config import MARKDOWN_ROOT
|
|
82
|
+
|
|
83
|
+
db = VectorDB()
|
|
84
|
+
db.build(MARKDOWN_ROOT)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### Run
|
|
88
|
+
|
|
89
|
+
```
|
|
90
|
+
from agent import RAGAgent
|
|
91
|
+
|
|
92
|
+
agent = RAGAgent()
|
|
93
|
+
agent.chat()
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
## Build instructions
|
|
98
|
+
|
|
99
|
+
Next steps:
|
|
100
|
+
|
|
101
|
+
1. If any changes are made, update `pyproject.toml`.
|
|
102
|
+
2. Building the package before uploading:
|
|
103
|
+
`cd raqa; python -m build`.
|
|
104
|
+
3. Upload the package to pypi:
|
|
105
|
+
`python -m twine upload --repository {pypi|testpypi} dist/*`
|
|
106
|
+
|
|
107
|
+
## Next steps
|
|
108
|
+
|
|
109
|
+
### Real tool-calling (instead of implicit RAG)
|
|
110
|
+
|
|
111
|
+
Define OpenAI tool:
|
|
112
|
+
|
|
113
|
+
```
|
|
114
|
+
{
|
|
115
|
+
"name": "search_docs",
|
|
116
|
+
"description": "...",
|
|
117
|
+
"parameters": { "query": "string" }
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Hybrid search
|
|
122
|
+
|
|
123
|
+
Combine BM25 (rank-bm25) + embeddings
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/raqa/__init__.py
|
|
5
|
+
src/raqa/agent.py
|
|
6
|
+
src/raqa/cli.py
|
|
7
|
+
src/raqa/config.py
|
|
8
|
+
src/raqa/db.py
|
|
9
|
+
src/raqa/utils.py
|
|
10
|
+
src/raqa.egg-info/PKG-INFO
|
|
11
|
+
src/raqa.egg-info/SOURCES.txt
|
|
12
|
+
src/raqa.egg-info/dependency_links.txt
|
|
13
|
+
src/raqa.egg-info/entry_points.txt
|
|
14
|
+
src/raqa.egg-info/requires.txt
|
|
15
|
+
src/raqa.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
raqa
|