raqa 1.0.2__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {raqa-1.0.2/src/raqa.egg-info → raqa-3.0.0}/PKG-INFO +8 -10
- {raqa-1.0.2 → raqa-3.0.0}/README.md +3 -3
- {raqa-1.0.2 → raqa-3.0.0}/pyproject.toml +6 -9
- raqa-3.0.0/src/raqa/__main__.py +3 -0
- raqa-3.0.0/src/raqa/_agent.py +63 -0
- raqa-3.0.0/src/raqa/_app.py +41 -0
- raqa-3.0.0/src/raqa/_data.py +48 -0
- raqa-3.0.0/src/raqa/cli.py +10 -0
- {raqa-1.0.2 → raqa-3.0.0/src/raqa.egg-info}/PKG-INFO +8 -10
- {raqa-1.0.2 → raqa-3.0.0}/src/raqa.egg-info/SOURCES.txt +4 -4
- raqa-3.0.0/src/raqa.egg-info/entry_points.txt +2 -0
- raqa-3.0.0/src/raqa.egg-info/requires.txt +7 -0
- raqa-1.0.2/src/raqa/agent.py +0 -54
- raqa-1.0.2/src/raqa/cli.py +0 -148
- raqa-1.0.2/src/raqa/config.py +0 -27
- raqa-1.0.2/src/raqa/db.py +0 -197
- raqa-1.0.2/src/raqa/utils.py +0 -49
- raqa-1.0.2/src/raqa.egg-info/entry_points.txt +0 -2
- raqa-1.0.2/src/raqa.egg-info/requires.txt +0 -10
- {raqa-1.0.2 → raqa-3.0.0}/LICENSE +0 -0
- {raqa-1.0.2 → raqa-3.0.0}/setup.cfg +0 -0
- {raqa-1.0.2 → raqa-3.0.0}/src/raqa/__init__.py +0 -0
- {raqa-1.0.2 → raqa-3.0.0}/src/raqa.egg-info/dependency_links.txt +0 -0
- {raqa-1.0.2 → raqa-3.0.0}/src/raqa.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: raqa
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: RAG-powered document Q&A — Streamlit + OpenAI Agents SDK
|
|
4
5
|
Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
|
|
5
6
|
Project-URL: GitHub repository, https://github.com/JordiCarreraVentura/raqa
|
|
6
7
|
Project-URL: PyPI, https://pypi.org/project/raqa
|
|
@@ -10,14 +11,11 @@ Requires-Python: >=3.10
|
|
|
10
11
|
Description-Content-Type: text/markdown
|
|
11
12
|
License-File: LICENSE
|
|
12
13
|
Requires-Dist: build
|
|
13
|
-
Requires-Dist: faiss-cpu
|
|
14
|
-
Requires-Dist: joblib
|
|
15
14
|
Requires-Dist: numpy
|
|
16
15
|
Requires-Dist: openai
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist:
|
|
20
|
-
Requires-Dist: typer[all]
|
|
16
|
+
Requires-Dist: openai-agents
|
|
17
|
+
Requires-Dist: python-dotenv
|
|
18
|
+
Requires-Dist: streamlit
|
|
21
19
|
Requires-Dist: twine
|
|
22
20
|
Dynamic: license-file
|
|
23
21
|
|
|
@@ -53,7 +51,7 @@ pip install raqa
|
|
|
53
51
|
|
|
54
52
|
3. One-shot retrieval
|
|
55
53
|
|
|
56
|
-
`python cli.py search "what is retrieval augmented generation?"`
|
|
54
|
+
`python cli.py search DATABASE_NAME "what is retrieval augmented generation?"`
|
|
57
55
|
|
|
58
56
|
4. Rebuild and chat
|
|
59
57
|
|
|
@@ -73,8 +71,8 @@ pip install raqa
|
|
|
73
71
|
```
|
|
74
72
|
raqa build DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
|
|
75
73
|
raqa chat DATABASE_NAME
|
|
76
|
-
raqa search "what is RAG?"
|
|
77
|
-
raqa list
|
|
74
|
+
raqa search DATABASE_NAME "what is RAG?"
|
|
75
|
+
raqa list (DATABASE_NAME)
|
|
78
76
|
raqa stats (DATABASE_NAME)
|
|
79
77
|
raqa rebuild-and-chat DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
|
|
80
78
|
```
|
|
@@ -30,7 +30,7 @@ pip install raqa
|
|
|
30
30
|
|
|
31
31
|
3. One-shot retrieval
|
|
32
32
|
|
|
33
|
-
`python cli.py search "what is retrieval augmented generation?"`
|
|
33
|
+
`python cli.py search DATABASE_NAME "what is retrieval augmented generation?"`
|
|
34
34
|
|
|
35
35
|
4. Rebuild and chat
|
|
36
36
|
|
|
@@ -50,8 +50,8 @@ pip install raqa
|
|
|
50
50
|
```
|
|
51
51
|
raqa build DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
|
|
52
52
|
raqa chat DATABASE_NAME
|
|
53
|
-
raqa search "what is RAG?"
|
|
54
|
-
raqa list
|
|
53
|
+
raqa search DATABASE_NAME "what is RAG?"
|
|
54
|
+
raqa list (DATABASE_NAME)
|
|
55
55
|
raqa stats (DATABASE_NAME)
|
|
56
56
|
raqa rebuild-and-chat DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
|
|
57
57
|
```
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "raqa"
|
|
7
|
-
version = "
|
|
7
|
+
version = "3.0.0"
|
|
8
8
|
authors = [
|
|
9
9
|
{ name="Jordi Carrera Ventura", email="jordi.carrera.ventura@gmail.com" },
|
|
10
10
|
]
|
|
11
|
-
description = ""
|
|
11
|
+
description = "RAG-powered document Q&A — Streamlit + OpenAI Agents SDK"
|
|
12
12
|
readme = "README.md"
|
|
13
13
|
requires-python = ">=3.10"
|
|
14
14
|
classifiers = [
|
|
@@ -17,14 +17,11 @@ classifiers = [
|
|
|
17
17
|
]
|
|
18
18
|
dependencies = [
|
|
19
19
|
"build",
|
|
20
|
-
"faiss-cpu",
|
|
21
|
-
"joblib",
|
|
22
20
|
"numpy",
|
|
23
21
|
"openai",
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
"
|
|
27
|
-
"typer[all]",
|
|
22
|
+
"openai-agents",
|
|
23
|
+
"python-dotenv",
|
|
24
|
+
"streamlit",
|
|
28
25
|
"twine"
|
|
29
26
|
]
|
|
30
27
|
|
|
@@ -33,4 +30,4 @@ dependencies = [
|
|
|
33
30
|
"PyPI" = "https://pypi.org/project/raqa"
|
|
34
31
|
|
|
35
32
|
[project.scripts]
|
|
36
|
-
raqa = "raqa.cli:
|
|
33
|
+
raqa = "raqa.cli:main"
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from dotenv import load_dotenv
|
|
3
|
+
from openai import OpenAI
|
|
4
|
+
from agents import Agent, function_tool
|
|
5
|
+
|
|
6
|
+
from ._data import load_documents
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
load_dotenv()
|
|
10
|
+
|
|
11
|
+
_client = None
|
|
12
|
+
chunks = []
|
|
13
|
+
embeddings = None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _get_client() -> OpenAI:
|
|
17
|
+
global _client
|
|
18
|
+
if _client is None:
|
|
19
|
+
_client = OpenAI()
|
|
20
|
+
return _client
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def initialize(data_dir: str = "data"):
|
|
24
|
+
global chunks, embeddings
|
|
25
|
+
chunks = load_documents(data_dir)
|
|
26
|
+
if not chunks:
|
|
27
|
+
embeddings = np.array([], dtype="float32").reshape(0, 0)
|
|
28
|
+
return
|
|
29
|
+
|
|
30
|
+
texts = [c["text"] for c in chunks]
|
|
31
|
+
response = _get_client().embeddings.create(input=texts, model="text-embedding-3-small")
|
|
32
|
+
embeddings = np.array([d.embedding for d in response.data], dtype="float32")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@function_tool
|
|
36
|
+
def search_docs(query: str) -> str:
|
|
37
|
+
"""Search the loaded documents for information relevant to the query."""
|
|
38
|
+
if not chunks or embeddings.size == 0:
|
|
39
|
+
return "No documents loaded."
|
|
40
|
+
|
|
41
|
+
q_emb = _get_client().embeddings.create(
|
|
42
|
+
input=[query], model="text-embedding-3-small"
|
|
43
|
+
).data[0].embedding
|
|
44
|
+
q_vec = np.array(q_emb, dtype="float32")
|
|
45
|
+
|
|
46
|
+
scores = embeddings @ q_vec
|
|
47
|
+
top_k = min(5, len(chunks))
|
|
48
|
+
indices = np.argsort(scores)[-top_k:][::-1]
|
|
49
|
+
|
|
50
|
+
results = [f"[{chunks[i]['source']}]\n{chunks[i]['text']}" for i in indices]
|
|
51
|
+
return "\n\n---\n\n".join(results)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def create_agent() -> Agent:
|
|
55
|
+
return Agent(
|
|
56
|
+
name="RAQA",
|
|
57
|
+
instructions=(
|
|
58
|
+
"You are a helpful assistant. "
|
|
59
|
+
"Answer questions based on the provided documents. "
|
|
60
|
+
"If the search results are not relevant, say so."
|
|
61
|
+
),
|
|
62
|
+
tools=[search_docs],
|
|
63
|
+
)
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import asyncio
|
|
3
|
+
|
|
4
|
+
import streamlit as st
|
|
5
|
+
from agents import Runner
|
|
6
|
+
|
|
7
|
+
from raqa._agent import initialize, create_agent
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
st.set_page_config(page_title="RAQA", page_icon="📚", layout="centered")
|
|
11
|
+
|
|
12
|
+
data_dir = os.environ.get("RAQA_DATA_DIR", "data")
|
|
13
|
+
|
|
14
|
+
if "ready" not in st.session_state:
|
|
15
|
+
with st.spinner("Loading documents..."):
|
|
16
|
+
initialize(data_dir=data_dir)
|
|
17
|
+
st.session_state.agent = create_agent()
|
|
18
|
+
st.session_state.messages = []
|
|
19
|
+
st.session_state.history = None
|
|
20
|
+
st.session_state.ready = True
|
|
21
|
+
|
|
22
|
+
st.title("📚 RAQA")
|
|
23
|
+
|
|
24
|
+
for msg in st.session_state.messages:
|
|
25
|
+
with st.chat_message(msg["role"]):
|
|
26
|
+
st.markdown(msg["content"])
|
|
27
|
+
|
|
28
|
+
if prompt := st.chat_input("Ask a question about your documents"):
|
|
29
|
+
st.session_state.messages.append({"role": "user", "content": prompt})
|
|
30
|
+
with st.chat_message("user"):
|
|
31
|
+
st.markdown(prompt)
|
|
32
|
+
|
|
33
|
+
with st.chat_message("assistant"):
|
|
34
|
+
with st.spinner("Thinking..."):
|
|
35
|
+
history = st.session_state.history
|
|
36
|
+
input_for = history + [{"role": "user", "content": prompt}] if history else prompt
|
|
37
|
+
result = asyncio.run(Runner.run(st.session_state.agent, input_for))
|
|
38
|
+
st.session_state.history = result.to_input_list()
|
|
39
|
+
response = result.final_output
|
|
40
|
+
st.markdown(response)
|
|
41
|
+
st.session_state.messages.append({"role": "assistant", "content": response})
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import csv
|
|
2
|
+
import re
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import List, Dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def load_documents(data_dir: str = "data") -> List[Dict]:
|
|
8
|
+
chunks = []
|
|
9
|
+
data_path = Path(data_dir)
|
|
10
|
+
if not data_path.exists():
|
|
11
|
+
print(f"⚠️ Data directory '{data_dir}' not found.")
|
|
12
|
+
return chunks
|
|
13
|
+
|
|
14
|
+
for path in sorted(data_path.rglob("*")):
|
|
15
|
+
if path.suffix == ".md":
|
|
16
|
+
text = path.read_text(encoding="utf-8")
|
|
17
|
+
clean = _remove_markdown(text)
|
|
18
|
+
paras = [p.strip() for p in re.split(r'\n\s*\n', clean) if p.strip()]
|
|
19
|
+
for i, para in enumerate(paras):
|
|
20
|
+
chunks.append({"text": para, "source": str(path), "index": i})
|
|
21
|
+
elif path.suffix == ".txt":
|
|
22
|
+
text = path.read_text(encoding="utf-8")
|
|
23
|
+
paras = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
|
|
24
|
+
for i, para in enumerate(paras):
|
|
25
|
+
chunks.append({"text": para, "source": str(path), "index": i})
|
|
26
|
+
elif path.suffix == ".csv":
|
|
27
|
+
with open(path, newline="", encoding="utf-8") as f:
|
|
28
|
+
rows = list(csv.reader(f))
|
|
29
|
+
if rows:
|
|
30
|
+
lines = [" | ".join(r) for r in rows]
|
|
31
|
+
chunks.append({"text": "\n".join(lines), "source": str(path), "index": 0})
|
|
32
|
+
|
|
33
|
+
return chunks
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _remove_markdown(text: str) -> str:
|
|
37
|
+
text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
|
|
38
|
+
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
|
39
|
+
text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text)
|
|
40
|
+
text = re.sub(r'(\*|_)(.*?)\1', r'\2', text)
|
|
41
|
+
text = re.sub(r'`([^`]+)`', r'\1', text)
|
|
42
|
+
text = re.sub(r'^\s*#+\s+(.*)$', r'\1', text, flags=re.MULTILINE)
|
|
43
|
+
text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)
|
|
44
|
+
text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
|
45
|
+
text = re.sub(r'^\s*[\-\*]\s+', '', text, flags=re.MULTILINE)
|
|
46
|
+
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
|
|
47
|
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
48
|
+
return text.strip()
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: raqa
|
|
3
|
-
Version:
|
|
3
|
+
Version: 3.0.0
|
|
4
|
+
Summary: RAG-powered document Q&A — Streamlit + OpenAI Agents SDK
|
|
4
5
|
Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
|
|
5
6
|
Project-URL: GitHub repository, https://github.com/JordiCarreraVentura/raqa
|
|
6
7
|
Project-URL: PyPI, https://pypi.org/project/raqa
|
|
@@ -10,14 +11,11 @@ Requires-Python: >=3.10
|
|
|
10
11
|
Description-Content-Type: text/markdown
|
|
11
12
|
License-File: LICENSE
|
|
12
13
|
Requires-Dist: build
|
|
13
|
-
Requires-Dist: faiss-cpu
|
|
14
|
-
Requires-Dist: joblib
|
|
15
14
|
Requires-Dist: numpy
|
|
16
15
|
Requires-Dist: openai
|
|
17
|
-
Requires-Dist:
|
|
18
|
-
Requires-Dist:
|
|
19
|
-
Requires-Dist:
|
|
20
|
-
Requires-Dist: typer[all]
|
|
16
|
+
Requires-Dist: openai-agents
|
|
17
|
+
Requires-Dist: python-dotenv
|
|
18
|
+
Requires-Dist: streamlit
|
|
21
19
|
Requires-Dist: twine
|
|
22
20
|
Dynamic: license-file
|
|
23
21
|
|
|
@@ -53,7 +51,7 @@ pip install raqa
|
|
|
53
51
|
|
|
54
52
|
3. One-shot retrieval
|
|
55
53
|
|
|
56
|
-
`python cli.py search "what is retrieval augmented generation?"`
|
|
54
|
+
`python cli.py search DATABASE_NAME "what is retrieval augmented generation?"`
|
|
57
55
|
|
|
58
56
|
4. Rebuild and chat
|
|
59
57
|
|
|
@@ -73,8 +71,8 @@ pip install raqa
|
|
|
73
71
|
```
|
|
74
72
|
raqa build DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
|
|
75
73
|
raqa chat DATABASE_NAME
|
|
76
|
-
raqa search "what is RAG?"
|
|
77
|
-
raqa list
|
|
74
|
+
raqa search DATABASE_NAME "what is RAG?"
|
|
75
|
+
raqa list (DATABASE_NAME)
|
|
78
76
|
raqa stats (DATABASE_NAME)
|
|
79
77
|
raqa rebuild-and-chat DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
|
|
80
78
|
```
|
|
@@ -2,11 +2,11 @@ LICENSE
|
|
|
2
2
|
README.md
|
|
3
3
|
pyproject.toml
|
|
4
4
|
src/raqa/__init__.py
|
|
5
|
-
src/raqa/
|
|
5
|
+
src/raqa/__main__.py
|
|
6
|
+
src/raqa/_agent.py
|
|
7
|
+
src/raqa/_app.py
|
|
8
|
+
src/raqa/_data.py
|
|
6
9
|
src/raqa/cli.py
|
|
7
|
-
src/raqa/config.py
|
|
8
|
-
src/raqa/db.py
|
|
9
|
-
src/raqa/utils.py
|
|
10
10
|
src/raqa.egg-info/PKG-INFO
|
|
11
11
|
src/raqa.egg-info/SOURCES.txt
|
|
12
12
|
src/raqa.egg-info/dependency_links.txt
|
raqa-1.0.2/src/raqa/agent.py
DELETED
|
@@ -1,54 +0,0 @@
|
|
|
1
|
-
from openai import OpenAI
|
|
2
|
-
|
|
3
|
-
from .db import VectorDB
|
|
4
|
-
from .utils import get_openai_key
|
|
5
|
-
|
|
6
|
-
class RAGAgent:
|
|
7
|
-
def __init__(self, db: VectorDB):
|
|
8
|
-
self.db = db
|
|
9
|
-
api_key = get_openai_key()
|
|
10
|
-
self.client = OpenAI(api_key=api_key)
|
|
11
|
-
|
|
12
|
-
def retrieve(self, query: str):
|
|
13
|
-
results = self.db.search(query)
|
|
14
|
-
filtered = self.db.nucleus_filter(results)
|
|
15
|
-
|
|
16
|
-
context = "\n\n".join(
|
|
17
|
-
f"[{r['data']['source']}]\n{r['data']['text']}"
|
|
18
|
-
for r in filtered
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
return context
|
|
22
|
-
|
|
23
|
-
def chat(self):
|
|
24
|
-
print("💬 RAG Agent ready. Type 'exit' to quit.")
|
|
25
|
-
|
|
26
|
-
messages = [{"role": "system", "content": "You are a helpful assistant."}]
|
|
27
|
-
|
|
28
|
-
while True:
|
|
29
|
-
user_input = input("\nYou: ")
|
|
30
|
-
if user_input.lower() in ("exit", "quit"):
|
|
31
|
-
break
|
|
32
|
-
|
|
33
|
-
context = self.retrieve(user_input)
|
|
34
|
-
|
|
35
|
-
augmented_prompt = f"""
|
|
36
|
-
Use the context below if relevant:
|
|
37
|
-
|
|
38
|
-
{context}
|
|
39
|
-
|
|
40
|
-
User question:
|
|
41
|
-
{user_input}
|
|
42
|
-
"""
|
|
43
|
-
|
|
44
|
-
messages.append({"role": "user", "content": augmented_prompt})
|
|
45
|
-
|
|
46
|
-
response = self.client.chat.completions.create(
|
|
47
|
-
model="gpt-4.1-mini",
|
|
48
|
-
messages=messages
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
reply = response.choices[0].message.content
|
|
52
|
-
messages.append({"role": "assistant", "content": reply})
|
|
53
|
-
|
|
54
|
-
print("\nAssistant:", reply)
|
raqa-1.0.2/src/raqa/cli.py
DELETED
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import Optional
|
|
3
|
-
|
|
4
|
-
import typer
|
|
5
|
-
|
|
6
|
-
from .db import VectorDB
|
|
7
|
-
from .agent import RAGAgent
|
|
8
|
-
from .config import (
|
|
9
|
-
DB_BASE_DIR,
|
|
10
|
-
MARKDOWN_ROOT
|
|
11
|
-
)
|
|
12
|
-
|
|
13
|
-
app = typer.Typer(help="📚 Markdown RAG CLI")
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
# ---------------------------
|
|
17
|
-
# BUILD DATABASE
|
|
18
|
-
# ---------------------------
|
|
19
|
-
@app.command()
|
|
20
|
-
def build(
|
|
21
|
-
db_name: str = typer.Argument(..., help="Name of the database to create"),
|
|
22
|
-
markdown_path: str = typer.Argument(MARKDOWN_ROOT, help="Path to markdown files")
|
|
23
|
-
):
|
|
24
|
-
"""Build a database with a user-given name"""
|
|
25
|
-
db = VectorDB(db_name=db_name)
|
|
26
|
-
db.build(markdown_path)
|
|
27
|
-
typer.echo(f"✅ Database '{db_name}' built at {db.db_path}")
|
|
28
|
-
|
|
29
|
-
# ---------------------------
|
|
30
|
-
# SEARCH ONLY (DEBUG TOOL)
|
|
31
|
-
# ---------------------------
|
|
32
|
-
@app.command()
|
|
33
|
-
def search(
|
|
34
|
-
db_name: str = typer.Argument(..., help="Database name to search within"),
|
|
35
|
-
query: str = typer.Argument(..., help="Search query"),
|
|
36
|
-
k: int = typer.Option(10, help="Top K results")
|
|
37
|
-
):
|
|
38
|
-
"""
|
|
39
|
-
Run retrieval without LLM (debugging).
|
|
40
|
-
"""
|
|
41
|
-
db = VectorDB(db_name=db_name)
|
|
42
|
-
db.load()
|
|
43
|
-
|
|
44
|
-
results = db.search(query, k=k)
|
|
45
|
-
|
|
46
|
-
typer.secho(f"\n🔎 Raw Results for '{db_name}':\n", fg=typer.colors.BLUE)
|
|
47
|
-
|
|
48
|
-
for i, r in enumerate(results):
|
|
49
|
-
typer.echo(f"\n--- Result {i+1} ---")
|
|
50
|
-
typer.echo(f"Score: {r['score']:.4f}")
|
|
51
|
-
typer.echo(f"Source: {r['data']['source']}")
|
|
52
|
-
typer.echo(r["data"]["text"][:500])
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
# ---------------------------
|
|
56
|
-
# CHAT (MAIN ENTRYPOINT)
|
|
57
|
-
# ---------------------------
|
|
58
|
-
@app.command()
|
|
59
|
-
def chat(
|
|
60
|
-
db_name: str = typer.Argument("default", help="Database name to use")
|
|
61
|
-
):
|
|
62
|
-
"""Start a chat using a specific database"""
|
|
63
|
-
db = VectorDB(db_name=db_name)
|
|
64
|
-
db.load()
|
|
65
|
-
|
|
66
|
-
agent = RAGAgent(db=db)
|
|
67
|
-
agent.chat()
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
# ---------------------------
|
|
71
|
-
# REBUILD + CHAT (CONVENIENCE)
|
|
72
|
-
# ---------------------------
|
|
73
|
-
@app.command()
|
|
74
|
-
def rebuild_and_chat(
|
|
75
|
-
db_name: str = typer.Argument(..., help="Database name"),
|
|
76
|
-
markdown_path: str = typer.Argument(..., help="Markdown folder")
|
|
77
|
-
):
|
|
78
|
-
"""
|
|
79
|
-
Rebuild a named database and immediately start chat.
|
|
80
|
-
"""
|
|
81
|
-
from .config import DB_EMBEDDINGS_CACHE
|
|
82
|
-
os.remove(DB_EMBEDDINGS_CACHE)
|
|
83
|
-
db = VectorDB(db_name=db_name)
|
|
84
|
-
|
|
85
|
-
typer.echo(f"🔄 Rebuilding database '{db_name}'...")
|
|
86
|
-
db.build(markdown_path)
|
|
87
|
-
|
|
88
|
-
typer.secho("✅ Build complete. Starting chat...\n", fg=typer.colors.GREEN)
|
|
89
|
-
|
|
90
|
-
agent = RAGAgent(db=db)
|
|
91
|
-
agent.chat()
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
# ---------------------------
|
|
95
|
-
# INSPECT DB
|
|
96
|
-
# ---------------------------
|
|
97
|
-
@app.command()
|
|
98
|
-
def stats(
|
|
99
|
-
db_name: str = typer.Argument(None, help="Database name (optional)")
|
|
100
|
-
):
|
|
101
|
-
"""
|
|
102
|
-
Show stats for one or all databases.
|
|
103
|
-
"""
|
|
104
|
-
if db_name:
|
|
105
|
-
db = VectorDB(db_name)
|
|
106
|
-
db.load()
|
|
107
|
-
|
|
108
|
-
typer.echo(f"📊 Stats for '{db_name}':")
|
|
109
|
-
typer.echo(f"Chunks: {len(db.metadata)}")
|
|
110
|
-
typer.echo(f"Location: {db.db_path}")
|
|
111
|
-
|
|
112
|
-
else:
|
|
113
|
-
typer.echo("📊 All databases:\n")
|
|
114
|
-
|
|
115
|
-
for db_path in DB_BASE_DIR.iterdir():
|
|
116
|
-
if db_path.is_dir():
|
|
117
|
-
name = db_path.name
|
|
118
|
-
|
|
119
|
-
try:
|
|
120
|
-
db = VectorDB(name)
|
|
121
|
-
db.load()
|
|
122
|
-
typer.echo(f"• {name}: {len(db.metadata)} chunks")
|
|
123
|
-
except Exception:
|
|
124
|
-
typer.echo(f"• {name}: ⚠️ corrupted or incomplete")
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
@app.command()
|
|
128
|
-
def list():
|
|
129
|
-
"""
|
|
130
|
-
List all available databases.
|
|
131
|
-
"""
|
|
132
|
-
# Fixed relative import for consistency
|
|
133
|
-
from .config import DB_BASE_DIR
|
|
134
|
-
|
|
135
|
-
typer.echo("📚 Available databases:\n")
|
|
136
|
-
|
|
137
|
-
found = False
|
|
138
|
-
for db_path in DB_BASE_DIR.iterdir():
|
|
139
|
-
if db_path.is_dir():
|
|
140
|
-
typer.echo(f"• {db_path.name}")
|
|
141
|
-
found = True
|
|
142
|
-
|
|
143
|
-
if not found:
|
|
144
|
-
typer.echo("No databases found. Use `raqa build` first.")
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
if __name__ == "__main__":
|
|
148
|
-
app()
|
raqa-1.0.2/src/raqa/config.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
|
|
3
|
-
# User folder for raqa configs
|
|
4
|
-
HOME = Path.home()
|
|
5
|
-
RAQA_HOME = HOME / ".raqa"
|
|
6
|
-
RAQA_HOME.mkdir(parents=True, exist_ok=True)
|
|
7
|
-
|
|
8
|
-
# OpenAI credential file
|
|
9
|
-
ENV_FILE = RAQA_HOME / "env"
|
|
10
|
-
|
|
11
|
-
# Default DB base folder (databases will be subfolders)
|
|
12
|
-
DB_BASE_DIR = RAQA_HOME / "databases"
|
|
13
|
-
DB_BASE_DIR.mkdir(parents=True, exist_ok=True)
|
|
14
|
-
DB_EMBEDDINGS_CACHE = RAQA_HOME / "embeddings_cache.json"
|
|
15
|
-
|
|
16
|
-
# Embeddings & chunk config
|
|
17
|
-
# EMBEDDING_MODEL = "joeddav/xlm-roberta-large-xnli"
|
|
18
|
-
EMBEDDING_MODEL = "text-embedding-3-small"
|
|
19
|
-
CHUNK_WINDOW = 1
|
|
20
|
-
TOP_K = 20
|
|
21
|
-
SIMILARITY_RADIUS = 0.7
|
|
22
|
-
|
|
23
|
-
# Default markdown folder (can override via CLI)
|
|
24
|
-
MARKDOWN_ROOT = "./markdown_files"
|
|
25
|
-
|
|
26
|
-
# Ensure directories exist
|
|
27
|
-
RAQA_HOME.mkdir(parents=True, exist_ok=True)
|
raqa-1.0.2/src/raqa/db.py
DELETED
|
@@ -1,197 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import os
|
|
3
|
-
import re
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from typing import Any
|
|
6
|
-
|
|
7
|
-
import faiss
|
|
8
|
-
import numpy as np
|
|
9
|
-
import snippyts
|
|
10
|
-
from joblib import Memory
|
|
11
|
-
from dotenv import load_dotenv
|
|
12
|
-
from openai import OpenAI # Added OpenAI import
|
|
13
|
-
from tqdm import tqdm
|
|
14
|
-
|
|
15
|
-
from .utils import split_sentences, window_chunks
|
|
16
|
-
from .config import (
|
|
17
|
-
DB_BASE_DIR,
|
|
18
|
-
EMBEDDING_MODEL,
|
|
19
|
-
CHUNK_WINDOW,
|
|
20
|
-
DB_EMBEDDINGS_CACHE,
|
|
21
|
-
TOP_K,
|
|
22
|
-
SIMILARITY_RADIUS
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
load_dotenv()
|
|
26
|
-
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
|
27
|
-
|
|
28
|
-
if os.path.exists(DB_EMBEDDINGS_CACHE):
|
|
29
|
-
EMBEDDINGS_CACHE = snippyts.from_json(DB_EMBEDDINGS_CACHE)
|
|
30
|
-
else:
|
|
31
|
-
EMBEDDINGS_CACHE = dict([])
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def remove_markdown(text: str) -> str:
|
|
35
|
-
"""
|
|
36
|
-
Removes basic Markdown formatting from a string.
|
|
37
|
-
"""
|
|
38
|
-
# 1. Remove images:  -> ""
|
|
39
|
-
text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
|
|
40
|
-
|
|
41
|
-
# 2. Remove links but keep the text: [link text](url) -> "link text"
|
|
42
|
-
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
|
|
43
|
-
|
|
44
|
-
# 3. Remove bold and italics: **text**, __text__, *text*, _text_
|
|
45
|
-
text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text)
|
|
46
|
-
text = re.sub(r'(\*|_)(.*?)\1', r'\2', text)
|
|
47
|
-
|
|
48
|
-
# 4. Remove inline code: `code` -> code
|
|
49
|
-
text = re.sub(r'`([^`]+)`', r'\1', text)
|
|
50
|
-
|
|
51
|
-
# 5. Remove headers: # Header -> Header
|
|
52
|
-
text = re.sub(r'^\s*#+\s+(.*)$', r'\1', text, flags=re.MULTILINE)
|
|
53
|
-
|
|
54
|
-
# 6. Remove blockquotes: > quote -> quote
|
|
55
|
-
text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)
|
|
56
|
-
|
|
57
|
-
# 7. Remove horizontal rules: ---, ***, ___
|
|
58
|
-
text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
|
|
59
|
-
|
|
60
|
-
# 8. Remove list markers: - item, * item, 1. item
|
|
61
|
-
text = re.sub(r'^\s*[\-\*]\s+', '', text, flags=re.MULTILINE)
|
|
62
|
-
text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
|
|
63
|
-
|
|
64
|
-
# Clean up excess whitespace/newlines
|
|
65
|
-
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
66
|
-
|
|
67
|
-
return text.strip()
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class VectorDB:
|
|
71
|
-
def __init__(self, db_name: str = "default"):
|
|
72
|
-
self.name = db_name
|
|
73
|
-
self.db_path = DB_BASE_DIR / self.name
|
|
74
|
-
self.db_path.mkdir(parents=True, exist_ok=True)
|
|
75
|
-
|
|
76
|
-
# Initialize OpenAI client instead of SentenceTransformer
|
|
77
|
-
self.client = OpenAI()
|
|
78
|
-
self.index = None
|
|
79
|
-
self.metadata = []
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
def _get_embeddings(self, texts: list[str]) -> np.ndarray:
|
|
83
|
-
"""Fetch embeddings from OpenAI with batching, caching, and robust parsing"""
|
|
84
|
-
indexed_texts = list(enumerate(texts))
|
|
85
|
-
|
|
86
|
-
# FIX 1: Store the actual vector from cache, not the text string
|
|
87
|
-
old_texts = [(idx, EMBEDDINGS_CACHE[text]) for idx, text in indexed_texts if text in EMBEDDINGS_CACHE]
|
|
88
|
-
new_texts = [(idx, text) for idx, text in indexed_texts if text not in EMBEDDINGS_CACHE]
|
|
89
|
-
|
|
90
|
-
all_vectors = old_texts
|
|
91
|
-
batch_size = 100
|
|
92
|
-
batches = snippyts.batched(new_texts, batch_size)
|
|
93
|
-
|
|
94
|
-
for batch in tqdm(batches, colour="green", desc="Fetching Embeddings"):
|
|
95
|
-
# Ensure we don't send empty strings to the API
|
|
96
|
-
clean_batch = [t if t.strip() else " " for _, t in batch]
|
|
97
|
-
|
|
98
|
-
response = self.client.embeddings.create(
|
|
99
|
-
input=clean_batch,
|
|
100
|
-
model=EMBEDDING_MODEL
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
res_data = response.data if hasattr(response, 'data') else response
|
|
104
|
-
|
|
105
|
-
for (idx, text), item in zip(batch, res_data):
|
|
106
|
-
vector = item.embedding if hasattr(item, 'embedding') else item
|
|
107
|
-
|
|
108
|
-
# FIX 2: Handle nested lists returned by some API proxies (e.g. [[val, val...]])
|
|
109
|
-
if isinstance(vector, list) and len(vector) > 0 and isinstance(vector[0], list):
|
|
110
|
-
vector = vector[0]
|
|
111
|
-
|
|
112
|
-
EMBEDDINGS_CACHE[text] = vector
|
|
113
|
-
all_vectors.append((idx, vector))
|
|
114
|
-
|
|
115
|
-
# Sort back to original order
|
|
116
|
-
all_vectors.sort(key=lambda x: x[0])
|
|
117
|
-
final_vectors = [vec for _, vec in all_vectors]
|
|
118
|
-
|
|
119
|
-
# FIX 3: Validate dimensions before converting to NumPy
|
|
120
|
-
if final_vectors:
|
|
121
|
-
dims = [len(v) for v in final_vectors]
|
|
122
|
-
if len(set(dims)) > 1:
|
|
123
|
-
# If this happens, your cache likely has vectors from a different model.
|
|
124
|
-
# You may need to delete your cache file.
|
|
125
|
-
raise ValueError(f"Inconsistent embedding dimensions: {set(dims)}. Clear your cache.")
|
|
126
|
-
|
|
127
|
-
# Save cache and return matrix
|
|
128
|
-
snippyts.to_json(EMBEDDINGS_CACHE, DB_EMBEDDINGS_CACHE)
|
|
129
|
-
return np.array(final_vectors).astype('float32')
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
def build(self, markdown_root: str):
|
|
133
|
-
"""Ingest markdown files, chunk, encode via OpenAI, and store"""
|
|
134
|
-
chunks = []
|
|
135
|
-
|
|
136
|
-
for path in Path(markdown_root).rglob("*.md"):
|
|
137
|
-
with open(path, "r", encoding="utf-8") as f:
|
|
138
|
-
content = f.read()
|
|
139
|
-
# Use the existing remove_markdown logic
|
|
140
|
-
clean_text = remove_markdown(content)
|
|
141
|
-
|
|
142
|
-
sentences = split_sentences(clean_text)
|
|
143
|
-
for c in window_chunks(sentences, CHUNK_WINDOW):
|
|
144
|
-
# Skip chunks that became empty after markdown removal
|
|
145
|
-
if not c["text"].strip():
|
|
146
|
-
continue
|
|
147
|
-
|
|
148
|
-
chunks.append({
|
|
149
|
-
"text": c["text"],
|
|
150
|
-
"source": str(path),
|
|
151
|
-
"position": c["index"]
|
|
152
|
-
})
|
|
153
|
-
|
|
154
|
-
if not chunks:
|
|
155
|
-
print("⚠️ No valid text found in markdown files.")
|
|
156
|
-
return
|
|
157
|
-
|
|
158
|
-
texts = [c["text"] for c in chunks]
|
|
159
|
-
embeddings = self._get_embeddings(texts)
|
|
160
|
-
|
|
161
|
-
# Build FAISS index
|
|
162
|
-
self.index = faiss.IndexFlatIP(embeddings.shape[1])
|
|
163
|
-
faiss.normalize_L2(embeddings)
|
|
164
|
-
self.index.add(embeddings)
|
|
165
|
-
|
|
166
|
-
self.metadata = chunks
|
|
167
|
-
self.save()
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
def save(self):
|
|
171
|
-
faiss.write_index(self.index, str(self.db_path / "index.faiss"))
|
|
172
|
-
with open(self.db_path / "meta.json", "w", encoding="utf-8") as f:
|
|
173
|
-
json.dump(self.metadata, f)
|
|
174
|
-
|
|
175
|
-
def load(self):
|
|
176
|
-
if not (self.db_path / "index.faiss").exists():
|
|
177
|
-
raise FileNotFoundError(f"No database found at {self.db_path}. Please build it first.")
|
|
178
|
-
self.index = faiss.read_index(str(self.db_path / "index.faiss"))
|
|
179
|
-
with open(self.db_path / "meta.json", "r", encoding="utf-8") as f:
|
|
180
|
-
self.metadata = json.load(f)
|
|
181
|
-
|
|
182
|
-
def search(self, query: str, k=TOP_K):
|
|
183
|
-
# Encode query using OpenAI
|
|
184
|
-
q_emb = self._get_embeddings([query])
|
|
185
|
-
faiss.normalize_L2(q_emb)
|
|
186
|
-
scores, indices = self.index.search(q_emb, k)
|
|
187
|
-
return [{"score": float(s), "data": self.metadata[i]} for s, i in zip(scores[0], indices[0])]
|
|
188
|
-
|
|
189
|
-
def nucleus_filter(self, results):
|
|
190
|
-
if not results:
|
|
191
|
-
return []
|
|
192
|
-
best = results[0]["score"]
|
|
193
|
-
filtered = [r for r in results if (best - r["score"]) <= SIMILARITY_RADIUS]
|
|
194
|
-
scores = np.array([r["score"] for r in filtered])
|
|
195
|
-
probs = np.exp(scores) / np.sum(np.exp(scores))
|
|
196
|
-
sampled_indices = np.random.choice(len(filtered), size=min(len(filtered), 10), replace=False, p=probs)
|
|
197
|
-
return [filtered[i] for i in sampled_indices]
|
raqa-1.0.2/src/raqa/utils.py
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
import re
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import List
|
|
5
|
-
|
|
6
|
-
from .config import ENV_FILE
|
|
7
|
-
|
|
8
|
-
def split_sentences(text: str) -> List[str]:
|
|
9
|
-
# simple but effective
|
|
10
|
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
|
11
|
-
return [s.strip() for s in sentences if s.strip()]
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def window_chunks(sentences: List[str], window: int = 3):
|
|
15
|
-
chunks = []
|
|
16
|
-
|
|
17
|
-
for i in range(len(sentences)):
|
|
18
|
-
left = sentences[max(0, i - window): i]
|
|
19
|
-
center = [sentences[i]]
|
|
20
|
-
right = sentences[i + 1: i + 1 + window]
|
|
21
|
-
|
|
22
|
-
chunk_text = " ".join(left + center + right)
|
|
23
|
-
|
|
24
|
-
chunks.append({
|
|
25
|
-
"text": chunk_text,
|
|
26
|
-
"center": sentences[i],
|
|
27
|
-
"index": i
|
|
28
|
-
})
|
|
29
|
-
|
|
30
|
-
return chunks
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
def get_openai_key() -> str:
|
|
34
|
-
"""
|
|
35
|
-
Load the OpenAI API key from ENV_FILE, prompt user if missing.
|
|
36
|
-
"""
|
|
37
|
-
if ENV_FILE.exists():
|
|
38
|
-
key = ENV_FILE.read_text().strip()
|
|
39
|
-
if key:
|
|
40
|
-
return key
|
|
41
|
-
|
|
42
|
-
# Prompt user
|
|
43
|
-
print(f"🔑 OpenAI API key not found. Enter your key (it will be saved at {ENV_FILE}):")
|
|
44
|
-
key = input("API Key: ").strip()
|
|
45
|
-
|
|
46
|
-
# Save to file
|
|
47
|
-
ENV_FILE.write_text(key)
|
|
48
|
-
print(f"✅ Key saved at {ENV_FILE}")
|
|
49
|
-
return key
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|