raqa 1.0.2__tar.gz → 3.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: raqa
3
- Version: 1.0.2
3
+ Version: 3.0.0
4
+ Summary: RAG-powered document Q&A — Streamlit + OpenAI Agents SDK
4
5
  Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
5
6
  Project-URL: GitHub repository, https://github.com/JordiCarreraVentura/raqa
6
7
  Project-URL: PyPI, https://pypi.org/project/raqa
@@ -10,14 +11,11 @@ Requires-Python: >=3.10
10
11
  Description-Content-Type: text/markdown
11
12
  License-File: LICENSE
12
13
  Requires-Dist: build
13
- Requires-Dist: faiss-cpu
14
- Requires-Dist: joblib
15
14
  Requires-Dist: numpy
16
15
  Requires-Dist: openai
17
- Requires-Dist: python-frontmatter
18
- Requires-Dist: sentence-transformers
19
- Requires-Dist: tqdm
20
- Requires-Dist: typer[all]
16
+ Requires-Dist: openai-agents
17
+ Requires-Dist: python-dotenv
18
+ Requires-Dist: streamlit
21
19
  Requires-Dist: twine
22
20
  Dynamic: license-file
23
21
 
@@ -53,7 +51,7 @@ pip install raqa
53
51
 
54
52
  3. One-shot retrieval
55
53
 
56
- `python cli.py search "what is retrieval augmented generation?"`
54
+ `python cli.py search DATABASE_NAME "what is retrieval augmented generation?"`
57
55
 
58
56
  4. Rebuild and chat
59
57
 
@@ -73,8 +71,8 @@ pip install raqa
73
71
  ```
74
72
  raqa build DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
75
73
  raqa chat DATABASE_NAME
76
- raqa search "what is RAG?"
77
- raqa list
74
+ raqa search DATABASE_NAME "what is RAG?"
75
+ raqa list (DATABASE_NAME)
78
76
  raqa stats (DATABASE_NAME)
79
77
  raqa rebuild-and-chat DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
80
78
  ```
@@ -30,7 +30,7 @@ pip install raqa
30
30
 
31
31
  3. One-shot retrieval
32
32
 
33
- `python cli.py search "what is retrieval augmented generation?"`
33
+ `python cli.py search DATABASE_NAME "what is retrieval augmented generation?"`
34
34
 
35
35
  4. Rebuild and chat
36
36
 
@@ -50,8 +50,8 @@ pip install raqa
50
50
  ```
51
51
  raqa build DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
52
52
  raqa chat DATABASE_NAME
53
- raqa search "what is RAG?"
54
- raqa list
53
+ raqa search DATABASE_NAME "what is RAG?"
54
+ raqa list (DATABASE_NAME)
55
55
  raqa stats (DATABASE_NAME)
56
56
  raqa rebuild-and-chat DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
57
57
  ```
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "raqa"
7
- version = "1.0.2"
7
+ version = "3.0.0"
8
8
  authors = [
9
9
  { name="Jordi Carrera Ventura", email="jordi.carrera.ventura@gmail.com" },
10
10
  ]
11
- description = ""
11
+ description = "RAG-powered document Q&A — Streamlit + OpenAI Agents SDK"
12
12
  readme = "README.md"
13
13
  requires-python = ">=3.10"
14
14
  classifiers = [
@@ -17,14 +17,11 @@ classifiers = [
17
17
  ]
18
18
  dependencies = [
19
19
  "build",
20
- "faiss-cpu",
21
- "joblib",
22
20
  "numpy",
23
21
  "openai",
24
- "python-frontmatter",
25
- "sentence-transformers",
26
- "tqdm",
27
- "typer[all]",
22
+ "openai-agents",
23
+ "python-dotenv",
24
+ "streamlit",
28
25
  "twine"
29
26
  ]
30
27
 
@@ -33,4 +30,4 @@ dependencies = [
33
30
  "PyPI" = "https://pypi.org/project/raqa"
34
31
 
35
32
  [project.scripts]
36
- raqa = "raqa.cli:app"
33
+ raqa = "raqa.cli:main"
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ main()
@@ -0,0 +1,63 @@
1
+ import numpy as np
2
+ from dotenv import load_dotenv
3
+ from openai import OpenAI
4
+ from agents import Agent, function_tool
5
+
6
+ from ._data import load_documents
7
+
8
+
9
+ load_dotenv()
10
+
11
+ _client = None
12
+ chunks = []
13
+ embeddings = None
14
+
15
+
16
+ def _get_client() -> OpenAI:
17
+ global _client
18
+ if _client is None:
19
+ _client = OpenAI()
20
+ return _client
21
+
22
+
23
+ def initialize(data_dir: str = "data"):
24
+ global chunks, embeddings
25
+ chunks = load_documents(data_dir)
26
+ if not chunks:
27
+ embeddings = np.array([], dtype="float32").reshape(0, 0)
28
+ return
29
+
30
+ texts = [c["text"] for c in chunks]
31
+ response = _get_client().embeddings.create(input=texts, model="text-embedding-3-small")
32
+ embeddings = np.array([d.embedding for d in response.data], dtype="float32")
33
+
34
+
35
+ @function_tool
36
+ def search_docs(query: str) -> str:
37
+ """Search the loaded documents for information relevant to the query."""
38
+ if not chunks or embeddings.size == 0:
39
+ return "No documents loaded."
40
+
41
+ q_emb = _get_client().embeddings.create(
42
+ input=[query], model="text-embedding-3-small"
43
+ ).data[0].embedding
44
+ q_vec = np.array(q_emb, dtype="float32")
45
+
46
+ scores = embeddings @ q_vec
47
+ top_k = min(5, len(chunks))
48
+ indices = np.argsort(scores)[-top_k:][::-1]
49
+
50
+ results = [f"[{chunks[i]['source']}]\n{chunks[i]['text']}" for i in indices]
51
+ return "\n\n---\n\n".join(results)
52
+
53
+
54
+ def create_agent() -> Agent:
55
+ return Agent(
56
+ name="RAQA",
57
+ instructions=(
58
+ "You are a helpful assistant. "
59
+ "Answer questions based on the provided documents. "
60
+ "If the search results are not relevant, say so."
61
+ ),
62
+ tools=[search_docs],
63
+ )
@@ -0,0 +1,41 @@
1
+ import os
2
+ import asyncio
3
+
4
+ import streamlit as st
5
+ from agents import Runner
6
+
7
+ from raqa._agent import initialize, create_agent
8
+
9
+
10
+ st.set_page_config(page_title="RAQA", page_icon="📚", layout="centered")
11
+
12
+ data_dir = os.environ.get("RAQA_DATA_DIR", "data")
13
+
14
+ if "ready" not in st.session_state:
15
+ with st.spinner("Loading documents..."):
16
+ initialize(data_dir=data_dir)
17
+ st.session_state.agent = create_agent()
18
+ st.session_state.messages = []
19
+ st.session_state.history = None
20
+ st.session_state.ready = True
21
+
22
+ st.title("📚 RAQA")
23
+
24
+ for msg in st.session_state.messages:
25
+ with st.chat_message(msg["role"]):
26
+ st.markdown(msg["content"])
27
+
28
+ if prompt := st.chat_input("Ask a question about your documents"):
29
+ st.session_state.messages.append({"role": "user", "content": prompt})
30
+ with st.chat_message("user"):
31
+ st.markdown(prompt)
32
+
33
+ with st.chat_message("assistant"):
34
+ with st.spinner("Thinking..."):
35
+ history = st.session_state.history
36
+ input_for = history + [{"role": "user", "content": prompt}] if history else prompt
37
+ result = asyncio.run(Runner.run(st.session_state.agent, input_for))
38
+ st.session_state.history = result.to_input_list()
39
+ response = result.final_output
40
+ st.markdown(response)
41
+ st.session_state.messages.append({"role": "assistant", "content": response})
@@ -0,0 +1,48 @@
1
+ import csv
2
+ import re
3
+ from pathlib import Path
4
+ from typing import List, Dict
5
+
6
+
7
+ def load_documents(data_dir: str = "data") -> List[Dict]:
8
+ chunks = []
9
+ data_path = Path(data_dir)
10
+ if not data_path.exists():
11
+ print(f"⚠️ Data directory '{data_dir}' not found.")
12
+ return chunks
13
+
14
+ for path in sorted(data_path.rglob("*")):
15
+ if path.suffix == ".md":
16
+ text = path.read_text(encoding="utf-8")
17
+ clean = _remove_markdown(text)
18
+ paras = [p.strip() for p in re.split(r'\n\s*\n', clean) if p.strip()]
19
+ for i, para in enumerate(paras):
20
+ chunks.append({"text": para, "source": str(path), "index": i})
21
+ elif path.suffix == ".txt":
22
+ text = path.read_text(encoding="utf-8")
23
+ paras = [p.strip() for p in re.split(r'\n\s*\n', text) if p.strip()]
24
+ for i, para in enumerate(paras):
25
+ chunks.append({"text": para, "source": str(path), "index": i})
26
+ elif path.suffix == ".csv":
27
+ with open(path, newline="", encoding="utf-8") as f:
28
+ rows = list(csv.reader(f))
29
+ if rows:
30
+ lines = [" | ".join(r) for r in rows]
31
+ chunks.append({"text": "\n".join(lines), "source": str(path), "index": 0})
32
+
33
+ return chunks
34
+
35
+
36
+ def _remove_markdown(text: str) -> str:
37
+ text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
38
+ text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
39
+ text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text)
40
+ text = re.sub(r'(\*|_)(.*?)\1', r'\2', text)
41
+ text = re.sub(r'`([^`]+)`', r'\1', text)
42
+ text = re.sub(r'^\s*#+\s+(.*)$', r'\1', text, flags=re.MULTILINE)
43
+ text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)
44
+ text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
45
+ text = re.sub(r'^\s*[\-\*]\s+', '', text, flags=re.MULTILINE)
46
+ text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
47
+ text = re.sub(r'\n{3,}', '\n\n', text)
48
+ return text.strip()
@@ -0,0 +1,10 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+
5
+ def main():
6
+ """Launch the RAQA Streamlit app."""
7
+ app_path = Path(__file__).parent / "_app.py"
8
+ from streamlit.web import cli as stcli
9
+ sys.argv = ["streamlit", "run", str(app_path)]
10
+ sys.exit(stcli.main())
@@ -1,6 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: raqa
3
- Version: 1.0.2
3
+ Version: 3.0.0
4
+ Summary: RAG-powered document Q&A — Streamlit + OpenAI Agents SDK
4
5
  Author-email: Jordi Carrera Ventura <jordi.carrera.ventura@gmail.com>
5
6
  Project-URL: GitHub repository, https://github.com/JordiCarreraVentura/raqa
6
7
  Project-URL: PyPI, https://pypi.org/project/raqa
@@ -10,14 +11,11 @@ Requires-Python: >=3.10
10
11
  Description-Content-Type: text/markdown
11
12
  License-File: LICENSE
12
13
  Requires-Dist: build
13
- Requires-Dist: faiss-cpu
14
- Requires-Dist: joblib
15
14
  Requires-Dist: numpy
16
15
  Requires-Dist: openai
17
- Requires-Dist: python-frontmatter
18
- Requires-Dist: sentence-transformers
19
- Requires-Dist: tqdm
20
- Requires-Dist: typer[all]
16
+ Requires-Dist: openai-agents
17
+ Requires-Dist: python-dotenv
18
+ Requires-Dist: streamlit
21
19
  Requires-Dist: twine
22
20
  Dynamic: license-file
23
21
 
@@ -53,7 +51,7 @@ pip install raqa
53
51
 
54
52
  3. One-shot retrieval
55
53
 
56
- `python cli.py search "what is retrieval augmented generation?"`
54
+ `python cli.py search DATABASE_NAME "what is retrieval augmented generation?"`
57
55
 
58
56
  4. Rebuild and chat
59
57
 
@@ -73,8 +71,8 @@ pip install raqa
73
71
  ```
74
72
  raqa build DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
75
73
  raqa chat DATABASE_NAME
76
- raqa search "what is RAG?"
77
- raqa list
74
+ raqa search DATABASE_NAME "what is RAG?"
75
+ raqa list (DATABASE_NAME)
78
76
  raqa stats (DATABASE_NAME)
79
77
  raqa rebuild-and-chat DATABASE_NAME PATH/TO/FOLDER/WITH/MARKDOWNS
80
78
  ```
@@ -2,11 +2,11 @@ LICENSE
2
2
  README.md
3
3
  pyproject.toml
4
4
  src/raqa/__init__.py
5
- src/raqa/agent.py
5
+ src/raqa/__main__.py
6
+ src/raqa/_agent.py
7
+ src/raqa/_app.py
8
+ src/raqa/_data.py
6
9
  src/raqa/cli.py
7
- src/raqa/config.py
8
- src/raqa/db.py
9
- src/raqa/utils.py
10
10
  src/raqa.egg-info/PKG-INFO
11
11
  src/raqa.egg-info/SOURCES.txt
12
12
  src/raqa.egg-info/dependency_links.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ raqa = raqa.cli:main
@@ -0,0 +1,7 @@
1
+ build
2
+ numpy
3
+ openai
4
+ openai-agents
5
+ python-dotenv
6
+ streamlit
7
+ twine
@@ -1,54 +0,0 @@
1
- from openai import OpenAI
2
-
3
- from .db import VectorDB
4
- from .utils import get_openai_key
5
-
6
- class RAGAgent:
7
- def __init__(self, db: VectorDB):
8
- self.db = db
9
- api_key = get_openai_key()
10
- self.client = OpenAI(api_key=api_key)
11
-
12
- def retrieve(self, query: str):
13
- results = self.db.search(query)
14
- filtered = self.db.nucleus_filter(results)
15
-
16
- context = "\n\n".join(
17
- f"[{r['data']['source']}]\n{r['data']['text']}"
18
- for r in filtered
19
- )
20
-
21
- return context
22
-
23
- def chat(self):
24
- print("💬 RAG Agent ready. Type 'exit' to quit.")
25
-
26
- messages = [{"role": "system", "content": "You are a helpful assistant."}]
27
-
28
- while True:
29
- user_input = input("\nYou: ")
30
- if user_input.lower() in ("exit", "quit"):
31
- break
32
-
33
- context = self.retrieve(user_input)
34
-
35
- augmented_prompt = f"""
36
- Use the context below if relevant:
37
-
38
- {context}
39
-
40
- User question:
41
- {user_input}
42
- """
43
-
44
- messages.append({"role": "user", "content": augmented_prompt})
45
-
46
- response = self.client.chat.completions.create(
47
- model="gpt-4.1-mini",
48
- messages=messages
49
- )
50
-
51
- reply = response.choices[0].message.content
52
- messages.append({"role": "assistant", "content": reply})
53
-
54
- print("\nAssistant:", reply)
@@ -1,148 +0,0 @@
1
- import os
2
- from typing import Optional
3
-
4
- import typer
5
-
6
- from .db import VectorDB
7
- from .agent import RAGAgent
8
- from .config import (
9
- DB_BASE_DIR,
10
- MARKDOWN_ROOT
11
- )
12
-
13
- app = typer.Typer(help="📚 Markdown RAG CLI")
14
-
15
-
16
- # ---------------------------
17
- # BUILD DATABASE
18
- # ---------------------------
19
- @app.command()
20
- def build(
21
- db_name: str = typer.Argument(..., help="Name of the database to create"),
22
- markdown_path: str = typer.Argument(MARKDOWN_ROOT, help="Path to markdown files")
23
- ):
24
- """Build a database with a user-given name"""
25
- db = VectorDB(db_name=db_name)
26
- db.build(markdown_path)
27
- typer.echo(f"✅ Database '{db_name}' built at {db.db_path}")
28
-
29
- # ---------------------------
30
- # SEARCH ONLY (DEBUG TOOL)
31
- # ---------------------------
32
- @app.command()
33
- def search(
34
- db_name: str = typer.Argument(..., help="Database name to search within"),
35
- query: str = typer.Argument(..., help="Search query"),
36
- k: int = typer.Option(10, help="Top K results")
37
- ):
38
- """
39
- Run retrieval without LLM (debugging).
40
- """
41
- db = VectorDB(db_name=db_name)
42
- db.load()
43
-
44
- results = db.search(query, k=k)
45
-
46
- typer.secho(f"\n🔎 Raw Results for '{db_name}':\n", fg=typer.colors.BLUE)
47
-
48
- for i, r in enumerate(results):
49
- typer.echo(f"\n--- Result {i+1} ---")
50
- typer.echo(f"Score: {r['score']:.4f}")
51
- typer.echo(f"Source: {r['data']['source']}")
52
- typer.echo(r["data"]["text"][:500])
53
-
54
-
55
- # ---------------------------
56
- # CHAT (MAIN ENTRYPOINT)
57
- # ---------------------------
58
- @app.command()
59
- def chat(
60
- db_name: str = typer.Argument("default", help="Database name to use")
61
- ):
62
- """Start a chat using a specific database"""
63
- db = VectorDB(db_name=db_name)
64
- db.load()
65
-
66
- agent = RAGAgent(db=db)
67
- agent.chat()
68
-
69
-
70
- # ---------------------------
71
- # REBUILD + CHAT (CONVENIENCE)
72
- # ---------------------------
73
- @app.command()
74
- def rebuild_and_chat(
75
- db_name: str = typer.Argument(..., help="Database name"),
76
- markdown_path: str = typer.Argument(..., help="Markdown folder")
77
- ):
78
- """
79
- Rebuild a named database and immediately start chat.
80
- """
81
- from .config import DB_EMBEDDINGS_CACHE
82
- os.remove(DB_EMBEDDINGS_CACHE)
83
- db = VectorDB(db_name=db_name)
84
-
85
- typer.echo(f"🔄 Rebuilding database '{db_name}'...")
86
- db.build(markdown_path)
87
-
88
- typer.secho("✅ Build complete. Starting chat...\n", fg=typer.colors.GREEN)
89
-
90
- agent = RAGAgent(db=db)
91
- agent.chat()
92
-
93
-
94
- # ---------------------------
95
- # INSPECT DB
96
- # ---------------------------
97
- @app.command()
98
- def stats(
99
- db_name: str = typer.Argument(None, help="Database name (optional)")
100
- ):
101
- """
102
- Show stats for one or all databases.
103
- """
104
- if db_name:
105
- db = VectorDB(db_name)
106
- db.load()
107
-
108
- typer.echo(f"📊 Stats for '{db_name}':")
109
- typer.echo(f"Chunks: {len(db.metadata)}")
110
- typer.echo(f"Location: {db.db_path}")
111
-
112
- else:
113
- typer.echo("📊 All databases:\n")
114
-
115
- for db_path in DB_BASE_DIR.iterdir():
116
- if db_path.is_dir():
117
- name = db_path.name
118
-
119
- try:
120
- db = VectorDB(name)
121
- db.load()
122
- typer.echo(f"• {name}: {len(db.metadata)} chunks")
123
- except Exception:
124
- typer.echo(f"• {name}: ⚠️ corrupted or incomplete")
125
-
126
-
127
- @app.command()
128
- def list():
129
- """
130
- List all available databases.
131
- """
132
- # Fixed relative import for consistency
133
- from .config import DB_BASE_DIR
134
-
135
- typer.echo("📚 Available databases:\n")
136
-
137
- found = False
138
- for db_path in DB_BASE_DIR.iterdir():
139
- if db_path.is_dir():
140
- typer.echo(f"• {db_path.name}")
141
- found = True
142
-
143
- if not found:
144
- typer.echo("No databases found. Use `raqa build` first.")
145
-
146
-
147
- if __name__ == "__main__":
148
- app()
@@ -1,27 +0,0 @@
1
- from pathlib import Path
2
-
3
- # User folder for raqa configs
4
- HOME = Path.home()
5
- RAQA_HOME = HOME / ".raqa"
6
- RAQA_HOME.mkdir(parents=True, exist_ok=True)
7
-
8
- # OpenAI credential file
9
- ENV_FILE = RAQA_HOME / "env"
10
-
11
- # Default DB base folder (databases will be subfolders)
12
- DB_BASE_DIR = RAQA_HOME / "databases"
13
- DB_BASE_DIR.mkdir(parents=True, exist_ok=True)
14
- DB_EMBEDDINGS_CACHE = RAQA_HOME / "embeddings_cache.json"
15
-
16
- # Embeddings & chunk config
17
- # EMBEDDING_MODEL = "joeddav/xlm-roberta-large-xnli"
18
- EMBEDDING_MODEL = "text-embedding-3-small"
19
- CHUNK_WINDOW = 1
20
- TOP_K = 20
21
- SIMILARITY_RADIUS = 0.7
22
-
23
- # Default markdown folder (can override via CLI)
24
- MARKDOWN_ROOT = "./markdown_files"
25
-
26
- # Ensure directories exist
27
- RAQA_HOME.mkdir(parents=True, exist_ok=True)
raqa-1.0.2/src/raqa/db.py DELETED
@@ -1,197 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from pathlib import Path
5
- from typing import Any
6
-
7
- import faiss
8
- import numpy as np
9
- import snippyts
10
- from joblib import Memory
11
- from dotenv import load_dotenv
12
- from openai import OpenAI # Added OpenAI import
13
- from tqdm import tqdm
14
-
15
- from .utils import split_sentences, window_chunks
16
- from .config import (
17
- DB_BASE_DIR,
18
- EMBEDDING_MODEL,
19
- CHUNK_WINDOW,
20
- DB_EMBEDDINGS_CACHE,
21
- TOP_K,
22
- SIMILARITY_RADIUS
23
- )
24
-
25
- load_dotenv()
26
- os.environ["TOKENIZERS_PARALLELISM"] = "true"
27
-
28
- if os.path.exists(DB_EMBEDDINGS_CACHE):
29
- EMBEDDINGS_CACHE = snippyts.from_json(DB_EMBEDDINGS_CACHE)
30
- else:
31
- EMBEDDINGS_CACHE = dict([])
32
-
33
-
34
- def remove_markdown(text: str) -> str:
35
- """
36
- Removes basic Markdown formatting from a string.
37
- """
38
- # 1. Remove images: ![alt](url) -> ""
39
- text = re.sub(r'!\[[^\]]*\]\([^\)]+\)', '', text)
40
-
41
- # 2. Remove links but keep the text: [link text](url) -> "link text"
42
- text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)
43
-
44
- # 3. Remove bold and italics: **text**, __text__, *text*, _text_
45
- text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', text)
46
- text = re.sub(r'(\*|_)(.*?)\1', r'\2', text)
47
-
48
- # 4. Remove inline code: `code` -> code
49
- text = re.sub(r'`([^`]+)`', r'\1', text)
50
-
51
- # 5. Remove headers: # Header -> Header
52
- text = re.sub(r'^\s*#+\s+(.*)$', r'\1', text, flags=re.MULTILINE)
53
-
54
- # 6. Remove blockquotes: > quote -> quote
55
- text = re.sub(r'^\s*>\s+', '', text, flags=re.MULTILINE)
56
-
57
- # 7. Remove horizontal rules: ---, ***, ___
58
- text = re.sub(r'^\s*[-*_]{3,}\s*$', '', text, flags=re.MULTILINE)
59
-
60
- # 8. Remove list markers: - item, * item, 1. item
61
- text = re.sub(r'^\s*[\-\*]\s+', '', text, flags=re.MULTILINE)
62
- text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE)
63
-
64
- # Clean up excess whitespace/newlines
65
- text = re.sub(r'\n{3,}', '\n\n', text)
66
-
67
- return text.strip()
68
-
69
-
70
- class VectorDB:
71
- def __init__(self, db_name: str = "default"):
72
- self.name = db_name
73
- self.db_path = DB_BASE_DIR / self.name
74
- self.db_path.mkdir(parents=True, exist_ok=True)
75
-
76
- # Initialize OpenAI client instead of SentenceTransformer
77
- self.client = OpenAI()
78
- self.index = None
79
- self.metadata = []
80
-
81
-
82
- def _get_embeddings(self, texts: list[str]) -> np.ndarray:
83
- """Fetch embeddings from OpenAI with batching, caching, and robust parsing"""
84
- indexed_texts = list(enumerate(texts))
85
-
86
- # FIX 1: Store the actual vector from cache, not the text string
87
- old_texts = [(idx, EMBEDDINGS_CACHE[text]) for idx, text in indexed_texts if text in EMBEDDINGS_CACHE]
88
- new_texts = [(idx, text) for idx, text in indexed_texts if text not in EMBEDDINGS_CACHE]
89
-
90
- all_vectors = old_texts
91
- batch_size = 100
92
- batches = snippyts.batched(new_texts, batch_size)
93
-
94
- for batch in tqdm(batches, colour="green", desc="Fetching Embeddings"):
95
- # Ensure we don't send empty strings to the API
96
- clean_batch = [t if t.strip() else " " for _, t in batch]
97
-
98
- response = self.client.embeddings.create(
99
- input=clean_batch,
100
- model=EMBEDDING_MODEL
101
- )
102
-
103
- res_data = response.data if hasattr(response, 'data') else response
104
-
105
- for (idx, text), item in zip(batch, res_data):
106
- vector = item.embedding if hasattr(item, 'embedding') else item
107
-
108
- # FIX 2: Handle nested lists returned by some API proxies (e.g. [[val, val...]])
109
- if isinstance(vector, list) and len(vector) > 0 and isinstance(vector[0], list):
110
- vector = vector[0]
111
-
112
- EMBEDDINGS_CACHE[text] = vector
113
- all_vectors.append((idx, vector))
114
-
115
- # Sort back to original order
116
- all_vectors.sort(key=lambda x: x[0])
117
- final_vectors = [vec for _, vec in all_vectors]
118
-
119
- # FIX 3: Validate dimensions before converting to NumPy
120
- if final_vectors:
121
- dims = [len(v) for v in final_vectors]
122
- if len(set(dims)) > 1:
123
- # If this happens, your cache likely has vectors from a different model.
124
- # You may need to delete your cache file.
125
- raise ValueError(f"Inconsistent embedding dimensions: {set(dims)}. Clear your cache.")
126
-
127
- # Save cache and return matrix
128
- snippyts.to_json(EMBEDDINGS_CACHE, DB_EMBEDDINGS_CACHE)
129
- return np.array(final_vectors).astype('float32')
130
-
131
-
132
- def build(self, markdown_root: str):
133
- """Ingest markdown files, chunk, encode via OpenAI, and store"""
134
- chunks = []
135
-
136
- for path in Path(markdown_root).rglob("*.md"):
137
- with open(path, "r", encoding="utf-8") as f:
138
- content = f.read()
139
- # Use the existing remove_markdown logic
140
- clean_text = remove_markdown(content)
141
-
142
- sentences = split_sentences(clean_text)
143
- for c in window_chunks(sentences, CHUNK_WINDOW):
144
- # Skip chunks that became empty after markdown removal
145
- if not c["text"].strip():
146
- continue
147
-
148
- chunks.append({
149
- "text": c["text"],
150
- "source": str(path),
151
- "position": c["index"]
152
- })
153
-
154
- if not chunks:
155
- print("⚠️ No valid text found in markdown files.")
156
- return
157
-
158
- texts = [c["text"] for c in chunks]
159
- embeddings = self._get_embeddings(texts)
160
-
161
- # Build FAISS index
162
- self.index = faiss.IndexFlatIP(embeddings.shape[1])
163
- faiss.normalize_L2(embeddings)
164
- self.index.add(embeddings)
165
-
166
- self.metadata = chunks
167
- self.save()
168
-
169
-
170
- def save(self):
171
- faiss.write_index(self.index, str(self.db_path / "index.faiss"))
172
- with open(self.db_path / "meta.json", "w", encoding="utf-8") as f:
173
- json.dump(self.metadata, f)
174
-
175
- def load(self):
176
- if not (self.db_path / "index.faiss").exists():
177
- raise FileNotFoundError(f"No database found at {self.db_path}. Please build it first.")
178
- self.index = faiss.read_index(str(self.db_path / "index.faiss"))
179
- with open(self.db_path / "meta.json", "r", encoding="utf-8") as f:
180
- self.metadata = json.load(f)
181
-
182
- def search(self, query: str, k=TOP_K):
183
- # Encode query using OpenAI
184
- q_emb = self._get_embeddings([query])
185
- faiss.normalize_L2(q_emb)
186
- scores, indices = self.index.search(q_emb, k)
187
- return [{"score": float(s), "data": self.metadata[i]} for s, i in zip(scores[0], indices[0])]
188
-
189
- def nucleus_filter(self, results):
190
- if not results:
191
- return []
192
- best = results[0]["score"]
193
- filtered = [r for r in results if (best - r["score"]) <= SIMILARITY_RADIUS]
194
- scores = np.array([r["score"] for r in filtered])
195
- probs = np.exp(scores) / np.sum(np.exp(scores))
196
- sampled_indices = np.random.choice(len(filtered), size=min(len(filtered), 10), replace=False, p=probs)
197
- return [filtered[i] for i in sampled_indices]
@@ -1,49 +0,0 @@
1
- import os
2
- import re
3
- from pathlib import Path
4
- from typing import List
5
-
6
- from .config import ENV_FILE
7
-
8
- def split_sentences(text: str) -> List[str]:
9
- # simple but effective
10
- sentences = re.split(r'(?<=[.!?])\s+', text)
11
- return [s.strip() for s in sentences if s.strip()]
12
-
13
-
14
- def window_chunks(sentences: List[str], window: int = 3):
15
- chunks = []
16
-
17
- for i in range(len(sentences)):
18
- left = sentences[max(0, i - window): i]
19
- center = [sentences[i]]
20
- right = sentences[i + 1: i + 1 + window]
21
-
22
- chunk_text = " ".join(left + center + right)
23
-
24
- chunks.append({
25
- "text": chunk_text,
26
- "center": sentences[i],
27
- "index": i
28
- })
29
-
30
- return chunks
31
-
32
-
33
- def get_openai_key() -> str:
34
- """
35
- Load the OpenAI API key from ENV_FILE, prompt user if missing.
36
- """
37
- if ENV_FILE.exists():
38
- key = ENV_FILE.read_text().strip()
39
- if key:
40
- return key
41
-
42
- # Prompt user
43
- print(f"🔑 OpenAI API key not found. Enter your key (it will be saved at {ENV_FILE}):")
44
- key = input("API Key: ").strip()
45
-
46
- # Save to file
47
- ENV_FILE.write_text(key)
48
- print(f"✅ Key saved at {ENV_FILE}")
49
- return key
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- raqa = raqa.cli:app
@@ -1,10 +0,0 @@
1
- build
2
- faiss-cpu
3
- joblib
4
- numpy
5
- openai
6
- python-frontmatter
7
- sentence-transformers
8
- tqdm
9
- typer[all]
10
- twine
File without changes
File without changes
File without changes