rag_server 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rag_server/__init__.py +9 -0
- rag_server/server.py +50 -0
- rag_server/utils/__init__.py +0 -0
- rag_server/utils/llm.py +9 -0
- rag_server/utils/vector/__init__.py +0 -0
- rag_server/utils/vector/misc.py +50 -0
- rag_server/utils/vector/store.py +67 -0
- rag_server-0.0.1.dist-info/METADATA +46 -0
- rag_server-0.0.1.dist-info/RECORD +11 -0
- rag_server-0.0.1.dist-info/WHEEL +4 -0
- rag_server-0.0.1.dist-info/entry_points.txt +3 -0
rag_server/__init__.py
ADDED
rag_server/server.py
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# server.py
|
2
|
+
import uuid
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from fastmcp import FastMCP
|
6
|
+
|
7
|
+
from rag_server.utils.vector.misc import chunk_text, extract_text_from_url
|
8
|
+
from rag_server.utils.vector.store import VectorStore
|
9
|
+
|
10
|
+
# Initialize the MCP server
|
11
|
+
mcp = FastMCP(name="syne_rag_server", instructions= "You are a helpful assistant that can answer questions about the documents in the session.")
|
12
|
+
|
13
|
+
# In-memory sessions: mapping session_id -> VectorStore
|
14
|
+
_sessions = {}
|
15
|
+
|
16
|
+
@mcp.tool(
|
17
|
+
description="Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session. Returns a session_id to use for querying. You can pass in a session_id to ingest into a specific session."
|
18
|
+
)
|
19
|
+
def ingest_urls(urls: list[str], session: Optional[str] = None) -> str:
|
20
|
+
"""
|
21
|
+
Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session.
|
22
|
+
Returns a session_id to use for querying.
|
23
|
+
"""
|
24
|
+
session_id = str(uuid.uuid4() if session is None else session)
|
25
|
+
vs = VectorStore()
|
26
|
+
for url in urls:
|
27
|
+
text = extract_text_from_url(url)
|
28
|
+
chunks = chunk_text(text)
|
29
|
+
vs.add(chunks)
|
30
|
+
_sessions[session_id] = vs
|
31
|
+
return session_id
|
32
|
+
|
33
|
+
@mcp.tool(
|
34
|
+
description="Query the ingested documents in the given session using RAG. Returns a generated answer."
|
35
|
+
)
|
36
|
+
def query_knowledge(session_id: str, question: str) -> str:
|
37
|
+
"""
|
38
|
+
Query the ingested documents in the given session using RAG.
|
39
|
+
Returns a generated answer.
|
40
|
+
"""
|
41
|
+
vs = _sessions.get(session_id)
|
42
|
+
if not vs:
|
43
|
+
return f"Session ID {session_id} not found. Please call ingest_urls first."
|
44
|
+
docs = vs.search(question)
|
45
|
+
context = "\n\n".join(docs)
|
46
|
+
return context
|
47
|
+
|
48
|
+
def main():
|
49
|
+
# Run the server
|
50
|
+
mcp.run()
|
File without changes
|
rag_server/utils/llm.py
ADDED
File without changes
|
@@ -0,0 +1,50 @@
|
|
1
|
+
import io
|
2
|
+
|
3
|
+
import docx
|
4
|
+
import requests
|
5
|
+
import textract
|
6
|
+
from PyPDF2 import PdfReader
|
7
|
+
from openai.types import CreateEmbeddingResponse
|
8
|
+
|
9
|
+
from rag_server.utils.llm import openai_client
|
10
|
+
|
11
|
+
|
12
|
+
def extract_text_from_url(url: str) -> str:
|
13
|
+
"""Download the file at the given URL and extract its text."""
|
14
|
+
resp = requests.get(url)
|
15
|
+
resp.raise_for_status()
|
16
|
+
content = resp.content
|
17
|
+
ext = url.split(".")[-1].lower()
|
18
|
+
if ext == "pdf":
|
19
|
+
reader = PdfReader(io.BytesIO(content))
|
20
|
+
return "\n".join(p.extract_text() or "" for p in reader.pages)
|
21
|
+
elif ext == "docx":
|
22
|
+
doc = docx.Document(io.BytesIO(content))
|
23
|
+
return "\n".join(p.text for p in doc.paragraphs)
|
24
|
+
elif ext == "doc":
|
25
|
+
return textract.process(io.BytesIO(content), extension="doc").decode("utf-8", errors="ignore")
|
26
|
+
else:
|
27
|
+
return content.decode("utf-8", errors="ignore")
|
28
|
+
|
29
|
+
|
30
|
+
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
|
31
|
+
"""Split text into chunks of approximately chunk_size words with overlap."""
|
32
|
+
words = text.split()
|
33
|
+
chunks = []
|
34
|
+
start = 0
|
35
|
+
while start < len(words):
|
36
|
+
chunk = " ".join(words[start: start + chunk_size])
|
37
|
+
chunks.append(chunk)
|
38
|
+
start += chunk_size - overlap
|
39
|
+
return chunks
|
40
|
+
|
41
|
+
|
42
|
+
def embed_texts(texts: list[str]) -> list[list[float]]:
|
43
|
+
"""Embed a list of texts using OpenAI embeddings."""
|
44
|
+
resp : CreateEmbeddingResponse = openai_client.embeddings.create(input=texts, model="text-embedding-ada-002")
|
45
|
+
return [d.embedding for d in resp.data]
|
46
|
+
|
47
|
+
def get_embedding(text: str) -> list[float]:
|
48
|
+
"""Embed a single text."""
|
49
|
+
resp : CreateEmbeddingResponse = openai_client.embeddings.create(input=text, model="text-embedding-ada-002")
|
50
|
+
return resp.data[0].embedding
|
@@ -0,0 +1,67 @@
|
|
1
|
+
import faiss
|
2
|
+
import numpy as np
|
3
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
4
|
+
|
5
|
+
from rag_server.utils.vector.misc import get_embedding, embed_texts
|
6
|
+
|
7
|
+
|
8
|
+
class VectorStore:
|
9
|
+
"""Simple in-memory vector store using FAISS."""
|
10
|
+
def __init__(self, dim: int = 1536):
|
11
|
+
self.dim = dim
|
12
|
+
# Use an HNSW approximate nearest neighbor index (no training needed)
|
13
|
+
self.index = faiss.index_factory(dim, "HNSW32")
|
14
|
+
# Configure HNSW parameters for construction and search quality
|
15
|
+
try:
|
16
|
+
self.index.hnsw.efConstruction = 200
|
17
|
+
self.index.hnsw.efSearch = 128
|
18
|
+
except AttributeError:
|
19
|
+
pass
|
20
|
+
self.texts: list[str] = []
|
21
|
+
# Initialize TF-IDF vectorizer and matrix
|
22
|
+
self.vectorizer = TfidfVectorizer()
|
23
|
+
self.tfidf_matrix = None
|
24
|
+
|
25
|
+
def add(self, chunks: list[str]) -> None:
|
26
|
+
embeddings = embed_texts(chunks)
|
27
|
+
arr = np.array(embeddings, dtype="float32")
|
28
|
+
self.index.add(arr)
|
29
|
+
self.texts.extend(chunks)
|
30
|
+
# Update TF-IDF matrix
|
31
|
+
self.tfidf_matrix = self.vectorizer.fit_transform(self.texts)
|
32
|
+
|
33
|
+
def search(self, query: str, top_k: int = 5, alpha: float = 0.5) -> list[str]:
|
34
|
+
"""Perform hybrid search combining semantic (FAISS) and lexical (TF-IDF) scores."""
|
35
|
+
# Semantic search via FAISS
|
36
|
+
q_emb = np.array([get_embedding(query)], dtype="float32")
|
37
|
+
D, I = self.index.search(q_emb, top_k)
|
38
|
+
vect_ids = I[0].tolist()
|
39
|
+
vect_scores = [-d for d in D[0]]
|
40
|
+
# Lexical search via TF-IDF
|
41
|
+
if self.tfidf_matrix is None:
|
42
|
+
self.tfidf_matrix = self.vectorizer.fit_transform(self.texts)
|
43
|
+
q_tfidf = self.vectorizer.transform([query])
|
44
|
+
tfidf_scores_all = q_tfidf.dot(self.tfidf_matrix.T).toarray()[0]
|
45
|
+
tfidf_top = np.argsort(-tfidf_scores_all)[:top_k].tolist()
|
46
|
+
# Combine candidate document indices
|
47
|
+
candidate_ids = set(vect_ids + tfidf_top)
|
48
|
+
vect_min = min(vect_scores) if vect_scores else 0.0
|
49
|
+
scores = []
|
50
|
+
for idx in candidate_ids:
|
51
|
+
vs = vect_scores[vect_ids.index(idx)] if idx in vect_ids else vect_min
|
52
|
+
ts = float(tfidf_scores_all[idx])
|
53
|
+
scores.append((idx, vs, ts))
|
54
|
+
# Normalize and blend scores
|
55
|
+
vs_vals = [v for _, v, _ in scores]
|
56
|
+
ts_vals = [t for _, _, t in scores]
|
57
|
+
vmin, vmax = min(vs_vals), max(vs_vals)
|
58
|
+
tmin, tmax = min(ts_vals), max(ts_vals)
|
59
|
+
blended = []
|
60
|
+
for idx, vs, ts in scores:
|
61
|
+
vn = (vs - vmin) / (vmax - vmin) if vmax > vmin else 0.0
|
62
|
+
tn = (ts - tmin) / (tmax - tmin) if tmax > tmin else 0.0
|
63
|
+
combined = alpha * vn + (1 - alpha) * tn
|
64
|
+
blended.append((idx, combined))
|
65
|
+
# Sort by blended score and return top_k chunks
|
66
|
+
top = sorted(blended, key=lambda x: x[1], reverse=True)[:top_k]
|
67
|
+
return [self.texts[i] for i, _ in top]
|
@@ -0,0 +1,46 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: rag_server
|
3
|
+
Version: 0.0.1
|
4
|
+
Summary: A FastMCP-based RAG server for dynamic document ingestion
|
5
|
+
Project-URL: Homepage, https://github.com/synehq/mcp-hybrid-rag
|
6
|
+
Project-URL: Bug Tracker, https://github.com/synehq/mcp-hybrid-rag/issues
|
7
|
+
Author-email: SyneHQ <human@synehq.com>
|
8
|
+
License-Expression: MIT
|
9
|
+
Requires-Python: >=3.10
|
10
|
+
Requires-Dist: faiss-cpu
|
11
|
+
Requires-Dist: fastmcp
|
12
|
+
Requires-Dist: numpy
|
13
|
+
Requires-Dist: openai
|
14
|
+
Requires-Dist: pypdf2
|
15
|
+
Requires-Dist: python-docx
|
16
|
+
Requires-Dist: requests
|
17
|
+
Requires-Dist: scikit-learn
|
18
|
+
Requires-Dist: textract
|
19
|
+
Description-Content-Type: text/markdown
|
20
|
+
|
21
|
+
# RAG Server
|
22
|
+
|
23
|
+
A FastMCP-based Retrieval-Augmented Generation server for dynamically ingesting public documents and querying them on-the-fly.
|
24
|
+
|
25
|
+
## Installation
|
26
|
+
|
27
|
+
```bash
|
28
|
+
pip install -r requirements.txt
|
29
|
+
```
|
30
|
+
|
31
|
+
Ensure you set your OpenAI API key:
|
32
|
+
|
33
|
+
```bash
|
34
|
+
export OPENAI_API_KEY=your_key_here
|
35
|
+
```
|
36
|
+
|
37
|
+
## Running the server
|
38
|
+
|
39
|
+
```bash
|
40
|
+
python -m rag_server.server
|
41
|
+
```
|
42
|
+
|
43
|
+
## API Tools
|
44
|
+
|
45
|
+
- ingest_urls(urls: List[str], session_id: Optional[str]) -> session_id
|
46
|
+
- query_knowledge(session_id: str, question: str) -> answer
|
@@ -0,0 +1,11 @@
|
|
1
|
+
rag_server/__init__.py,sha256=KNZ1bD9ZGfyZwlv91Ueeega_1lsRDLs2fYQDgNbBdtc,212
|
2
|
+
rag_server/server.py,sha256=75IV2Ggowcx30LEtFy1stRbJGodsgvsD-CKObhbCeg4,1699
|
3
|
+
rag_server/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
rag_server/utils/llm.py,sha256=yEmxoRQ750LGu8ufWu38RoX0umBRWw8q0GQxzFmqAy8,158
|
5
|
+
rag_server/utils/vector/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
|
+
rag_server/utils/vector/misc.py,sha256=NbBRzU6RBc4A5Pu0cl76dutuZZfj_abwuAkKjM-LD6k,1768
|
7
|
+
rag_server/utils/vector/store.py,sha256=b7GtzjnXuqDVpQHMZ4Otms4wIY4zB0y6aLBCu58DSNE,2929
|
8
|
+
rag_server-0.0.1.dist-info/METADATA,sha256=i9DFzwVljGdfABtAK21WFGM9JoxE0hdAPloJlpE0za0,1104
|
9
|
+
rag_server-0.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
10
|
+
rag_server-0.0.1.dist-info/entry_points.txt,sha256=sWdH-o-5Mge0fcw28bZ-lAMdlVq3PJOsXTZSzZy_ndc,76
|
11
|
+
rag_server-0.0.1.dist-info/RECORD,,
|