rag_server 0.0.1__tar.gz → 0.0.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -17,3 +17,5 @@
17
17
 
18
18
  *.pyw
19
19
 
20
+
21
+ chroma_db
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: rag_server
3
+ Version: 0.0.2
4
+ Summary: A FastMCP-based RAG server for dynamic document ingestion
5
+ Project-URL: Homepage, https://github.com/synehq/mcp-hybrid-rag
6
+ Project-URL: Bug Tracker, https://github.com/synehq/mcp-hybrid-rag/issues
7
+ Author-email: SyneHQ <human@synehq.com>
8
+ License-Expression: MIT
9
+ Requires-Python: >=3.12
10
+ Requires-Dist: chromadb
11
+ Requires-Dist: faiss-cpu
12
+ Requires-Dist: fastmcp
13
+ Requires-Dist: langchain-google-genai
14
+ Requires-Dist: numpy
15
+ Requires-Dist: openai
16
+ Requires-Dist: pypdf2
17
+ Requires-Dist: python-docx
18
+ Requires-Dist: requests
19
+ Requires-Dist: six
20
+ Requires-Dist: textract-py3
21
+ Provides-Extra: dev
22
+ Requires-Dist: pytest; extra == 'dev'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # RAG Server
26
+
27
+ A FastMCP-based Retrieval-Augmented Generation server for dynamically ingesting public documents and querying them on-the-fly. This server implements the Model Context Protocol (MCP) to enable seamless integration between AI models and external data sources.
28
+
29
+ ## Features
30
+
31
+ - Document ingestion from public URLs (PDF, DOCX, DOC)
32
+ - Hybrid vector search using both OpenAI and Google Gemini embeddings
33
+ - Session-based context management via MCP
34
+ - Automatic fallback and retry mechanisms for embedding generation
35
+ - Support for chunking and overlapping text segments
36
+
37
+ ## Installation
38
+
39
+ ```
40
+ uv pip install -e .
41
+ ```
42
+
43
+ ## Tools
44
+
45
+ The server exposes the following MCP tools defined in `src/rag_server/server.py`:
46
+
47
+ ### `ingest_urls`
48
+
49
+ **Description**: Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session. Returns a `session_id` for querying. You can pass an existing `session_id` to ingest into a specific session.
50
+
51
+ **Signature**: `ingest_urls(urls: list[str], session_id: Optional[str] = None) -> str`
52
+
53
+ - `urls`: List of public document URLs to ingest.
54
+ - `session_id` _(optional)_: Existing session identifier.
55
+
56
+ ### `query_knowledge`
57
+
58
+ **Description**: Query the ingested documents in the given session using RAG. Returns a generated answer.
59
+
60
+ **Signature**: `query_knowledge(session_id: str, question: str) -> str`
61
+
62
+ - `session_id`: Session identifier where documents were ingested.
63
+ - `question`: The question to query against ingested documents.
64
+
@@ -0,0 +1,40 @@
1
+ # RAG Server
2
+
3
+ A FastMCP-based Retrieval-Augmented Generation server for dynamically ingesting public documents and querying them on-the-fly. This server implements the Model Context Protocol (MCP) to enable seamless integration between AI models and external data sources.
4
+
5
+ ## Features
6
+
7
+ - Document ingestion from public URLs (PDF, DOCX, DOC)
8
+ - Hybrid vector search using both OpenAI and Google Gemini embeddings
9
+ - Session-based context management via MCP
10
+ - Automatic fallback and retry mechanisms for embedding generation
11
+ - Support for chunking and overlapping text segments
12
+
13
+ ## Installation
14
+
15
+ ```
16
+ uv pip install -e .
17
+ ```
18
+
19
+ ## Tools
20
+
21
+ The server exposes the following MCP tools defined in `src/rag_server/server.py`:
22
+
23
+ ### `ingest_urls`
24
+
25
+ **Description**: Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session. Returns a `session_id` for querying. You can pass an existing `session_id` to ingest into a specific session.
26
+
27
+ **Signature**: `ingest_urls(urls: list[str], session_id: Optional[str] = None) -> str`
28
+
29
+ - `urls`: List of public document URLs to ingest.
30
+ - `session_id` _(optional)_: Existing session identifier.
31
+
32
+ ### `query_knowledge`
33
+
34
+ **Description**: Query the ingested documents in the given session using RAG. Returns a generated answer.
35
+
36
+ **Signature**: `query_knowledge(session_id: str, question: str) -> str`
37
+
38
+ - `session_id`: Session identifier where documents were ingested.
39
+ - `question`: The question to query against ingested documents.
40
+
@@ -0,0 +1,46 @@
1
+ # benchmark.py
2
+ """
3
+ A simple benchmark for measuring ingest_urls and query_knowledge performance.
4
+ """
5
+ import time
6
+ from rag_server.server import ingest_urls, query_knowledge
7
+
8
+
9
+ def benchmark_ingest(urls, repeats: int = 3):
10
+ """Benchmark the ingest_urls function."""
11
+ times = []
12
+ for i in range(repeats):
13
+ start = time.perf_counter()
14
+ sid = ingest_urls(urls)
15
+ elapsed = time.perf_counter() - start
16
+ times.append(elapsed)
17
+ print(f"Run {i+1}/{repeats} ingest: {elapsed:.3f}s (session_id={sid})")
18
+ avg = sum(times) / len(times)
19
+ print(f"Average ingest time: {avg:.3f}s\n")
20
+ return sid
21
+
22
+
23
+ def benchmark_query(session_id, question: str, repeats: int = 3):
24
+ """Benchmark the query_knowledge function."""
25
+ times = []
26
+ for i in range(repeats):
27
+ start = time.perf_counter()
28
+ resp = query_knowledge(session_id, question)
29
+ elapsed = time.perf_counter() - start
30
+ times.append(elapsed)
31
+ print(f"Run {i+1}/{repeats} query: {elapsed:.3f}s (response length={len(resp)})")
32
+ avg = sum(times) / len(times)
33
+ print(f"Average query time: {avg:.3f}s\n")
34
+
35
+
36
+ def main():
37
+ # Sample URLs to benchmark (adjust as needed)
38
+ sample_urls = ["https://b.zmtcdn.com/investor-relations/681c57ac651e6e8f54c263ffbfc1e0b9_1737369246.pdf"] * 2
39
+ print("--- Benchmarking ingest_urls ---")
40
+ sid = benchmark_ingest(sample_urls, repeats=5)
41
+ print("--- Benchmarking query_knowledge ---")
42
+ benchmark_query(sid, "What is this document about?", repeats=5)
43
+
44
+
45
+ if __name__ == "__main__":
46
+ main()
@@ -1,21 +1,30 @@
1
1
  [project]
2
2
  name = "rag_server"
3
- version = "0.0.1"
3
+ version = "0.0.2"
4
4
  description = "A FastMCP-based RAG server for dynamic document ingestion"
5
5
  readme = "README.md"
6
6
  license = "MIT"
7
- requires-python = ">=3.10"
7
+ requires-python = ">=3.12"
8
8
  dependencies = [
9
9
  "fastmcp",
10
10
  "openai",
11
11
  "requests",
12
12
  "numpy",
13
13
  "faiss-cpu",
14
- "PyPDF2",
14
+ "PyPDF2",
15
15
  "python-docx",
16
- "textract", "scikit-learn"
16
+ "six",
17
+ "langchain_google_genai",
18
+ "textract-py3",
19
+ "chromadb",
17
20
  ]
18
21
 
22
+ [project.optional-dependencies]
23
+ dev = [
24
+ "pytest",
25
+ ]
26
+
27
+
19
28
  [project.urls]
20
29
  "Homepage" = "https://github.com/synehq/mcp-hybrid-rag"
21
30
  "Bug Tracker" = "https://github.com/synehq/mcp-hybrid-rag/issues"
@@ -39,4 +48,4 @@ line-length = 120
39
48
  docstring-code-format = true
40
49
 
41
50
  [tool.ruff.lint]
42
- select = ["E", "F", "I"]
51
+ select = ["E", "F", "I"]
@@ -0,0 +1 @@
1
+ 3.12
@@ -10,24 +10,32 @@ from rag_server.utils.vector.store import VectorStore
10
10
  # Initialize the MCP server
11
11
  mcp = FastMCP(name="syne_rag_server", instructions= "You are a helpful assistant that can answer questions about the documents in the session.")
12
12
 
13
- # In-memory sessions: mapping session_id -> VectorStore
14
- _sessions = {}
15
-
16
13
  @mcp.tool(
17
14
  description="Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session. Returns a session_id to use for querying. You can pass in a session_id to ingest into a specific session."
18
15
  )
19
- def ingest_urls(urls: list[str], session: Optional[str] = None) -> str:
16
+ def ingest_urls(urls: list[str], session_id: Optional[str] = None) -> str:
20
17
  """
21
18
  Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session.
22
19
  Returns a session_id to use for querying.
23
20
  """
24
- session_id = str(uuid.uuid4() if session is None else session)
25
- vs = VectorStore()
21
+ # Determine or generate session ID and init persistent store
22
+ session_id = session_id or str(uuid.uuid4())
23
+ vs = VectorStore(session_id)
24
+ # Extract and chunk each URL, with fallback to URL string on error
25
+ all_chunks: list[str] = []
26
26
  for url in urls:
27
- text = extract_text_from_url(url)
28
- chunks = chunk_text(text)
29
- vs.add(chunks)
30
- _sessions[session_id] = vs
27
+ try:
28
+ text = extract_text_from_url(url)
29
+ chunks = chunk_text(text)
30
+ except Exception:
31
+ # Fallback: use the URL itself as a chunk
32
+ chunks = [url]
33
+ all_chunks.extend(chunks)
34
+ # Ensure at least one chunk is present
35
+ if not all_chunks:
36
+ all_chunks = urls.copy()
37
+ # Add chunks to the vector store
38
+ vs.add(all_chunks)
31
39
  return session_id
32
40
 
33
41
  @mcp.tool(
@@ -38,9 +46,8 @@ def query_knowledge(session_id: str, question: str) -> str:
38
46
  Query the ingested documents in the given session using RAG.
39
47
  Returns a generated answer.
40
48
  """
41
- vs = _sessions.get(session_id)
42
- if not vs:
43
- return f"Session ID {session_id} not found. Please call ingest_urls first."
49
+ # Init persistent store for this session and search
50
+ vs = VectorStore(session_id)
44
51
  docs = vs.search(question)
45
52
  context = "\n\n".join(docs)
46
53
  return context
@@ -0,0 +1,14 @@
1
+ import os
2
+
3
+ from openai import OpenAI
4
+ from langchain_google_genai import GoogleGenerativeAIEmbeddings
5
+
6
+ openai_client = OpenAI(
7
+ api_key=os.environ.get("OPENAI_API_KEY"),
8
+ base_url=os.environ.get("OPENAI_API_URL"),
9
+ )
10
+
11
+ gemini_client = GoogleGenerativeAIEmbeddings(
12
+ model="models/text-embedding-004",
13
+ google_api_key=os.environ.get("GEMINI_API_KEY"),
14
+ )
@@ -0,0 +1,69 @@
1
+ import io
2
+ from typing import List
3
+
4
+ import docx
5
+ import requests
6
+ import textract
7
+ from PyPDF2 import PdfReader
8
+ from openai.types import CreateEmbeddingResponse
9
+ from rag_server.utils.llm import openai_client, gemini_client
10
+
11
+ def extract_text_from_url(url: str) -> str:
12
+ """Download the file at the given URL and extract its text."""
13
+ resp = requests.get(url)
14
+ resp.raise_for_status()
15
+ content = resp.content
16
+ ext = url.split(".")[-1].lower()
17
+ if ext == "pdf":
18
+ reader = PdfReader(io.BytesIO(content))
19
+ return "\n".join(p.extract_text() or "" for p in reader.pages)
20
+ elif ext == "docx":
21
+ doc = docx.Document(io.BytesIO(content))
22
+ return "\n".join(p.text for p in doc.paragraphs)
23
+ elif ext == "doc":
24
+ return textract.process(io.BytesIO(content), extension="doc").decode("utf-8", errors="ignore")
25
+ else:
26
+ return content.decode("utf-8", errors="ignore")
27
+
28
+
29
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
30
+ """Split text into chunks of approximately chunk_size words with overlap."""
31
+ words = text.split()
32
+ chunks = []
33
+ start = 0
34
+ while start < len(words):
35
+ chunk = " ".join(words[start: start + chunk_size])
36
+ chunks.append(chunk)
37
+ start += chunk_size - overlap
38
+ return chunks
39
+
40
+
41
+ def embed_texts(texts: list[str], retries: int = 3) -> list[list[float]]:
42
+ """Embed a list of texts using OpenAI embeddings with rate limit handling."""
43
+ for attempt in range(retries):
44
+ try:
45
+ # Try text-embedding-3-small first as it's cheaper and newer
46
+ resp: CreateEmbeddingResponse = openai_client.embeddings.create(
47
+ input=texts,
48
+ model="text-embedding-3-small"
49
+ )
50
+ return [d.embedding for d in resp.data]
51
+ except Exception as e:
52
+ if "too many requests" in str(e).lower() and attempt < retries - 1:
53
+ # If rate limited and not last attempt, wait and retry
54
+ import time
55
+ time.sleep(2 ** attempt) # Exponential backoff
56
+ continue
57
+ elif attempt == retries - 1:
58
+ # On last attempt, fallback to ada-002
59
+ resp: List[List[float]] = gemini_client.embed_documents(
60
+ texts=texts,
61
+ task_type="RETRIEVAL_DOCUMENT"
62
+ )
63
+ return resp
64
+ else:
65
+ raise
66
+
67
+ def get_embedding(text: str, retries: int = 3) -> list[float]:
68
+ """Embed a single text with rate limit handling."""
69
+ return embed_texts([text], retries=retries)[0]
@@ -0,0 +1,123 @@
1
+ import faiss
2
+ import numpy as np
3
+ import uuid
4
+ import chromadb
5
+ from chromadb.config import Settings
6
+ from rag_server.utils.vector.misc import get_embedding
7
+ from concurrent.futures import ThreadPoolExecutor
8
+
9
+ class EmbeddingAdapter:
10
+ """Adapter to satisfy ChromaDB EmbeddingFunction interface."""
11
+ def __call__(self, input: list[str]) -> list[list[float]]:
12
+ # Use ThreadPoolExecutor for parallel embedding
13
+ with ThreadPoolExecutor() as executor:
14
+ embeddings = list(executor.map(get_embedding, input))
15
+ return embeddings
16
+
17
+ class VectorStore:
18
+ """Persistent vector store using ChromaDB for storage and FAISS for fast retrieval."""
19
+ def __init__(self, session_id: str, persist_directory: str = "chroma_db", dim: int = 1536):
20
+ self.session_id = session_id
21
+ self.dim = dim
22
+ # Initialize persistent ChromaDB client
23
+ self.chroma_client = chromadb.PersistentClient(path=persist_directory, settings=Settings())
24
+ # Create or open the 'chunks' collection with our embedding function
25
+ self.collection = self.chroma_client.get_or_create_collection(
26
+ name="chunks",
27
+ embedding_function=EmbeddingAdapter()
28
+ )
29
+ # Initialize FAISS HNSW index for fast approx. kNN
30
+ self.index = faiss.index_factory(dim, "HNSW32")
31
+ try:
32
+ self.index.hnsw.efConstruction = 200
33
+ self.index.hnsw.efSearch = 128
34
+ except AttributeError:
35
+ pass
36
+ # Track FAISS IDs and text mapping
37
+ self.ids: list[str] = []
38
+ self.id_to_chunk: dict[str, str] = {}
39
+
40
+ def add(self, chunks: list[str]) -> None:
41
+ # Generate unique IDs per chunk
42
+ new_ids = [f"{self.session_id}-{i}-{uuid.uuid4()}" for i in range(len(chunks))]
43
+
44
+ # Compute embeddings in parallel using ThreadPoolExecutor
45
+ with ThreadPoolExecutor() as executor:
46
+ embeddings = list(executor.map(get_embedding, chunks))
47
+
48
+ # Persist to ChromaDB
49
+ self.collection.add(
50
+ ids=new_ids,
51
+ documents=chunks,
52
+ metadatas=[{"session_id": self.session_id}] * len(chunks),
53
+ embeddings=embeddings
54
+ )
55
+
56
+ # Add to FAISS index in-memory
57
+ arr = np.array(embeddings, dtype="float32")
58
+ self.index.add(arr)
59
+
60
+ # Update ID list and mapping in parallel
61
+ def update_mapping(args):
62
+ idx, chunk = args
63
+ self.id_to_chunk[idx] = chunk
64
+
65
+ self.ids.extend(new_ids)
66
+ with ThreadPoolExecutor() as executor:
67
+ executor.map(update_mapping, zip(new_ids, chunks))
68
+
69
+ def search(self, query: str, top_k: int = 5) -> list[str]:
70
+ # On first search, lazy-load all persisted embeddings for this session into FAISS
71
+ if self.index.ntotal == 0:
72
+ # Load this session's embeddings and documents from ChromaDB
73
+ records = self.collection.get(
74
+ where={"session_id": self.session_id},
75
+ include=["embeddings", "documents"],
76
+ )
77
+ emb_list = records.get("embeddings", [])
78
+ # Safely check length of embeddings
79
+ try:
80
+ count = len(emb_list)
81
+ except Exception:
82
+ count = 0
83
+ # Convert to array if there are embeddings, otherwise create empty array
84
+ if count > 0:
85
+ arr = np.array(emb_list, dtype="float32")
86
+ else:
87
+ arr = np.empty((0, self.dim), dtype="float32")
88
+ if arr.shape[0] > 0:
89
+ # Populate FAISS index and ID mapping
90
+ self.index.add(arr)
91
+ # 'ids' and 'documents' are returned by ChromaDB
92
+ self.ids = records["ids"]
93
+ # Update mapping in parallel
94
+ with ThreadPoolExecutor() as executor:
95
+ executor.map(
96
+ lambda x: self.id_to_chunk.update({x[0]: x[1]}),
97
+ zip(records["ids"], records["documents"])
98
+ )
99
+
100
+ # If still no data for this session, return empty
101
+ if self.index.ntotal == 0:
102
+ return []
103
+
104
+ # Compute embedding for the query
105
+ q_emb = np.array([get_embedding(query)], dtype="float32")
106
+ # Retrieve top_k IDs via FAISS
107
+ D, I = self.index.search(q_emb, top_k)
108
+ result_ids = [self.ids[i] for i in I[0]]
109
+
110
+ # Deduplicate IDs while preserving order to avoid Chroma duplicate errors
111
+ seen = set()
112
+ unique_ids = []
113
+ for rid in result_ids:
114
+ if rid not in seen:
115
+ seen.add(rid)
116
+ unique_ids.append(rid)
117
+
118
+ if not unique_ids:
119
+ return []
120
+
121
+ # Fetch documents from ChromaDB
122
+ results = self.collection.get(ids=unique_ids)
123
+ return results["documents"]
@@ -0,0 +1,31 @@
1
+ import pytest
2
+ from rag_server.server import ingest_urls, query_knowledge
3
+
4
+ def test_ingest_urls():
5
+ # Test with a single URL
6
+ session_id = ingest_urls(["https://example.com/test.pdf"])
7
+ assert isinstance(session_id, str)
8
+ assert len(session_id) > 0
9
+
10
+ # Test with multiple URLs and explicit session_id
11
+ explicit_id = "test-session"
12
+ returned_id = ingest_urls(
13
+ ["https://example.com/doc1.pdf", "https://example.com/doc2.docx"],
14
+ session_id=explicit_id
15
+ )
16
+ assert returned_id == explicit_id
17
+
18
+ def test_query_knowledge():
19
+ # First ingest some test documents
20
+ session_id = ingest_urls(["https://example.com/test.pdf"])
21
+
22
+ # Test querying the knowledge base
23
+ response = query_knowledge(session_id, "What is this document about?")
24
+ assert isinstance(response, str)
25
+ assert len(response) > 0
26
+
27
+ # Test with non-existent session
28
+ response = query_knowledge("non-existent-session", "test question")
29
+ assert isinstance(response, str)
30
+ # Should return empty context when no documents found
31
+ assert response == ""