rag_server 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ name: Publish Python distribution to PyPI
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - sudo
7
+ tags:
8
+ - 'v*'
9
+
10
+ jobs:
11
+ run:
12
+ name: "Build and publish release"
13
+ runs-on: ubuntu-latest
14
+ if: startsWith(github.ref, 'refs/tags/v')
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v3
21
+ with:
22
+ enable-cache: true
23
+ cache-dependency-glob: uv.lock
24
+
25
+ - name: Set up Python
26
+ run: uv python install 3.10
27
+
28
+ - name: Get latest tag
29
+ id: latest_tag
30
+ run: |
31
+ TAG=$(git describe --tags `git rev-list --tags --max-count=1`)
32
+ echo "tag=$TAG" >> $GITHUB_ENV
33
+ - name: Update version
34
+ run: |
35
+ TAG=$(git describe --tags `git rev-list --tags --max-count=1`)
36
+ if [[ $TAG =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
37
+ VERSION=${TAG#v} # Remove the 'v' prefix
38
+ sed -i "s/^version = .*/version = \"$VERSION\"/" pyproject.toml
39
+ else
40
+ echo "Invalid tag format. Expected format: v<MAJOR>.<MINOR>.<PATCH>"
41
+ exit 1
42
+ fi
43
+
44
+ - name: Build
45
+ run: uv build
46
+
47
+ - name: Publish
48
+ run: uv publish -t ${{ secrets.PYPI_TOKEN }}
@@ -0,0 +1,19 @@
1
+ .venv
2
+
3
+ *egg_info
4
+ .pycache
5
+
6
+ .idea/*
7
+
8
+ .ruff_cache/*
9
+
10
+ .mypy_cache/*
11
+
12
+ *.pyc
13
+
14
+ *.pyo
15
+
16
+ *.pyd
17
+
18
+ *.pyw
19
+
@@ -0,0 +1,46 @@
1
+ Metadata-Version: 2.4
2
+ Name: rag_server
3
+ Version: 0.0.1
4
+ Summary: A FastMCP-based RAG server for dynamic document ingestion
5
+ Project-URL: Homepage, https://github.com/synehq/mcp-hybrid-rag
6
+ Project-URL: Bug Tracker, https://github.com/synehq/mcp-hybrid-rag/issues
7
+ Author-email: SyneHQ <human@synehq.com>
8
+ License-Expression: MIT
9
+ Requires-Python: >=3.10
10
+ Requires-Dist: faiss-cpu
11
+ Requires-Dist: fastmcp
12
+ Requires-Dist: numpy
13
+ Requires-Dist: openai
14
+ Requires-Dist: pypdf2
15
+ Requires-Dist: python-docx
16
+ Requires-Dist: requests
17
+ Requires-Dist: scikit-learn
18
+ Requires-Dist: textract
19
+ Description-Content-Type: text/markdown
20
+
21
+ # RAG Server
22
+
23
+ A FastMCP-based Retrieval-Augmented Generation server for dynamically ingesting public documents and querying them on-the-fly.
24
+
25
+ ## Installation
26
+
27
+ ```bash
28
+ pip install -r requirements.txt
29
+ ```
30
+
31
+ Ensure you set your OpenAI API key:
32
+
33
+ ```bash
34
+ export OPENAI_API_KEY=your_key_here
35
+ ```
36
+
37
+ ## Running the server
38
+
39
+ ```bash
40
+ python -m rag_server.server
41
+ ```
42
+
43
+ ## API Tools
44
+
45
+ - ingest_urls(urls: List[str], session_id: Optional[str]) -> session_id
46
+ - query_knowledge(session_id: str, question: str) -> answer
@@ -0,0 +1,26 @@
1
+ # RAG Server
2
+
3
+ A FastMCP-based Retrieval-Augmented Generation server for dynamically ingesting public documents and querying them on-the-fly.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install -r requirements.txt
9
+ ```
10
+
11
+ Ensure you set your OpenAI API key:
12
+
13
+ ```bash
14
+ export OPENAI_API_KEY=your_key_here
15
+ ```
16
+
17
+ ## Running the server
18
+
19
+ ```bash
20
+ python -m rag_server.server
21
+ ```
22
+
23
+ ## API Tools
24
+
25
+ - ingest_urls(urls: List[str], session_id: Optional[str]) -> session_id
26
+ - query_knowledge(session_id: str, question: str) -> answer
@@ -0,0 +1,42 @@
1
+ [project]
2
+ name = "rag_server"
3
+ version = "0.0.1"
4
+ description = "A FastMCP-based RAG server for dynamic document ingestion"
5
+ readme = "README.md"
6
+ license = "MIT"
7
+ requires-python = ">=3.10"
8
+ dependencies = [
9
+ "fastmcp",
10
+ "openai",
11
+ "requests",
12
+ "numpy",
13
+ "faiss-cpu",
14
+ "PyPDF2",
15
+ "python-docx",
16
+ "textract", "scikit-learn"
17
+ ]
18
+
19
+ [project.urls]
20
+ "Homepage" = "https://github.com/synehq/mcp-hybrid-rag"
21
+ "Bug Tracker" = "https://github.com/synehq/mcp-hybrid-rag/issues"
22
+
23
+ [[project.authors]]
24
+ name = "SyneHQ"
25
+ email = "human@synehq.com"
26
+
27
+ [build-system]
28
+ requires = [ "hatchling",]
29
+ build-backend = "hatchling.build"
30
+
31
+ [project.scripts]
32
+ rag_server = "rag_server:main"
33
+ "rag-server" = "rag_server:main"
34
+
35
+ [tool.ruff]
36
+ line-length = 120
37
+
38
+ [tool.ruff.format]
39
+ docstring-code-format = true
40
+
41
+ [tool.ruff.lint]
42
+ select = ["E", "F", "I"]
@@ -0,0 +1,9 @@
1
+ from . import server
2
+ import asyncio
3
+
4
+ def main():
5
+ """Main entry point for the package."""
6
+ asyncio.run(server.main())
7
+
8
+ # Optionally expose other important items at package level
9
+ __all__ = ['main', 'server']
@@ -0,0 +1,50 @@
1
+ # server.py
2
+ import uuid
3
+ from typing import Optional
4
+
5
+ from fastmcp import FastMCP
6
+
7
+ from rag_server.utils.vector.misc import chunk_text, extract_text_from_url
8
+ from rag_server.utils.vector.store import VectorStore
9
+
10
+ # Initialize the MCP server
11
+ mcp = FastMCP(name="syne_rag_server", instructions= "You are a helpful assistant that can answer questions about the documents in the session.")
12
+
13
+ # In-memory sessions: mapping session_id -> VectorStore
14
+ _sessions = {}
15
+
16
+ @mcp.tool(
17
+ description="Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session. Returns a session_id to use for querying. You can pass in a session_id to ingest into a specific session."
18
+ )
19
+ def ingest_urls(urls: list[str], session: Optional[str] = None) -> str:
20
+ """
21
+ Ingest a list of public URLs (PDF, DOCX, DOC) into an ephemeral session.
22
+ Returns a session_id to use for querying.
23
+ """
24
+ session_id = str(uuid.uuid4() if session is None else session)
25
+ vs = VectorStore()
26
+ for url in urls:
27
+ text = extract_text_from_url(url)
28
+ chunks = chunk_text(text)
29
+ vs.add(chunks)
30
+ _sessions[session_id] = vs
31
+ return session_id
32
+
33
+ @mcp.tool(
34
+ description="Query the ingested documents in the given session using RAG. Returns a generated answer."
35
+ )
36
+ def query_knowledge(session_id: str, question: str) -> str:
37
+ """
38
+ Query the ingested documents in the given session using RAG.
39
+ Returns a generated answer.
40
+ """
41
+ vs = _sessions.get(session_id)
42
+ if not vs:
43
+ return f"Session ID {session_id} not found. Please call ingest_urls first."
44
+ docs = vs.search(question)
45
+ context = "\n\n".join(docs)
46
+ return context
47
+
48
+ def main():
49
+ # Run the server
50
+ mcp.run()
File without changes
@@ -0,0 +1,9 @@
1
+ import os
2
+
3
+ from openai import OpenAI
4
+
5
+ openai_client = OpenAI(
6
+ api_key=os.environ.get("OPENAI_API_KEY"),
7
+ base_url=os.environ.get("OPENAI_API_URL"),
8
+ )
9
+
@@ -0,0 +1,50 @@
1
+ import io
2
+
3
+ import docx
4
+ import requests
5
+ import textract
6
+ from PyPDF2 import PdfReader
7
+ from openai.types import CreateEmbeddingResponse
8
+
9
+ from rag_server.utils.llm import openai_client
10
+
11
+
12
+ def extract_text_from_url(url: str) -> str:
13
+ """Download the file at the given URL and extract its text."""
14
+ resp = requests.get(url)
15
+ resp.raise_for_status()
16
+ content = resp.content
17
+ ext = url.split(".")[-1].lower()
18
+ if ext == "pdf":
19
+ reader = PdfReader(io.BytesIO(content))
20
+ return "\n".join(p.extract_text() or "" for p in reader.pages)
21
+ elif ext == "docx":
22
+ doc = docx.Document(io.BytesIO(content))
23
+ return "\n".join(p.text for p in doc.paragraphs)
24
+ elif ext == "doc":
25
+ return textract.process(io.BytesIO(content), extension="doc").decode("utf-8", errors="ignore")
26
+ else:
27
+ return content.decode("utf-8", errors="ignore")
28
+
29
+
30
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
31
+ """Split text into chunks of approximately chunk_size words with overlap."""
32
+ words = text.split()
33
+ chunks = []
34
+ start = 0
35
+ while start < len(words):
36
+ chunk = " ".join(words[start: start + chunk_size])
37
+ chunks.append(chunk)
38
+ start += chunk_size - overlap
39
+ return chunks
40
+
41
+
42
+ def embed_texts(texts: list[str]) -> list[list[float]]:
43
+ """Embed a list of texts using OpenAI embeddings."""
44
+ resp : CreateEmbeddingResponse = openai_client.embeddings.create(input=texts, model="text-embedding-ada-002")
45
+ return [d.embedding for d in resp.data]
46
+
47
+ def get_embedding(text: str) -> list[float]:
48
+ """Embed a single text."""
49
+ resp : CreateEmbeddingResponse = openai_client.embeddings.create(input=text, model="text-embedding-ada-002")
50
+ return resp.data[0].embedding
@@ -0,0 +1,67 @@
1
+ import faiss
2
+ import numpy as np
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+
5
+ from rag_server.utils.vector.misc import get_embedding, embed_texts
6
+
7
+
8
+ class VectorStore:
9
+ """Simple in-memory vector store using FAISS."""
10
+ def __init__(self, dim: int = 1536):
11
+ self.dim = dim
12
+ # Use an HNSW approximate nearest neighbor index (no training needed)
13
+ self.index = faiss.index_factory(dim, "HNSW32")
14
+ # Configure HNSW parameters for construction and search quality
15
+ try:
16
+ self.index.hnsw.efConstruction = 200
17
+ self.index.hnsw.efSearch = 128
18
+ except AttributeError:
19
+ pass
20
+ self.texts: list[str] = []
21
+ # Initialize TF-IDF vectorizer and matrix
22
+ self.vectorizer = TfidfVectorizer()
23
+ self.tfidf_matrix = None
24
+
25
+ def add(self, chunks: list[str]) -> None:
26
+ embeddings = embed_texts(chunks)
27
+ arr = np.array(embeddings, dtype="float32")
28
+ self.index.add(arr)
29
+ self.texts.extend(chunks)
30
+ # Update TF-IDF matrix
31
+ self.tfidf_matrix = self.vectorizer.fit_transform(self.texts)
32
+
33
+ def search(self, query: str, top_k: int = 5, alpha: float = 0.5) -> list[str]:
34
+ """Perform hybrid search combining semantic (FAISS) and lexical (TF-IDF) scores."""
35
+ # Semantic search via FAISS
36
+ q_emb = np.array([get_embedding(query)], dtype="float32")
37
+ D, I = self.index.search(q_emb, top_k)
38
+ vect_ids = I[0].tolist()
39
+ vect_scores = [-d for d in D[0]]
40
+ # Lexical search via TF-IDF
41
+ if self.tfidf_matrix is None:
42
+ self.tfidf_matrix = self.vectorizer.fit_transform(self.texts)
43
+ q_tfidf = self.vectorizer.transform([query])
44
+ tfidf_scores_all = q_tfidf.dot(self.tfidf_matrix.T).toarray()[0]
45
+ tfidf_top = np.argsort(-tfidf_scores_all)[:top_k].tolist()
46
+ # Combine candidate document indices
47
+ candidate_ids = set(vect_ids + tfidf_top)
48
+ vect_min = min(vect_scores) if vect_scores else 0.0
49
+ scores = []
50
+ for idx in candidate_ids:
51
+ vs = vect_scores[vect_ids.index(idx)] if idx in vect_ids else vect_min
52
+ ts = float(tfidf_scores_all[idx])
53
+ scores.append((idx, vs, ts))
54
+ # Normalize and blend scores
55
+ vs_vals = [v for _, v, _ in scores]
56
+ ts_vals = [t for _, _, t in scores]
57
+ vmin, vmax = min(vs_vals), max(vs_vals)
58
+ tmin, tmax = min(ts_vals), max(ts_vals)
59
+ blended = []
60
+ for idx, vs, ts in scores:
61
+ vn = (vs - vmin) / (vmax - vmin) if vmax > vmin else 0.0
62
+ tn = (ts - tmin) / (tmax - tmin) if tmax > tmin else 0.0
63
+ combined = alpha * vn + (1 - alpha) * tn
64
+ blended.append((idx, combined))
65
+ # Sort by blended score and return top_k chunks
66
+ top = sorted(blended, key=lambda x: x[1], reverse=True)[:top_k]
67
+ return [self.texts[i] for i, _ in top]