PyPI - rag-multiple-files - Versions diffs - 0.1.0__tar.gz - Mend

rag-multiple-files 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

rag_multiple_files-0.1.0/.gitignore +29 -0
rag_multiple_files-0.1.0/BUILD.md +6 -0
rag_multiple_files-0.1.0/PKG-INFO +61 -0
rag_multiple_files-0.1.0/README.md +48 -0
rag_multiple_files-0.1.0/pyproject.toml +28 -0
rag_multiple_files-0.1.0/requirements.txt +3 -0
rag_multiple_files-0.1.0/src/ragmfiles/__init__.py +1 -0
rag_multiple_files-0.1.0/src/ragmfiles/main.py +116 -0

rag_multiple_files-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,29 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+*.egg
+# Virtual environments
+.venv/
+venv/
+ENV/
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+# Environment variables
+.env
+.env.*
+# OS
+.DS_Store
+Thumbs.db

rag_multiple_files-0.1.0/BUILD.md ADDED Viewed

@@ -0,0 +1,6 @@
+# Build and Publish
+```bash
+uv build
+uv publish --token xxxxxxxxxxxxxxx
+```

rag_multiple_files-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,61 @@
+Metadata-Version: 2.4
+Name: rag-multiple-files
+Version: 0.1.0
+Summary: MCP server for semantic PDF querying using RAG
+Project-URL: Homepage, https://codeberg.org/mionita1980/rag-multiple-files
+Project-URL: Source, https://codeberg.org/mionita1980/rag-multiple-files
+Author-email: Mihai Ionita <mionita1980@proton.me>
+Requires-Python: >=3.11
+Requires-Dist: faiss-cpu
+Requires-Dist: mcp[cli]
+Requires-Dist: sentence-transformers
+Description-Content-Type: text/markdown
+# RAG Multiple Files MCP Server
+A local MCP server that lets you semantically search multiple PDFs using natural language queries.
+## Use from Claude Code
+Configure the MCP server:
+```json
+"mcpServers": {
+    ...
+    "rag-multiple-files": {
+        "type": "stdio",
+        "command": "uvx",
+        "args": [
+            "rag-multiple-files"
+        ],
+        "env": {
+            "PDF_FOLDER": "/path/to/your/folder"
+        }
+    }
+}
+```
+### Sample Request
+Query:
+```
+YYYYYYYYYYYYYYY.pdf: What is the chain slack
+```
+Response:
+```
+Based on the YYYYYYYYYYYYYYY owner's manual:
+    Chain tension (slack): XX mm (ZZ in)
+```
+## How it works
+1. **Querying** — The query is embedded with the same model and searched against the FAISS index. The top-k most similar chunks are returned with their page numbers and similarity scores.
+Parameters:
+- query (str) - required - no default - the question or topic to search for
+- file_name (str) - required - no default - the name of the PDF file
+- top_k (int) - not required - 5 default - number of chunks to return

rag_multiple_files-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,48 @@
+# RAG Multiple Files MCP Server
+A local MCP server that lets you semantically search multiple PDFs using natural language queries.
+## Use from Claude Code
+Configure the MCP server:
+```json
+"mcpServers": {
+    ...
+    "rag-multiple-files": {
+        "type": "stdio",
+        "command": "uvx",
+        "args": [
+            "rag-multiple-files"
+        ],
+        "env": {
+            "PDF_FOLDER": "/path/to/your/folder"
+        }
+    }
+}
+```
+### Sample Request
+Query:
+```
+YYYYYYYYYYYYYYY.pdf: What is the chain slack
+```
+Response:
+```
+Based on the YYYYYYYYYYYYYYY owner's manual:
+    Chain tension (slack): XX mm (ZZ in)
+```
+## How it works
+1. **Querying** — The query is embedded with the same model and searched against the FAISS index. The top-k most similar chunks are returned with their page numbers and similarity scores.
+Parameters:
+- query (str) - required - no default - the question or topic to search for
+- file_name (str) - required - no default - the name of the PDF file
+- top_k (int) - not required - 5 default - number of chunks to return

rag_multiple_files-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,28 @@
+[project]
+name = "rag-multiple-files"
+version = "0.1.0"
+description = "MCP server for semantic PDF querying using RAG"
+authors = [
+    { name = "Mihai Ionita", email = "mionita1980@proton.me" }
+]
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "mcp[cli]",
+    "sentence-transformers",
+    "faiss-cpu"
+]
+[project.urls]
+Homepage = "https://codeberg.org/mionita1980/rag-multiple-files"
+Source = "https://codeberg.org/mionita1980/rag-multiple-files"
+[project.scripts]
+rag-multiple-files = "ragmfiles.main:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/ragmfiles"]

rag_multiple_files-0.1.0/requirements.txt ADDED Viewed

@@ -0,0 +1,3 @@
+faiss-cpu==1.13.2
+mcp==1.26.0
+sentence-transformers==5.3.0

rag_multiple_files-0.1.0/src/ragmfiles/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """Multiple files RAG MCP Server - Query info from multiple documents."""

rag_multiple_files-0.1.0/src/ragmfiles/main.py ADDED Viewed

@@ -0,0 +1,116 @@
+"""MCP server that answers queries against pre-built PDF indexes.
+Indexes must already exist (run the indexer first for each PDF).
+"""
+import json
+import os
+from pathlib import Path
+import faiss
+import numpy as np
+from mcp.server.fastmcp import FastMCP
+from sentence_transformers import SentenceTransformer
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+MODEL_NAME = "all-MiniLM-L6-v2"
+mcp = FastMCP("rag-multiple-files")
+_model: SentenceTransformer | None = None
+# Maps pdf filename (e.g. "doc.pdf") -> (faiss.Index, list[dict])
+_indexes: dict[str, tuple[faiss.Index, list[dict]]] = {}
+def get_model() -> SentenceTransformer:
+    global _model
+    if _model is None:
+        _model = SentenceTransformer(MODEL_NAME)
+    return _model
+def get_pdf_folder() -> Path:
+    folder = os.environ.get("PDF_FOLDER", "")
+    if not folder:
+        raise ValueError("PDF_FOLDER environment variable is not set")
+    path = Path(folder)
+    if not path.is_dir():
+        raise FileNotFoundError(f"Folder not found: {path}")
+    return path
+def load_all_indexes() -> None:
+    """Scan PDF_FOLDER for *.index subdirs and load each one."""
+    folder = get_pdf_folder()
+    for index_dir in sorted(folder.glob("*.index")):
+        faiss_file = index_dir / "index.faiss"
+        chunks_file = index_dir / "chunks.json"
+        if not faiss_file.exists() or not chunks_file.exists():
+            logger.warning(f"Skipping incomplete index at {index_dir}")
+            continue
+        # Reconstruct the original PDF filename from the index dir name
+        pdf_name = index_dir.stem + ".pdf"
+        if pdf_name in _indexes:
+            continue
+        logger.info(f"Loading index for {pdf_name}...")
+        index = faiss.read_index(str(faiss_file))
+        chunks = json.loads(chunks_file.read_text())
+        _indexes[pdf_name] = (index, chunks)
+    if not _indexes:
+        logger.warning("No indexes found in PDF_FOLDER. Run the indexer first.")
+@mcp.tool()
+def list_indexed_files() -> str:
+    """List all PDF files that have a loaded index available for querying."""
+    if not _indexes:
+        return "No indexes loaded."
+    return "\n".join(sorted(_indexes.keys()))
+@mcp.tool()
+def query_pdf(query: str, file_name: str, top_k: int = 5) -> str:
+    """Search the indexed PDF for chunks most relevant to the query.
+    Args:
+        query: The question or topic to search for.
+        file_name: The PDF filename to query (e.g. "document.pdf").
+        top_k: Number of chunks to return (default: 5).
+    """
+    logger.info(f"query_pdf file={file_name} query={query!r} top_k={top_k}")
+    if file_name not in _indexes:
+        available = ", ".join(sorted(_indexes.keys())) or "none"
+        return f"No index loaded for '{file_name}'. Available: {available}"
+    index, chunks = _indexes[file_name]
+    model = get_model()
+    query_vec = model.encode([query], normalize_embeddings=True)
+    query_vec = np.array(query_vec, dtype=np.float32)
+    scores, indices = index.search(query_vec, min(top_k, len(chunks)))
+    results = []
+    for score, idx in zip(scores[0], indices[0]):
+        if idx == -1:
+            continue
+        chunk = chunks[idx]
+        results.append(
+            f"[Page {chunk['page']}] (score: {score:.3f})\n{chunk['text']}"
+        )
+    if not results:
+        return "No matching chunks found."
+    return (
+        f"Found {len(results)} relevant chunks from: {file_name}\n\n"
+        + "\n\n---\n\n".join(results)
+    )
+def main():
+    """Entry point for the RAG MCP server."""
+    logger.info("Starting MCP server...")
+    load_all_indexes()
+    mcp.run(transport="stdio")
+if __name__ == "__main__":
+    main()