rag-multiple-files 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Distribution / packaging
7
+ dist/
8
+ build/
9
+ *.egg-info/
10
+ *.egg
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ ENV/
16
+
17
+ # IDE
18
+ .idea/
19
+ .vscode/
20
+ *.swp
21
+ *.swo
22
+
23
+ # Environment variables
24
+ .env
25
+ .env.*
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
@@ -0,0 +1,6 @@
1
+ # Build and Publish
2
+
3
+ ```bash
4
+ uv build
5
+ uv publish --token xxxxxxxxxxxxxxx
6
+ ```
@@ -0,0 +1,61 @@
1
+ Metadata-Version: 2.4
2
+ Name: rag-multiple-files
3
+ Version: 0.1.0
4
+ Summary: MCP server for semantic PDF querying using RAG
5
+ Project-URL: Homepage, https://codeberg.org/mionita1980/rag-multiple-files
6
+ Project-URL: Source, https://codeberg.org/mionita1980/rag-multiple-files
7
+ Author-email: Mihai Ionita <mionita1980@proton.me>
8
+ Requires-Python: >=3.11
9
+ Requires-Dist: faiss-cpu
10
+ Requires-Dist: mcp[cli]
11
+ Requires-Dist: sentence-transformers
12
+ Description-Content-Type: text/markdown
13
+
14
+ # RAG Multiple Files MCP Server
15
+
16
+ A local MCP server that lets you semantically search multiple PDFs using natural language queries.
17
+
18
+ ## Use from Claude Code
19
+
20
+ Configure the MCP server:
21
+
22
+ ```json
23
+ "mcpServers": {
24
+ ...
25
+ "rag-multiple-files": {
26
+ "type": "stdio",
27
+ "command": "uvx",
28
+ "args": [
29
+ "rag-multiple-files"
30
+ ],
31
+ "env": {
32
+ "PDF_FOLDER": "/path/to/your/folder"
33
+ }
34
+ }
35
+ }
36
+ ```
37
+
38
+ ### Sample Request
39
+
40
+ Query:
41
+
42
+ ```
43
+ YYYYYYYYYYYYYYY.pdf: What is the chain slack
44
+ ```
45
+
46
+ Response:
47
+
48
+ ```
49
+ Based on the YYYYYYYYYYYYYYY owner's manual:
50
+ Chain tension (slack): XX mm (ZZ in)
51
+ ```
52
+
53
+ ## How it works
54
+
55
+ 1. **Querying** — The query is embedded with the same model and searched against the FAISS index. The top-k most similar chunks are returned with their page numbers and similarity scores.
56
+
57
+ Parameters:
58
+
59
+ - query (str) - required - no default - the question or topic to search for
60
+ - file_name (str) - required - no default - the name of the PDF file
61
+ - top_k (int) - not required - 5 default - number of chunks to return
@@ -0,0 +1,48 @@
1
+ # RAG Multiple Files MCP Server
2
+
3
+ A local MCP server that lets you semantically search multiple PDFs using natural language queries.
4
+
5
+ ## Use from Claude Code
6
+
7
+ Configure the MCP server:
8
+
9
+ ```json
10
+ "mcpServers": {
11
+ ...
12
+ "rag-multiple-files": {
13
+ "type": "stdio",
14
+ "command": "uvx",
15
+ "args": [
16
+ "rag-multiple-files"
17
+ ],
18
+ "env": {
19
+ "PDF_FOLDER": "/path/to/your/folder"
20
+ }
21
+ }
22
+ }
23
+ ```
24
+
25
+ ### Sample Request
26
+
27
+ Query:
28
+
29
+ ```
30
+ YYYYYYYYYYYYYYY.pdf: What is the chain slack
31
+ ```
32
+
33
+ Response:
34
+
35
+ ```
36
+ Based on the YYYYYYYYYYYYYYY owner's manual:
37
+ Chain tension (slack): XX mm (ZZ in)
38
+ ```
39
+
40
+ ## How it works
41
+
42
+ 1. **Querying** — The query is embedded with the same model and searched against the FAISS index. The top-k most similar chunks are returned with their page numbers and similarity scores.
43
+
44
+ Parameters:
45
+
46
+ - query (str) - required - no default - the question or topic to search for
47
+ - file_name (str) - required - no default - the name of the PDF file
48
+ - top_k (int) - not required - 5 default - number of chunks to return
@@ -0,0 +1,28 @@
1
+ [project]
2
+ name = "rag-multiple-files"
3
+ version = "0.1.0"
4
+ description = "MCP server for semantic PDF querying using RAG"
5
+ authors = [
6
+ { name = "Mihai Ionita", email = "mionita1980@proton.me" }
7
+ ]
8
+ readme = "README.md"
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "mcp[cli]",
12
+ "sentence-transformers",
13
+ "faiss-cpu"
14
+ ]
15
+
16
+ [project.urls]
17
+ Homepage = "https://codeberg.org/mionita1980/rag-multiple-files"
18
+ Source = "https://codeberg.org/mionita1980/rag-multiple-files"
19
+
20
+ [project.scripts]
21
+ rag-multiple-files = "ragmfiles.main:main"
22
+
23
+ [build-system]
24
+ requires = ["hatchling"]
25
+ build-backend = "hatchling.build"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["src/ragmfiles"]
@@ -0,0 +1,3 @@
1
+ faiss-cpu==1.13.2
2
+ mcp==1.26.0
3
+ sentence-transformers==5.3.0
@@ -0,0 +1 @@
1
+ """Multiple files RAG MCP Server - Query info from multiple documents."""
@@ -0,0 +1,116 @@
1
+ """MCP server that answers queries against pre-built PDF indexes.
2
+
3
+ Indexes must already exist (run the indexer first for each PDF).
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from pathlib import Path
9
+ import faiss
10
+ import numpy as np
11
+ from mcp.server.fastmcp import FastMCP
12
+ from sentence_transformers import SentenceTransformer
13
+ import logging
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ MODEL_NAME = "all-MiniLM-L6-v2"
19
+
20
+ mcp = FastMCP("rag-multiple-files")
21
+
22
+ _model: SentenceTransformer | None = None
23
+ # Maps pdf filename (e.g. "doc.pdf") -> (faiss.Index, list[dict])
24
+ _indexes: dict[str, tuple[faiss.Index, list[dict]]] = {}
25
+
26
+
27
+ def get_model() -> SentenceTransformer:
28
+ global _model
29
+ if _model is None:
30
+ _model = SentenceTransformer(MODEL_NAME)
31
+ return _model
32
+
33
+
34
+ def get_pdf_folder() -> Path:
35
+ folder = os.environ.get("PDF_FOLDER", "")
36
+ if not folder:
37
+ raise ValueError("PDF_FOLDER environment variable is not set")
38
+ path = Path(folder)
39
+ if not path.is_dir():
40
+ raise FileNotFoundError(f"Folder not found: {path}")
41
+ return path
42
+
43
+
44
+ def load_all_indexes() -> None:
45
+ """Scan PDF_FOLDER for *.index subdirs and load each one."""
46
+ folder = get_pdf_folder()
47
+ for index_dir in sorted(folder.glob("*.index")):
48
+ faiss_file = index_dir / "index.faiss"
49
+ chunks_file = index_dir / "chunks.json"
50
+ if not faiss_file.exists() or not chunks_file.exists():
51
+ logger.warning(f"Skipping incomplete index at {index_dir}")
52
+ continue
53
+ # Reconstruct the original PDF filename from the index dir name
54
+ pdf_name = index_dir.stem + ".pdf"
55
+ if pdf_name in _indexes:
56
+ continue
57
+ logger.info(f"Loading index for {pdf_name}...")
58
+ index = faiss.read_index(str(faiss_file))
59
+ chunks = json.loads(chunks_file.read_text())
60
+ _indexes[pdf_name] = (index, chunks)
61
+ if not _indexes:
62
+ logger.warning("No indexes found in PDF_FOLDER. Run the indexer first.")
63
+
64
+
65
+ @mcp.tool()
66
+ def list_indexed_files() -> str:
67
+ """List all PDF files that have a loaded index available for querying."""
68
+ if not _indexes:
69
+ return "No indexes loaded."
70
+ return "\n".join(sorted(_indexes.keys()))
71
+
72
+
73
+ @mcp.tool()
74
+ def query_pdf(query: str, file_name: str, top_k: int = 5) -> str:
75
+ """Search the indexed PDF for chunks most relevant to the query.
76
+
77
+ Args:
78
+ query: The question or topic to search for.
79
+ file_name: The PDF filename to query (e.g. "document.pdf").
80
+ top_k: Number of chunks to return (default: 5).
81
+ """
82
+ logger.info(f"query_pdf file={file_name} query={query!r} top_k={top_k}")
83
+ if file_name not in _indexes:
84
+ available = ", ".join(sorted(_indexes.keys())) or "none"
85
+ return f"No index loaded for '{file_name}'. Available: {available}"
86
+
87
+ index, chunks = _indexes[file_name]
88
+ model = get_model()
89
+ query_vec = model.encode([query], normalize_embeddings=True)
90
+ query_vec = np.array(query_vec, dtype=np.float32)
91
+ scores, indices = index.search(query_vec, min(top_k, len(chunks)))
92
+ results = []
93
+ for score, idx in zip(scores[0], indices[0]):
94
+ if idx == -1:
95
+ continue
96
+ chunk = chunks[idx]
97
+ results.append(
98
+ f"[Page {chunk['page']}] (score: {score:.3f})\n{chunk['text']}"
99
+ )
100
+ if not results:
101
+ return "No matching chunks found."
102
+ return (
103
+ f"Found {len(results)} relevant chunks from: {file_name}\n\n"
104
+ + "\n\n---\n\n".join(results)
105
+ )
106
+
107
+
108
+ def main():
109
+ """Entry point for the RAG MCP server."""
110
+ logger.info("Starting MCP server...")
111
+ load_all_indexes()
112
+ mcp.run(transport="stdio")
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()