codebase-qa-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_qa_mcp-0.1.0/.dockerignore +6 -0
- codebase_qa_mcp-0.1.0/Dockerfile +22 -0
- codebase_qa_mcp-0.1.0/PKG-INFO +192 -0
- codebase_qa_mcp-0.1.0/README.md +180 -0
- codebase_qa_mcp-0.1.0/pyproject.toml +20 -0
- codebase_qa_mcp-0.1.0/src/codebase_qa_mcp/__init__.py +3 -0
- codebase_qa_mcp-0.1.0/src/codebase_qa_mcp/config.py +39 -0
- codebase_qa_mcp-0.1.0/src/codebase_qa_mcp/indexer.py +354 -0
- codebase_qa_mcp-0.1.0/src/codebase_qa_mcp/retriever.py +125 -0
- codebase_qa_mcp-0.1.0/src/codebase_qa_mcp/server.py +131 -0
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
FROM python:3.12-slim
|
|
2
|
+
|
|
3
|
+
# Git is needed for incremental updates (git diff, git rev-parse)
|
|
4
|
+
RUN apt-get update && apt-get install -y --no-install-recommends git \
|
|
5
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
6
|
+
|
|
7
|
+
WORKDIR /app
|
|
8
|
+
|
|
9
|
+
# Copy project files
|
|
10
|
+
COPY pyproject.toml README.md ./
|
|
11
|
+
COPY src/ src/
|
|
12
|
+
|
|
13
|
+
# Install the package
|
|
14
|
+
RUN pip install --no-cache-dir .
|
|
15
|
+
|
|
16
|
+
# Pre-download the embedding model so first run is instant
|
|
17
|
+
RUN python -c "from sentence_transformers import SentenceTransformer; SentenceTransformer('all-MiniLM-L6-v2')"
|
|
18
|
+
|
|
19
|
+
# ChromaDB data will be stored here — mount a volume for persistence
|
|
20
|
+
ENV CODEBASE_QA_CHROMA_PATH=/data/chroma_db
|
|
21
|
+
|
|
22
|
+
ENTRYPOINT ["codebase-qa-mcp"]
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codebase-qa-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server for codebase Q&A powered by RAG — index any git repo and ask questions about the code
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: chromadb
|
|
8
|
+
Requires-Dist: langchain-text-splitters
|
|
9
|
+
Requires-Dist: mcp[cli]
|
|
10
|
+
Requires-Dist: sentence-transformers
|
|
11
|
+
Description-Content-Type: text/markdown
|
|
12
|
+
|
|
13
|
+
# Codebase Q&A MCP Server
|
|
14
|
+
|
|
15
|
+
An MCP server that lets AI assistants answer questions about any codebase using RAG (Retrieval-Augmented Generation). Index any git repository locally, then ask natural-language questions — answers are grounded in actual source code.
|
|
16
|
+
|
|
17
|
+
Built with [FastMCP](https://github.com/jlowin/fastmcp), [ChromaDB](https://www.trychroma.com/), and [sentence-transformers](https://www.sbert.net/).
|
|
18
|
+
|
|
19
|
+
## How It Works
|
|
20
|
+
|
|
21
|
+
```
|
|
22
|
+
┌─────────────┐ ┌──────────────────────────────────────────────┐
|
|
23
|
+
│ AI Client │ │ Codebase Q&A MCP Server │
|
|
24
|
+
│ (Claude Code,│ MCP │ │
|
|
25
|
+
│ Claude │◄───►│ ┌─────────┐ ┌──────────┐ ┌───────────┐ │
|
|
26
|
+
│ Desktop, │ │ │ Load & │─►│ Embed │─►│ ChromaDB │ │
|
|
27
|
+
│ Cursor) │ │ │ Chunk │ │(MiniLM) │ │ (persist) │ │
|
|
28
|
+
│ │ │ └─────────┘ └──────────┘ └───────────┘ │
|
|
29
|
+
└─────────────┘ └──────────────────────────────────────────────┘
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
1. **Index** — Point the server at a git repo. It reads all source files, splits them into chunks, generates embeddings with `all-MiniLM-L6-v2`, and stores them in a persistent ChromaDB instance.
|
|
33
|
+
2. **Query** — Ask a natural-language question. The server finds the most relevant code chunks via semantic similarity and returns them with file paths and metadata.
|
|
34
|
+
3. **Update** — After new commits, run an incremental update that only re-indexes changed files (using `git diff`).
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
- **7 MCP tools** for full codebase Q&A workflow
|
|
39
|
+
- **Local embeddings** — no API keys needed (runs `all-MiniLM-L6-v2` on your machine)
|
|
40
|
+
- **Persistent storage** — ChromaDB persists indexes across sessions
|
|
41
|
+
- **Incremental updates** — only re-index files that changed since last commit
|
|
42
|
+
- **Multi-project support** — index multiple repos and switch between them
|
|
43
|
+
- **File filtering** — narrow queries to specific paths (e.g. only `auth`-related files)
|
|
44
|
+
- **30+ file types** supported (Python, JS/TS, Go, Rust, Java, C/C++, Markdown, YAML, SQL, and more)
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
### Option 1: uvx (recommended)
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uvx codebase-qa-mcp
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Option 2: pip
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install codebase-qa-mcp
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Option 3: From source
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
git clone https://github.com/gokul-viswanathan/codebase-qa-mcp.git
|
|
64
|
+
cd codebase-qa-mcp
|
|
65
|
+
pip install -e .
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Option 4: agentregistry
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Install arctl
|
|
72
|
+
curl -fsSL https://raw.githubusercontent.com/agentregistry-dev/agentregistry/main/scripts/get-arctl | bash
|
|
73
|
+
|
|
74
|
+
# Deploy the server
|
|
75
|
+
arctl deploy codebase-qa-mcp
|
|
76
|
+
|
|
77
|
+
# Auto-configure your IDE
|
|
78
|
+
arctl configure claude-desktop
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Quick Start
|
|
82
|
+
|
|
83
|
+
### 1. Add to Claude Code
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
claude mcp add codebase-qa-mcp -- codebase-qa-mcp
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Or if running from source:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
claude mcp add codebase-qa-mcp -- /path/to/codebase-qa-mcp/.venv/bin/python -m codebase_qa_mcp.server
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### 2. Add to Claude Desktop
|
|
96
|
+
|
|
97
|
+
Add this to your Claude Desktop config (`~/.config/claude-desktop/config.json`):
|
|
98
|
+
|
|
99
|
+
```json
|
|
100
|
+
{
|
|
101
|
+
"mcpServers": {
|
|
102
|
+
"codebase-qa-mcp": {
|
|
103
|
+
"command": "codebase-qa-mcp",
|
|
104
|
+
"args": []
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Or from source:
|
|
111
|
+
|
|
112
|
+
```json
|
|
113
|
+
{
|
|
114
|
+
"mcpServers": {
|
|
115
|
+
"codebase-qa-mcp": {
|
|
116
|
+
"command": "/path/to/codebase-qa-mcp/.venv/bin/python",
|
|
117
|
+
"args": ["-m", "codebase_qa_mcp.server"]
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### 3. Use it
|
|
124
|
+
|
|
125
|
+
Once connected, your AI assistant can use these tools:
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
You: "Index the repo at /home/user/my-project"
|
|
129
|
+
AI: → calls index_repository("/home/user/my-project")
|
|
130
|
+
✓ Indexed 42 files → 380 chunks
|
|
131
|
+
|
|
132
|
+
You: "How does authentication work in this codebase?"
|
|
133
|
+
AI: → calls query_codebase("how does authentication work")
|
|
134
|
+
Returns relevant code chunks from auth-related files
|
|
135
|
+
|
|
136
|
+
You: "What files handle database migrations?"
|
|
137
|
+
AI: → calls query_codebase("database migrations", file_filter="migrations")
|
|
138
|
+
Returns chunks specifically from migration files
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
## MCP Tools Reference
|
|
142
|
+
|
|
143
|
+
| Tool | Description |
|
|
144
|
+
|------|-------------|
|
|
145
|
+
| `index_repository` | Full-index a git repo into ChromaDB. Reads all source files, chunks, embeds, and stores them. |
|
|
146
|
+
| `update_index` | Incrementally update using `git diff`. Only re-indexes changed files. |
|
|
147
|
+
| `query_codebase` | Semantic search for code chunks relevant to a natural-language question. |
|
|
148
|
+
| `switch_project` | Switch to a previously indexed repo without re-indexing. |
|
|
149
|
+
| `list_indexed_files` | List all file paths in the current project's index. |
|
|
150
|
+
| `get_index_stats` | Get stats: total chunks, total files, last indexed commit. |
|
|
151
|
+
| `list_projects` | List all previously indexed repositories. |
|
|
152
|
+
|
|
153
|
+
## Configuration
|
|
154
|
+
|
|
155
|
+
The server uses sensible defaults but you can customize via environment variables:
|
|
156
|
+
|
|
157
|
+
| Variable | Default | Description |
|
|
158
|
+
|----------|---------|-------------|
|
|
159
|
+
| `CODEBASE_QA_CHROMA_PATH` | `~/.local/share/codebase-qa-mcp/chroma_db` | Where ChromaDB stores its data |
|
|
160
|
+
|
|
161
|
+
Built-in defaults:
|
|
162
|
+
- **Chunk size:** 500 characters with 50-character overlap
|
|
163
|
+
- **Embedding model:** `all-MiniLM-L6-v2` (384-dimensional, runs locally)
|
|
164
|
+
- **Top-K results:** 5
|
|
165
|
+
|
|
166
|
+
## Project Structure
|
|
167
|
+
|
|
168
|
+
```
|
|
169
|
+
codebase-qa-mcp/
|
|
170
|
+
├── pyproject.toml # Package config, deps, entry point
|
|
171
|
+
├── Dockerfile # Container build for agentregistry
|
|
172
|
+
└── src/codebase_qa_mcp/
|
|
173
|
+
├── __init__.py
|
|
174
|
+
├── config.py # Constants (chunk size, model, extensions)
|
|
175
|
+
├── indexer.py # Load → chunk → embed → store + incremental updates
|
|
176
|
+
├── retriever.py # Query, list files, stats, list projects
|
|
177
|
+
└── server.py # FastMCP server with 7 tools
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## How RAG Works Here
|
|
181
|
+
|
|
182
|
+
**Retrieval-Augmented Generation (RAG)** grounds AI responses in actual source code rather than relying on the model's training data:
|
|
183
|
+
|
|
184
|
+
1. **Chunking** — Source files are split into ~500-character chunks using `RecursiveCharacterTextSplitter` from LangChain, which respects code boundaries.
|
|
185
|
+
2. **Embedding** — Each chunk is converted to a 384-dimensional vector using `all-MiniLM-L6-v2`, a fast sentence-transformer model that runs entirely on your machine.
|
|
186
|
+
3. **Storage** — Vectors are stored in ChromaDB with metadata (file path, chunk index, language). The database persists to disk so indexes survive restarts.
|
|
187
|
+
4. **Retrieval** — When you ask a question, it's embedded with the same model and compared against stored chunks via cosine similarity. The top-K most relevant chunks are returned.
|
|
188
|
+
5. **Generation** — The AI client (Claude, etc.) receives the relevant code chunks and uses them to generate an accurate, grounded answer.
|
|
189
|
+
|
|
190
|
+
## License
|
|
191
|
+
|
|
192
|
+
MIT
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
# Codebase Q&A MCP Server
|
|
2
|
+
|
|
3
|
+
An MCP server that lets AI assistants answer questions about any codebase using RAG (Retrieval-Augmented Generation). Index any git repository locally, then ask natural-language questions — answers are grounded in actual source code.
|
|
4
|
+
|
|
5
|
+
Built with [FastMCP](https://github.com/jlowin/fastmcp), [ChromaDB](https://www.trychroma.com/), and [sentence-transformers](https://www.sbert.net/).
|
|
6
|
+
|
|
7
|
+
## How It Works
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
┌─────────────┐ ┌──────────────────────────────────────────────┐
|
|
11
|
+
│ AI Client │ │ Codebase Q&A MCP Server │
|
|
12
|
+
│ (Claude Code,│ MCP │ │
|
|
13
|
+
│ Claude │◄───►│ ┌─────────┐ ┌──────────┐ ┌───────────┐ │
|
|
14
|
+
│ Desktop, │ │ │ Load & │─►│ Embed │─►│ ChromaDB │ │
|
|
15
|
+
│ Cursor) │ │ │ Chunk │ │(MiniLM) │ │ (persist) │ │
|
|
16
|
+
│ │ │ └─────────┘ └──────────┘ └───────────┘ │
|
|
17
|
+
└─────────────┘ └──────────────────────────────────────────────┘
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
1. **Index** — Point the server at a git repo. It reads all source files, splits them into chunks, generates embeddings with `all-MiniLM-L6-v2`, and stores them in a persistent ChromaDB instance.
|
|
21
|
+
2. **Query** — Ask a natural-language question. The server finds the most relevant code chunks via semantic similarity and returns them with file paths and metadata.
|
|
22
|
+
3. **Update** — After new commits, run an incremental update that only re-indexes changed files (using `git diff`).
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
- **7 MCP tools** for full codebase Q&A workflow
|
|
27
|
+
- **Local embeddings** — no API keys needed (runs `all-MiniLM-L6-v2` on your machine)
|
|
28
|
+
- **Persistent storage** — ChromaDB persists indexes across sessions
|
|
29
|
+
- **Incremental updates** — only re-index files that changed since last commit
|
|
30
|
+
- **Multi-project support** — index multiple repos and switch between them
|
|
31
|
+
- **File filtering** — narrow queries to specific paths (e.g. only `auth`-related files)
|
|
32
|
+
- **30+ file types** supported (Python, JS/TS, Go, Rust, Java, C/C++, Markdown, YAML, SQL, and more)
|
|
33
|
+
|
|
34
|
+
## Installation
|
|
35
|
+
|
|
36
|
+
### Option 1: uvx (recommended)
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
uvx codebase-qa-mcp
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Option 2: pip
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install codebase-qa-mcp
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Option 3: From source
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
git clone https://github.com/gokul-viswanathan/codebase-qa-mcp.git
|
|
52
|
+
cd codebase-qa-mcp
|
|
53
|
+
pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Option 4: agentregistry
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
# Install arctl
|
|
60
|
+
curl -fsSL https://raw.githubusercontent.com/agentregistry-dev/agentregistry/main/scripts/get-arctl | bash
|
|
61
|
+
|
|
62
|
+
# Deploy the server
|
|
63
|
+
arctl deploy codebase-qa-mcp
|
|
64
|
+
|
|
65
|
+
# Auto-configure your IDE
|
|
66
|
+
arctl configure claude-desktop
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
## Quick Start
|
|
70
|
+
|
|
71
|
+
### 1. Add to Claude Code
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
claude mcp add codebase-qa-mcp -- codebase-qa-mcp
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Or if running from source:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
claude mcp add codebase-qa-mcp -- /path/to/codebase-qa-mcp/.venv/bin/python -m codebase_qa_mcp.server
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### 2. Add to Claude Desktop
|
|
84
|
+
|
|
85
|
+
Add this to your Claude Desktop config (`~/.config/claude-desktop/config.json`):
|
|
86
|
+
|
|
87
|
+
```json
|
|
88
|
+
{
|
|
89
|
+
"mcpServers": {
|
|
90
|
+
"codebase-qa-mcp": {
|
|
91
|
+
"command": "codebase-qa-mcp",
|
|
92
|
+
"args": []
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Or from source:
|
|
99
|
+
|
|
100
|
+
```json
|
|
101
|
+
{
|
|
102
|
+
"mcpServers": {
|
|
103
|
+
"codebase-qa-mcp": {
|
|
104
|
+
"command": "/path/to/codebase-qa-mcp/.venv/bin/python",
|
|
105
|
+
"args": ["-m", "codebase_qa_mcp.server"]
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### 3. Use it
|
|
112
|
+
|
|
113
|
+
Once connected, your AI assistant can use these tools:
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
You: "Index the repo at /home/user/my-project"
|
|
117
|
+
AI: → calls index_repository("/home/user/my-project")
|
|
118
|
+
✓ Indexed 42 files → 380 chunks
|
|
119
|
+
|
|
120
|
+
You: "How does authentication work in this codebase?"
|
|
121
|
+
AI: → calls query_codebase("how does authentication work")
|
|
122
|
+
Returns relevant code chunks from auth-related files
|
|
123
|
+
|
|
124
|
+
You: "What files handle database migrations?"
|
|
125
|
+
AI: → calls query_codebase("database migrations", file_filter="migrations")
|
|
126
|
+
Returns chunks specifically from migration files
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## MCP Tools Reference
|
|
130
|
+
|
|
131
|
+
| Tool | Description |
|
|
132
|
+
|------|-------------|
|
|
133
|
+
| `index_repository` | Full-index a git repo into ChromaDB. Reads all source files, chunks, embeds, and stores them. |
|
|
134
|
+
| `update_index` | Incrementally update using `git diff`. Only re-indexes changed files. |
|
|
135
|
+
| `query_codebase` | Semantic search for code chunks relevant to a natural-language question. |
|
|
136
|
+
| `switch_project` | Switch to a previously indexed repo without re-indexing. |
|
|
137
|
+
| `list_indexed_files` | List all file paths in the current project's index. |
|
|
138
|
+
| `get_index_stats` | Get stats: total chunks, total files, last indexed commit. |
|
|
139
|
+
| `list_projects` | List all previously indexed repositories. |
|
|
140
|
+
|
|
141
|
+
## Configuration
|
|
142
|
+
|
|
143
|
+
The server uses sensible defaults but you can customize via environment variables:
|
|
144
|
+
|
|
145
|
+
| Variable | Default | Description |
|
|
146
|
+
|----------|---------|-------------|
|
|
147
|
+
| `CODEBASE_QA_CHROMA_PATH` | `~/.local/share/codebase-qa-mcp/chroma_db` | Where ChromaDB stores its data |
|
|
148
|
+
|
|
149
|
+
Built-in defaults:
|
|
150
|
+
- **Chunk size:** 500 characters with 50-character overlap
|
|
151
|
+
- **Embedding model:** `all-MiniLM-L6-v2` (384-dimensional, runs locally)
|
|
152
|
+
- **Top-K results:** 5
|
|
153
|
+
|
|
154
|
+
## Project Structure
|
|
155
|
+
|
|
156
|
+
```
|
|
157
|
+
codebase-qa-mcp/
|
|
158
|
+
├── pyproject.toml # Package config, deps, entry point
|
|
159
|
+
├── Dockerfile # Container build for agentregistry
|
|
160
|
+
└── src/codebase_qa_mcp/
|
|
161
|
+
├── __init__.py
|
|
162
|
+
├── config.py # Constants (chunk size, model, extensions)
|
|
163
|
+
├── indexer.py # Load → chunk → embed → store + incremental updates
|
|
164
|
+
├── retriever.py # Query, list files, stats, list projects
|
|
165
|
+
└── server.py # FastMCP server with 7 tools
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## How RAG Works Here
|
|
169
|
+
|
|
170
|
+
**Retrieval-Augmented Generation (RAG)** grounds AI responses in actual source code rather than relying on the model's training data:
|
|
171
|
+
|
|
172
|
+
1. **Chunking** — Source files are split into ~500-character chunks using `RecursiveCharacterTextSplitter` from LangChain, which respects code boundaries.
|
|
173
|
+
2. **Embedding** — Each chunk is converted to a 384-dimensional vector using `all-MiniLM-L6-v2`, a fast sentence-transformer model that runs entirely on your machine.
|
|
174
|
+
3. **Storage** — Vectors are stored in ChromaDB with metadata (file path, chunk index, language). The database persists to disk so indexes survive restarts.
|
|
175
|
+
4. **Retrieval** — When you ask a question, it's embedded with the same model and compared against stored chunks via cosine similarity. The top-K most relevant chunks are returned.
|
|
176
|
+
5. **Generation** — The AI client (Claude, etc.) receives the relevant code chunks and uses them to generate an accurate, grounded answer.
|
|
177
|
+
|
|
178
|
+
## License
|
|
179
|
+
|
|
180
|
+
MIT
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "codebase-qa-mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "MCP server for codebase Q&A powered by RAG — index any git repo and ask questions about the code"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
requires-python = ">=3.10"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"mcp[cli]",
|
|
10
|
+
"chromadb",
|
|
11
|
+
"sentence-transformers",
|
|
12
|
+
"langchain-text-splitters",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
[project.scripts]
|
|
16
|
+
codebase-qa-mcp = "codebase_qa_mcp.server:main"
|
|
17
|
+
|
|
18
|
+
[build-system]
|
|
19
|
+
requires = ["hatchling"]
|
|
20
|
+
build-backend = "hatchling.build"
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Configuration constants for the Codebase Q&A MCP server."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# Chunking
|
|
7
|
+
CHUNK_SIZE = 500
|
|
8
|
+
CHUNK_OVERLAP = 50
|
|
9
|
+
|
|
10
|
+
# Embedding model (runs locally, no API key needed)
|
|
11
|
+
EMBEDDING_MODEL = "all-MiniLM-L6-v2"
|
|
12
|
+
|
|
13
|
+
# Retrieval
|
|
14
|
+
TOP_K = 5
|
|
15
|
+
|
|
16
|
+
# File types to index
|
|
17
|
+
SUPPORTED_EXTENSIONS = {
|
|
18
|
+
".py", ".js", ".ts", ".tsx", ".jsx",
|
|
19
|
+
".md", ".txt", ".rst",
|
|
20
|
+
".java", ".go", ".rs", ".c", ".cpp", ".h", ".hpp",
|
|
21
|
+
".yaml", ".yml", ".json", ".toml",
|
|
22
|
+
".sh", ".bash", ".zsh",
|
|
23
|
+
".html", ".css", ".scss",
|
|
24
|
+
".sql",
|
|
25
|
+
".dockerfile",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# Directories to skip during indexing
|
|
29
|
+
IGNORED_DIRS = {
|
|
30
|
+
".git", "node_modules", "__pycache__", ".venv", "venv",
|
|
31
|
+
"dist", "build", ".tox", ".mypy_cache", ".pytest_cache",
|
|
32
|
+
".next", ".nuxt", "target", "vendor", ".eggs", "*.egg-info",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
# ChromaDB storage (user-level, not inside any repo)
|
|
36
|
+
CHROMA_DB_PATH = os.environ.get(
|
|
37
|
+
"CODEBASE_QA_CHROMA_PATH",
|
|
38
|
+
str(Path.home() / ".local" / "share" / "codebase-qa-mcp" / "chroma_db"),
|
|
39
|
+
)
|
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
"""Codebase ingestion: load files, chunk, embed, store in ChromaDB."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import subprocess
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
import chromadb
|
|
10
|
+
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
|
|
11
|
+
from langchain_text_splitters import Language, RecursiveCharacterTextSplitter
|
|
12
|
+
|
|
13
|
+
from . import config
|
|
14
|
+
|
|
15
|
+
# Map file extensions to language names
|
|
16
|
+
EXTENSION_TO_LANGUAGE = {
|
|
17
|
+
".py": "python", ".js": "javascript", ".ts": "typescript",
|
|
18
|
+
".tsx": "typescript", ".jsx": "javascript", ".java": "java",
|
|
19
|
+
".go": "go", ".rs": "rust", ".c": "c", ".cpp": "cpp",
|
|
20
|
+
".h": "c", ".hpp": "cpp", ".html": "html", ".css": "css",
|
|
21
|
+
".scss": "css", ".sql": "sql", ".sh": "bash", ".bash": "bash",
|
|
22
|
+
".zsh": "bash", ".md": "markdown", ".txt": "text", ".rst": "text",
|
|
23
|
+
".yaml": "yaml", ".yml": "yaml", ".json": "json", ".toml": "toml",
|
|
24
|
+
".dockerfile": "dockerfile",
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
# Map our language names to LangChain Language enums for code-aware splitting
|
|
28
|
+
_LANGUAGE_TO_LANGCHAIN = {
|
|
29
|
+
"python": Language.PYTHON,
|
|
30
|
+
"javascript": Language.JS,
|
|
31
|
+
"typescript": Language.TS,
|
|
32
|
+
"java": Language.JAVA,
|
|
33
|
+
"go": Language.GO,
|
|
34
|
+
"rust": Language.RUST,
|
|
35
|
+
"c": Language.C,
|
|
36
|
+
"cpp": Language.CPP,
|
|
37
|
+
"html": Language.HTML,
|
|
38
|
+
"markdown": Language.MARKDOWN,
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _get_collection_name(repo_path: str) -> str:
|
|
43
|
+
"""Derive a ChromaDB collection name from a repo path."""
|
|
44
|
+
name = Path(repo_path).resolve().name
|
|
45
|
+
# ChromaDB collection names: 3-63 chars, alphanumeric/underscores/hyphens
|
|
46
|
+
name = re.sub(r"[^a-zA-Z0-9_-]", "-", name).strip("-")
|
|
47
|
+
if len(name) < 3:
|
|
48
|
+
name = name + "-repo"
|
|
49
|
+
return name[:63]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _get_chroma_client() -> chromadb.ClientAPI:
|
|
53
|
+
"""Return a persistent ChromaDB client."""
|
|
54
|
+
db_path = config.CHROMA_DB_PATH
|
|
55
|
+
os.makedirs(db_path, exist_ok=True)
|
|
56
|
+
return chromadb.PersistentClient(path=db_path)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _get_embedding_function() -> SentenceTransformerEmbeddingFunction:
|
|
60
|
+
"""Return the embedding function used for all collections."""
|
|
61
|
+
return SentenceTransformerEmbeddingFunction(model_name=config.EMBEDDING_MODEL)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _get_state_file(collection_name: str) -> Path:
|
|
65
|
+
"""Path to the index state file for a collection."""
|
|
66
|
+
return Path(config.CHROMA_DB_PATH) / f".state_{collection_name}.json"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _read_state(collection_name: str) -> dict:
|
|
70
|
+
"""Read the persisted index state (last commit SHA, repo path)."""
|
|
71
|
+
state_file = _get_state_file(collection_name)
|
|
72
|
+
if state_file.exists():
|
|
73
|
+
return json.loads(state_file.read_text())
|
|
74
|
+
return {}
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _write_state(collection_name: str, state: dict) -> None:
|
|
78
|
+
"""Persist index state."""
|
|
79
|
+
state_file = _get_state_file(collection_name)
|
|
80
|
+
state_file.write_text(json.dumps(state, indent=2))
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _get_head_sha(repo_path: str) -> str | None:
|
|
84
|
+
"""Get the current HEAD commit SHA of a git repo."""
|
|
85
|
+
try:
|
|
86
|
+
result = subprocess.run(
|
|
87
|
+
["git", "rev-parse", "HEAD"],
|
|
88
|
+
cwd=repo_path, capture_output=True, text=True, check=True,
|
|
89
|
+
)
|
|
90
|
+
return result.stdout.strip()
|
|
91
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _should_skip_dir(dir_name: str) -> bool:
|
|
96
|
+
"""Check if a directory should be skipped."""
|
|
97
|
+
return dir_name in config.IGNORED_DIRS or dir_name.endswith(".egg-info")
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def load_files(repo_path: str) -> list[dict]:
|
|
101
|
+
"""Walk a directory and load all supported files."""
|
|
102
|
+
repo = Path(repo_path).resolve()
|
|
103
|
+
files = []
|
|
104
|
+
|
|
105
|
+
for root, dirs, filenames in os.walk(repo):
|
|
106
|
+
# Prune ignored directories in-place
|
|
107
|
+
dirs[:] = [d for d in dirs if not _should_skip_dir(d)]
|
|
108
|
+
|
|
109
|
+
for filename in filenames:
|
|
110
|
+
filepath = Path(root) / filename
|
|
111
|
+
ext = filepath.suffix.lower()
|
|
112
|
+
|
|
113
|
+
if ext not in config.SUPPORTED_EXTENSIONS:
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
try:
|
|
117
|
+
content = filepath.read_text(encoding="utf-8", errors="ignore")
|
|
118
|
+
except (OSError, PermissionError):
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
if not content.strip():
|
|
122
|
+
continue
|
|
123
|
+
|
|
124
|
+
rel_path = str(filepath.relative_to(repo))
|
|
125
|
+
files.append({
|
|
126
|
+
"path": rel_path,
|
|
127
|
+
"content": content,
|
|
128
|
+
"language": EXTENSION_TO_LANGUAGE.get(ext, "text"),
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
return files
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def chunk_file(file_dict: dict) -> list[dict]:
|
|
135
|
+
"""Split a file's content into chunks with metadata.
|
|
136
|
+
|
|
137
|
+
Uses language-aware splitting when possible (splits on function/class
|
|
138
|
+
boundaries) and falls back to generic recursive splitting for unsupported
|
|
139
|
+
languages.
|
|
140
|
+
"""
|
|
141
|
+
lang_enum = _LANGUAGE_TO_LANGCHAIN.get(file_dict["language"])
|
|
142
|
+
|
|
143
|
+
if lang_enum is not None:
|
|
144
|
+
splitter = RecursiveCharacterTextSplitter.from_language(
|
|
145
|
+
language=lang_enum,
|
|
146
|
+
chunk_size=config.CHUNK_SIZE,
|
|
147
|
+
chunk_overlap=config.CHUNK_OVERLAP,
|
|
148
|
+
)
|
|
149
|
+
else:
|
|
150
|
+
splitter = RecursiveCharacterTextSplitter(
|
|
151
|
+
chunk_size=config.CHUNK_SIZE,
|
|
152
|
+
chunk_overlap=config.CHUNK_OVERLAP,
|
|
153
|
+
length_function=len,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
texts = splitter.split_text(file_dict["content"])
|
|
157
|
+
|
|
158
|
+
return [
|
|
159
|
+
{
|
|
160
|
+
"text": text,
|
|
161
|
+
"metadata": {
|
|
162
|
+
"file_path": file_dict["path"],
|
|
163
|
+
"chunk_index": i,
|
|
164
|
+
"language": file_dict["language"],
|
|
165
|
+
},
|
|
166
|
+
}
|
|
167
|
+
for i, text in enumerate(texts)
|
|
168
|
+
]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def index_repository(repo_path: str) -> dict:
|
|
172
|
+
"""Full-index a codebase into ChromaDB. Replaces any existing index."""
|
|
173
|
+
repo_path = str(Path(repo_path).resolve())
|
|
174
|
+
collection_name = _get_collection_name(repo_path)
|
|
175
|
+
client = _get_chroma_client()
|
|
176
|
+
embed_fn = _get_embedding_function()
|
|
177
|
+
|
|
178
|
+
# Delete existing collection if it exists, then create fresh
|
|
179
|
+
try:
|
|
180
|
+
client.delete_collection(collection_name)
|
|
181
|
+
except (ValueError, Exception):
|
|
182
|
+
pass # Collection doesn't exist yet, that's fine
|
|
183
|
+
collection = client.get_or_create_collection(
|
|
184
|
+
name=collection_name,
|
|
185
|
+
embedding_function=embed_fn,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
# Load and chunk all files
|
|
189
|
+
files = load_files(repo_path)
|
|
190
|
+
all_chunks = []
|
|
191
|
+
for f in files:
|
|
192
|
+
all_chunks.extend(chunk_file(f))
|
|
193
|
+
|
|
194
|
+
if not all_chunks:
|
|
195
|
+
return {
|
|
196
|
+
"collection": collection_name,
|
|
197
|
+
"repo_path": repo_path,
|
|
198
|
+
"files_indexed": 0,
|
|
199
|
+
"chunks_created": 0,
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# Add chunks to ChromaDB in batches (ChromaDB limit is ~41666 per add)
|
|
203
|
+
batch_size = 5000
|
|
204
|
+
for i in range(0, len(all_chunks), batch_size):
|
|
205
|
+
batch = all_chunks[i:i + batch_size]
|
|
206
|
+
collection.add(
|
|
207
|
+
ids=[f"{c['metadata']['file_path']}::{c['metadata']['chunk_index']}" for c in batch],
|
|
208
|
+
documents=[c["text"] for c in batch],
|
|
209
|
+
metadatas=[c["metadata"] for c in batch],
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Save state
|
|
213
|
+
head_sha = _get_head_sha(repo_path)
|
|
214
|
+
_write_state(collection_name, {
|
|
215
|
+
"repo_path": repo_path,
|
|
216
|
+
"last_commit_sha": head_sha,
|
|
217
|
+
"collection_name": collection_name,
|
|
218
|
+
})
|
|
219
|
+
|
|
220
|
+
return {
|
|
221
|
+
"collection": collection_name,
|
|
222
|
+
"repo_path": repo_path,
|
|
223
|
+
"files_indexed": len(files),
|
|
224
|
+
"chunks_created": len(all_chunks),
|
|
225
|
+
"commit_sha": head_sha,
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def get_changed_files(repo_path: str, last_sha: str) -> dict:
|
|
230
|
+
"""Get files changed between last indexed commit and HEAD."""
|
|
231
|
+
try:
|
|
232
|
+
result = subprocess.run(
|
|
233
|
+
["git", "diff", "--name-status", f"{last_sha}..HEAD"],
|
|
234
|
+
cwd=repo_path, capture_output=True, text=True, check=True,
|
|
235
|
+
)
|
|
236
|
+
except (subprocess.CalledProcessError, FileNotFoundError):
|
|
237
|
+
return {"added": [], "modified": [], "deleted": []}
|
|
238
|
+
|
|
239
|
+
changes = {"added": [], "modified": [], "deleted": []}
|
|
240
|
+
for line in result.stdout.strip().splitlines():
|
|
241
|
+
if not line:
|
|
242
|
+
continue
|
|
243
|
+
parts = line.split("\t", 1)
|
|
244
|
+
if len(parts) != 2:
|
|
245
|
+
continue
|
|
246
|
+
status, filepath = parts[0], parts[1]
|
|
247
|
+
|
|
248
|
+
if status.startswith("A"):
|
|
249
|
+
changes["added"].append(filepath)
|
|
250
|
+
elif status.startswith("M"):
|
|
251
|
+
changes["modified"].append(filepath)
|
|
252
|
+
elif status.startswith("D"):
|
|
253
|
+
changes["deleted"].append(filepath)
|
|
254
|
+
elif status.startswith("R"):
|
|
255
|
+
# Renamed: old\tnew
|
|
256
|
+
rename_parts = filepath.split("\t")
|
|
257
|
+
changes["deleted"].append(rename_parts[0])
|
|
258
|
+
if len(rename_parts) > 1:
|
|
259
|
+
changes["added"].append(rename_parts[1])
|
|
260
|
+
|
|
261
|
+
return changes
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def update_index(repo_path: str) -> dict:
|
|
265
|
+
"""Incrementally update the index using git diff."""
|
|
266
|
+
repo_path = str(Path(repo_path).resolve())
|
|
267
|
+
collection_name = _get_collection_name(repo_path)
|
|
268
|
+
state = _read_state(collection_name)
|
|
269
|
+
|
|
270
|
+
if not state or not state.get("last_commit_sha"):
|
|
271
|
+
# No previous index — do a full index
|
|
272
|
+
return index_repository(repo_path)
|
|
273
|
+
|
|
274
|
+
last_sha = state["last_commit_sha"]
|
|
275
|
+
head_sha = _get_head_sha(repo_path)
|
|
276
|
+
|
|
277
|
+
if head_sha == last_sha:
|
|
278
|
+
return {
|
|
279
|
+
"collection": collection_name,
|
|
280
|
+
"status": "already_up_to_date",
|
|
281
|
+
"commit_sha": head_sha,
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
changes = get_changed_files(repo_path, last_sha)
|
|
285
|
+
files_to_remove = changes["deleted"] + changes["modified"]
|
|
286
|
+
files_to_add = changes["added"] + changes["modified"]
|
|
287
|
+
|
|
288
|
+
client = _get_chroma_client()
|
|
289
|
+
embed_fn = _get_embedding_function()
|
|
290
|
+
collection = client.get_or_create_collection(
|
|
291
|
+
name=collection_name,
|
|
292
|
+
embedding_function=embed_fn,
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
chunks_removed = 0
|
|
296
|
+
chunks_added = 0
|
|
297
|
+
|
|
298
|
+
# Remove chunks for deleted/modified files
|
|
299
|
+
for filepath in files_to_remove:
|
|
300
|
+
existing = collection.get(where={"file_path": filepath})
|
|
301
|
+
if existing["ids"]:
|
|
302
|
+
collection.delete(ids=existing["ids"])
|
|
303
|
+
chunks_removed += len(existing["ids"])
|
|
304
|
+
|
|
305
|
+
# Add chunks for added/modified files
|
|
306
|
+
repo = Path(repo_path).resolve()
|
|
307
|
+
for filepath in files_to_add:
|
|
308
|
+
full_path = repo / filepath
|
|
309
|
+
if not full_path.exists():
|
|
310
|
+
continue
|
|
311
|
+
|
|
312
|
+
ext = full_path.suffix.lower()
|
|
313
|
+
if ext not in config.SUPPORTED_EXTENSIONS:
|
|
314
|
+
continue
|
|
315
|
+
|
|
316
|
+
try:
|
|
317
|
+
content = full_path.read_text(encoding="utf-8", errors="ignore")
|
|
318
|
+
except (OSError, PermissionError):
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
if not content.strip():
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
file_dict = {
|
|
325
|
+
"path": filepath,
|
|
326
|
+
"content": content,
|
|
327
|
+
"language": EXTENSION_TO_LANGUAGE.get(ext, "text"),
|
|
328
|
+
}
|
|
329
|
+
chunks = chunk_file(file_dict)
|
|
330
|
+
|
|
331
|
+
if chunks:
|
|
332
|
+
collection.add(
|
|
333
|
+
ids=[f"{c['metadata']['file_path']}::{c['metadata']['chunk_index']}" for c in chunks],
|
|
334
|
+
documents=[c["text"] for c in chunks],
|
|
335
|
+
metadatas=[c["metadata"] for c in chunks],
|
|
336
|
+
)
|
|
337
|
+
chunks_added += len(chunks)
|
|
338
|
+
|
|
339
|
+
# Update state
|
|
340
|
+
_write_state(collection_name, {
|
|
341
|
+
"repo_path": repo_path,
|
|
342
|
+
"last_commit_sha": head_sha,
|
|
343
|
+
"collection_name": collection_name,
|
|
344
|
+
})
|
|
345
|
+
|
|
346
|
+
return {
|
|
347
|
+
"collection": collection_name,
|
|
348
|
+
"files_updated": len(changes["modified"]),
|
|
349
|
+
"files_added": len(changes["added"]),
|
|
350
|
+
"files_deleted": len(changes["deleted"]),
|
|
351
|
+
"chunks_removed": chunks_removed,
|
|
352
|
+
"chunks_added": chunks_added,
|
|
353
|
+
"commit_sha": head_sha,
|
|
354
|
+
}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Query engine: search ChromaDB for relevant code chunks."""
|
|
2
|
+
|
|
3
|
+
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
|
|
4
|
+
|
|
5
|
+
from . import config
|
|
6
|
+
from .indexer import _get_chroma_client, _get_collection_name, _get_embedding_function, _read_state
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def query_codebase(
|
|
10
|
+
repo_path: str,
|
|
11
|
+
question: str,
|
|
12
|
+
top_k: int = config.TOP_K,
|
|
13
|
+
file_filter: str | None = None,
|
|
14
|
+
) -> list[dict]:
|
|
15
|
+
"""Search the indexed codebase for chunks relevant to a question."""
|
|
16
|
+
collection_name = _get_collection_name(repo_path)
|
|
17
|
+
client = _get_chroma_client()
|
|
18
|
+
embed_fn = _get_embedding_function()
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
collection = client.get_collection(
|
|
22
|
+
name=collection_name,
|
|
23
|
+
embedding_function=embed_fn,
|
|
24
|
+
)
|
|
25
|
+
except ValueError:
|
|
26
|
+
return []
|
|
27
|
+
|
|
28
|
+
where_filter = None
|
|
29
|
+
if file_filter:
|
|
30
|
+
where_filter = {"file_path": {"$contains": file_filter}}
|
|
31
|
+
|
|
32
|
+
results = collection.query(
|
|
33
|
+
query_texts=[question],
|
|
34
|
+
n_results=min(top_k, collection.count()),
|
|
35
|
+
where=where_filter,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
if not results["documents"] or not results["documents"][0]:
|
|
39
|
+
return []
|
|
40
|
+
|
|
41
|
+
chunks = []
|
|
42
|
+
for i, doc in enumerate(results["documents"][0]):
|
|
43
|
+
chunks.append({
|
|
44
|
+
"content": doc,
|
|
45
|
+
"file_path": results["metadatas"][0][i].get("file_path", "unknown"),
|
|
46
|
+
"chunk_index": results["metadatas"][0][i].get("chunk_index", 0),
|
|
47
|
+
"language": results["metadatas"][0][i].get("language", "text"),
|
|
48
|
+
"distance": results["distances"][0][i] if results["distances"] else None,
|
|
49
|
+
})
|
|
50
|
+
|
|
51
|
+
return chunks
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def list_indexed_files(repo_path: str) -> list[str]:
|
|
55
|
+
"""Get all unique file paths in the index for a repo."""
|
|
56
|
+
collection_name = _get_collection_name(repo_path)
|
|
57
|
+
client = _get_chroma_client()
|
|
58
|
+
embed_fn = _get_embedding_function()
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
collection = client.get_collection(
|
|
62
|
+
name=collection_name,
|
|
63
|
+
embedding_function=embed_fn,
|
|
64
|
+
)
|
|
65
|
+
except ValueError:
|
|
66
|
+
return []
|
|
67
|
+
|
|
68
|
+
all_items = collection.get(include=["metadatas"])
|
|
69
|
+
file_paths = set()
|
|
70
|
+
for meta in all_items["metadatas"]:
|
|
71
|
+
if "file_path" in meta:
|
|
72
|
+
file_paths.add(meta["file_path"])
|
|
73
|
+
|
|
74
|
+
return sorted(file_paths)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def get_index_stats(repo_path: str) -> dict:
|
|
78
|
+
"""Get statistics about the index for a repo."""
|
|
79
|
+
collection_name = _get_collection_name(repo_path)
|
|
80
|
+
client = _get_chroma_client()
|
|
81
|
+
embed_fn = _get_embedding_function()
|
|
82
|
+
|
|
83
|
+
state = _read_state(collection_name)
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
collection = client.get_collection(
|
|
87
|
+
name=collection_name,
|
|
88
|
+
embedding_function=embed_fn,
|
|
89
|
+
)
|
|
90
|
+
except ValueError:
|
|
91
|
+
return {
|
|
92
|
+
"collection": collection_name,
|
|
93
|
+
"status": "not_indexed",
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
all_items = collection.get(include=["metadatas"])
|
|
97
|
+
file_paths = set()
|
|
98
|
+
for meta in all_items["metadatas"]:
|
|
99
|
+
if "file_path" in meta:
|
|
100
|
+
file_paths.add(meta["file_path"])
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
"collection": collection_name,
|
|
104
|
+
"repo_path": state.get("repo_path", repo_path),
|
|
105
|
+
"total_chunks": collection.count(),
|
|
106
|
+
"total_files": len(file_paths),
|
|
107
|
+
"last_indexed_commit": state.get("last_commit_sha"),
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def list_projects() -> list[dict]:
|
|
112
|
+
"""List all indexed projects."""
|
|
113
|
+
client = _get_chroma_client()
|
|
114
|
+
collections = client.list_collections()
|
|
115
|
+
|
|
116
|
+
projects = []
|
|
117
|
+
for col in collections:
|
|
118
|
+
state = _read_state(col.name)
|
|
119
|
+
projects.append({
|
|
120
|
+
"collection": col.name,
|
|
121
|
+
"repo_path": state.get("repo_path", "unknown"),
|
|
122
|
+
"last_commit": state.get("last_commit_sha"),
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
return projects
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""MCP server exposing codebase Q&A tools."""
|
|
2
|
+
|
|
3
|
+
from mcp.server.fastmcp import FastMCP
|
|
4
|
+
|
|
5
|
+
from . import indexer, retriever
|
|
6
|
+
|
|
7
|
+
mcp = FastMCP("Codebase Q&A")
|
|
8
|
+
|
|
9
|
+
# Track the active project so queries know which collection to search
|
|
10
|
+
_active_repo_path: str | None = None
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@mcp.tool()
|
|
14
|
+
def index_repository(repo_path: str) -> dict:
|
|
15
|
+
"""Full-index a git repository into the vector store.
|
|
16
|
+
|
|
17
|
+
Reads all supported source files, splits them into chunks,
|
|
18
|
+
generates embeddings, and stores them in ChromaDB.
|
|
19
|
+
This replaces any existing index for the same repo.
|
|
20
|
+
Sets this repo as the active project for queries.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
repo_path: Absolute path to the git repository to index.
|
|
24
|
+
"""
|
|
25
|
+
global _active_repo_path
|
|
26
|
+
_active_repo_path = repo_path
|
|
27
|
+
return indexer.index_repository(repo_path)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@mcp.tool()
|
|
31
|
+
def update_index(repo_path: str) -> dict:
|
|
32
|
+
"""Incrementally update the index using git diff.
|
|
33
|
+
|
|
34
|
+
Only re-indexes files that changed since the last index.
|
|
35
|
+
Much faster than a full re-index for large repos.
|
|
36
|
+
Falls back to full index if no previous index exists.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
repo_path: Absolute path to the git repository.
|
|
40
|
+
"""
|
|
41
|
+
global _active_repo_path
|
|
42
|
+
_active_repo_path = repo_path
|
|
43
|
+
return indexer.update_index(repo_path)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@mcp.tool()
|
|
47
|
+
def query_codebase(
|
|
48
|
+
question: str, top_k: int = 5, file_filter: str | None = None
|
|
49
|
+
) -> list[dict]:
|
|
50
|
+
"""Search the indexed codebase for code chunks relevant to a question.
|
|
51
|
+
|
|
52
|
+
Returns the most relevant source code chunks with file paths,
|
|
53
|
+
ordered by similarity. Use this to find code related to a topic,
|
|
54
|
+
understand how something is implemented, or locate specific functionality.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
question: Natural language question about the codebase.
|
|
58
|
+
top_k: Number of results to return (default 5).
|
|
59
|
+
file_filter: Optional substring to filter by file path (e.g. "auth" to only search auth-related files).
|
|
60
|
+
"""
|
|
61
|
+
if not _active_repo_path:
|
|
62
|
+
return [
|
|
63
|
+
{
|
|
64
|
+
"error": "No active project. Call index_repository or switch_project first."
|
|
65
|
+
}
|
|
66
|
+
]
|
|
67
|
+
return retriever.query_codebase(_active_repo_path, question, top_k, file_filter)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@mcp.tool()
|
|
71
|
+
def switch_project(repo_path: str) -> dict:
|
|
72
|
+
"""Switch the active project without re-indexing.
|
|
73
|
+
|
|
74
|
+
Use this when you want to query a previously indexed repo.
|
|
75
|
+
The repo must have been indexed before with index_repository.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
repo_path: Absolute path to a previously indexed repository.
|
|
79
|
+
"""
|
|
80
|
+
global _active_repo_path
|
|
81
|
+
stats = retriever.get_index_stats(repo_path)
|
|
82
|
+
if stats.get("status") == "not_indexed":
|
|
83
|
+
return {
|
|
84
|
+
"error": f"Repository not indexed. Run index_repository('{repo_path}') first."
|
|
85
|
+
}
|
|
86
|
+
_active_repo_path = repo_path
|
|
87
|
+
return {"active_project": repo_path, **stats}
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@mcp.tool()
|
|
91
|
+
def list_indexed_files() -> list[str]:
|
|
92
|
+
"""List all file paths currently indexed in the active project.
|
|
93
|
+
|
|
94
|
+
Useful for understanding what's in the index and verifying
|
|
95
|
+
that the right files were captured.
|
|
96
|
+
"""
|
|
97
|
+
if not _active_repo_path:
|
|
98
|
+
return ["No active project. Call index_repository or switch_project first."]
|
|
99
|
+
return retriever.list_indexed_files(_active_repo_path)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
@mcp.tool()
|
|
103
|
+
def get_index_stats() -> dict:
|
|
104
|
+
"""Get statistics about the active project's index.
|
|
105
|
+
|
|
106
|
+
Returns total chunks, total files, last indexed commit, etc.
|
|
107
|
+
"""
|
|
108
|
+
if not _active_repo_path:
|
|
109
|
+
return {
|
|
110
|
+
"error": "No active project. Call index_repository or switch_project first."
|
|
111
|
+
}
|
|
112
|
+
return retriever.get_index_stats(_active_repo_path)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
@mcp.tool()
|
|
116
|
+
def list_projects() -> list[dict]:
|
|
117
|
+
"""List all previously indexed projects.
|
|
118
|
+
|
|
119
|
+
Shows all repos that have been indexed, with their paths
|
|
120
|
+
and last indexed commit. Use switch_project to activate one.
|
|
121
|
+
"""
|
|
122
|
+
return retriever.list_projects()
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def main():
|
|
126
|
+
"""Entry point for the MCP server."""
|
|
127
|
+
mcp.run(transport="stdio")
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
if __name__ == "__main__":
|
|
131
|
+
main()
|