yacodebase-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. yacodebase_mcp-0.1.0/.github/workflows/ci.yml +37 -0
  2. yacodebase_mcp-0.1.0/.github/workflows/publish.yml +41 -0
  3. yacodebase_mcp-0.1.0/.gitignore +6 -0
  4. yacodebase_mcp-0.1.0/AGENTS.md +1 -0
  5. yacodebase_mcp-0.1.0/CLAUDE.md +77 -0
  6. yacodebase_mcp-0.1.0/LICENSE +21 -0
  7. yacodebase_mcp-0.1.0/PKG-INFO +179 -0
  8. yacodebase_mcp-0.1.0/README.md +145 -0
  9. yacodebase_mcp-0.1.0/docs/superpowers/plans/2026-05-20-ast-chunking.md +613 -0
  10. yacodebase_mcp-0.1.0/docs/superpowers/plans/2026-05-20-codebase-search-mcp.md +1324 -0
  11. yacodebase_mcp-0.1.0/docs/superpowers/plans/2026-05-20-global-embedding-config.md +962 -0
  12. yacodebase_mcp-0.1.0/docs/superpowers/specs/2026-05-20-ast-chunking-design.md +152 -0
  13. yacodebase_mcp-0.1.0/docs/superpowers/specs/2026-05-20-codebase-search-mcp-design.md +200 -0
  14. yacodebase_mcp-0.1.0/docs/superpowers/specs/2026-05-20-global-embedding-config-design.md +165 -0
  15. yacodebase_mcp-0.1.0/pyproject.toml +64 -0
  16. yacodebase_mcp-0.1.0/src/codebase_mcp/__init__.py +0 -0
  17. yacodebase_mcp-0.1.0/src/codebase_mcp/ast_chunker.py +119 -0
  18. yacodebase_mcp-0.1.0/src/codebase_mcp/cli.py +168 -0
  19. yacodebase_mcp-0.1.0/src/codebase_mcp/indexer.py +156 -0
  20. yacodebase_mcp-0.1.0/src/codebase_mcp/searcher.py +84 -0
  21. yacodebase_mcp-0.1.0/src/codebase_mcp/server.py +30 -0
  22. yacodebase_mcp-0.1.0/src/codebase_mcp/settings.py +57 -0
  23. yacodebase_mcp-0.1.0/src/codebase_mcp/store.py +78 -0
  24. yacodebase_mcp-0.1.0/tests/__init__.py +0 -0
  25. yacodebase_mcp-0.1.0/tests/conftest.py +20 -0
  26. yacodebase_mcp-0.1.0/tests/test_ast_chunker.py +218 -0
  27. yacodebase_mcp-0.1.0/tests/test_cli.py +195 -0
  28. yacodebase_mcp-0.1.0/tests/test_indexer.py +179 -0
  29. yacodebase_mcp-0.1.0/tests/test_integration.py +73 -0
  30. yacodebase_mcp-0.1.0/tests/test_searcher.py +143 -0
  31. yacodebase_mcp-0.1.0/tests/test_settings.py +67 -0
  32. yacodebase_mcp-0.1.0/tests/test_store.py +74 -0
  33. yacodebase_mcp-0.1.0/uv.lock +2033 -0
@@ -0,0 +1,37 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Install uv
20
+ uses: astral-sh/setup-uv@v5
21
+ with:
22
+ enable-cache: true
23
+
24
+ - name: Set up Python ${{ matrix.python-version }}
25
+ run: uv python install ${{ matrix.python-version }}
26
+
27
+ - name: Install dependencies
28
+ run: uv sync --dev
29
+
30
+ - name: Lint
31
+ run: uv run ruff check .
32
+
33
+ - name: Format check
34
+ run: uv run ruff format --check .
35
+
36
+ - name: Run tests
37
+ run: uv run pytest -v
@@ -0,0 +1,41 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+ steps:
11
+ - uses: actions/checkout@v4
12
+
13
+ - name: Install uv
14
+ uses: astral-sh/setup-uv@v5
15
+ with:
16
+ enable-cache: true
17
+
18
+ - name: Build package
19
+ run: uv build
20
+
21
+ - name: Upload dist
22
+ uses: actions/upload-artifact@v4
23
+ with:
24
+ name: dist
25
+ path: dist/
26
+
27
+ publish:
28
+ needs: build
29
+ runs-on: ubuntu-latest
30
+ environment: pypi
31
+ permissions:
32
+ id-token: write # required for Trusted Publishing (OIDC)
33
+ steps:
34
+ - name: Download dist
35
+ uses: actions/download-artifact@v4
36
+ with:
37
+ name: dist
38
+ path: dist/
39
+
40
+ - name: Publish to PyPI
41
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,6 @@
1
+ __pycache__/
2
+ *.pyc
3
+ .venv/
4
+ dist/
5
+ *.egg-info/
6
+ .pytest_cache/
@@ -0,0 +1 @@
1
+ CLAUDE.md
@@ -0,0 +1,77 @@
1
+ # codebase-mcp — dev guide
2
+
3
+ ## Setup
4
+
5
+ ```bash
6
+ uv sync
7
+ ```
8
+
9
+ All commands use `.venv/bin/` prefix or `uv run`.
10
+
11
+ ## Run tests
12
+
13
+ ```bash
14
+ .venv/bin/pytest
15
+ .venv/bin/pytest -v
16
+ .venv/bin/pytest tests/test_ast_chunker.py -v # specific module
17
+ ```
18
+
19
+ ## Lint / format
20
+
21
+ ```bash
22
+ .venv/bin/ruff check src tests
23
+ .venv/bin/ruff format src tests
24
+ ```
25
+
26
+ Line length: 100. Rules: E, F, I (pycodestyle, pyflakes, isort).
27
+
28
+ ## Project structure
29
+
30
+ ```
31
+ src/codebase_mcp/
32
+ cli.py # Click CLI: index, reindex, list, remove, serve, config *
33
+ server.py # FastMCP server — exposes search_codebase + list_indexed_repos
34
+ indexer.py # File walking, chunking, embedding, Qdrant upsert
35
+ ast_chunker.py # tree-sitter AST chunking (function/method boundaries)
36
+ searcher.py # Query embedding + Qdrant search + result formatting
37
+ store.py # Qdrant client, config.json r/w, repo metadata
38
+ settings.py # settings.json r/w (embedding model, vector size, api_key, api_base)
39
+ tests/
40
+ test_ast_chunker.py
41
+ test_cli.py
42
+ test_indexer.py
43
+ test_integration.py
44
+ test_searcher.py
45
+ test_settings.py
46
+ test_store.py
47
+ ```
48
+
49
+ ## Key design decisions
50
+
51
+ **AST chunking → line fallback**: `indexer.chunk_file` tries `ast_chunker.chunk_file_ast` first. If the language is unsupported or tree-sitter fails, falls back to 100-line sliding window (20-line overlap). This ensures semantic boundaries for supported languages without breaking on unknown file types.
52
+
53
+ **In-process Qdrant**: No external service needed. `store.get_client()` returns a `QdrantClient` pointing at `~/.codebase-mcp/qdrant/`. One collection per repo, named by `repo_id` (hash of abs path).
54
+
55
+ **OpenAI-compatible embeddings**: `indexer` and `searcher` both instantiate `openai.OpenAI(api_key=..., base_url=...)` from settings. Any OpenAI-compatible provider works by setting `api_base`.
56
+
57
+ **Vector size mismatch detection**: `searcher.search` reads actual vector dim from Qdrant and skips repos where it doesn't match current model's `vector_size`. Emits a warning prompting reindex.
58
+
59
+ **MAX_CHUNK_CHARS = 16000**: Both `ast_chunker` and `indexer` truncate chunk text at 16k chars (~8192 tokens at 2 chars/token for dense code). Applied post-collection in `indexer.index_repo` as final safety.
60
+
61
+ **Config precedence**: `settings.json` fields override env vars. `api_key=None` in settings → falls back to `OPENAI_API_KEY` env var (handled by OpenAI SDK). `api_base=None` → uses OpenAI default.
62
+
63
+ ## Adding a new language
64
+
65
+ 1. Add tree-sitter grammar to `pyproject.toml` dependencies.
66
+ 2. Add entry to `ast_chunker.EXT_TO_LANG` mapping extension → language name.
67
+ 3. Add entry to `ast_chunker.SEMANTIC_NODES` mapping language → relevant node type set.
68
+ 4. Add parser instantiation branch in `ast_chunker._get_parser`.
69
+ 5. Add extension to `indexer.INDEXED_EXTENSIONS`.
70
+
71
+ ## Data dir
72
+
73
+ `~/.codebase-mcp/` — created on first use by `store._data_dir()`.
74
+
75
+ - `config.json` — repo registry: `{abs_path: {repo_id, chunk_count, last_indexed}}`
76
+ - `settings.json` — persistent settings (only non-null, non-default values written)
77
+ - `qdrant/` — Qdrant on-disk storage
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 gzamboni
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: yacodebase-mcp
3
+ Version: 0.1.0
4
+ Summary: CLI + MCP server that indexes local codebases into Qdrant using OpenAI-compatible embeddings, then exposes semantic search to Claude Code and other MCP clients
5
+ Project-URL: Homepage, https://github.com/gzamboni/codebase-mcp
6
+ Project-URL: Repository, https://github.com/gzamboni/codebase-mcp
7
+ Project-URL: Issues, https://github.com/gzamboni/codebase-mcp/issues
8
+ Author-email: gzamboni <gzamboni@gmail.com>
9
+ License: MIT
10
+ License-File: LICENSE
11
+ Keywords: claude,codebase,embeddings,mcp,qdrant,semantic-search
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
19
+ Requires-Python: >=3.11
20
+ Requires-Dist: click>=8.0
21
+ Requires-Dist: fastmcp>=2.0
22
+ Requires-Dist: openai>=1.0
23
+ Requires-Dist: qdrant-client>=1.9
24
+ Requires-Dist: rich>=13.0
25
+ Requires-Dist: tree-sitter-go>=0.23
26
+ Requires-Dist: tree-sitter-hcl>=0.23
27
+ Requires-Dist: tree-sitter-java>=0.23
28
+ Requires-Dist: tree-sitter-javascript>=0.23
29
+ Requires-Dist: tree-sitter-python>=0.23
30
+ Requires-Dist: tree-sitter-rust>=0.23
31
+ Requires-Dist: tree-sitter-typescript>=0.23
32
+ Requires-Dist: tree-sitter>=0.23
33
+ Description-Content-Type: text/markdown
34
+
35
+ # codebase-mcp
36
+
37
+ Vector search MCP server for codebases. Index repos locally with AST-aware chunking; let Claude (or any MCP client) search them via semantic similarity.
38
+
39
+ ## How it works
40
+
41
+ 1. **Index** — walks repo files, chunks them using tree-sitter AST (function/method boundaries) with line-based fallback, embeds via OpenAI-compatible API, stores in in-process Qdrant.
42
+ 2. **Serve** — exposes two MCP tools (`search_codebase`, `list_indexed_repos`) over stdio.
43
+ 3. **Search** — embeds the query, retrieves top-8 chunks across all indexed repos (or a specific one), returns ranked results with file path and line numbers.
44
+
45
+ ## Install
46
+
47
+ ```bash
48
+ uv tool install /path/to/codebase-mcp
49
+ ```
50
+
51
+ Or for development:
52
+
53
+ ```bash
54
+ uv sync
55
+ ```
56
+
57
+ ## CLI
58
+
59
+ ```bash
60
+ # Index a repo (fails if already indexed)
61
+ codebase-mcp index ~/Code/myproject
62
+
63
+ # Re-index after changes (replaces existing index)
64
+ codebase-mcp reindex ~/Code/myproject
65
+
66
+ # List indexed repos with chunk counts
67
+ codebase-mcp list
68
+
69
+ # Remove a repo from the index
70
+ codebase-mcp remove ~/Code/myproject
71
+
72
+ # Start MCP server (stdio, used by Claude Code)
73
+ codebase-mcp serve
74
+ ```
75
+
76
+ ### Config commands
77
+
78
+ ```bash
79
+ # Show current settings
80
+ codebase-mcp config list
81
+
82
+ # Set embedding model (known models auto-resolve vector size)
83
+ codebase-mcp config set embedding-model text-embedding-3-large
84
+ codebase-mcp config set embedding-model my-custom-model --vector-size 768
85
+
86
+ # Set API credentials
87
+ codebase-mcp config set api-key sk-...
88
+ codebase-mcp config set api-base https://my-provider.com/v1
89
+
90
+ # Revert a setting to default / env var fallback
91
+ codebase-mcp config unset embedding-model
92
+ codebase-mcp config unset api-key
93
+ codebase-mcp config unset api-base
94
+ ```
95
+
96
+ **Known models** (vector size auto-detected):
97
+
98
+ | Model | Vector size |
99
+ |---|---|
100
+ | `text-embedding-3-small` | 1536 |
101
+ | `text-embedding-3-large` | 3072 |
102
+ | `text-embedding-ada-002` | 1536 |
103
+
104
+ Default: `text-embedding-3-small`.
105
+
106
+ ## Claude Code config
107
+
108
+ Add to `~/.claude/settings.json`:
109
+
110
+ ```json
111
+ {
112
+ "mcpServers": {
113
+ "codebase-search": {
114
+ "command": "codebase-mcp",
115
+ "args": ["serve"],
116
+ "env": { "OPENAI_API_KEY": "sk-..." }
117
+ }
118
+ }
119
+ }
120
+ ```
121
+
122
+ API key can also be set via `codebase-mcp config set api-key sk-...` (persisted in `~/.codebase-mcp/settings.json`), which takes precedence over the env var.
123
+
124
+ ## MCP tools
125
+
126
+ ### `search_codebase`
127
+
128
+ Search indexed repos for relevant code and docs.
129
+
130
+ | Parameter | Type | Description |
131
+ |---|---|---|
132
+ | `query` | string | Natural language description of what to find |
133
+ | `repo_path` | string (optional) | Absolute path to a specific repo; omit to search all |
134
+
135
+ Returns top-8 results ranked by similarity, each with file path, line range, score, and code block.
136
+
137
+ ### `list_indexed_repos`
138
+
139
+ List all indexed repos with chunk count and last indexed timestamp. No parameters.
140
+
141
+ ## Supported languages (AST chunking)
142
+
143
+ | Language | Extensions | Chunk boundary |
144
+ |---|---|---|
145
+ | Python | `.py` | `function_definition`, `decorated_definition` |
146
+ | TypeScript | `.ts` | `function_declaration`, `method_definition`, `arrow_function` |
147
+ | TSX | `.tsx` | same as TypeScript |
148
+ | JavaScript | `.js`, `.jsx` | `function_declaration`, `method_definition`, `arrow_function` |
149
+ | Go | `.go` | `function_declaration`, `method_declaration` |
150
+ | Rust | `.rs` | `function_item` |
151
+ | Java | `.java` | `method_declaration`, `constructor_declaration` |
152
+ | HCL/Terraform | `.tf` | `block` |
153
+
154
+ Files without AST support (`.md`, `.yaml`, `.toml`, `.json`, `.rb`, `.cpp`, `.c`, `.h`) fall back to 100-line sliding window with 20-line overlap.
155
+
156
+ ## Data storage
157
+
158
+ All data lives in `~/.codebase-mcp/`:
159
+
160
+ ```
161
+ ~/.codebase-mcp/
162
+ config.json # indexed repo metadata (paths, repo_ids, chunk counts, timestamps)
163
+ settings.json # embedding model, vector size, api_key, api_base
164
+ qdrant/ # Qdrant in-process storage (one collection per repo)
165
+ ```
166
+
167
+ Each repo gets a stable `repo_id` derived from its absolute path (used as Qdrant collection name). Reindexing replaces the collection in-place.
168
+
169
+ ## OpenAI-compatible providers
170
+
171
+ Set `api-base` to use any OpenAI-compatible embedding API (e.g. Ollama, vLLM, Azure):
172
+
173
+ ```bash
174
+ codebase-mcp config set api-base http://localhost:11434/v1
175
+ codebase-mcp config set api-key ollama
176
+ codebase-mcp config set embedding-model nomic-embed-text --vector-size 768
177
+ ```
178
+
179
+ After changing the model, reindex all repos (vector dimensions must match).
@@ -0,0 +1,145 @@
1
+ # codebase-mcp
2
+
3
+ Vector search MCP server for codebases. Index repos locally with AST-aware chunking; let Claude (or any MCP client) search them via semantic similarity.
4
+
5
+ ## How it works
6
+
7
+ 1. **Index** — walks repo files, chunks them using tree-sitter AST (function/method boundaries) with line-based fallback, embeds via OpenAI-compatible API, stores in in-process Qdrant.
8
+ 2. **Serve** — exposes two MCP tools (`search_codebase`, `list_indexed_repos`) over stdio.
9
+ 3. **Search** — embeds the query, retrieves top-8 chunks across all indexed repos (or a specific one), returns ranked results with file path and line numbers.
10
+
11
+ ## Install
12
+
13
+ ```bash
14
+ uv tool install /path/to/codebase-mcp
15
+ ```
16
+
17
+ Or for development:
18
+
19
+ ```bash
20
+ uv sync
21
+ ```
22
+
23
+ ## CLI
24
+
25
+ ```bash
26
+ # Index a repo (fails if already indexed)
27
+ codebase-mcp index ~/Code/myproject
28
+
29
+ # Re-index after changes (replaces existing index)
30
+ codebase-mcp reindex ~/Code/myproject
31
+
32
+ # List indexed repos with chunk counts
33
+ codebase-mcp list
34
+
35
+ # Remove a repo from the index
36
+ codebase-mcp remove ~/Code/myproject
37
+
38
+ # Start MCP server (stdio, used by Claude Code)
39
+ codebase-mcp serve
40
+ ```
41
+
42
+ ### Config commands
43
+
44
+ ```bash
45
+ # Show current settings
46
+ codebase-mcp config list
47
+
48
+ # Set embedding model (known models auto-resolve vector size)
49
+ codebase-mcp config set embedding-model text-embedding-3-large
50
+ codebase-mcp config set embedding-model my-custom-model --vector-size 768
51
+
52
+ # Set API credentials
53
+ codebase-mcp config set api-key sk-...
54
+ codebase-mcp config set api-base https://my-provider.com/v1
55
+
56
+ # Revert a setting to default / env var fallback
57
+ codebase-mcp config unset embedding-model
58
+ codebase-mcp config unset api-key
59
+ codebase-mcp config unset api-base
60
+ ```
61
+
62
+ **Known models** (vector size auto-detected):
63
+
64
+ | Model | Vector size |
65
+ |---|---|
66
+ | `text-embedding-3-small` | 1536 |
67
+ | `text-embedding-3-large` | 3072 |
68
+ | `text-embedding-ada-002` | 1536 |
69
+
70
+ Default: `text-embedding-3-small`.
71
+
72
+ ## Claude Code config
73
+
74
+ Add to `~/.claude/settings.json`:
75
+
76
+ ```json
77
+ {
78
+ "mcpServers": {
79
+ "codebase-search": {
80
+ "command": "codebase-mcp",
81
+ "args": ["serve"],
82
+ "env": { "OPENAI_API_KEY": "sk-..." }
83
+ }
84
+ }
85
+ }
86
+ ```
87
+
88
+ API key can also be set via `codebase-mcp config set api-key sk-...` (persisted in `~/.codebase-mcp/settings.json`), which takes precedence over the env var.
89
+
90
+ ## MCP tools
91
+
92
+ ### `search_codebase`
93
+
94
+ Search indexed repos for relevant code and docs.
95
+
96
+ | Parameter | Type | Description |
97
+ |---|---|---|
98
+ | `query` | string | Natural language description of what to find |
99
+ | `repo_path` | string (optional) | Absolute path to a specific repo; omit to search all |
100
+
101
+ Returns top-8 results ranked by similarity, each with file path, line range, score, and code block.
102
+
103
+ ### `list_indexed_repos`
104
+
105
+ List all indexed repos with chunk count and last indexed timestamp. No parameters.
106
+
107
+ ## Supported languages (AST chunking)
108
+
109
+ | Language | Extensions | Chunk boundary |
110
+ |---|---|---|
111
+ | Python | `.py` | `function_definition`, `decorated_definition` |
112
+ | TypeScript | `.ts` | `function_declaration`, `method_definition`, `arrow_function` |
113
+ | TSX | `.tsx` | same as TypeScript |
114
+ | JavaScript | `.js`, `.jsx` | `function_declaration`, `method_definition`, `arrow_function` |
115
+ | Go | `.go` | `function_declaration`, `method_declaration` |
116
+ | Rust | `.rs` | `function_item` |
117
+ | Java | `.java` | `method_declaration`, `constructor_declaration` |
118
+ | HCL/Terraform | `.tf` | `block` |
119
+
120
+ Files without AST support (`.md`, `.yaml`, `.toml`, `.json`, `.rb`, `.cpp`, `.c`, `.h`) fall back to 100-line sliding window with 20-line overlap.
121
+
122
+ ## Data storage
123
+
124
+ All data lives in `~/.codebase-mcp/`:
125
+
126
+ ```
127
+ ~/.codebase-mcp/
128
+ config.json # indexed repo metadata (paths, repo_ids, chunk counts, timestamps)
129
+ settings.json # embedding model, vector size, api_key, api_base
130
+ qdrant/ # Qdrant in-process storage (one collection per repo)
131
+ ```
132
+
133
+ Each repo gets a stable `repo_id` derived from its absolute path (used as Qdrant collection name). Reindexing replaces the collection in-place.
134
+
135
+ ## OpenAI-compatible providers
136
+
137
+ Set `api-base` to use any OpenAI-compatible embedding API (e.g. Ollama, vLLM, Azure):
138
+
139
+ ```bash
140
+ codebase-mcp config set api-base http://localhost:11434/v1
141
+ codebase-mcp config set api-key ollama
142
+ codebase-mcp config set embedding-model nomic-embed-text --vector-size 768
143
+ ```
144
+
145
+ After changing the model, reindex all repos (vector dimensions must match).