cocoindex-code 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .env
25
+ .venv
26
+ env/
27
+ venv/
28
+ ENV/
29
+
30
+ # IDE
31
+ .idea/
32
+ .vscode/
33
+ *.swp
34
+ *.swo
35
+
36
+ # Testing
37
+ .tox/
38
+ .coverage
39
+ .coverage.*
40
+ htmlcov/
41
+ .pytest_cache/
42
+ .mypy_cache/
43
+
44
+ # CocoIndex
45
+ .cocoindex_code/
@@ -0,0 +1,203 @@
1
+ Metadata-Version: 2.4
2
+ Name: cocoindex-code
3
+ Version: 0.1.0
4
+ Summary: MCP server for indexing and querying codebases using CocoIndex
5
+ Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
6
+ Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
7
+ Project-URL: Issues, https://github.com/cocoindex-io/cocoindex-code/issues
8
+ License-Expression: MIT
9
+ Keywords: cocoindex,codebase,indexing,mcp,vector-search
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Requires-Python: >=3.11
19
+ Requires-Dist: cocoindex==1.0.0a10
20
+ Requires-Dist: mcp>=1.0.0
21
+ Requires-Dist: numpy>=1.24.0
22
+ Requires-Dist: pydantic>=2.0.0
23
+ Requires-Dist: sentence-transformers>=2.2.0
24
+ Requires-Dist: sqlite-vec>=0.1.0
25
+ Provides-Extra: dev
26
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
27
+ Requires-Dist: prek>=0.1.0; extra == 'dev'
28
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
29
+ Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
30
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
31
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # CocoIndex Code
35
+
36
+ An MCP (Model Context Protocol) server for indexing and querying codebases using [CocoIndex](https://cocoindex.io).
37
+
38
+ ## Features
39
+
40
+ - **Semantic Code Search**: Find relevant code using natural language queries
41
+ - **Incremental Indexing**: Only re-indexes changed files for fast updates
42
+ - **Multi-Language Support**: Python, JavaScript/TypeScript, Rust, Go
43
+ - **Vector Embeddings**: Uses sentence-transformers for semantic similarity
44
+ - **SQLite Storage**: Portable, no external database required
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ pip install cocoindex-code
50
+ ```
51
+
52
+ Or with uv:
53
+
54
+ ```bash
55
+ uv pip install cocoindex-code
56
+ ```
57
+
58
+ ## Usage with Claude Code
59
+
60
+ Add to your Claude Code MCP configuration (`.claude/mcp_config.json`):
61
+
62
+ ```json
63
+ {
64
+ "mcpServers": {
65
+ "cocoindex-code": {
66
+ "command": "cocoindex-code",
67
+ "env": {
68
+ "COCOINDEX_CODE_ROOT_PATH": "/path/to/your/codebase"
69
+ }
70
+ }
71
+ }
72
+ }
73
+ ```
74
+
75
+ Or without explicit path (auto-discovers from current directory):
76
+
77
+ ```json
78
+ {
79
+ "mcpServers": {
80
+ "cocoindex-code": {
81
+ "command": "cocoindex-code"
82
+ }
83
+ }
84
+ }
85
+ ```
86
+
87
+ ## Configuration
88
+
89
+ Environment variables:
90
+
91
+ | Variable | Description | Default |
92
+ |----------|-------------|---------|
93
+ | `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
94
+ | `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model to use | `sentence-transformers/all-MiniLM-L6-v2` |
95
+
96
+ ### Root Path Discovery
97
+
98
+ If `COCOINDEX_CODE_ROOT_PATH` is not set, the codebase root is discovered by:
99
+
100
+ 1. Finding the nearest parent directory containing `.cocoindex_code/`
101
+ 2. Finding the nearest parent directory containing `.git/`
102
+ 3. Falling back to the current working directory
103
+
104
+ ## MCP Tools
105
+
106
+ ### `query`
107
+
108
+ Search the codebase using semantic similarity.
109
+
110
+ ```
111
+ query(
112
+ query: str, # Natural language query or code snippet
113
+ limit: int = 10, # Maximum results (1-100)
114
+ offset: int = 0, # Pagination offset
115
+ refresh_index: bool = True # Refresh index before querying
116
+ )
117
+ ```
118
+
119
+ The `refresh_index` parameter controls whether the index is refreshed before searching:
120
+
121
+ - `True` (default): Refreshes the index to include any recent changes
122
+ - `False`: Skip refresh for faster consecutive queries
123
+
124
+ Returns matching code chunks with:
125
+
126
+ - File path
127
+ - Language
128
+ - Code content
129
+ - Line numbers (start/end)
130
+ - Similarity score
131
+
132
+ ## Index Storage
133
+
134
+ The index is stored in `.cocoindex_code/` under your codebase root:
135
+
136
+ ```
137
+ your-project/
138
+ ├── .cocoindex_code/
139
+ │ ├── target_sqlite.db # Vector index (SQLite + sqlite-vec)
140
+ │ └── cocoindex.db/ # CocoIndex state
141
+ ├── src/
142
+ │ └── ...
143
+ ```
144
+
145
+ Add `.cocoindex_code/` to your `.gitignore`.
146
+
147
+ ## Supported File Types
148
+
149
+ - **Python**: `.py`, `.pyi`
150
+ - **JavaScript**: `.js`, `.jsx`, `.mjs`, `.cjs`
151
+ - **TypeScript**: `.ts`, `.tsx`
152
+ - **Rust**: `.rs`
153
+ - **Go**: `.go`
154
+
155
+ Common generated directories are automatically excluded:
156
+
157
+ - `__pycache__/`
158
+ - `node_modules/`
159
+ - `target/`
160
+ - `dist/`
161
+ - `build/`
162
+ - `.git/`
163
+
164
+ ## Development
165
+
166
+ ### Local Testing with Claude Code
167
+
168
+ To test locally without installing the package, use the Claude Code CLI:
169
+
170
+ ```bash
171
+ claude mcp add cocoindex-code \
172
+ -- uv run --project /path/to/cocoindex-code cocoindex-code
173
+ ```
174
+
175
+ Or add to `.mcp.json` in your project root:
176
+
177
+ ```json
178
+ {
179
+ "mcpServers": {
180
+ "cocoindex-code": {
181
+ "command": "uv",
182
+ "args": ["run", "--project", "/path/to/cocoindex-code", "cocoindex-code"]
183
+ }
184
+ }
185
+ }
186
+ ```
187
+
188
+ ### Running Tests
189
+
190
+ ```bash
191
+ # Install dev dependencies
192
+ uv sync --group dev
193
+
194
+ # Run tests
195
+ uv run pytest tests/ -v
196
+
197
+ # Run pre-commit hooks
198
+ uv run pre-commit run --all-files
199
+ ```
200
+
201
+ ## License
202
+
203
+ MIT
@@ -0,0 +1,170 @@
1
+ # CocoIndex Code
2
+
3
+ An MCP (Model Context Protocol) server for indexing and querying codebases using [CocoIndex](https://cocoindex.io).
4
+
5
+ ## Features
6
+
7
+ - **Semantic Code Search**: Find relevant code using natural language queries
8
+ - **Incremental Indexing**: Only re-indexes changed files for fast updates
9
+ - **Multi-Language Support**: Python, JavaScript/TypeScript, Rust, Go
10
+ - **Vector Embeddings**: Uses sentence-transformers for semantic similarity
11
+ - **SQLite Storage**: Portable, no external database required
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ pip install cocoindex-code
17
+ ```
18
+
19
+ Or with uv:
20
+
21
+ ```bash
22
+ uv pip install cocoindex-code
23
+ ```
24
+
25
+ ## Usage with Claude Code
26
+
27
+ Add to your Claude Code MCP configuration (`.claude/mcp_config.json`):
28
+
29
+ ```json
30
+ {
31
+ "mcpServers": {
32
+ "cocoindex-code": {
33
+ "command": "cocoindex-code",
34
+ "env": {
35
+ "COCOINDEX_CODE_ROOT_PATH": "/path/to/your/codebase"
36
+ }
37
+ }
38
+ }
39
+ }
40
+ ```
41
+
42
+ Or without explicit path (auto-discovers from current directory):
43
+
44
+ ```json
45
+ {
46
+ "mcpServers": {
47
+ "cocoindex-code": {
48
+ "command": "cocoindex-code"
49
+ }
50
+ }
51
+ }
52
+ ```
53
+
54
+ ## Configuration
55
+
56
+ Environment variables:
57
+
58
+ | Variable | Description | Default |
59
+ |----------|-------------|---------|
60
+ | `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
61
+ | `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model to use | `sentence-transformers/all-MiniLM-L6-v2` |
62
+
63
+ ### Root Path Discovery
64
+
65
+ If `COCOINDEX_CODE_ROOT_PATH` is not set, the codebase root is discovered by:
66
+
67
+ 1. Finding the nearest parent directory containing `.cocoindex_code/`
68
+ 2. Finding the nearest parent directory containing `.git/`
69
+ 3. Falling back to the current working directory
70
+
71
+ ## MCP Tools
72
+
73
+ ### `query`
74
+
75
+ Search the codebase using semantic similarity.
76
+
77
+ ```
78
+ query(
79
+ query: str, # Natural language query or code snippet
80
+ limit: int = 10, # Maximum results (1-100)
81
+ offset: int = 0, # Pagination offset
82
+ refresh_index: bool = True # Refresh index before querying
83
+ )
84
+ ```
85
+
86
+ The `refresh_index` parameter controls whether the index is refreshed before searching:
87
+
88
+ - `True` (default): Refreshes the index to include any recent changes
89
+ - `False`: Skip refresh for faster consecutive queries
90
+
91
+ Returns matching code chunks with:
92
+
93
+ - File path
94
+ - Language
95
+ - Code content
96
+ - Line numbers (start/end)
97
+ - Similarity score
98
+
99
+ ## Index Storage
100
+
101
+ The index is stored in `.cocoindex_code/` under your codebase root:
102
+
103
+ ```
104
+ your-project/
105
+ ├── .cocoindex_code/
106
+ │ ├── target_sqlite.db # Vector index (SQLite + sqlite-vec)
107
+ │ └── cocoindex.db/ # CocoIndex state
108
+ ├── src/
109
+ │ └── ...
110
+ ```
111
+
112
+ Add `.cocoindex_code/` to your `.gitignore`.
113
+
114
+ ## Supported File Types
115
+
116
+ - **Python**: `.py`, `.pyi`
117
+ - **JavaScript**: `.js`, `.jsx`, `.mjs`, `.cjs`
118
+ - **TypeScript**: `.ts`, `.tsx`
119
+ - **Rust**: `.rs`
120
+ - **Go**: `.go`
121
+
122
+ Common generated directories are automatically excluded:
123
+
124
+ - `__pycache__/`
125
+ - `node_modules/`
126
+ - `target/`
127
+ - `dist/`
128
+ - `build/`
129
+ - `.git/`
130
+
131
+ ## Development
132
+
133
+ ### Local Testing with Claude Code
134
+
135
+ To test locally without installing the package, use the Claude Code CLI:
136
+
137
+ ```bash
138
+ claude mcp add cocoindex-code \
139
+ -- uv run --project /path/to/cocoindex-code cocoindex-code
140
+ ```
141
+
142
+ Or add to `.mcp.json` in your project root:
143
+
144
+ ```json
145
+ {
146
+ "mcpServers": {
147
+ "cocoindex-code": {
148
+ "command": "uv",
149
+ "args": ["run", "--project", "/path/to/cocoindex-code", "cocoindex-code"]
150
+ }
151
+ }
152
+ }
153
+ ```
154
+
155
+ ### Running Tests
156
+
157
+ ```bash
158
+ # Install dev dependencies
159
+ uv sync --group dev
160
+
161
+ # Run tests
162
+ uv run pytest tests/ -v
163
+
164
+ # Run pre-commit hooks
165
+ uv run pre-commit run --all-files
166
+ ```
167
+
168
+ ## License
169
+
170
+ MIT
@@ -0,0 +1,85 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "cocoindex-code"
7
+ version = "0.1.0"
8
+ description = "MCP server for indexing and querying codebases using CocoIndex"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.11"
12
+ keywords = ["mcp", "codebase", "indexing", "vector-search", "cocoindex"]
13
+ classifiers = [
14
+ "Development Status :: 3 - Alpha",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: Software Development :: Libraries :: Python Modules",
22
+ ]
23
+
24
+ dependencies = [
25
+ "mcp>=1.0.0",
26
+ "cocoindex==1.0.0a10",
27
+ "sentence-transformers>=2.2.0",
28
+ "sqlite-vec>=0.1.0",
29
+ "pydantic>=2.0.0",
30
+ "numpy>=1.24.0",
31
+ ]
32
+
33
+ [project.optional-dependencies]
34
+ dev = [
35
+ "pytest>=7.0.0",
36
+ "pytest-asyncio>=0.21.0",
37
+ "pytest-cov>=4.0.0",
38
+ "ruff>=0.1.0",
39
+ "mypy>=1.0.0",
40
+ "prek>=0.1.0",
41
+ ]
42
+
43
+ [project.scripts]
44
+ cocoindex-code = "cocoindex_code:main"
45
+
46
+ [project.urls]
47
+ Homepage = "https://github.com/cocoindex-io/cocoindex-code"
48
+ Repository = "https://github.com/cocoindex-io/cocoindex-code"
49
+ Issues = "https://github.com/cocoindex-io/cocoindex-code/issues"
50
+
51
+ [tool.hatch.build.targets.wheel]
52
+ packages = ["src/cocoindex_code"]
53
+
54
+ [tool.hatch.build.targets.sdist]
55
+ include = ["/src", "/README.md", "/LICENSE"]
56
+
57
+ [dependency-groups]
58
+ dev = [
59
+ "pytest>=7.0.0",
60
+ "pytest-asyncio>=0.21.0",
61
+ "pytest-cov>=4.0.0",
62
+ "ruff>=0.1.0",
63
+ "mypy>=1.0.0",
64
+ "prek>=0.1.0",
65
+ ]
66
+
67
+ [tool.uv]
68
+ prerelease = "explicit"
69
+
70
+ [tool.ruff]
71
+ line-length = 100
72
+
73
+ [tool.ruff.lint]
74
+ select = ["E", "F", "I", "N", "W", "UP"]
75
+
76
+ [tool.mypy]
77
+ python_version = "3.11"
78
+ strict = true
79
+ ignore_missing_imports = true
80
+
81
+ [tool.pytest.ini_options]
82
+ testpaths = ["tests"]
83
+ python_files = ["test_*.py"]
84
+ python_functions = ["test_*"]
85
+ addopts = "-v --tb=short"
@@ -0,0 +1,7 @@
1
+ """CocoIndex Code - MCP server for indexing and querying codebases."""
2
+
3
+ from .config import Config
4
+ from .server import main, mcp
5
+
6
+ __version__ = "0.1.0"
7
+ __all__ = ["Config", "main", "mcp"]
@@ -0,0 +1,6 @@
1
+ """Entry point for `python -m cocoindex_code`."""
2
+
3
+ from .server import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
@@ -0,0 +1,87 @@
1
+ """Configuration for CocoIndex Code MCP server."""
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+
8
+ def _find_root_with_marker(start_dir: Path, marker: str) -> Path | None:
9
+ """Find the nearest parent directory containing the given marker directory."""
10
+ current = start_dir.resolve()
11
+ while current != current.parent:
12
+ if (current / marker).is_dir():
13
+ return current
14
+ current = current.parent
15
+ # Check root directory too
16
+ if (current / marker).is_dir():
17
+ return current
18
+ return None
19
+
20
+
21
+ def _discover_codebase_root() -> Path:
22
+ """
23
+ Discover the codebase root directory.
24
+
25
+ Discovery order:
26
+ 1. Find nearest parent with `.cocoindex_code` directory
27
+ 2. Find nearest parent with `.git` directory
28
+ 3. Fall back to current working directory
29
+ """
30
+ cwd = Path.cwd()
31
+
32
+ # First, look for existing .cocoindex_code directory
33
+ root = _find_root_with_marker(cwd, ".cocoindex_code")
34
+ if root is not None:
35
+ return root
36
+
37
+ # Then, look for .git directory
38
+ root = _find_root_with_marker(cwd, ".git")
39
+ if root is not None:
40
+ return root
41
+
42
+ # Fall back to current working directory
43
+ return cwd
44
+
45
+
46
+ @dataclass
47
+ class Config:
48
+ """Configuration loaded from environment variables."""
49
+
50
+ codebase_root_path: Path
51
+ embedding_model: str
52
+ index_dir: Path
53
+
54
+ @classmethod
55
+ def from_env(cls) -> "Config":
56
+ """Load configuration from environment variables."""
57
+ # Get root path from env or discover it
58
+ root_path_str = os.environ.get("COCOINDEX_CODE_ROOT_PATH")
59
+ if root_path_str:
60
+ root = Path(root_path_str).resolve()
61
+ else:
62
+ root = _discover_codebase_root()
63
+
64
+ # Get embedding model
65
+ embedding_model = os.environ.get(
66
+ "COCOINDEX_CODE_EMBEDDING_MODEL",
67
+ "sentence-transformers/all-MiniLM-L6-v2",
68
+ )
69
+
70
+ # Index directory is always under the root
71
+ index_dir = root / ".cocoindex_code"
72
+
73
+ return cls(
74
+ codebase_root_path=root,
75
+ embedding_model=embedding_model,
76
+ index_dir=index_dir,
77
+ )
78
+
79
+ @property
80
+ def target_sqlite_db_path(self) -> Path:
81
+ """Path to the vector index SQLite database."""
82
+ return self.index_dir / "target_sqlite.db"
83
+
84
+ @property
85
+ def cocoindex_db_path(self) -> Path:
86
+ """Path to the CocoIndex state database."""
87
+ return self.index_dir / "cocoindex.db"
@@ -0,0 +1,164 @@
1
+ """CocoIndex app for indexing codebases."""
2
+
3
+ import asyncio
4
+
5
+ import cocoindex.asyncio as coco_aio
6
+ from cocoindex.connectors import localfs, sqlite
7
+ from cocoindex.ops.text import RecursiveSplitter, detect_code_language
8
+ from cocoindex.resources.chunk import Chunk
9
+ from cocoindex.resources.file import PatternFilePathMatcher
10
+ from cocoindex.resources.id import IdGenerator
11
+
12
+ from .shared import SQLITE_DB, CodeChunk, config, embedder
13
+
14
+ # File patterns for supported languages
15
+ INCLUDED_PATTERNS = [
16
+ "*.py", # Python
17
+ "*.pyi", # Python stubs
18
+ "*.js", # JavaScript
19
+ "*.jsx", # JavaScript React
20
+ "*.ts", # TypeScript
21
+ "*.tsx", # TypeScript React
22
+ "*.mjs", # JavaScript ES modules
23
+ "*.cjs", # JavaScript CommonJS
24
+ "*.rs", # Rust
25
+ "*.go", # Go
26
+ ]
27
+
28
+ EXCLUDED_PATTERNS = [
29
+ ".*/**", # Hidden directories
30
+ "**/__pycache__/**", # Python cache
31
+ "**/node_modules/**", # Node.js dependencies
32
+ "**/target/**", # Rust/Maven build output
33
+ "**/dist/**", # Distribution directories
34
+ "**/build/**", # Build directories
35
+ "**/vendor/**", # Go vendor directory
36
+ "**/.git/**", # Git directory
37
+ "**/.cocoindex_code/**", # Our own index directory
38
+ "*.min.js", # Minified JavaScript
39
+ "*.min.css", # Minified CSS
40
+ "*.lock", # Lock files
41
+ "**/package-lock.json", # NPM lock
42
+ "**/yarn.lock", # Yarn lock
43
+ "**/Cargo.lock", # Cargo lock
44
+ "**/go.sum", # Go sum
45
+ "**/*.pyc", # Python bytecode
46
+ "**/*.pyo", # Python optimized bytecode
47
+ "**/*.so", # Shared objects
48
+ "**/*.dylib", # macOS dynamic libraries
49
+ "**/*.dll", # Windows dynamic libraries
50
+ ]
51
+
52
+ # Chunking configuration
53
+ CHUNK_SIZE = 1000
54
+ MIN_CHUNK_SIZE = 300
55
+ CHUNK_OVERLAP = 200
56
+
57
+ # Chunking splitter (stateless, can be module-level)
58
+ splitter = RecursiveSplitter()
59
+
60
+
61
+ @coco_aio.function
62
+ async def process_chunk(
63
+ file_path: str,
64
+ chunk: Chunk,
65
+ language: str,
66
+ id_gen: IdGenerator,
67
+ table: sqlite.TableTarget,
68
+ ) -> None:
69
+ """Process a single chunk: embed and store."""
70
+ id, chunk_embedding = await asyncio.gather(
71
+ id_gen.next_id(chunk.text),
72
+ embedder.embed(chunk.text),
73
+ )
74
+ table.declare_row(
75
+ row=CodeChunk( # type: ignore[arg-type]
76
+ id=id,
77
+ file_path=file_path,
78
+ language=language,
79
+ content=chunk.text,
80
+ start_line=chunk.start.line,
81
+ end_line=chunk.end.line,
82
+ embedding=chunk_embedding,
83
+ )
84
+ )
85
+
86
+
87
+ @coco_aio.function(memo=True)
88
+ async def process_file(
89
+ file: localfs.File,
90
+ table: sqlite.TableTarget,
91
+ ) -> None:
92
+ """Process a single file: chunk, embed, and store."""
93
+ # Read file content
94
+ try:
95
+ content = file.read_text()
96
+ except UnicodeDecodeError:
97
+ # Skip binary files
98
+ return
99
+
100
+ if not content.strip():
101
+ return
102
+
103
+ # Get relative path and detect language
104
+ language = detect_code_language(filename=file.file_path.path.name) or "text"
105
+
106
+ # Split into chunks
107
+ chunks = splitter.split(
108
+ content,
109
+ chunk_size=CHUNK_SIZE,
110
+ min_chunk_size=MIN_CHUNK_SIZE,
111
+ chunk_overlap=CHUNK_OVERLAP,
112
+ language=language,
113
+ )
114
+
115
+ id_gen = IdGenerator()
116
+ await asyncio.gather(
117
+ *(
118
+ process_chunk(str(file.file_path.path), chunk, language, id_gen, table)
119
+ for chunk in chunks
120
+ )
121
+ )
122
+
123
+
124
+ @coco_aio.function
125
+ async def app_main() -> None:
126
+ """Main indexing function - walks files and processes each."""
127
+ db = coco_aio.use_context(SQLITE_DB)
128
+
129
+ # Declare the table target for storing embeddings
130
+ table = await coco_aio.mount_run(
131
+ coco_aio.component_subpath("setup", "table"),
132
+ db.declare_table_target,
133
+ table_name="code_chunks",
134
+ table_schema=await sqlite.TableSchema.from_class(
135
+ CodeChunk,
136
+ primary_key=["id"],
137
+ ),
138
+ ).result()
139
+
140
+ # Walk source directory
141
+ files = localfs.walk_dir(
142
+ config.codebase_root_path,
143
+ recursive=True,
144
+ path_matcher=PatternFilePathMatcher(
145
+ included_patterns=INCLUDED_PATTERNS,
146
+ excluded_patterns=EXCLUDED_PATTERNS,
147
+ ),
148
+ )
149
+
150
+ # Process each file
151
+ for f in files:
152
+ coco_aio.mount(
153
+ coco_aio.component_subpath("process", str(f.file_path.path)),
154
+ process_file,
155
+ f,
156
+ table,
157
+ )
158
+
159
+
160
+ # Create the app
161
+ app = coco_aio.App(
162
+ coco_aio.AppConfig(name="CocoIndexCode"),
163
+ app_main,
164
+ )
@@ -0,0 +1,64 @@
1
+ """Query implementation for codebase search."""
2
+
3
+ import cocoindex as coco
4
+
5
+ from .schema import QueryResult
6
+ from .shared import SQLITE_DB, config, embedder
7
+
8
+
9
+ async def query_codebase(
10
+ query: str,
11
+ limit: int = 10,
12
+ offset: int = 0,
13
+ ) -> list[QueryResult]:
14
+ """
15
+ Perform vector similarity search.
16
+
17
+ Uses sqlite-vec's vec_distance_cosine for similarity scoring.
18
+ """
19
+ if not config.target_sqlite_db_path.exists():
20
+ raise RuntimeError(
21
+ f"Index database not found at {config.target_sqlite_db_path}. "
22
+ "Please run a query with refresh_index=True first."
23
+ )
24
+
25
+ # Get the database connection from CocoIndex environment
26
+ db = coco.default_env().get_context(SQLITE_DB)
27
+
28
+ # Generate query embedding
29
+ query_embedding = await embedder.embed(query)
30
+
31
+ # Convert to bytes for sqlite-vec (float32)
32
+ embedding_bytes = query_embedding.astype("float32").tobytes()
33
+
34
+ # Query using sqlite-vec with readonly transaction
35
+ # vec_distance_cosine returns distance (lower is better),
36
+ # so we convert to similarity score (1 - distance)
37
+ with db.value.readonly() as conn:
38
+ cursor = conn.execute(
39
+ """
40
+ SELECT
41
+ file_path,
42
+ language,
43
+ content,
44
+ start_line,
45
+ end_line,
46
+ (1.0 - vec_distance_cosine(embedding, ?)) as score
47
+ FROM code_chunks
48
+ ORDER BY vec_distance_cosine(embedding, ?) ASC
49
+ LIMIT ? OFFSET ?
50
+ """,
51
+ (embedding_bytes, embedding_bytes, limit, offset),
52
+ )
53
+
54
+ return [
55
+ QueryResult(
56
+ file_path=row[0],
57
+ language=row[1],
58
+ content=row[2],
59
+ start_line=row[3],
60
+ end_line=row[4],
61
+ score=row[5],
62
+ )
63
+ for row in cursor.fetchall()
64
+ ]
@@ -0,0 +1,29 @@
1
+ """Data models for CocoIndex Code."""
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Any
5
+
6
+
7
+ @dataclass
8
+ class CodeChunk:
9
+ """Represents an indexed code chunk stored in SQLite."""
10
+
11
+ id: int
12
+ file_path: str
13
+ language: str
14
+ content: str
15
+ start_line: int
16
+ end_line: int
17
+ embedding: Any # NDArray - type hint relaxed for compatibility
18
+
19
+
20
+ @dataclass
21
+ class QueryResult:
22
+ """Result from a vector similarity query."""
23
+
24
+ file_path: str
25
+ language: str
26
+ content: str
27
+ start_line: int
28
+ end_line: int
29
+ score: float
@@ -0,0 +1,150 @@
1
+ """MCP server for codebase indexing and querying."""
2
+
3
+ import asyncio
4
+
5
+ from mcp.server.fastmcp import FastMCP
6
+ from pydantic import BaseModel, Field
7
+
8
+ from .indexer import app as indexer_app
9
+ from .query import query_codebase
10
+
11
+ # Initialize MCP server
12
+ mcp = FastMCP(
13
+ "cocoindex-code",
14
+ instructions="""
15
+ This server provides semantic code search for the codebase.
16
+ This allows you to quickly and cheaply search for code related to a concept or functionality
17
+ across the entire codebase.
18
+
19
+ Use the `query` tool when you need to:
20
+ - Find code related to a concept or functionality
21
+ - Search for implementations of specific features
22
+ - Discover how something is done in the codebase
23
+ - Find similar code patterns
24
+
25
+ The `query` tool has a `refresh_index` parameter (default: True) that refreshes
26
+ the index before searching. Set it to False for consecutive queries to avoid
27
+ redundant refreshes.
28
+
29
+ The search uses vector embeddings for semantic similarity, so you can describe
30
+ what you're looking for in natural language rather than exact text matches.
31
+ """.strip(),
32
+ )
33
+
34
+ # Lock to prevent concurrent index updates
35
+ _index_lock = asyncio.Lock()
36
+
37
+
38
+ async def _refresh_index() -> None:
39
+ """Refresh the index. Uses lock to prevent concurrent updates."""
40
+ async with _index_lock:
41
+ await indexer_app.update(report_to_stdout=False)
42
+
43
+
44
+ # === Pydantic Models for Tool Inputs/Outputs ===
45
+
46
+
47
+ class CodeChunkResult(BaseModel):
48
+ """A single code chunk result."""
49
+
50
+ file_path: str = Field(description="Relative path to the file")
51
+ language: str = Field(description="Programming language")
52
+ content: str = Field(description="The code content")
53
+ start_line: int = Field(description="Starting line number (1-indexed)")
54
+ end_line: int = Field(description="Ending line number (1-indexed)")
55
+ score: float = Field(description="Similarity score (0-1, higher is better)")
56
+
57
+
58
+ class QueryResultModel(BaseModel):
59
+ """Result from query tool."""
60
+
61
+ success: bool
62
+ results: list[CodeChunkResult] = Field(default_factory=list)
63
+ total_returned: int = Field(default=0)
64
+ offset: int = Field(default=0)
65
+ message: str | None = None
66
+
67
+
68
+ # === MCP Tools ===
69
+
70
+
71
+ @mcp.tool(
72
+ name="query",
73
+ description=(
74
+ "Search the codebase using semantic similarity. "
75
+ "Returns relevant code chunks with file locations and similarity scores. "
76
+ "Use natural language queries or code snippets to find related code."
77
+ ),
78
+ )
79
+ async def query(
80
+ query: str = Field(description="Natural language query or code snippet to search for"),
81
+ limit: int = Field(
82
+ default=10,
83
+ ge=1,
84
+ le=100,
85
+ description="Maximum number of results to return (1-100)",
86
+ ),
87
+ offset: int = Field(
88
+ default=0,
89
+ ge=0,
90
+ description="Number of results to skip for pagination",
91
+ ),
92
+ refresh_index: bool = Field(
93
+ default=True,
94
+ description=(
95
+ "Whether to refresh the index before querying. "
96
+ "Set to False for consecutive queries to skip redundant refreshes."
97
+ ),
98
+ ),
99
+ ) -> QueryResultModel:
100
+ """Query the codebase index."""
101
+ try:
102
+ # Refresh index if requested
103
+ if refresh_index:
104
+ await _refresh_index()
105
+
106
+ results = await query_codebase(query=query, limit=limit, offset=offset)
107
+
108
+ return QueryResultModel(
109
+ success=True,
110
+ results=[
111
+ CodeChunkResult(
112
+ file_path=r.file_path,
113
+ language=r.language,
114
+ content=r.content,
115
+ start_line=r.start_line,
116
+ end_line=r.end_line,
117
+ score=r.score,
118
+ )
119
+ for r in results
120
+ ],
121
+ total_returned=len(results),
122
+ offset=offset,
123
+ )
124
+ except RuntimeError as e:
125
+ # Index doesn't exist
126
+ return QueryResultModel(
127
+ success=False,
128
+ message=str(e),
129
+ )
130
+ except Exception as e:
131
+ return QueryResultModel(
132
+ success=False,
133
+ message=f"Query failed: {e!s}",
134
+ )
135
+
136
+
137
+ async def _async_main() -> None:
138
+ """Async entry point for the MCP server."""
139
+ # Refresh index in background so startup isn't blocked
140
+ asyncio.create_task(_refresh_index())
141
+ await mcp.run_stdio_async()
142
+
143
+
144
+ def main() -> None:
145
+ """Entry point for the MCP server."""
146
+ asyncio.run(_async_main())
147
+
148
+
149
+ if __name__ == "__main__":
150
+ main()
@@ -0,0 +1,52 @@
1
+ """Shared resources for CocoIndex Code."""
2
+
3
+ from collections.abc import Iterator
4
+ from dataclasses import dataclass
5
+ from typing import Annotated
6
+
7
+ import cocoindex as coco
8
+ from cocoindex.connectors import sqlite
9
+ from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
10
+ from numpy.typing import NDArray
11
+
12
+ from .config import Config
13
+
14
+ # Load configuration at module level
15
+ config = Config.from_env()
16
+
17
+ # Initialize embedder at module level
18
+ embedder = SentenceTransformerEmbedder(config.embedding_model)
19
+
20
+ # Context key for SQLite database (connection managed in lifespan)
21
+ SQLITE_DB = coco.ContextKey[sqlite.SqliteDatabase]("sqlite_db")
22
+
23
+
24
+ @coco.lifespan
25
+ def coco_lifespan(builder: coco.EnvironmentBuilder) -> Iterator[None]:
26
+ """Set up database connection."""
27
+ # Ensure index directory exists
28
+ config.index_dir.mkdir(parents=True, exist_ok=True)
29
+
30
+ # Set CocoIndex state database path
31
+ builder.settings.db_path = config.cocoindex_db_path
32
+
33
+ # Connect to SQLite with vector extension
34
+ conn = sqlite.connect(str(config.target_sqlite_db_path), load_vec="auto")
35
+ builder.provide(SQLITE_DB, sqlite.register_db("index_db", conn))
36
+
37
+ yield
38
+
39
+ conn.close()
40
+
41
+
42
+ @dataclass
43
+ class CodeChunk:
44
+ """Schema for storing code chunks in SQLite."""
45
+
46
+ id: int
47
+ file_path: str
48
+ language: str
49
+ content: str
50
+ start_line: int
51
+ end_line: int
52
+ embedding: Annotated[NDArray, embedder] # type: ignore[type-arg]