cocoindex-code 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cocoindex_code-0.1.0/.gitignore +45 -0
- cocoindex_code-0.1.0/PKG-INFO +203 -0
- cocoindex_code-0.1.0/README.md +170 -0
- cocoindex_code-0.1.0/pyproject.toml +85 -0
- cocoindex_code-0.1.0/src/cocoindex_code/__init__.py +7 -0
- cocoindex_code-0.1.0/src/cocoindex_code/__main__.py +6 -0
- cocoindex_code-0.1.0/src/cocoindex_code/config.py +87 -0
- cocoindex_code-0.1.0/src/cocoindex_code/indexer.py +164 -0
- cocoindex_code-0.1.0/src/cocoindex_code/query.py +64 -0
- cocoindex_code-0.1.0/src/cocoindex_code/schema.py +29 -0
- cocoindex_code-0.1.0/src/cocoindex_code/server.py +150 -0
- cocoindex_code-0.1.0/src/cocoindex_code/shared.py +52 -0
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.env
|
|
25
|
+
.venv
|
|
26
|
+
env/
|
|
27
|
+
venv/
|
|
28
|
+
ENV/
|
|
29
|
+
|
|
30
|
+
# IDE
|
|
31
|
+
.idea/
|
|
32
|
+
.vscode/
|
|
33
|
+
*.swp
|
|
34
|
+
*.swo
|
|
35
|
+
|
|
36
|
+
# Testing
|
|
37
|
+
.tox/
|
|
38
|
+
.coverage
|
|
39
|
+
.coverage.*
|
|
40
|
+
htmlcov/
|
|
41
|
+
.pytest_cache/
|
|
42
|
+
.mypy_cache/
|
|
43
|
+
|
|
44
|
+
# CocoIndex
|
|
45
|
+
.cocoindex_code/
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cocoindex-code
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server for indexing and querying codebases using CocoIndex
|
|
5
|
+
Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
|
|
6
|
+
Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
|
|
7
|
+
Project-URL: Issues, https://github.com/cocoindex-io/cocoindex-code/issues
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: cocoindex,codebase,indexing,mcp,vector-search
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Requires-Dist: cocoindex==1.0.0a10
|
|
20
|
+
Requires-Dist: mcp>=1.0.0
|
|
21
|
+
Requires-Dist: numpy>=1.24.0
|
|
22
|
+
Requires-Dist: pydantic>=2.0.0
|
|
23
|
+
Requires-Dist: sentence-transformers>=2.2.0
|
|
24
|
+
Requires-Dist: sqlite-vec>=0.1.0
|
|
25
|
+
Provides-Extra: dev
|
|
26
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
27
|
+
Requires-Dist: prek>=0.1.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# CocoIndex Code
|
|
35
|
+
|
|
36
|
+
An MCP (Model Context Protocol) server for indexing and querying codebases using [CocoIndex](https://cocoindex.io).
|
|
37
|
+
|
|
38
|
+
## Features
|
|
39
|
+
|
|
40
|
+
- **Semantic Code Search**: Find relevant code using natural language queries
|
|
41
|
+
- **Incremental Indexing**: Only re-indexes changed files for fast updates
|
|
42
|
+
- **Multi-Language Support**: Python, JavaScript/TypeScript, Rust, Go
|
|
43
|
+
- **Vector Embeddings**: Uses sentence-transformers for semantic similarity
|
|
44
|
+
- **SQLite Storage**: Portable, no external database required
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
pip install cocoindex-code
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Or with uv:
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
uv pip install cocoindex-code
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Usage with Claude Code
|
|
59
|
+
|
|
60
|
+
Add to your Claude Code MCP configuration (`.claude/mcp_config.json`):
|
|
61
|
+
|
|
62
|
+
```json
|
|
63
|
+
{
|
|
64
|
+
"mcpServers": {
|
|
65
|
+
"cocoindex-code": {
|
|
66
|
+
"command": "cocoindex-code",
|
|
67
|
+
"env": {
|
|
68
|
+
"COCOINDEX_CODE_ROOT_PATH": "/path/to/your/codebase"
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Or without explicit path (auto-discovers from current directory):
|
|
76
|
+
|
|
77
|
+
```json
|
|
78
|
+
{
|
|
79
|
+
"mcpServers": {
|
|
80
|
+
"cocoindex-code": {
|
|
81
|
+
"command": "cocoindex-code"
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Configuration
|
|
88
|
+
|
|
89
|
+
Environment variables:
|
|
90
|
+
|
|
91
|
+
| Variable | Description | Default |
|
|
92
|
+
|----------|-------------|---------|
|
|
93
|
+
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
|
|
94
|
+
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model to use | `sentence-transformers/all-MiniLM-L6-v2` |
|
|
95
|
+
|
|
96
|
+
### Root Path Discovery
|
|
97
|
+
|
|
98
|
+
If `COCOINDEX_CODE_ROOT_PATH` is not set, the codebase root is discovered by:
|
|
99
|
+
|
|
100
|
+
1. Finding the nearest parent directory containing `.cocoindex_code/`
|
|
101
|
+
2. Finding the nearest parent directory containing `.git/`
|
|
102
|
+
3. Falling back to the current working directory
|
|
103
|
+
|
|
104
|
+
## MCP Tools
|
|
105
|
+
|
|
106
|
+
### `query`
|
|
107
|
+
|
|
108
|
+
Search the codebase using semantic similarity.
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
query(
|
|
112
|
+
query: str, # Natural language query or code snippet
|
|
113
|
+
limit: int = 10, # Maximum results (1-100)
|
|
114
|
+
offset: int = 0, # Pagination offset
|
|
115
|
+
refresh_index: bool = True # Refresh index before querying
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
The `refresh_index` parameter controls whether the index is refreshed before searching:
|
|
120
|
+
|
|
121
|
+
- `True` (default): Refreshes the index to include any recent changes
|
|
122
|
+
- `False`: Skip refresh for faster consecutive queries
|
|
123
|
+
|
|
124
|
+
Returns matching code chunks with:
|
|
125
|
+
|
|
126
|
+
- File path
|
|
127
|
+
- Language
|
|
128
|
+
- Code content
|
|
129
|
+
- Line numbers (start/end)
|
|
130
|
+
- Similarity score
|
|
131
|
+
|
|
132
|
+
## Index Storage
|
|
133
|
+
|
|
134
|
+
The index is stored in `.cocoindex_code/` under your codebase root:
|
|
135
|
+
|
|
136
|
+
```
|
|
137
|
+
your-project/
|
|
138
|
+
├── .cocoindex_code/
|
|
139
|
+
│ ├── target_sqlite.db # Vector index (SQLite + sqlite-vec)
|
|
140
|
+
│ └── cocoindex.db/ # CocoIndex state
|
|
141
|
+
├── src/
|
|
142
|
+
│ └── ...
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Add `.cocoindex_code/` to your `.gitignore`.
|
|
146
|
+
|
|
147
|
+
## Supported File Types
|
|
148
|
+
|
|
149
|
+
- **Python**: `.py`, `.pyi`
|
|
150
|
+
- **JavaScript**: `.js`, `.jsx`, `.mjs`, `.cjs`
|
|
151
|
+
- **TypeScript**: `.ts`, `.tsx`
|
|
152
|
+
- **Rust**: `.rs`
|
|
153
|
+
- **Go**: `.go`
|
|
154
|
+
|
|
155
|
+
Common generated directories are automatically excluded:
|
|
156
|
+
|
|
157
|
+
- `__pycache__/`
|
|
158
|
+
- `node_modules/`
|
|
159
|
+
- `target/`
|
|
160
|
+
- `dist/`
|
|
161
|
+
- `build/`
|
|
162
|
+
- `.git/`
|
|
163
|
+
|
|
164
|
+
## Development
|
|
165
|
+
|
|
166
|
+
### Local Testing with Claude Code
|
|
167
|
+
|
|
168
|
+
To test locally without installing the package, use the Claude Code CLI:
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
claude mcp add cocoindex-code \
|
|
172
|
+
-- uv run --project /path/to/cocoindex-code cocoindex-code
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Or add to `.mcp.json` in your project root:
|
|
176
|
+
|
|
177
|
+
```json
|
|
178
|
+
{
|
|
179
|
+
"mcpServers": {
|
|
180
|
+
"cocoindex-code": {
|
|
181
|
+
"command": "uv",
|
|
182
|
+
"args": ["run", "--project", "/path/to/cocoindex-code", "cocoindex-code"]
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### Running Tests
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
# Install dev dependencies
|
|
192
|
+
uv sync --group dev
|
|
193
|
+
|
|
194
|
+
# Run tests
|
|
195
|
+
uv run pytest tests/ -v
|
|
196
|
+
|
|
197
|
+
# Run pre-commit hooks
|
|
198
|
+
uv run pre-commit run --all-files
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## License
|
|
202
|
+
|
|
203
|
+
MIT
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
# CocoIndex Code
|
|
2
|
+
|
|
3
|
+
An MCP (Model Context Protocol) server for indexing and querying codebases using [CocoIndex](https://cocoindex.io).
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Semantic Code Search**: Find relevant code using natural language queries
|
|
8
|
+
- **Incremental Indexing**: Only re-indexes changed files for fast updates
|
|
9
|
+
- **Multi-Language Support**: Python, JavaScript/TypeScript, Rust, Go
|
|
10
|
+
- **Vector Embeddings**: Uses sentence-transformers for semantic similarity
|
|
11
|
+
- **SQLite Storage**: Portable, no external database required
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install cocoindex-code
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
Or with uv:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
uv pip install cocoindex-code
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Usage with Claude Code
|
|
26
|
+
|
|
27
|
+
Add to your Claude Code MCP configuration (`.claude/mcp_config.json`):
|
|
28
|
+
|
|
29
|
+
```json
|
|
30
|
+
{
|
|
31
|
+
"mcpServers": {
|
|
32
|
+
"cocoindex-code": {
|
|
33
|
+
"command": "cocoindex-code",
|
|
34
|
+
"env": {
|
|
35
|
+
"COCOINDEX_CODE_ROOT_PATH": "/path/to/your/codebase"
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
Or without explicit path (auto-discovers from current directory):
|
|
43
|
+
|
|
44
|
+
```json
|
|
45
|
+
{
|
|
46
|
+
"mcpServers": {
|
|
47
|
+
"cocoindex-code": {
|
|
48
|
+
"command": "cocoindex-code"
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Configuration
|
|
55
|
+
|
|
56
|
+
Environment variables:
|
|
57
|
+
|
|
58
|
+
| Variable | Description | Default |
|
|
59
|
+
|----------|-------------|---------|
|
|
60
|
+
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
|
|
61
|
+
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model to use | `sentence-transformers/all-MiniLM-L6-v2` |
|
|
62
|
+
|
|
63
|
+
### Root Path Discovery
|
|
64
|
+
|
|
65
|
+
If `COCOINDEX_CODE_ROOT_PATH` is not set, the codebase root is discovered by:
|
|
66
|
+
|
|
67
|
+
1. Finding the nearest parent directory containing `.cocoindex_code/`
|
|
68
|
+
2. Finding the nearest parent directory containing `.git/`
|
|
69
|
+
3. Falling back to the current working directory
|
|
70
|
+
|
|
71
|
+
## MCP Tools
|
|
72
|
+
|
|
73
|
+
### `query`
|
|
74
|
+
|
|
75
|
+
Search the codebase using semantic similarity.
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
query(
|
|
79
|
+
query: str, # Natural language query or code snippet
|
|
80
|
+
limit: int = 10, # Maximum results (1-100)
|
|
81
|
+
offset: int = 0, # Pagination offset
|
|
82
|
+
refresh_index: bool = True # Refresh index before querying
|
|
83
|
+
)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
The `refresh_index` parameter controls whether the index is refreshed before searching:
|
|
87
|
+
|
|
88
|
+
- `True` (default): Refreshes the index to include any recent changes
|
|
89
|
+
- `False`: Skip refresh for faster consecutive queries
|
|
90
|
+
|
|
91
|
+
Returns matching code chunks with:
|
|
92
|
+
|
|
93
|
+
- File path
|
|
94
|
+
- Language
|
|
95
|
+
- Code content
|
|
96
|
+
- Line numbers (start/end)
|
|
97
|
+
- Similarity score
|
|
98
|
+
|
|
99
|
+
## Index Storage
|
|
100
|
+
|
|
101
|
+
The index is stored in `.cocoindex_code/` under your codebase root:
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
your-project/
|
|
105
|
+
├── .cocoindex_code/
|
|
106
|
+
│ ├── target_sqlite.db # Vector index (SQLite + sqlite-vec)
|
|
107
|
+
│ └── cocoindex.db/ # CocoIndex state
|
|
108
|
+
├── src/
|
|
109
|
+
│ └── ...
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Add `.cocoindex_code/` to your `.gitignore`.
|
|
113
|
+
|
|
114
|
+
## Supported File Types
|
|
115
|
+
|
|
116
|
+
- **Python**: `.py`, `.pyi`
|
|
117
|
+
- **JavaScript**: `.js`, `.jsx`, `.mjs`, `.cjs`
|
|
118
|
+
- **TypeScript**: `.ts`, `.tsx`
|
|
119
|
+
- **Rust**: `.rs`
|
|
120
|
+
- **Go**: `.go`
|
|
121
|
+
|
|
122
|
+
Common generated directories are automatically excluded:
|
|
123
|
+
|
|
124
|
+
- `__pycache__/`
|
|
125
|
+
- `node_modules/`
|
|
126
|
+
- `target/`
|
|
127
|
+
- `dist/`
|
|
128
|
+
- `build/`
|
|
129
|
+
- `.git/`
|
|
130
|
+
|
|
131
|
+
## Development
|
|
132
|
+
|
|
133
|
+
### Local Testing with Claude Code
|
|
134
|
+
|
|
135
|
+
To test locally without installing the package, use the Claude Code CLI:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
claude mcp add cocoindex-code \
|
|
139
|
+
-- uv run --project /path/to/cocoindex-code cocoindex-code
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Or add to `.mcp.json` in your project root:
|
|
143
|
+
|
|
144
|
+
```json
|
|
145
|
+
{
|
|
146
|
+
"mcpServers": {
|
|
147
|
+
"cocoindex-code": {
|
|
148
|
+
"command": "uv",
|
|
149
|
+
"args": ["run", "--project", "/path/to/cocoindex-code", "cocoindex-code"]
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
### Running Tests
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
# Install dev dependencies
|
|
159
|
+
uv sync --group dev
|
|
160
|
+
|
|
161
|
+
# Run tests
|
|
162
|
+
uv run pytest tests/ -v
|
|
163
|
+
|
|
164
|
+
# Run pre-commit hooks
|
|
165
|
+
uv run pre-commit run --all-files
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
## License
|
|
169
|
+
|
|
170
|
+
MIT
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "cocoindex-code"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "MCP server for indexing and querying codebases using CocoIndex"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.11"
|
|
12
|
+
keywords = ["mcp", "codebase", "indexing", "vector-search", "cocoindex"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 3 - Alpha",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
dependencies = [
|
|
25
|
+
"mcp>=1.0.0",
|
|
26
|
+
"cocoindex==1.0.0a10",
|
|
27
|
+
"sentence-transformers>=2.2.0",
|
|
28
|
+
"sqlite-vec>=0.1.0",
|
|
29
|
+
"pydantic>=2.0.0",
|
|
30
|
+
"numpy>=1.24.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.optional-dependencies]
|
|
34
|
+
dev = [
|
|
35
|
+
"pytest>=7.0.0",
|
|
36
|
+
"pytest-asyncio>=0.21.0",
|
|
37
|
+
"pytest-cov>=4.0.0",
|
|
38
|
+
"ruff>=0.1.0",
|
|
39
|
+
"mypy>=1.0.0",
|
|
40
|
+
"prek>=0.1.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[project.scripts]
|
|
44
|
+
cocoindex-code = "cocoindex_code:main"
|
|
45
|
+
|
|
46
|
+
[project.urls]
|
|
47
|
+
Homepage = "https://github.com/cocoindex-io/cocoindex-code"
|
|
48
|
+
Repository = "https://github.com/cocoindex-io/cocoindex-code"
|
|
49
|
+
Issues = "https://github.com/cocoindex-io/cocoindex-code/issues"
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.wheel]
|
|
52
|
+
packages = ["src/cocoindex_code"]
|
|
53
|
+
|
|
54
|
+
[tool.hatch.build.targets.sdist]
|
|
55
|
+
include = ["/src", "/README.md", "/LICENSE"]
|
|
56
|
+
|
|
57
|
+
[dependency-groups]
|
|
58
|
+
dev = [
|
|
59
|
+
"pytest>=7.0.0",
|
|
60
|
+
"pytest-asyncio>=0.21.0",
|
|
61
|
+
"pytest-cov>=4.0.0",
|
|
62
|
+
"ruff>=0.1.0",
|
|
63
|
+
"mypy>=1.0.0",
|
|
64
|
+
"prek>=0.1.0",
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
[tool.uv]
|
|
68
|
+
prerelease = "explicit"
|
|
69
|
+
|
|
70
|
+
[tool.ruff]
|
|
71
|
+
line-length = 100
|
|
72
|
+
|
|
73
|
+
[tool.ruff.lint]
|
|
74
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
75
|
+
|
|
76
|
+
[tool.mypy]
|
|
77
|
+
python_version = "3.11"
|
|
78
|
+
strict = true
|
|
79
|
+
ignore_missing_imports = true
|
|
80
|
+
|
|
81
|
+
[tool.pytest.ini_options]
|
|
82
|
+
testpaths = ["tests"]
|
|
83
|
+
python_files = ["test_*.py"]
|
|
84
|
+
python_functions = ["test_*"]
|
|
85
|
+
addopts = "-v --tb=short"
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Configuration for CocoIndex Code MCP server."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _find_root_with_marker(start_dir: Path, marker: str) -> Path | None:
|
|
9
|
+
"""Find the nearest parent directory containing the given marker directory."""
|
|
10
|
+
current = start_dir.resolve()
|
|
11
|
+
while current != current.parent:
|
|
12
|
+
if (current / marker).is_dir():
|
|
13
|
+
return current
|
|
14
|
+
current = current.parent
|
|
15
|
+
# Check root directory too
|
|
16
|
+
if (current / marker).is_dir():
|
|
17
|
+
return current
|
|
18
|
+
return None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _discover_codebase_root() -> Path:
|
|
22
|
+
"""
|
|
23
|
+
Discover the codebase root directory.
|
|
24
|
+
|
|
25
|
+
Discovery order:
|
|
26
|
+
1. Find nearest parent with `.cocoindex_code` directory
|
|
27
|
+
2. Find nearest parent with `.git` directory
|
|
28
|
+
3. Fall back to current working directory
|
|
29
|
+
"""
|
|
30
|
+
cwd = Path.cwd()
|
|
31
|
+
|
|
32
|
+
# First, look for existing .cocoindex_code directory
|
|
33
|
+
root = _find_root_with_marker(cwd, ".cocoindex_code")
|
|
34
|
+
if root is not None:
|
|
35
|
+
return root
|
|
36
|
+
|
|
37
|
+
# Then, look for .git directory
|
|
38
|
+
root = _find_root_with_marker(cwd, ".git")
|
|
39
|
+
if root is not None:
|
|
40
|
+
return root
|
|
41
|
+
|
|
42
|
+
# Fall back to current working directory
|
|
43
|
+
return cwd
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class Config:
|
|
48
|
+
"""Configuration loaded from environment variables."""
|
|
49
|
+
|
|
50
|
+
codebase_root_path: Path
|
|
51
|
+
embedding_model: str
|
|
52
|
+
index_dir: Path
|
|
53
|
+
|
|
54
|
+
@classmethod
|
|
55
|
+
def from_env(cls) -> "Config":
|
|
56
|
+
"""Load configuration from environment variables."""
|
|
57
|
+
# Get root path from env or discover it
|
|
58
|
+
root_path_str = os.environ.get("COCOINDEX_CODE_ROOT_PATH")
|
|
59
|
+
if root_path_str:
|
|
60
|
+
root = Path(root_path_str).resolve()
|
|
61
|
+
else:
|
|
62
|
+
root = _discover_codebase_root()
|
|
63
|
+
|
|
64
|
+
# Get embedding model
|
|
65
|
+
embedding_model = os.environ.get(
|
|
66
|
+
"COCOINDEX_CODE_EMBEDDING_MODEL",
|
|
67
|
+
"sentence-transformers/all-MiniLM-L6-v2",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Index directory is always under the root
|
|
71
|
+
index_dir = root / ".cocoindex_code"
|
|
72
|
+
|
|
73
|
+
return cls(
|
|
74
|
+
codebase_root_path=root,
|
|
75
|
+
embedding_model=embedding_model,
|
|
76
|
+
index_dir=index_dir,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def target_sqlite_db_path(self) -> Path:
|
|
81
|
+
"""Path to the vector index SQLite database."""
|
|
82
|
+
return self.index_dir / "target_sqlite.db"
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def cocoindex_db_path(self) -> Path:
|
|
86
|
+
"""Path to the CocoIndex state database."""
|
|
87
|
+
return self.index_dir / "cocoindex.db"
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
"""CocoIndex app for indexing codebases."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
import cocoindex.asyncio as coco_aio
|
|
6
|
+
from cocoindex.connectors import localfs, sqlite
|
|
7
|
+
from cocoindex.ops.text import RecursiveSplitter, detect_code_language
|
|
8
|
+
from cocoindex.resources.chunk import Chunk
|
|
9
|
+
from cocoindex.resources.file import PatternFilePathMatcher
|
|
10
|
+
from cocoindex.resources.id import IdGenerator
|
|
11
|
+
|
|
12
|
+
from .shared import SQLITE_DB, CodeChunk, config, embedder
|
|
13
|
+
|
|
14
|
+
# File patterns for supported languages
|
|
15
|
+
INCLUDED_PATTERNS = [
|
|
16
|
+
"*.py", # Python
|
|
17
|
+
"*.pyi", # Python stubs
|
|
18
|
+
"*.js", # JavaScript
|
|
19
|
+
"*.jsx", # JavaScript React
|
|
20
|
+
"*.ts", # TypeScript
|
|
21
|
+
"*.tsx", # TypeScript React
|
|
22
|
+
"*.mjs", # JavaScript ES modules
|
|
23
|
+
"*.cjs", # JavaScript CommonJS
|
|
24
|
+
"*.rs", # Rust
|
|
25
|
+
"*.go", # Go
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
EXCLUDED_PATTERNS = [
|
|
29
|
+
".*/**", # Hidden directories
|
|
30
|
+
"**/__pycache__/**", # Python cache
|
|
31
|
+
"**/node_modules/**", # Node.js dependencies
|
|
32
|
+
"**/target/**", # Rust/Maven build output
|
|
33
|
+
"**/dist/**", # Distribution directories
|
|
34
|
+
"**/build/**", # Build directories
|
|
35
|
+
"**/vendor/**", # Go vendor directory
|
|
36
|
+
"**/.git/**", # Git directory
|
|
37
|
+
"**/.cocoindex_code/**", # Our own index directory
|
|
38
|
+
"*.min.js", # Minified JavaScript
|
|
39
|
+
"*.min.css", # Minified CSS
|
|
40
|
+
"*.lock", # Lock files
|
|
41
|
+
"**/package-lock.json", # NPM lock
|
|
42
|
+
"**/yarn.lock", # Yarn lock
|
|
43
|
+
"**/Cargo.lock", # Cargo lock
|
|
44
|
+
"**/go.sum", # Go sum
|
|
45
|
+
"**/*.pyc", # Python bytecode
|
|
46
|
+
"**/*.pyo", # Python optimized bytecode
|
|
47
|
+
"**/*.so", # Shared objects
|
|
48
|
+
"**/*.dylib", # macOS dynamic libraries
|
|
49
|
+
"**/*.dll", # Windows dynamic libraries
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
# Chunking configuration
|
|
53
|
+
CHUNK_SIZE = 1000
|
|
54
|
+
MIN_CHUNK_SIZE = 300
|
|
55
|
+
CHUNK_OVERLAP = 200
|
|
56
|
+
|
|
57
|
+
# Chunking splitter (stateless, can be module-level)
|
|
58
|
+
splitter = RecursiveSplitter()
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@coco_aio.function
|
|
62
|
+
async def process_chunk(
|
|
63
|
+
file_path: str,
|
|
64
|
+
chunk: Chunk,
|
|
65
|
+
language: str,
|
|
66
|
+
id_gen: IdGenerator,
|
|
67
|
+
table: sqlite.TableTarget,
|
|
68
|
+
) -> None:
|
|
69
|
+
"""Process a single chunk: embed and store."""
|
|
70
|
+
id, chunk_embedding = await asyncio.gather(
|
|
71
|
+
id_gen.next_id(chunk.text),
|
|
72
|
+
embedder.embed(chunk.text),
|
|
73
|
+
)
|
|
74
|
+
table.declare_row(
|
|
75
|
+
row=CodeChunk( # type: ignore[arg-type]
|
|
76
|
+
id=id,
|
|
77
|
+
file_path=file_path,
|
|
78
|
+
language=language,
|
|
79
|
+
content=chunk.text,
|
|
80
|
+
start_line=chunk.start.line,
|
|
81
|
+
end_line=chunk.end.line,
|
|
82
|
+
embedding=chunk_embedding,
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
@coco_aio.function(memo=True)
|
|
88
|
+
async def process_file(
|
|
89
|
+
file: localfs.File,
|
|
90
|
+
table: sqlite.TableTarget,
|
|
91
|
+
) -> None:
|
|
92
|
+
"""Process a single file: chunk, embed, and store."""
|
|
93
|
+
# Read file content
|
|
94
|
+
try:
|
|
95
|
+
content = file.read_text()
|
|
96
|
+
except UnicodeDecodeError:
|
|
97
|
+
# Skip binary files
|
|
98
|
+
return
|
|
99
|
+
|
|
100
|
+
if not content.strip():
|
|
101
|
+
return
|
|
102
|
+
|
|
103
|
+
# Get relative path and detect language
|
|
104
|
+
language = detect_code_language(filename=file.file_path.path.name) or "text"
|
|
105
|
+
|
|
106
|
+
# Split into chunks
|
|
107
|
+
chunks = splitter.split(
|
|
108
|
+
content,
|
|
109
|
+
chunk_size=CHUNK_SIZE,
|
|
110
|
+
min_chunk_size=MIN_CHUNK_SIZE,
|
|
111
|
+
chunk_overlap=CHUNK_OVERLAP,
|
|
112
|
+
language=language,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
id_gen = IdGenerator()
|
|
116
|
+
await asyncio.gather(
|
|
117
|
+
*(
|
|
118
|
+
process_chunk(str(file.file_path.path), chunk, language, id_gen, table)
|
|
119
|
+
for chunk in chunks
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
@coco_aio.function
|
|
125
|
+
async def app_main() -> None:
|
|
126
|
+
"""Main indexing function - walks files and processes each."""
|
|
127
|
+
db = coco_aio.use_context(SQLITE_DB)
|
|
128
|
+
|
|
129
|
+
# Declare the table target for storing embeddings
|
|
130
|
+
table = await coco_aio.mount_run(
|
|
131
|
+
coco_aio.component_subpath("setup", "table"),
|
|
132
|
+
db.declare_table_target,
|
|
133
|
+
table_name="code_chunks",
|
|
134
|
+
table_schema=await sqlite.TableSchema.from_class(
|
|
135
|
+
CodeChunk,
|
|
136
|
+
primary_key=["id"],
|
|
137
|
+
),
|
|
138
|
+
).result()
|
|
139
|
+
|
|
140
|
+
# Walk source directory
|
|
141
|
+
files = localfs.walk_dir(
|
|
142
|
+
config.codebase_root_path,
|
|
143
|
+
recursive=True,
|
|
144
|
+
path_matcher=PatternFilePathMatcher(
|
|
145
|
+
included_patterns=INCLUDED_PATTERNS,
|
|
146
|
+
excluded_patterns=EXCLUDED_PATTERNS,
|
|
147
|
+
),
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Process each file
|
|
151
|
+
for f in files:
|
|
152
|
+
coco_aio.mount(
|
|
153
|
+
coco_aio.component_subpath("process", str(f.file_path.path)),
|
|
154
|
+
process_file,
|
|
155
|
+
f,
|
|
156
|
+
table,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# Create the app
|
|
161
|
+
app = coco_aio.App(
|
|
162
|
+
coco_aio.AppConfig(name="CocoIndexCode"),
|
|
163
|
+
app_main,
|
|
164
|
+
)
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
"""Query implementation for codebase search."""
|
|
2
|
+
|
|
3
|
+
import cocoindex as coco
|
|
4
|
+
|
|
5
|
+
from .schema import QueryResult
|
|
6
|
+
from .shared import SQLITE_DB, config, embedder
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
async def query_codebase(
|
|
10
|
+
query: str,
|
|
11
|
+
limit: int = 10,
|
|
12
|
+
offset: int = 0,
|
|
13
|
+
) -> list[QueryResult]:
|
|
14
|
+
"""
|
|
15
|
+
Perform vector similarity search.
|
|
16
|
+
|
|
17
|
+
Uses sqlite-vec's vec_distance_cosine for similarity scoring.
|
|
18
|
+
"""
|
|
19
|
+
if not config.target_sqlite_db_path.exists():
|
|
20
|
+
raise RuntimeError(
|
|
21
|
+
f"Index database not found at {config.target_sqlite_db_path}. "
|
|
22
|
+
"Please run a query with refresh_index=True first."
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
# Get the database connection from CocoIndex environment
|
|
26
|
+
db = coco.default_env().get_context(SQLITE_DB)
|
|
27
|
+
|
|
28
|
+
# Generate query embedding
|
|
29
|
+
query_embedding = await embedder.embed(query)
|
|
30
|
+
|
|
31
|
+
# Convert to bytes for sqlite-vec (float32)
|
|
32
|
+
embedding_bytes = query_embedding.astype("float32").tobytes()
|
|
33
|
+
|
|
34
|
+
# Query using sqlite-vec with readonly transaction
|
|
35
|
+
# vec_distance_cosine returns distance (lower is better),
|
|
36
|
+
# so we convert to similarity score (1 - distance)
|
|
37
|
+
with db.value.readonly() as conn:
|
|
38
|
+
cursor = conn.execute(
|
|
39
|
+
"""
|
|
40
|
+
SELECT
|
|
41
|
+
file_path,
|
|
42
|
+
language,
|
|
43
|
+
content,
|
|
44
|
+
start_line,
|
|
45
|
+
end_line,
|
|
46
|
+
(1.0 - vec_distance_cosine(embedding, ?)) as score
|
|
47
|
+
FROM code_chunks
|
|
48
|
+
ORDER BY vec_distance_cosine(embedding, ?) ASC
|
|
49
|
+
LIMIT ? OFFSET ?
|
|
50
|
+
""",
|
|
51
|
+
(embedding_bytes, embedding_bytes, limit, offset),
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
return [
|
|
55
|
+
QueryResult(
|
|
56
|
+
file_path=row[0],
|
|
57
|
+
language=row[1],
|
|
58
|
+
content=row[2],
|
|
59
|
+
start_line=row[3],
|
|
60
|
+
end_line=row[4],
|
|
61
|
+
score=row[5],
|
|
62
|
+
)
|
|
63
|
+
for row in cursor.fetchall()
|
|
64
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Data models for CocoIndex Code."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class CodeChunk:
|
|
9
|
+
"""Represents an indexed code chunk stored in SQLite."""
|
|
10
|
+
|
|
11
|
+
id: int
|
|
12
|
+
file_path: str
|
|
13
|
+
language: str
|
|
14
|
+
content: str
|
|
15
|
+
start_line: int
|
|
16
|
+
end_line: int
|
|
17
|
+
embedding: Any # NDArray - type hint relaxed for compatibility
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass
|
|
21
|
+
class QueryResult:
|
|
22
|
+
"""Result from a vector similarity query."""
|
|
23
|
+
|
|
24
|
+
file_path: str
|
|
25
|
+
language: str
|
|
26
|
+
content: str
|
|
27
|
+
start_line: int
|
|
28
|
+
end_line: int
|
|
29
|
+
score: float
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""MCP server for codebase indexing and querying."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
|
|
5
|
+
from mcp.server.fastmcp import FastMCP
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
from .indexer import app as indexer_app
|
|
9
|
+
from .query import query_codebase
|
|
10
|
+
|
|
11
|
+
# Initialize MCP server
|
|
12
|
+
mcp = FastMCP(
|
|
13
|
+
"cocoindex-code",
|
|
14
|
+
instructions="""
|
|
15
|
+
This server provides semantic code search for the codebase.
|
|
16
|
+
This allows you to quickly and cheaply search for code related to a concept or functionality
|
|
17
|
+
across the entire codebase.
|
|
18
|
+
|
|
19
|
+
Use the `query` tool when you need to:
|
|
20
|
+
- Find code related to a concept or functionality
|
|
21
|
+
- Search for implementations of specific features
|
|
22
|
+
- Discover how something is done in the codebase
|
|
23
|
+
- Find similar code patterns
|
|
24
|
+
|
|
25
|
+
The `query` tool has a `refresh_index` parameter (default: True) that refreshes
|
|
26
|
+
the index before searching. Set it to False for consecutive queries to avoid
|
|
27
|
+
redundant refreshes.
|
|
28
|
+
|
|
29
|
+
The search uses vector embeddings for semantic similarity, so you can describe
|
|
30
|
+
what you're looking for in natural language rather than exact text matches.
|
|
31
|
+
""".strip(),
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
# Lock to prevent concurrent index updates
|
|
35
|
+
_index_lock = asyncio.Lock()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
async def _refresh_index() -> None:
|
|
39
|
+
"""Refresh the index. Uses lock to prevent concurrent updates."""
|
|
40
|
+
async with _index_lock:
|
|
41
|
+
await indexer_app.update(report_to_stdout=False)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# === Pydantic Models for Tool Inputs/Outputs ===
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class CodeChunkResult(BaseModel):
|
|
48
|
+
"""A single code chunk result."""
|
|
49
|
+
|
|
50
|
+
file_path: str = Field(description="Relative path to the file")
|
|
51
|
+
language: str = Field(description="Programming language")
|
|
52
|
+
content: str = Field(description="The code content")
|
|
53
|
+
start_line: int = Field(description="Starting line number (1-indexed)")
|
|
54
|
+
end_line: int = Field(description="Ending line number (1-indexed)")
|
|
55
|
+
score: float = Field(description="Similarity score (0-1, higher is better)")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class QueryResultModel(BaseModel):
|
|
59
|
+
"""Result from query tool."""
|
|
60
|
+
|
|
61
|
+
success: bool
|
|
62
|
+
results: list[CodeChunkResult] = Field(default_factory=list)
|
|
63
|
+
total_returned: int = Field(default=0)
|
|
64
|
+
offset: int = Field(default=0)
|
|
65
|
+
message: str | None = None
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# === MCP Tools ===
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@mcp.tool(
|
|
72
|
+
name="query",
|
|
73
|
+
description=(
|
|
74
|
+
"Search the codebase using semantic similarity. "
|
|
75
|
+
"Returns relevant code chunks with file locations and similarity scores. "
|
|
76
|
+
"Use natural language queries or code snippets to find related code."
|
|
77
|
+
),
|
|
78
|
+
)
|
|
79
|
+
async def query(
|
|
80
|
+
query: str = Field(description="Natural language query or code snippet to search for"),
|
|
81
|
+
limit: int = Field(
|
|
82
|
+
default=10,
|
|
83
|
+
ge=1,
|
|
84
|
+
le=100,
|
|
85
|
+
description="Maximum number of results to return (1-100)",
|
|
86
|
+
),
|
|
87
|
+
offset: int = Field(
|
|
88
|
+
default=0,
|
|
89
|
+
ge=0,
|
|
90
|
+
description="Number of results to skip for pagination",
|
|
91
|
+
),
|
|
92
|
+
refresh_index: bool = Field(
|
|
93
|
+
default=True,
|
|
94
|
+
description=(
|
|
95
|
+
"Whether to refresh the index before querying. "
|
|
96
|
+
"Set to False for consecutive queries to skip redundant refreshes."
|
|
97
|
+
),
|
|
98
|
+
),
|
|
99
|
+
) -> QueryResultModel:
|
|
100
|
+
"""Query the codebase index."""
|
|
101
|
+
try:
|
|
102
|
+
# Refresh index if requested
|
|
103
|
+
if refresh_index:
|
|
104
|
+
await _refresh_index()
|
|
105
|
+
|
|
106
|
+
results = await query_codebase(query=query, limit=limit, offset=offset)
|
|
107
|
+
|
|
108
|
+
return QueryResultModel(
|
|
109
|
+
success=True,
|
|
110
|
+
results=[
|
|
111
|
+
CodeChunkResult(
|
|
112
|
+
file_path=r.file_path,
|
|
113
|
+
language=r.language,
|
|
114
|
+
content=r.content,
|
|
115
|
+
start_line=r.start_line,
|
|
116
|
+
end_line=r.end_line,
|
|
117
|
+
score=r.score,
|
|
118
|
+
)
|
|
119
|
+
for r in results
|
|
120
|
+
],
|
|
121
|
+
total_returned=len(results),
|
|
122
|
+
offset=offset,
|
|
123
|
+
)
|
|
124
|
+
except RuntimeError as e:
|
|
125
|
+
# Index doesn't exist
|
|
126
|
+
return QueryResultModel(
|
|
127
|
+
success=False,
|
|
128
|
+
message=str(e),
|
|
129
|
+
)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
return QueryResultModel(
|
|
132
|
+
success=False,
|
|
133
|
+
message=f"Query failed: {e!s}",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
async def _async_main() -> None:
|
|
138
|
+
"""Async entry point for the MCP server."""
|
|
139
|
+
# Refresh index in background so startup isn't blocked
|
|
140
|
+
asyncio.create_task(_refresh_index())
|
|
141
|
+
await mcp.run_stdio_async()
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def main() -> None:
|
|
145
|
+
"""Entry point for the MCP server."""
|
|
146
|
+
asyncio.run(_async_main())
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
if __name__ == "__main__":
|
|
150
|
+
main()
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Shared resources for CocoIndex Code."""
|
|
2
|
+
|
|
3
|
+
from collections.abc import Iterator
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import Annotated
|
|
6
|
+
|
|
7
|
+
import cocoindex as coco
|
|
8
|
+
from cocoindex.connectors import sqlite
|
|
9
|
+
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
|
|
10
|
+
from numpy.typing import NDArray
|
|
11
|
+
|
|
12
|
+
from .config import Config
|
|
13
|
+
|
|
14
|
+
# Load configuration at module level
|
|
15
|
+
config = Config.from_env()
|
|
16
|
+
|
|
17
|
+
# Initialize embedder at module level
|
|
18
|
+
embedder = SentenceTransformerEmbedder(config.embedding_model)
|
|
19
|
+
|
|
20
|
+
# Context key for SQLite database (connection managed in lifespan)
|
|
21
|
+
SQLITE_DB = coco.ContextKey[sqlite.SqliteDatabase]("sqlite_db")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@coco.lifespan
|
|
25
|
+
def coco_lifespan(builder: coco.EnvironmentBuilder) -> Iterator[None]:
|
|
26
|
+
"""Set up database connection."""
|
|
27
|
+
# Ensure index directory exists
|
|
28
|
+
config.index_dir.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
# Set CocoIndex state database path
|
|
31
|
+
builder.settings.db_path = config.cocoindex_db_path
|
|
32
|
+
|
|
33
|
+
# Connect to SQLite with vector extension
|
|
34
|
+
conn = sqlite.connect(str(config.target_sqlite_db_path), load_vec="auto")
|
|
35
|
+
builder.provide(SQLITE_DB, sqlite.register_db("index_db", conn))
|
|
36
|
+
|
|
37
|
+
yield
|
|
38
|
+
|
|
39
|
+
conn.close()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class CodeChunk:
|
|
44
|
+
"""Schema for storing code chunks in SQLite."""
|
|
45
|
+
|
|
46
|
+
id: int
|
|
47
|
+
file_path: str
|
|
48
|
+
language: str
|
|
49
|
+
content: str
|
|
50
|
+
start_line: int
|
|
51
|
+
end_line: int
|
|
52
|
+
embedding: Annotated[NDArray, embedder] # type: ignore[type-arg]
|