cocoindex-code 0.1.7__tar.gz → 0.1.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/PKG-INFO +21 -2
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/README.md +19 -0
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/pyproject.toml +1 -5
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/src/cocoindex_code/config.py +14 -15
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/src/cocoindex_code/indexer.py +26 -9
- cocoindex_code-0.1.9/src/cocoindex_code/query.py +148 -0
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/src/cocoindex_code/server.py +42 -21
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/src/cocoindex_code/shared.py +12 -7
- cocoindex_code-0.1.7/src/cocoindex_code/embedder.py +0 -117
- cocoindex_code-0.1.7/src/cocoindex_code/query.py +0 -71
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/.gitignore +0 -0
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/LICENSE +0 -0
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/src/cocoindex_code/__init__.py +0 -0
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/src/cocoindex_code/__main__.py +0 -0
- {cocoindex_code-0.1.7 → cocoindex_code-0.1.9}/src/cocoindex_code/schema.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: cocoindex-code
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.9
|
|
4
4
|
Summary: MCP server for indexing and querying codebases using CocoIndex
|
|
5
5
|
Project-URL: Homepage, https://github.com/cocoindex-io/cocoindex-code
|
|
6
6
|
Project-URL: Repository, https://github.com/cocoindex-io/cocoindex-code
|
|
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
|
17
17
|
Classifier: Programming Language :: Python :: 3.13
|
|
18
18
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
19
19
|
Requires-Python: >=3.11
|
|
20
|
-
Requires-Dist: cocoindex[litellm]==1.0.
|
|
20
|
+
Requires-Dist: cocoindex[litellm]==1.0.0a24
|
|
21
21
|
Requires-Dist: einops>=0.8.2
|
|
22
22
|
Requires-Dist: mcp>=1.0.0
|
|
23
23
|
Requires-Dist: numpy>=1.24.0
|
|
@@ -40,6 +40,10 @@ Description-Content-Type: text/markdown
|
|
|
40
40
|
|
|
41
41
|
<h1 align="center">light weight MCP for code that just works </h1>
|
|
42
42
|
|
|
43
|
+

|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
|
|
43
47
|
|
|
44
48
|
A super light-weight, effective embedded MCP **(AST-based)** that understand and searches your codebase that just works! Using [CocoIndex](https://github.com/cocoindex-io/cocoindex) - an Rust-based ultra performant data transformation engine. No blackbox. Works for Claude, Codex, Cursor - any coding agent.
|
|
45
49
|
|
|
@@ -116,6 +120,20 @@ Or use opencode.json:
|
|
|
116
120
|
|
|
117
121
|
Optionally, you can run `cocoindex-code index` to create or update the index. Without running it, the MCP server will automatically build and keep the index up-to-date in the background.
|
|
118
122
|
|
|
123
|
+
## When Is the MCP Triggered?
|
|
124
|
+
|
|
125
|
+
Once configured, your coding agent (Claude Code, Codex, Cursor, etc.) automatically decides when semantic code search is helpful — especially for finding code by description, exploring unfamiliar codebases, fuzzy/conceptual matches, or locating implementations without knowing exact names.
|
|
126
|
+
|
|
127
|
+
You can also nudge the agent explicitly, e.g. *"Use the cocoindex-code MCP to find how user sessions are managed."* For persistent instructions, add guidance to your project's `AGENTS.md` or `CLAUDE.md`:
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
Use the cocoindex-code MCP server for semantic code search when:
|
|
131
|
+
- Searching for code by meaning or description rather than exact text
|
|
132
|
+
- Exploring unfamiliar parts of the codebase
|
|
133
|
+
- Looking for implementations without knowing exact names
|
|
134
|
+
- Finding similar code patterns or related functionality
|
|
135
|
+
```
|
|
136
|
+
|
|
119
137
|
## Features
|
|
120
138
|
- **Semantic Code Search**: Find relevant code using natural language queries when grep doesn't work well, and save tokens immediately.
|
|
121
139
|
- **Ultra Performant to code changes**:⚡ Built on top of ultra performant [Rust indexing engine](https://github.com/cocoindex-io/cocoindex/edit/main/README.md). Only re-indexes changed files for fast updates.
|
|
@@ -131,6 +149,7 @@ Optionally, you can run `cocoindex-code index` to create or update the index. Wi
|
|
|
131
149
|
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
|
|
132
150
|
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
|
|
133
151
|
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
|
|
152
|
+
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
|
|
134
153
|
|
|
135
154
|
|
|
136
155
|
### Root Path Discovery
|
|
@@ -5,6 +5,10 @@
|
|
|
5
5
|
|
|
6
6
|
<h1 align="center">light weight MCP for code that just works </h1>
|
|
7
7
|
|
|
8
|
+

|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
8
12
|
|
|
9
13
|
A super light-weight, effective embedded MCP **(AST-based)** that understand and searches your codebase that just works! Using [CocoIndex](https://github.com/cocoindex-io/cocoindex) - an Rust-based ultra performant data transformation engine. No blackbox. Works for Claude, Codex, Cursor - any coding agent.
|
|
10
14
|
|
|
@@ -81,6 +85,20 @@ Or use opencode.json:
|
|
|
81
85
|
|
|
82
86
|
Optionally, you can run `cocoindex-code index` to create or update the index. Without running it, the MCP server will automatically build and keep the index up-to-date in the background.
|
|
83
87
|
|
|
88
|
+
## When Is the MCP Triggered?
|
|
89
|
+
|
|
90
|
+
Once configured, your coding agent (Claude Code, Codex, Cursor, etc.) automatically decides when semantic code search is helpful — especially for finding code by description, exploring unfamiliar codebases, fuzzy/conceptual matches, or locating implementations without knowing exact names.
|
|
91
|
+
|
|
92
|
+
You can also nudge the agent explicitly, e.g. *"Use the cocoindex-code MCP to find how user sessions are managed."* For persistent instructions, add guidance to your project's `AGENTS.md` or `CLAUDE.md`:
|
|
93
|
+
|
|
94
|
+
```
|
|
95
|
+
Use the cocoindex-code MCP server for semantic code search when:
|
|
96
|
+
- Searching for code by meaning or description rather than exact text
|
|
97
|
+
- Exploring unfamiliar parts of the codebase
|
|
98
|
+
- Looking for implementations without knowing exact names
|
|
99
|
+
- Finding similar code patterns or related functionality
|
|
100
|
+
```
|
|
101
|
+
|
|
84
102
|
## Features
|
|
85
103
|
- **Semantic Code Search**: Find relevant code using natural language queries when grep doesn't work well, and save tokens immediately.
|
|
86
104
|
- **Ultra Performant to code changes**:⚡ Built on top of ultra performant [Rust indexing engine](https://github.com/cocoindex-io/cocoindex/edit/main/README.md). Only re-indexes changed files for fast updates.
|
|
@@ -96,6 +114,7 @@ Optionally, you can run `cocoindex-code index` to create or update the index. Wi
|
|
|
96
114
|
| `COCOINDEX_CODE_ROOT_PATH` | Root path of the codebase | Auto-discovered (see below) |
|
|
97
115
|
| `COCOINDEX_CODE_EMBEDDING_MODEL` | Embedding model (see below) | `sbert/sentence-transformers/all-MiniLM-L6-v2` |
|
|
98
116
|
| `COCOINDEX_CODE_BATCH_SIZE` | Max batch size for local embedding model | `16` |
|
|
117
|
+
| `COCOINDEX_CODE_EXTRA_EXTENSIONS` | Additional file extensions to index (comma-separated, e.g. `"inc:php,yaml,toml"` — use `ext:lang` to override language detection) | _(none)_ |
|
|
99
118
|
|
|
100
119
|
|
|
101
120
|
### Root Path Discovery
|
|
@@ -23,7 +23,7 @@ classifiers = [
|
|
|
23
23
|
|
|
24
24
|
dependencies = [
|
|
25
25
|
"mcp>=1.0.0",
|
|
26
|
-
"cocoindex[litellm]==1.0.
|
|
26
|
+
"cocoindex[litellm]==1.0.0a24",
|
|
27
27
|
"sentence-transformers>=2.2.0",
|
|
28
28
|
"sqlite-vec>=0.1.0",
|
|
29
29
|
"pydantic>=2.0.0",
|
|
@@ -82,10 +82,6 @@ python_version = "3.11"
|
|
|
82
82
|
strict = true
|
|
83
83
|
ignore_missing_imports = true
|
|
84
84
|
|
|
85
|
-
[[tool.mypy.overrides]]
|
|
86
|
-
module = "cocoindex_code.embedder"
|
|
87
|
-
warn_unused_ignores = false
|
|
88
|
-
|
|
89
85
|
[tool.pytest.ini_options]
|
|
90
86
|
testpaths = ["tests"]
|
|
91
87
|
python_files = ["test_*.py"]
|
|
@@ -6,7 +6,6 @@ import os
|
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from pathlib import Path
|
|
8
8
|
|
|
9
|
-
_SBERT_PREFIX = "sbert/"
|
|
10
9
|
_DEFAULT_MODEL = "sbert/sentence-transformers/all-MiniLM-L6-v2"
|
|
11
10
|
|
|
12
11
|
|
|
@@ -65,7 +64,7 @@ class Config:
|
|
|
65
64
|
index_dir: Path
|
|
66
65
|
device: str
|
|
67
66
|
trust_remote_code: bool
|
|
68
|
-
|
|
67
|
+
extra_extensions: dict[str, str | None]
|
|
69
68
|
|
|
70
69
|
@classmethod
|
|
71
70
|
def from_env(cls) -> Config:
|
|
@@ -100,18 +99,18 @@ class Config:
|
|
|
100
99
|
"yes",
|
|
101
100
|
)
|
|
102
101
|
|
|
103
|
-
#
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
102
|
+
# Extra file extensions (format: "inc:php,yaml,toml" — optional lang after colon)
|
|
103
|
+
raw_extra = os.environ.get("COCOINDEX_CODE_EXTRA_EXTENSIONS", "")
|
|
104
|
+
extra_extensions: dict[str, str | None] = {}
|
|
105
|
+
for token in raw_extra.split(","):
|
|
106
|
+
token = token.strip()
|
|
107
|
+
if not token:
|
|
108
|
+
continue
|
|
109
|
+
if ":" in token:
|
|
110
|
+
ext, lang = token.split(":", 1)
|
|
111
|
+
extra_extensions[f".{ext.strip()}"] = lang.strip() or None
|
|
112
|
+
else:
|
|
113
|
+
extra_extensions[f".{token}"] = None
|
|
115
114
|
|
|
116
115
|
return cls(
|
|
117
116
|
codebase_root_path=root,
|
|
@@ -119,7 +118,7 @@ class Config:
|
|
|
119
118
|
index_dir=index_dir,
|
|
120
119
|
device=device,
|
|
121
120
|
trust_remote_code=trust_remote_code,
|
|
122
|
-
|
|
121
|
+
extra_extensions=extra_extensions,
|
|
123
122
|
)
|
|
124
123
|
|
|
125
124
|
@property
|
|
@@ -2,16 +2,17 @@
|
|
|
2
2
|
|
|
3
3
|
import cocoindex as coco
|
|
4
4
|
from cocoindex.connectors import localfs, sqlite
|
|
5
|
+
from cocoindex.connectors.sqlite import Vec0TableDef
|
|
5
6
|
from cocoindex.ops.text import RecursiveSplitter, detect_code_language
|
|
6
7
|
from cocoindex.resources.chunk import Chunk
|
|
7
8
|
from cocoindex.resources.file import PatternFilePathMatcher
|
|
8
9
|
from cocoindex.resources.id import IdGenerator
|
|
9
10
|
|
|
10
11
|
from .config import config
|
|
11
|
-
from .shared import SQLITE_DB, CodeChunk, embedder
|
|
12
|
+
from .shared import CODEBASE_DIR, SQLITE_DB, CodeChunk, embedder
|
|
12
13
|
|
|
13
14
|
# File patterns for supported languages
|
|
14
|
-
|
|
15
|
+
DEFAULT_INCLUDED_PATTERNS = [
|
|
15
16
|
"**/*.py", # Python
|
|
16
17
|
"**/*.pyi", # Python stubs
|
|
17
18
|
"**/*.js", # JavaScript
|
|
@@ -43,6 +44,13 @@ INCLUDED_PATTERNS = [
|
|
|
43
44
|
"**/*.php", # PHP
|
|
44
45
|
]
|
|
45
46
|
|
|
47
|
+
INCLUDED_PATTERNS = DEFAULT_INCLUDED_PATTERNS + [f"**/*{ext}" for ext in config.extra_extensions]
|
|
48
|
+
|
|
49
|
+
# Language overrides from extra_extensions (e.g. ".inc" -> "php")
|
|
50
|
+
LANGUAGE_OVERRIDES: dict[str, str] = {
|
|
51
|
+
ext: lang for ext, lang in config.extra_extensions.items() if lang is not None
|
|
52
|
+
}
|
|
53
|
+
|
|
46
54
|
EXCLUDED_PATTERNS = [
|
|
47
55
|
"**/.*", # Hidden directories
|
|
48
56
|
"**/__pycache__", # Python cache
|
|
@@ -56,9 +64,9 @@ EXCLUDED_PATTERNS = [
|
|
|
56
64
|
]
|
|
57
65
|
|
|
58
66
|
# Chunking configuration
|
|
59
|
-
CHUNK_SIZE =
|
|
60
|
-
MIN_CHUNK_SIZE =
|
|
61
|
-
CHUNK_OVERLAP =
|
|
67
|
+
CHUNK_SIZE = 2000
|
|
68
|
+
MIN_CHUNK_SIZE = 300
|
|
69
|
+
CHUNK_OVERLAP = 200
|
|
62
70
|
|
|
63
71
|
# Chunking splitter (stateless, can be module-level)
|
|
64
72
|
splitter = RecursiveSplitter()
|
|
@@ -66,7 +74,7 @@ splitter = RecursiveSplitter()
|
|
|
66
74
|
|
|
67
75
|
@coco.fn(memo=True)
|
|
68
76
|
async def process_file(
|
|
69
|
-
file: localfs.
|
|
77
|
+
file: localfs.File,
|
|
70
78
|
table: sqlite.TableTarget[CodeChunk],
|
|
71
79
|
) -> None:
|
|
72
80
|
"""Process a single file: chunk, embed, and store."""
|
|
@@ -81,7 +89,12 @@ async def process_file(
|
|
|
81
89
|
return
|
|
82
90
|
|
|
83
91
|
# Get relative path and detect language
|
|
84
|
-
|
|
92
|
+
suffix = file.file_path.path.suffix
|
|
93
|
+
language = (
|
|
94
|
+
LANGUAGE_OVERRIDES.get(suffix)
|
|
95
|
+
or detect_code_language(filename=file.file_path.path.name)
|
|
96
|
+
or "text"
|
|
97
|
+
)
|
|
85
98
|
|
|
86
99
|
# Split into chunks
|
|
87
100
|
chunks = splitter.split(
|
|
@@ -119,16 +132,20 @@ async def app_main() -> None:
|
|
|
119
132
|
|
|
120
133
|
# Declare the table target for storing embeddings
|
|
121
134
|
table = await db.mount_table_target(
|
|
122
|
-
table_name="
|
|
135
|
+
table_name="code_chunks_vec",
|
|
123
136
|
table_schema=await sqlite.TableSchema.from_class(
|
|
124
137
|
CodeChunk,
|
|
125
138
|
primary_key=["id"],
|
|
126
139
|
),
|
|
140
|
+
virtual_table_def=Vec0TableDef(
|
|
141
|
+
partition_key_columns=["language"],
|
|
142
|
+
auxiliary_columns=["file_path", "content", "start_line", "end_line"],
|
|
143
|
+
),
|
|
127
144
|
)
|
|
128
145
|
|
|
129
146
|
# Walk source directory
|
|
130
147
|
files = localfs.walk_dir(
|
|
131
|
-
|
|
148
|
+
coco.use_context(CODEBASE_DIR),
|
|
132
149
|
recursive=True,
|
|
133
150
|
path_matcher=PatternFilePathMatcher(
|
|
134
151
|
included_patterns=INCLUDED_PATTERNS,
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""Query implementation for codebase search."""
|
|
2
|
+
|
|
3
|
+
import heapq
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import cocoindex as coco
|
|
8
|
+
|
|
9
|
+
from .config import config
|
|
10
|
+
from .schema import QueryResult
|
|
11
|
+
from .shared import SQLITE_DB, embedder, query_prompt_name
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _l2_to_score(distance: float) -> float:
|
|
15
|
+
"""Convert L2 distance to cosine similarity (exact for unit vectors)."""
|
|
16
|
+
return 1.0 - distance * distance / 2.0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _knn_query(
|
|
20
|
+
conn: sqlite3.Connection,
|
|
21
|
+
embedding_bytes: bytes,
|
|
22
|
+
k: int,
|
|
23
|
+
language: str | None = None,
|
|
24
|
+
) -> list[tuple[Any, ...]]:
|
|
25
|
+
"""Run a vec0 KNN query, optionally constrained to a language partition."""
|
|
26
|
+
if language is not None:
|
|
27
|
+
return conn.execute(
|
|
28
|
+
"""
|
|
29
|
+
SELECT file_path, language, content, start_line, end_line, distance
|
|
30
|
+
FROM code_chunks_vec
|
|
31
|
+
WHERE embedding MATCH ? AND k = ? AND language = ?
|
|
32
|
+
ORDER BY distance
|
|
33
|
+
""",
|
|
34
|
+
(embedding_bytes, k, language),
|
|
35
|
+
).fetchall()
|
|
36
|
+
return conn.execute(
|
|
37
|
+
"""
|
|
38
|
+
SELECT file_path, language, content, start_line, end_line, distance
|
|
39
|
+
FROM code_chunks_vec
|
|
40
|
+
WHERE embedding MATCH ? AND k = ?
|
|
41
|
+
ORDER BY distance
|
|
42
|
+
""",
|
|
43
|
+
(embedding_bytes, k),
|
|
44
|
+
).fetchall()
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _full_scan_query(
|
|
48
|
+
conn: sqlite3.Connection,
|
|
49
|
+
embedding_bytes: bytes,
|
|
50
|
+
limit: int,
|
|
51
|
+
offset: int,
|
|
52
|
+
languages: list[str] | None = None,
|
|
53
|
+
paths: list[str] | None = None,
|
|
54
|
+
) -> list[tuple[Any, ...]]:
|
|
55
|
+
"""Full scan with SQL-level distance computation and filtering."""
|
|
56
|
+
conditions: list[str] = []
|
|
57
|
+
params: list[Any] = [embedding_bytes]
|
|
58
|
+
|
|
59
|
+
if languages:
|
|
60
|
+
placeholders = ",".join("?" for _ in languages)
|
|
61
|
+
conditions.append(f"language IN ({placeholders})")
|
|
62
|
+
params.extend(languages)
|
|
63
|
+
|
|
64
|
+
if paths:
|
|
65
|
+
path_clauses = " OR ".join("file_path GLOB ?" for _ in paths)
|
|
66
|
+
conditions.append(f"({path_clauses})")
|
|
67
|
+
params.extend(paths)
|
|
68
|
+
|
|
69
|
+
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
|
|
70
|
+
params.extend([limit, offset])
|
|
71
|
+
|
|
72
|
+
return conn.execute(
|
|
73
|
+
f"""
|
|
74
|
+
SELECT file_path, language, content, start_line, end_line,
|
|
75
|
+
vec_distance_L2(embedding, ?) as distance
|
|
76
|
+
FROM code_chunks_vec
|
|
77
|
+
{where}
|
|
78
|
+
ORDER BY distance
|
|
79
|
+
LIMIT ? OFFSET ?
|
|
80
|
+
""",
|
|
81
|
+
params,
|
|
82
|
+
).fetchall()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
async def query_codebase(
|
|
86
|
+
query: str,
|
|
87
|
+
limit: int = 10,
|
|
88
|
+
offset: int = 0,
|
|
89
|
+
languages: list[str] | None = None,
|
|
90
|
+
paths: list[str] | None = None,
|
|
91
|
+
) -> list[QueryResult]:
|
|
92
|
+
"""
|
|
93
|
+
Perform vector similarity search using vec0 KNN index.
|
|
94
|
+
|
|
95
|
+
Uses sqlite-vec's vec0 virtual table for indexed nearest-neighbor search.
|
|
96
|
+
Language filtering uses vec0 partition keys for exact index-level filtering.
|
|
97
|
+
Path filtering triggers a full scan with distance computation.
|
|
98
|
+
"""
|
|
99
|
+
if not config.target_sqlite_db_path.exists():
|
|
100
|
+
raise RuntimeError(
|
|
101
|
+
f"Index database not found at {config.target_sqlite_db_path}. "
|
|
102
|
+
"Please run a query with refresh_index=True first."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
coco_env = await coco.default_env()
|
|
106
|
+
db = coco_env.get_context(SQLITE_DB)
|
|
107
|
+
|
|
108
|
+
# Generate query embedding.
|
|
109
|
+
query_embedding = await embedder.embed(query, True, query_prompt_name)
|
|
110
|
+
|
|
111
|
+
embedding_bytes = query_embedding.astype("float32").tobytes()
|
|
112
|
+
|
|
113
|
+
with db.value.readonly() as conn:
|
|
114
|
+
if paths:
|
|
115
|
+
# Path filter → full scan (vec0 can't filter on auxiliary columns).
|
|
116
|
+
# LIMIT/OFFSET handled in SQL.
|
|
117
|
+
rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths)
|
|
118
|
+
elif not languages or len(languages) == 1:
|
|
119
|
+
# Single language or no filter: one KNN query.
|
|
120
|
+
lang = languages[0] if languages else None
|
|
121
|
+
rows = _knn_query(conn, embedding_bytes, limit + offset, lang)
|
|
122
|
+
else:
|
|
123
|
+
# Multiple languages: separate KNN per partition, merge by distance.
|
|
124
|
+
fetch_k = limit + offset
|
|
125
|
+
rows = heapq.nsmallest(
|
|
126
|
+
fetch_k,
|
|
127
|
+
(
|
|
128
|
+
row
|
|
129
|
+
for lang in languages
|
|
130
|
+
for row in _knn_query(conn, embedding_bytes, fetch_k, lang)
|
|
131
|
+
),
|
|
132
|
+
key=lambda r: r[5], # distance column
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if not paths:
|
|
136
|
+
rows = rows[offset:]
|
|
137
|
+
|
|
138
|
+
return [
|
|
139
|
+
QueryResult(
|
|
140
|
+
file_path=file_path,
|
|
141
|
+
language=language,
|
|
142
|
+
content=content,
|
|
143
|
+
start_line=start_line,
|
|
144
|
+
end_line=end_line,
|
|
145
|
+
score=_l2_to_score(distance),
|
|
146
|
+
)
|
|
147
|
+
for file_path, language, content, start_line, end_line, distance in rows
|
|
148
|
+
]
|
|
@@ -2,14 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
import argparse
|
|
4
4
|
import asyncio
|
|
5
|
-
import sqlite3
|
|
6
5
|
|
|
6
|
+
import cocoindex as coco
|
|
7
7
|
from mcp.server.fastmcp import FastMCP
|
|
8
8
|
from pydantic import BaseModel, Field
|
|
9
9
|
|
|
10
10
|
from .config import config
|
|
11
11
|
from .indexer import app as indexer_app
|
|
12
12
|
from .query import query_codebase
|
|
13
|
+
from .shared import SQLITE_DB
|
|
13
14
|
|
|
14
15
|
# Initialize MCP server
|
|
15
16
|
mcp = FastMCP(
|
|
@@ -76,6 +77,8 @@ class SearchResultModel(BaseModel):
|
|
|
76
77
|
" or code snippets."
|
|
77
78
|
" Returns matching code chunks with file paths,"
|
|
78
79
|
" line numbers, and relevance scores."
|
|
80
|
+
" Start with a small limit (e.g., 5);"
|
|
81
|
+
" if most results look relevant, use offset to paginate for more."
|
|
79
82
|
),
|
|
80
83
|
)
|
|
81
84
|
async def search(
|
|
@@ -89,7 +92,7 @@ async def search(
|
|
|
89
92
|
)
|
|
90
93
|
),
|
|
91
94
|
limit: int = Field(
|
|
92
|
-
default=
|
|
95
|
+
default=5,
|
|
93
96
|
ge=1,
|
|
94
97
|
le=100,
|
|
95
98
|
description="Maximum number of results to return (1-100)",
|
|
@@ -107,6 +110,17 @@ async def search(
|
|
|
107
110
|
" when the codebase hasn't changed."
|
|
108
111
|
),
|
|
109
112
|
),
|
|
113
|
+
languages: list[str] | None = Field(
|
|
114
|
+
default=None,
|
|
115
|
+
description=("Filter by programming language(s). Example: ['python', 'typescript']"),
|
|
116
|
+
),
|
|
117
|
+
paths: list[str] | None = Field(
|
|
118
|
+
default=None,
|
|
119
|
+
description=(
|
|
120
|
+
"Filter by file path pattern(s) using GLOB wildcards (* and ?)."
|
|
121
|
+
" Example: ['src/utils/*', '*.py']"
|
|
122
|
+
),
|
|
123
|
+
),
|
|
110
124
|
) -> SearchResultModel:
|
|
111
125
|
"""Query the codebase index."""
|
|
112
126
|
try:
|
|
@@ -114,7 +128,13 @@ async def search(
|
|
|
114
128
|
if refresh_index:
|
|
115
129
|
await _refresh_index()
|
|
116
130
|
|
|
117
|
-
results = await query_codebase(
|
|
131
|
+
results = await query_codebase(
|
|
132
|
+
query=query,
|
|
133
|
+
limit=limit,
|
|
134
|
+
offset=offset,
|
|
135
|
+
languages=languages,
|
|
136
|
+
paths=paths,
|
|
137
|
+
)
|
|
118
138
|
|
|
119
139
|
return SearchResultModel(
|
|
120
140
|
success=True,
|
|
@@ -155,35 +175,36 @@ async def _async_serve() -> None:
|
|
|
155
175
|
async def _async_index() -> None:
|
|
156
176
|
"""Async entry point for the index command."""
|
|
157
177
|
await indexer_app.update(report_to_stdout=True)
|
|
158
|
-
_print_index_stats()
|
|
178
|
+
await _print_index_stats()
|
|
159
179
|
|
|
160
180
|
|
|
161
|
-
def _print_index_stats() -> None:
|
|
181
|
+
async def _print_index_stats() -> None:
|
|
162
182
|
"""Print index statistics from the database."""
|
|
163
183
|
db_path = config.target_sqlite_db_path
|
|
164
184
|
if not db_path.exists():
|
|
165
185
|
print("No index database found.")
|
|
166
186
|
return
|
|
167
187
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
188
|
+
coco_env = await coco.default_env()
|
|
189
|
+
db = coco_env.get_context(SQLITE_DB)
|
|
190
|
+
|
|
191
|
+
with db.value.readonly() as conn:
|
|
192
|
+
total_chunks = conn.execute("SELECT COUNT(*) FROM code_chunks_vec").fetchone()[0]
|
|
193
|
+
total_files = conn.execute(
|
|
194
|
+
"SELECT COUNT(DISTINCT file_path) FROM code_chunks_vec"
|
|
195
|
+
).fetchone()[0]
|
|
174
196
|
langs = conn.execute(
|
|
175
|
-
"SELECT language, COUNT(*) as cnt FROM
|
|
197
|
+
"SELECT language, COUNT(*) as cnt FROM code_chunks_vec"
|
|
198
|
+
" GROUP BY language ORDER BY cnt DESC"
|
|
176
199
|
).fetchall()
|
|
177
200
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
finally:
|
|
186
|
-
conn.close()
|
|
201
|
+
print("\nIndex stats:")
|
|
202
|
+
print(f" Chunks: {total_chunks}")
|
|
203
|
+
print(f" Files: {total_files}")
|
|
204
|
+
if langs:
|
|
205
|
+
print(" Languages:")
|
|
206
|
+
for lang, count in langs:
|
|
207
|
+
print(f" {lang}: {count} chunks")
|
|
187
208
|
|
|
188
209
|
|
|
189
210
|
def main() -> None:
|
|
@@ -9,12 +9,12 @@ from typing import TYPE_CHECKING, Annotated
|
|
|
9
9
|
|
|
10
10
|
import cocoindex as coco
|
|
11
11
|
from cocoindex.connectors import sqlite
|
|
12
|
+
from cocoindex.connectors.localfs import FilePath, register_base_dir
|
|
12
13
|
from numpy.typing import NDArray
|
|
13
14
|
|
|
14
15
|
if TYPE_CHECKING:
|
|
15
16
|
from cocoindex.ops.litellm import LiteLLMEmbedder
|
|
16
|
-
|
|
17
|
-
from .embedder import LocalEmbedder
|
|
17
|
+
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
|
|
18
18
|
|
|
19
19
|
from .config import config
|
|
20
20
|
|
|
@@ -23,22 +23,21 @@ logger = logging.getLogger(__name__)
|
|
|
23
23
|
SBERT_PREFIX = "sbert/"
|
|
24
24
|
|
|
25
25
|
# Initialize embedder at module level based on model prefix
|
|
26
|
-
embedder:
|
|
26
|
+
embedder: SentenceTransformerEmbedder | LiteLLMEmbedder
|
|
27
27
|
if config.embedding_model.startswith(SBERT_PREFIX):
|
|
28
|
-
from .
|
|
28
|
+
from cocoindex.ops.sentence_transformers import SentenceTransformerEmbedder
|
|
29
29
|
|
|
30
30
|
_model_name = config.embedding_model[len(SBERT_PREFIX) :]
|
|
31
31
|
# Models that define a "query" prompt for asymmetric retrieval.
|
|
32
32
|
_QUERY_PROMPT_MODELS = {"nomic-ai/nomic-embed-code", "nomic-ai/CodeRankEmbed"}
|
|
33
|
-
|
|
33
|
+
query_prompt_name: str | None = "query" if _model_name in _QUERY_PROMPT_MODELS else None
|
|
34
34
|
# Models whose custom remote code is known-compatible with transformers 5.x.
|
|
35
35
|
_KNOWN_REMOTE_CODE_MODELS = {"nomic-ai/CodeRankEmbed"}
|
|
36
36
|
_trust = config.trust_remote_code or _model_name in _KNOWN_REMOTE_CODE_MODELS
|
|
37
|
-
embedder =
|
|
37
|
+
embedder = SentenceTransformerEmbedder(
|
|
38
38
|
_model_name,
|
|
39
39
|
device=config.device,
|
|
40
40
|
trust_remote_code=_trust,
|
|
41
|
-
query_prompt_name=_query_prompt_name,
|
|
42
41
|
)
|
|
43
42
|
logger.info(
|
|
44
43
|
"Embedding model: %s | device: %s | trust_remote_code: %s",
|
|
@@ -50,10 +49,13 @@ else:
|
|
|
50
49
|
from cocoindex.ops.litellm import LiteLLMEmbedder
|
|
51
50
|
|
|
52
51
|
embedder = LiteLLMEmbedder(config.embedding_model)
|
|
52
|
+
query_prompt_name = None
|
|
53
53
|
logger.info("Embedding model (LiteLLM): %s", config.embedding_model)
|
|
54
54
|
|
|
55
55
|
# Context key for SQLite database (connection managed in lifespan)
|
|
56
56
|
SQLITE_DB = coco.ContextKey[sqlite.SqliteDatabase]("sqlite_db")
|
|
57
|
+
# Context key for codebase root directory (provided in lifespan)
|
|
58
|
+
CODEBASE_DIR = coco.ContextKey[FilePath]("codebase_dir")
|
|
57
59
|
|
|
58
60
|
|
|
59
61
|
@coco.lifespan
|
|
@@ -65,6 +67,9 @@ def coco_lifespan(builder: coco.EnvironmentBuilder) -> Iterator[None]:
|
|
|
65
67
|
# Set CocoIndex state database path
|
|
66
68
|
builder.settings.db_path = config.cocoindex_db_path
|
|
67
69
|
|
|
70
|
+
# Provide codebase root directory to environment
|
|
71
|
+
builder.provide(CODEBASE_DIR, register_base_dir("codebase", config.codebase_root_path))
|
|
72
|
+
|
|
68
73
|
# Connect to SQLite with vector extension
|
|
69
74
|
conn = sqlite.connect(str(config.target_sqlite_db_path), load_vec="auto")
|
|
70
75
|
builder.provide(SQLITE_DB, sqlite.register_db("index_db", conn))
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
"""Local SentenceTransformer embedder with device and trust_remote_code support."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import threading
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
-
|
|
8
|
-
import cocoindex as coco
|
|
9
|
-
import numpy as np
|
|
10
|
-
from cocoindex.resources import schema as _schema
|
|
11
|
-
from numpy.typing import NDArray
|
|
12
|
-
|
|
13
|
-
from .config import config as _config
|
|
14
|
-
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from sentence_transformers import SentenceTransformer
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class LocalEmbedder(_schema.VectorSchemaProvider):
|
|
20
|
-
"""SentenceTransformer embedder with explicit device and trust_remote_code support.
|
|
21
|
-
|
|
22
|
-
Drop-in replacement for cocoindex's SentenceTransformerEmbedder that supports:
|
|
23
|
-
- Explicit device selection (e.g. "cuda", "cpu")
|
|
24
|
-
- trust_remote_code for models with custom pooling (e.g. Jina models)
|
|
25
|
-
"""
|
|
26
|
-
|
|
27
|
-
def __init__(
|
|
28
|
-
self,
|
|
29
|
-
model_name_or_path: str,
|
|
30
|
-
*,
|
|
31
|
-
device: str = "cpu",
|
|
32
|
-
trust_remote_code: bool = False,
|
|
33
|
-
normalize_embeddings: bool = True,
|
|
34
|
-
query_prompt_name: str | None = None,
|
|
35
|
-
) -> None:
|
|
36
|
-
self._model_name_or_path = model_name_or_path
|
|
37
|
-
self._device = device
|
|
38
|
-
self._trust_remote_code = trust_remote_code
|
|
39
|
-
self._normalize_embeddings = normalize_embeddings
|
|
40
|
-
self._query_prompt_name = query_prompt_name
|
|
41
|
-
self._model: SentenceTransformer | None = None
|
|
42
|
-
self._lock = threading.Lock()
|
|
43
|
-
|
|
44
|
-
def __getstate__(self) -> dict[str, Any]:
|
|
45
|
-
return {
|
|
46
|
-
"model_name_or_path": self._model_name_or_path,
|
|
47
|
-
"device": self._device,
|
|
48
|
-
"trust_remote_code": self._trust_remote_code,
|
|
49
|
-
"normalize_embeddings": self._normalize_embeddings,
|
|
50
|
-
"query_prompt_name": self._query_prompt_name,
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
def __setstate__(self, state: dict[str, Any]) -> None:
|
|
54
|
-
self._model_name_or_path = state["model_name_or_path"]
|
|
55
|
-
self._device = state["device"]
|
|
56
|
-
self._trust_remote_code = state["trust_remote_code"]
|
|
57
|
-
self._normalize_embeddings = state["normalize_embeddings"]
|
|
58
|
-
self._query_prompt_name = state.get("query_prompt_name")
|
|
59
|
-
self._model = None
|
|
60
|
-
self._lock = threading.Lock()
|
|
61
|
-
|
|
62
|
-
def _get_model(self) -> SentenceTransformer:
|
|
63
|
-
"""Lazy-load the model with thread-safe double-checked locking."""
|
|
64
|
-
if self._model is None:
|
|
65
|
-
with self._lock:
|
|
66
|
-
if self._model is None:
|
|
67
|
-
from sentence_transformers import SentenceTransformer
|
|
68
|
-
|
|
69
|
-
self._model = SentenceTransformer(
|
|
70
|
-
self._model_name_or_path,
|
|
71
|
-
device=self._device,
|
|
72
|
-
trust_remote_code=self._trust_remote_code,
|
|
73
|
-
)
|
|
74
|
-
return self._model
|
|
75
|
-
|
|
76
|
-
@coco.fn.as_async(batching=True, runner=coco.GPU, memo=True, max_batch_size=_config.batch_size)
|
|
77
|
-
def embed(self, texts: list[str]) -> list[NDArray[np.float32]]:
|
|
78
|
-
"""Embed a batch of texts into float32 vectors."""
|
|
79
|
-
model = self._get_model()
|
|
80
|
-
embeddings: NDArray[np.float32] = model.encode(
|
|
81
|
-
texts,
|
|
82
|
-
convert_to_numpy=True,
|
|
83
|
-
normalize_embeddings=self._normalize_embeddings,
|
|
84
|
-
) # type: ignore[assignment]
|
|
85
|
-
return list(embeddings)
|
|
86
|
-
|
|
87
|
-
@coco.fn.as_async(batching=True, runner=coco.GPU, memo=True, max_batch_size=_config.batch_size)
|
|
88
|
-
def embed_query(self, texts: list[str]) -> list[NDArray[np.float32]]:
|
|
89
|
-
"""Embed query texts, applying query_prompt_name if configured."""
|
|
90
|
-
model = self._get_model()
|
|
91
|
-
embeddings: NDArray[np.float32] = model.encode(
|
|
92
|
-
texts,
|
|
93
|
-
prompt_name=self._query_prompt_name,
|
|
94
|
-
convert_to_numpy=True,
|
|
95
|
-
normalize_embeddings=self._normalize_embeddings,
|
|
96
|
-
) # type: ignore[assignment]
|
|
97
|
-
return list(embeddings)
|
|
98
|
-
|
|
99
|
-
@coco.fn.as_async(runner=coco.GPU, memo=True)
|
|
100
|
-
def __coco_vector_schema__(self) -> _schema.VectorSchema:
|
|
101
|
-
"""Return the vector schema (dimension + dtype) for this model."""
|
|
102
|
-
model = self._get_model()
|
|
103
|
-
dim = model.get_sentence_embedding_dimension()
|
|
104
|
-
if dim is None:
|
|
105
|
-
raise RuntimeError(
|
|
106
|
-
f"Embedding dimension is unknown for model {self._model_name_or_path}."
|
|
107
|
-
)
|
|
108
|
-
return _schema.VectorSchema(dtype=np.dtype(np.float32), size=dim)
|
|
109
|
-
|
|
110
|
-
def __coco_memo_key__(self) -> object:
|
|
111
|
-
return (
|
|
112
|
-
self._model_name_or_path,
|
|
113
|
-
self._device,
|
|
114
|
-
self._trust_remote_code,
|
|
115
|
-
self._normalize_embeddings,
|
|
116
|
-
self._query_prompt_name,
|
|
117
|
-
)
|
|
@@ -1,71 +0,0 @@
|
|
|
1
|
-
"""Query implementation for codebase search."""
|
|
2
|
-
|
|
3
|
-
import cocoindex as coco
|
|
4
|
-
|
|
5
|
-
from .config import config
|
|
6
|
-
from .schema import QueryResult
|
|
7
|
-
from .shared import SQLITE_DB, embedder
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
async def query_codebase(
|
|
11
|
-
query: str,
|
|
12
|
-
limit: int = 10,
|
|
13
|
-
offset: int = 0,
|
|
14
|
-
) -> list[QueryResult]:
|
|
15
|
-
"""
|
|
16
|
-
Perform vector similarity search.
|
|
17
|
-
|
|
18
|
-
Uses sqlite-vec's vec_distance_cosine for similarity scoring.
|
|
19
|
-
"""
|
|
20
|
-
if not config.target_sqlite_db_path.exists():
|
|
21
|
-
raise RuntimeError(
|
|
22
|
-
f"Index database not found at {config.target_sqlite_db_path}. "
|
|
23
|
-
"Please run a query with refresh_index=True first."
|
|
24
|
-
)
|
|
25
|
-
|
|
26
|
-
# Get the database connection from CocoIndex environment
|
|
27
|
-
coco_env = await coco.default_env()
|
|
28
|
-
db = coco_env.get_context(SQLITE_DB)
|
|
29
|
-
|
|
30
|
-
# Generate query embedding — use embed_query if available (supports asymmetric
|
|
31
|
-
# prompting for models like nomic-embed-code that use different prefixes for
|
|
32
|
-
# queries vs indexed documents).
|
|
33
|
-
if hasattr(embedder, "embed_query"):
|
|
34
|
-
query_embedding = await embedder.embed_query(query)
|
|
35
|
-
else:
|
|
36
|
-
query_embedding = await embedder.embed(query)
|
|
37
|
-
|
|
38
|
-
# Convert to bytes for sqlite-vec (float32)
|
|
39
|
-
embedding_bytes = query_embedding.astype("float32").tobytes()
|
|
40
|
-
|
|
41
|
-
# Query using sqlite-vec with readonly transaction
|
|
42
|
-
# vec_distance_cosine returns distance (lower is better),
|
|
43
|
-
# so we convert to similarity score (1 - distance)
|
|
44
|
-
with db.value.readonly() as conn:
|
|
45
|
-
cursor = conn.execute(
|
|
46
|
-
"""
|
|
47
|
-
SELECT
|
|
48
|
-
file_path,
|
|
49
|
-
language,
|
|
50
|
-
content,
|
|
51
|
-
start_line,
|
|
52
|
-
end_line,
|
|
53
|
-
(1.0 - vec_distance_cosine(embedding, ?)) as score
|
|
54
|
-
FROM code_chunks
|
|
55
|
-
ORDER BY vec_distance_cosine(embedding, ?) ASC
|
|
56
|
-
LIMIT ? OFFSET ?
|
|
57
|
-
""",
|
|
58
|
-
(embedding_bytes, embedding_bytes, limit, offset),
|
|
59
|
-
)
|
|
60
|
-
|
|
61
|
-
return [
|
|
62
|
-
QueryResult(
|
|
63
|
-
file_path=row[0],
|
|
64
|
-
language=row[1],
|
|
65
|
-
content=row[2],
|
|
66
|
-
start_line=row[3],
|
|
67
|
-
end_line=row[4],
|
|
68
|
-
score=row[5],
|
|
69
|
-
)
|
|
70
|
-
for row in cursor.fetchall()
|
|
71
|
-
]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|