semantic-code-index-mcp 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ mcp.json
@@ -0,0 +1,10 @@
1
+ {
2
+ "mcpServers": {
3
+ "semantic-code-index": {
4
+ "command": "/Users/thinhdo/Documents/semantic-code-index-mcp/.venv/bin/semantic-code-index-mcp",
5
+ "env": {
6
+ "SEMANTIC_CODE_ROOT": "/Users/thinhdo/Documents/semantic-code-index-mcp"
7
+ }
8
+ }
9
+ }
10
+ }
@@ -0,0 +1,17 @@
1
+ ---
2
+ description: Prefer semantic_search MCP tool over Grep/Glob for code exploration
3
+ globs: "**/*"
4
+ ---
5
+
6
+ # Semantic Code Search
7
+
8
+ This project has a semantic code index (semantic-code-index MCP server).
9
+
10
+ When you need to explore, understand, or find code:
11
+ 1. **Use `semantic_search` first** — it finds conceptually related code, not just exact string matches.
12
+ 2. Only fall back to Grep/Glob when you need exact string/pattern matching (e.g. finding a specific variable name or import).
13
+ 3. `semantic_search` auto-syncs changed files before searching — no need to manually sync.
14
+ 4. Run `index_project` once if the project has never been indexed.
15
+
16
+ When spawning sub-agents (Agent tool) for code exploration or research, always include this instruction in the prompt:
17
+ > Use the `mcp__semantic-code-index__semantic_search` tool to search code. It finds conceptually related code via semantic vector search. Prefer it over Grep/Glob.
@@ -0,0 +1,12 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ .pytest_cache/
5
+ .mypy_cache/
6
+ .ruff_cache/
7
+ dist/
8
+ *.egg-info/
9
+ .DS_Store
10
+ *.sqlite3
11
+ .cache/
12
+ .mcp.json
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.4
2
+ Name: semantic-code-index-mcp
3
+ Version: 0.2.0
4
+ Summary: MCP server: semantic code search with SQLite + local free embeddings
5
+ Requires-Python: >=3.11
6
+ Requires-Dist: fastembed>=0.4.0
7
+ Requires-Dist: mcp>=1.2.0
8
+ Requires-Dist: numpy>=1.26.0
9
+ Requires-Dist: tiktoken>=0.7.0
10
+ Description-Content-Type: text/markdown
11
+
12
+ # semantic-code-index-mcp
13
+
14
+ MCP server cho Claude / Claude Code: index semantic vào SQLite, embedding chạy local (miễn phí, không API), công cụ tìm kiếm và thống kê token ước lượng.
15
+
16
+ ## Cài đặt
17
+
18
+ ```bash
19
+ cd /path/to/semantic-code-index-mcp
20
+ python3 -m venv .venv
21
+ source .venv/bin/activate
22
+ pip install -e .
23
+ ```
24
+
25
+ Lần đầu chạy, `fastembed` sẽ tải model ONNX (khoảng vài chục MB).
26
+
27
+ ## Biến môi trường
28
+
29
+ - `SEMANTIC_CODE_ROOT` hoặc `WORKSPACE_ROOT`: thư mục gốc project cần index (mặc định: thư mục làm việc của process MCP).
30
+
31
+ ## Claude Code / Cursor MCP
32
+
33
+ Thêm server (stdio), ví dụ trong cấu hình MCP của client:
34
+
35
+ ```json
36
+ {
37
+ "mcpServers": {
38
+ "semantic-code-index": {
39
+ "command": "semantic-code-index-mcp",
40
+ "env": {
41
+ "SEMANTIC_CODE_ROOT": "/absolute/path/to/your/repo"
42
+ }
43
+ }
44
+ }
45
+ }
46
+ ```
47
+
48
+ Nếu không set `SEMANTIC_CODE_ROOT`, đặt `cwd` của server trỏ vào repo hoặc truyền `root_path` trong từng tool call.
49
+
50
+ ## Tools
51
+
52
+ | Tool | Mô tả |
53
+ |------|--------|
54
+ | `index_project` | Index lại toàn bộ |
55
+ | `sync_index` | Chỉ cập nhật file thay đổi / mới / xóa |
56
+ | `semantic_search` | Tìm theo ngôn ngữ tự nhiên |
57
+ | `token_usage_stats` | Ước lượng token “đọc full repo” vs tích lũy từ search |
58
+
59
+ Database: `<root>/.semantic_index/index.sqlite3`.
60
+
61
+ ## Ghi chú
62
+
63
+ - Đếm token dùng `tiktoken` encoding `cl100k_base` (xấp xỉ Claude/GPT-4), không phải billing thực tế.
64
+ - Vector search hiện quét toàn bộ chunk trong SQLite; repo rất lớn có thể cần mở rộng (sqlite-vec / ANN).
@@ -0,0 +1,53 @@
1
+ # semantic-code-index-mcp
2
+
3
+ MCP server cho Claude / Claude Code: index semantic vào SQLite, embedding chạy local (miễn phí, không API), công cụ tìm kiếm và thống kê token ước lượng.
4
+
5
+ ## Cài đặt
6
+
7
+ ```bash
8
+ cd /path/to/semantic-code-index-mcp
9
+ python3 -m venv .venv
10
+ source .venv/bin/activate
11
+ pip install -e .
12
+ ```
13
+
14
+ Lần đầu chạy, `fastembed` sẽ tải model ONNX (khoảng vài chục MB).
15
+
16
+ ## Biến môi trường
17
+
18
+ - `SEMANTIC_CODE_ROOT` hoặc `WORKSPACE_ROOT`: thư mục gốc project cần index (mặc định: thư mục làm việc của process MCP).
19
+
20
+ ## Claude Code / Cursor MCP
21
+
22
+ Thêm server (stdio), ví dụ trong cấu hình MCP của client:
23
+
24
+ ```json
25
+ {
26
+ "mcpServers": {
27
+ "semantic-code-index": {
28
+ "command": "semantic-code-index-mcp",
29
+ "env": {
30
+ "SEMANTIC_CODE_ROOT": "/absolute/path/to/your/repo"
31
+ }
32
+ }
33
+ }
34
+ }
35
+ ```
36
+
37
+ Nếu không set `SEMANTIC_CODE_ROOT`, đặt `cwd` của server trỏ vào repo hoặc truyền `root_path` trong từng tool call.
38
+
39
+ ## Tools
40
+
41
+ | Tool | Mô tả |
42
+ |------|--------|
43
+ | `index_project` | Index lại toàn bộ |
44
+ | `sync_index` | Chỉ cập nhật file thay đổi / mới / xóa |
45
+ | `semantic_search` | Tìm theo ngôn ngữ tự nhiên |
46
+ | `token_usage_stats` | Ước lượng token “đọc full repo” vs tích lũy từ search |
47
+
48
+ Database: `<root>/.semantic_index/index.sqlite3`.
49
+
50
+ ## Ghi chú
51
+
52
+ - Đếm token dùng `tiktoken` encoding `cl100k_base` (xấp xỉ Claude/GPT-4), không phải billing thực tế.
53
+ - Vector search hiện quét toàn bộ chunk trong SQLite; repo rất lớn có thể cần mở rộng (sqlite-vec / ANN).
@@ -0,0 +1,102 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "fs";
4
+ import { resolve, join } from "path";
5
+
6
+ const RULE = `---
7
+ description: Prefer semantic_search MCP tool over Grep/Glob for code exploration
8
+ globs: "**/*"
9
+ ---
10
+
11
+ # Semantic Code Search
12
+
13
+ This project has a semantic code index (semantic-code-index MCP server).
14
+
15
+ When you need to explore, understand, or find code:
16
+ 1. **Use \`semantic_search\` first** — it finds conceptually related code, not just exact string matches.
17
+ 2. Only fall back to Grep/Glob when you need exact string/pattern matching (e.g. finding a specific variable name or import).
18
+ 3. \`semantic_search\` auto-syncs changed files before searching — no need to manually sync.
19
+ 4. Run \`index_project\` once if the project has never been indexed.
20
+
21
+ When spawning sub-agents (Agent tool) for code exploration or research, always include this instruction in the prompt:
22
+ > Use the \`mcp__semantic-code-index__semantic_search\` tool to search code. It finds conceptually related code via semantic vector search. Prefer it over Grep/Glob.
23
+ `;
24
+
25
+ function mergeJson(path, entry) {
26
+ let config = {};
27
+ if (existsSync(path)) {
28
+ try {
29
+ config = JSON.parse(readFileSync(path, "utf8"));
30
+ } catch {}
31
+ }
32
+ config.mcpServers = config.mcpServers || {};
33
+ Object.assign(config.mcpServers, entry);
34
+ mkdirSync(resolve(path, ".."), { recursive: true });
35
+ writeFileSync(path, JSON.stringify(config, null, 2) + "\n");
36
+ console.log(`✓ Wrote ${path}`);
37
+ }
38
+
39
+ function ensureGitignore(path, entry) {
40
+ let lines = [];
41
+ if (existsSync(path)) {
42
+ lines = readFileSync(path, "utf8").split("\n").filter(Boolean);
43
+ }
44
+ if (!lines.includes(entry)) {
45
+ lines.push(entry);
46
+ writeFileSync(path, lines.join("\n") + "\n");
47
+ }
48
+ }
49
+
50
+ function install(root, localBin) {
51
+ const entry = localBin
52
+ ? { command: resolve(localBin), env: { SEMANTIC_CODE_ROOT: root } }
53
+ : {
54
+ command: "uvx",
55
+ args: ["--from", "semantic-code-index-mcp", "semantic-code-index-mcp"],
56
+ env: { SEMANTIC_CODE_ROOT: root },
57
+ };
58
+ const serverEntry = { "semantic-code-index": entry };
59
+
60
+ mergeJson(join(root, ".claude", "mcp.json"), serverEntry);
61
+ mergeJson(join(root, ".mcp.json"), serverEntry);
62
+
63
+ ensureGitignore(join(root, ".gitignore"), ".mcp.json");
64
+ ensureGitignore(join(root, ".gitignore"), ".claude/");
65
+
66
+ const ruleFile = join(root, ".claude", "rules", "semantic-search.md");
67
+ mkdirSync(resolve(ruleFile, ".."), { recursive: true });
68
+ writeFileSync(ruleFile, RULE);
69
+ console.log(`✓ Wrote ${ruleFile}`);
70
+
71
+ console.log(` root: ${root}`);
72
+ console.log(
73
+ "\nDone! Open Claude Code in this project — it will auto-detect the MCP server."
74
+ );
75
+ console.log(
76
+ 'First time? Ask Claude: "index this project with semantic_search"'
77
+ );
78
+ }
79
+
80
+ const args = process.argv.slice(2);
81
+ const cmd = args[0];
82
+
83
+ if (cmd === "install") {
84
+ let target = ".";
85
+ let localBin = null;
86
+ for (let i = 1; i < args.length; i++) {
87
+ if (args[i] === "--local" && args[i + 1]) {
88
+ localBin = args[++i];
89
+ } else {
90
+ target = args[i];
91
+ }
92
+ }
93
+ install(resolve(target), localBin);
94
+ } else {
95
+ console.log("Usage: npx semantic-code-index-mcp install [path] [--local bin]");
96
+ console.log("");
97
+ console.log(" Install semantic code search MCP server into a project.");
98
+ console.log("");
99
+ console.log("Options:");
100
+ console.log(" --local <bin> Use local binary instead of uvx (for dev)");
101
+ process.exit(cmd ? 1 : 0);
102
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "name": "semantic-code-index-mcp",
3
+ "version": "0.2.0",
4
+ "description": "MCP server: semantic code search with SQLite + local embeddings. One command install for Claude Code.",
5
+ "bin": {
6
+ "semantic-code-index-mcp": "bin/cli.mjs"
7
+ },
8
+ "type": "module",
9
+ "license": "MIT",
10
+ "repository": {
11
+ "type": "git",
12
+ "url": "https://github.com/thinhdo/semantic-code-index-mcp"
13
+ },
14
+ "keywords": ["mcp", "claude", "semantic-search", "code-index", "embeddings"]
15
+ }
@@ -0,0 +1,22 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "semantic-code-index-mcp"
7
+ version = "0.2.0"
8
+ description = "MCP server: semantic code search with SQLite + local free embeddings"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ dependencies = [
12
+ "mcp>=1.2.0",
13
+ "fastembed>=0.4.0",
14
+ "numpy>=1.26.0",
15
+ "tiktoken>=0.7.0",
16
+ ]
17
+
18
+ [project.scripts]
19
+ semantic-code-index-mcp = "semantic_code_index_mcp.server:main"
20
+
21
+ [tool.hatch.build.targets.wheel]
22
+ packages = ["semantic_code_index_mcp"]
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
@@ -0,0 +1,834 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import os
5
+ import sqlite3
6
+ import time
7
+ from contextlib import contextmanager
8
+ from dataclasses import dataclass, field
9
+ from pathlib import Path
10
+ from typing import Generator
11
+
12
+ import numpy as np
13
+ import tiktoken
14
+
15
+ try:
16
+ from fastembed import TextEmbedding
17
+ except ImportError:
18
+ TextEmbedding = None # type: ignore[misc, assignment]
19
+
20
+ # ---------------------------------------------------------------------------
21
+ # Constants
22
+ # ---------------------------------------------------------------------------
23
+
24
+ DEFAULT_EMBED_MODEL = "BAAI/bge-small-en-v1.5"
25
+ MAX_FILE_SIZE_BYTES = 512_000 # skip files > 512 KB
26
+ SNIPPET_MAX_CHARS = 500
27
+ EMBED_BATCH_SIZE = 64
28
+ RRF_K = 60 # reciprocal rank fusion constant
29
+
30
+ DEFAULT_SKIP_DIR_NAMES = frozenset(
31
+ {
32
+ ".git",
33
+ ".semantic_index",
34
+ "node_modules",
35
+ "__pycache__",
36
+ ".venv",
37
+ "venv",
38
+ ".mypy_cache",
39
+ ".pytest_cache",
40
+ ".ruff_cache",
41
+ "dist",
42
+ "build",
43
+ ".next",
44
+ ".turbo",
45
+ "target",
46
+ ".idea",
47
+ ".vscode",
48
+ }
49
+ )
50
+
51
+ DEFAULT_EXTENSIONS = frozenset(
52
+ {
53
+ ".py", ".pyi",
54
+ ".ts", ".tsx", ".js", ".jsx", ".mjs", ".cjs",
55
+ ".vue", ".svelte",
56
+ ".go", ".rs",
57
+ ".java", ".kt", ".cs",
58
+ ".rb", ".php", ".swift", ".scala",
59
+ ".md",
60
+ ".json", ".yaml", ".yml", ".toml",
61
+ ".html", ".css", ".scss",
62
+ ".sql",
63
+ ".sh", ".bash", ".zsh",
64
+ ".dockerfile",
65
+ }
66
+ )
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Helpers
70
+ # ---------------------------------------------------------------------------
71
+
72
+
73
+ def _sha256_text(text: str) -> str:
74
+ return hashlib.sha256(text.encode("utf-8")).hexdigest()
75
+
76
+
77
+ def _normalize_path(p: Path, root: Path) -> str:
78
+ try:
79
+ rel = p.resolve().relative_to(root.resolve())
80
+ except ValueError:
81
+ rel = p
82
+ return rel.as_posix()
83
+
84
+
85
+ def _fts_sanitize(query: str) -> str | None:
86
+ """Convert natural-language query to safe FTS5 MATCH expression."""
87
+ words = []
88
+ for word in query.split():
89
+ clean = "".join(c for c in word if c.isalnum() or c == "_")
90
+ if clean:
91
+ words.append(f'"{clean}"')
92
+ return " OR ".join(words) if words else None
93
+
94
+
95
+ # ---------------------------------------------------------------------------
96
+ # Data classes
97
+ # ---------------------------------------------------------------------------
98
+
99
+
100
+ @dataclass
101
+ class SearchHit:
102
+ path: str
103
+ start_line: int
104
+ end_line: int
105
+ score: float
106
+ snippet: str
107
+
108
+
109
+ @dataclass
110
+ class _VecCache:
111
+ """In-memory cache: all chunk embeddings as a single numpy matrix."""
112
+
113
+ chunk_ids: list[int] = field(default_factory=list)
114
+ paths: list[str] = field(default_factory=list)
115
+ matrix: np.ndarray | None = None # (n, dim)
116
+ version: int = -1
117
+
118
+
119
+ # ---------------------------------------------------------------------------
120
+ # CodeIndexer
121
+ # ---------------------------------------------------------------------------
122
+
123
+
124
+ class CodeIndexer:
125
+ """Chunk code, embed locally, store in SQLite, hybrid search."""
126
+
127
+ def __init__(
128
+ self,
129
+ root: Path,
130
+ *,
131
+ db_path: Path | None = None,
132
+ embed_model: str = DEFAULT_EMBED_MODEL,
133
+ max_chunk_lines: int = 100,
134
+ overlap_lines: int = 15,
135
+ ) -> None:
136
+ self.root = root.resolve()
137
+ self.db_path = db_path or self._default_db_path(self.root)
138
+ self.embed_model = embed_model
139
+ self.max_chunk_lines = max_chunk_lines
140
+ self.overlap_lines = overlap_lines
141
+
142
+ self._embedder: TextEmbedding | None = None
143
+ self._enc = tiktoken.get_encoding("cl100k_base")
144
+ self._vec_cache = _VecCache()
145
+ self._index_version = 0
146
+ self._fts_available: bool | None = None
147
+
148
+ @staticmethod
149
+ def _default_db_path(root: Path) -> Path:
150
+ """~/.cache/semantic-code-index/<hash>/index.sqlite3"""
151
+ h = hashlib.sha256(str(root).encode()).hexdigest()[:16]
152
+ return Path.home() / ".cache" / "semantic-code-index" / h / "index.sqlite3"
153
+
154
+ # -- connection ----------------------------------------------------------
155
+
156
+ def _connect(self) -> sqlite3.Connection:
157
+ self.db_path.parent.mkdir(parents=True, exist_ok=True)
158
+ conn = sqlite3.connect(str(self.db_path))
159
+ conn.row_factory = sqlite3.Row
160
+ return conn
161
+
162
+ @contextmanager
163
+ def open(self) -> Generator[sqlite3.Connection, None, None]:
164
+ conn = self._connect()
165
+ try:
166
+ yield conn
167
+ finally:
168
+ conn.close()
169
+
170
+ # -- schema --------------------------------------------------------------
171
+
172
+ def init_db(self, conn: sqlite3.Connection) -> None:
173
+ conn.executescript(
174
+ """
175
+ CREATE TABLE IF NOT EXISTS meta (
176
+ key TEXT PRIMARY KEY,
177
+ value TEXT NOT NULL
178
+ );
179
+ CREATE TABLE IF NOT EXISTS files_state (
180
+ path TEXT PRIMARY KEY,
181
+ mtime REAL NOT NULL,
182
+ size INTEGER NOT NULL,
183
+ hash TEXT NOT NULL,
184
+ tokens INTEGER NOT NULL DEFAULT 0
185
+ );
186
+ CREATE TABLE IF NOT EXISTS chunks (
187
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
188
+ path TEXT NOT NULL,
189
+ start_line INTEGER NOT NULL,
190
+ end_line INTEGER NOT NULL,
191
+ content TEXT NOT NULL,
192
+ embedding BLOB NOT NULL,
193
+ UNIQUE(path, start_line, end_line)
194
+ );
195
+ CREATE INDEX IF NOT EXISTS idx_chunks_path ON chunks(path);
196
+ CREATE TABLE IF NOT EXISTS metrics (
197
+ id INTEGER PRIMARY KEY CHECK (id = 1),
198
+ total_native_tokens_est INTEGER NOT NULL DEFAULT 0,
199
+ total_search_queries INTEGER NOT NULL DEFAULT 0,
200
+ total_search_result_tokens_est INTEGER NOT NULL DEFAULT 0,
201
+ total_query_tokens_est INTEGER NOT NULL DEFAULT 0,
202
+ updated_at REAL NOT NULL
203
+ );
204
+ INSERT OR IGNORE INTO metrics (id, updated_at) VALUES (1, 0);
205
+ CREATE TABLE IF NOT EXISTS search_log (
206
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
207
+ ts REAL NOT NULL,
208
+ query TEXT NOT NULL,
209
+ limit_val INTEGER NOT NULL,
210
+ path_prefix TEXT,
211
+ hits INTEGER NOT NULL,
212
+ query_tokens INTEGER NOT NULL,
213
+ result_tokens INTEGER NOT NULL,
214
+ native_tokens_full_repo INTEGER NOT NULL,
215
+ duration_ms REAL NOT NULL
216
+ );
217
+ """
218
+ )
219
+ # Migrate: add tokens column if missing (upgrade from v0.1)
220
+ cols = {r[1] for r in conn.execute("PRAGMA table_info(files_state)").fetchall()}
221
+ if "tokens" not in cols:
222
+ conn.execute("ALTER TABLE files_state ADD COLUMN tokens INTEGER NOT NULL DEFAULT 0")
223
+
224
+ # FTS5 setup
225
+ if self._fts_available is None:
226
+ self._init_fts(conn)
227
+
228
+ conn.commit()
229
+
230
+ def _init_fts(self, conn: sqlite3.Connection) -> None:
231
+ """Try to create FTS5 virtual table; populate from existing chunks if needed."""
232
+ try:
233
+ conn.execute(
234
+ "CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts "
235
+ "USING fts5(content, tokenize='porter unicode61')"
236
+ )
237
+ self._fts_available = True
238
+ # Backfill FTS from existing chunks if FTS is empty but chunks exist
239
+ chunk_count = conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0]
240
+ if chunk_count > 0:
241
+ fts_count = conn.execute("SELECT COUNT(*) FROM chunks_fts").fetchone()[0]
242
+ if fts_count == 0:
243
+ rows = conn.execute("SELECT id, content FROM chunks").fetchall()
244
+ for r in rows:
245
+ conn.execute(
246
+ "INSERT INTO chunks_fts(rowid, content) VALUES(?, ?)",
247
+ (r[0], r[1]),
248
+ )
249
+ except Exception:
250
+ self._fts_available = False
251
+
252
+ # -- meta helpers --------------------------------------------------------
253
+
254
+ def _set_meta(self, conn: sqlite3.Connection, key: str, value: str) -> None:
255
+ conn.execute(
256
+ "INSERT INTO meta(key, value) VALUES(?, ?) "
257
+ "ON CONFLICT(key) DO UPDATE SET value = excluded.value",
258
+ (key, value),
259
+ )
260
+
261
+ def _get_meta(self, conn: sqlite3.Connection, key: str) -> str | None:
262
+ row = conn.execute("SELECT value FROM meta WHERE key = ?", (key,)).fetchone()
263
+ return str(row[0]) if row else None
264
+
265
+ # -- embedding -----------------------------------------------------------
266
+
267
+ def _get_embedder(self) -> TextEmbedding:
268
+ if TextEmbedding is None:
269
+ raise RuntimeError("fastembed is not installed")
270
+ if self._embedder is None:
271
+ self._embedder = TextEmbedding(model_name=self.embed_model)
272
+ return self._embedder
273
+
274
+ def _embed_batch(self, texts: list[str]) -> list[np.ndarray]:
275
+ model = self._get_embedder()
276
+ vecs: list[np.ndarray] = []
277
+ for emb in model.embed(texts):
278
+ v = np.asarray(emb, dtype=np.float32)
279
+ n = np.linalg.norm(v)
280
+ if n > 0:
281
+ v = v / n
282
+ vecs.append(v)
283
+ return vecs
284
+
285
+ def _blob_from_vec(self, v: np.ndarray) -> bytes:
286
+ return v.astype(np.float32).tobytes()
287
+
288
+ def _vec_from_blob(self, blob: bytes) -> np.ndarray:
289
+ return np.frombuffer(blob, dtype=np.float32).copy()
290
+
291
+ # -- tokenizer -----------------------------------------------------------
292
+
293
+ def count_tokens(self, text: str) -> int:
294
+ return len(self._enc.encode(text))
295
+
296
+ # -- file iteration ------------------------------------------------------
297
+
298
+ def _iter_source_files(self) -> list[Path]:
299
+ out: list[Path] = []
300
+ root = self.root
301
+ for dirpath, dirnames, filenames in os.walk(root, topdown=True):
302
+ dirnames[:] = [d for d in dirnames if d not in DEFAULT_SKIP_DIR_NAMES]
303
+ base = Path(dirpath)
304
+ for name in filenames:
305
+ p = base / name
306
+ if p.suffix.lower() in DEFAULT_EXTENSIONS or name == "Dockerfile":
307
+ try:
308
+ if p.stat().st_size > MAX_FILE_SIZE_BYTES:
309
+ continue
310
+ except OSError:
311
+ continue
312
+ out.append(p)
313
+ return sorted(out)
314
+
315
+ # -- chunking ------------------------------------------------------------
316
+
317
+ def _chunk_file(self, _rel_path: str, text: str) -> list[tuple[int, int, str]]:
318
+ lines = text.splitlines(keepends=True)
319
+ if not lines:
320
+ return []
321
+ chunks: list[tuple[int, int, str]] = []
322
+ i = 0
323
+ n = len(lines)
324
+ max_l = self.max_chunk_lines
325
+ ov = self.overlap_lines
326
+ while i < n:
327
+ end = min(i + max_l, n)
328
+ block = "".join(lines[i:end])
329
+ chunks.append((i + 1, end, block))
330
+ if end >= n:
331
+ break
332
+ i = max(end - ov, i + 1)
333
+ return chunks
334
+
335
+ # -- chunk CRUD (with FTS sync) ------------------------------------------
336
+
337
+ def _delete_path_chunks(self, conn: sqlite3.Connection, path: str) -> None:
338
+ """Delete all chunks (+ FTS entries) for a given path."""
339
+ if self._fts_available:
340
+ ids = [
341
+ r[0]
342
+ for r in conn.execute(
343
+ "SELECT id FROM chunks WHERE path = ?", (path,)
344
+ ).fetchall()
345
+ ]
346
+ if ids:
347
+ placeholders = ",".join("?" * len(ids))
348
+ conn.execute(
349
+ f"DELETE FROM chunks_fts WHERE rowid IN ({placeholders})", ids
350
+ )
351
+ conn.execute("DELETE FROM chunks WHERE path = ?", (path,))
352
+
353
+ def _insert_chunk(
354
+ self,
355
+ conn: sqlite3.Connection,
356
+ path: str,
357
+ sl: int,
358
+ el: int,
359
+ content: str,
360
+ vec: np.ndarray,
361
+ ) -> None:
362
+ cur = conn.execute(
363
+ "INSERT INTO chunks(path, start_line, end_line, content, embedding) "
364
+ "VALUES(?,?,?,?,?)",
365
+ (path, sl, el, content, self._blob_from_vec(vec)),
366
+ )
367
+ if self._fts_available:
368
+ conn.execute(
369
+ "INSERT INTO chunks_fts(rowid, content) VALUES(?, ?)",
370
+ (cur.lastrowid, content),
371
+ )
372
+
373
+ # -- vector cache --------------------------------------------------------
374
+
375
+ def _invalidate_vec_cache(self) -> None:
376
+ self._index_version += 1
377
+
378
+ def _load_vec_cache(self, conn: sqlite3.Connection) -> None:
379
+ if self._vec_cache.version == self._index_version:
380
+ return
381
+ rows = conn.execute("SELECT id, path, embedding FROM chunks").fetchall()
382
+ if not rows:
383
+ self._vec_cache = _VecCache(version=self._index_version)
384
+ return
385
+ ids = [r[0] for r in rows]
386
+ paths = [str(r[1]) for r in rows]
387
+ dim = len(np.frombuffer(rows[0][2], dtype=np.float32))
388
+ matrix = np.empty((len(rows), dim), dtype=np.float32)
389
+ for i, r in enumerate(rows):
390
+ matrix[i] = np.frombuffer(r[2], dtype=np.float32)
391
+ self._vec_cache = _VecCache(
392
+ chunk_ids=ids, paths=paths, matrix=matrix, version=self._index_version
393
+ )
394
+
395
+ # -- index full ----------------------------------------------------------
396
+
397
+ def index_full(self, conn: sqlite3.Connection) -> dict[str, int | float | str]:
398
+ """Full rebuild: clear everything and re-index all source files."""
399
+ self.init_db(conn)
400
+ conn.execute("DELETE FROM chunks")
401
+ conn.execute("DELETE FROM files_state")
402
+ if self._fts_available:
403
+ conn.execute("DELETE FROM chunks_fts")
404
+ conn.commit()
405
+
406
+ files = self._iter_source_files()
407
+ inserted = 0
408
+ native_tokens = 0
409
+ batch_texts: list[str] = []
410
+ batch_rows: list[tuple[str, int, int, str]] = []
411
+ t0 = time.time()
412
+
413
+ def flush() -> None:
414
+ nonlocal inserted
415
+ if not batch_texts:
416
+ return
417
+ vecs = self._embed_batch(batch_texts)
418
+ for (path, sl, el, content), v in zip(batch_rows, vecs, strict=True):
419
+ self._insert_chunk(conn, path, sl, el, content, v)
420
+ inserted += 1
421
+ batch_texts.clear()
422
+ batch_rows.clear()
423
+
424
+ for fp in files:
425
+ rel = _normalize_path(fp, self.root)
426
+ try:
427
+ raw = fp.read_bytes()
428
+ text = raw.decode("utf-8", errors="replace")
429
+ except OSError:
430
+ continue
431
+ st = fp.stat()
432
+ h = _sha256_text(text)
433
+ tok = self.count_tokens(text)
434
+ native_tokens += tok
435
+ conn.execute(
436
+ "INSERT INTO files_state(path, mtime, size, hash, tokens) VALUES(?,?,?,?,?) "
437
+ "ON CONFLICT(path) DO UPDATE SET mtime=excluded.mtime, size=excluded.size, "
438
+ "hash=excluded.hash, tokens=excluded.tokens",
439
+ (rel, st.st_mtime, st.st_size, h, tok),
440
+ )
441
+ for sl, el, content in self._chunk_file(rel, text):
442
+ batch_rows.append((rel, sl, el, content))
443
+ batch_texts.append(content[:8000])
444
+ if len(batch_texts) >= EMBED_BATCH_SIZE:
445
+ flush()
446
+
447
+ flush()
448
+ conn.execute(
449
+ "UPDATE metrics SET total_native_tokens_est = ?, updated_at = ? WHERE id = 1",
450
+ (native_tokens, time.time()),
451
+ )
452
+ self._set_meta(conn, "embed_model", self.embed_model)
453
+ self._set_meta(conn, "last_full_index", str(time.time()))
454
+ conn.commit()
455
+ self._invalidate_vec_cache()
456
+ return {
457
+ "files_indexed": len(files),
458
+ "chunks": inserted,
459
+ "native_tokens_est": native_tokens,
460
+ "seconds": round(time.time() - t0, 2),
461
+ }
462
+
463
+ # -- incremental sync ----------------------------------------------------
464
+
465
+ def sync_incremental(
466
+ self, conn: sqlite3.Connection
467
+ ) -> dict[str, int | float | str | list[str]]:
468
+ """Update index for changed / new / removed files (incremental tokens)."""
469
+ self.init_db(conn)
470
+ disk_files = {_normalize_path(p, self.root): p for p in self._iter_source_files()}
471
+ rows = conn.execute(
472
+ "SELECT path, mtime, size, hash, tokens FROM files_state"
473
+ ).fetchall()
474
+ db_state = {
475
+ str(r["path"]): (float(r["mtime"]), int(r["size"]), str(r["hash"]), int(r["tokens"]))
476
+ for r in rows
477
+ }
478
+
479
+ # Current native_tokens from metrics
480
+ m = conn.execute(
481
+ "SELECT total_native_tokens_est FROM metrics WHERE id = 1"
482
+ ).fetchone()
483
+ native_tokens = int(m[0] or 0)
484
+
485
+ # Removed files
486
+ removed = [p for p in db_state if p not in disk_files]
487
+ for p in removed:
488
+ self._delete_path_chunks(conn, p)
489
+ native_tokens -= db_state[p][3] # subtract old tokens
490
+ conn.execute("DELETE FROM files_state WHERE path = ?", (p,))
491
+
492
+ # Changed / new files
493
+ to_reindex: list[str] = []
494
+ batch_texts: list[str] = []
495
+ batch_rows: list[tuple[str, int, int, str]] = []
496
+
497
+ def flush() -> None:
498
+ if not batch_texts:
499
+ return
500
+ vecs = self._embed_batch(batch_texts)
501
+ for (path, sl, el, content), v in zip(batch_rows, vecs, strict=True):
502
+ self._insert_chunk(conn, path, sl, el, content, v)
503
+ batch_texts.clear()
504
+ batch_rows.clear()
505
+
506
+ for rel, fp in sorted(disk_files.items()):
507
+ try:
508
+ st = fp.stat()
509
+ except OSError:
510
+ continue
511
+ prev = db_state.get(rel)
512
+ # Quick check: mtime + size unchanged → skip (no hash needed)
513
+ if prev and prev[0] == st.st_mtime and prev[1] == st.st_size:
514
+ continue
515
+ try:
516
+ raw = fp.read_bytes()
517
+ text = raw.decode("utf-8", errors="replace")
518
+ except OSError:
519
+ continue
520
+ h = _sha256_text(text)
521
+ # If hash matches (mtime changed but content didn't), just update mtime
522
+ if prev and prev[2] == h:
523
+ conn.execute(
524
+ "UPDATE files_state SET mtime = ?, size = ? WHERE path = ?",
525
+ (st.st_mtime, st.st_size, rel),
526
+ )
527
+ continue
528
+
529
+ to_reindex.append(rel)
530
+ # Adjust native token count: subtract old, add new
531
+ tok = self.count_tokens(text)
532
+ if prev:
533
+ native_tokens -= prev[3]
534
+ native_tokens += tok
535
+
536
+ self._delete_path_chunks(conn, rel)
537
+ conn.execute(
538
+ "INSERT INTO files_state(path, mtime, size, hash, tokens) VALUES(?,?,?,?,?) "
539
+ "ON CONFLICT(path) DO UPDATE SET mtime=excluded.mtime, size=excluded.size, "
540
+ "hash=excluded.hash, tokens=excluded.tokens",
541
+ (rel, st.st_mtime, st.st_size, h, tok),
542
+ )
543
+ for sl, el, content in self._chunk_file(rel, text):
544
+ batch_rows.append((rel, sl, el, content))
545
+ batch_texts.append(content[:8000])
546
+ if len(batch_texts) >= EMBED_BATCH_SIZE:
547
+ flush()
548
+
549
+ flush()
550
+ conn.execute(
551
+ "UPDATE metrics SET total_native_tokens_est = ?, updated_at = ? WHERE id = 1",
552
+ (native_tokens, time.time()),
553
+ )
554
+ conn.commit()
555
+ self._invalidate_vec_cache()
556
+ return {
557
+ "removed_paths": len(removed),
558
+ "reindexed_files": len(to_reindex),
559
+ "paths_reindexed": to_reindex[:50],
560
+ "native_tokens_est": native_tokens,
561
+ }
562
+
563
+ # -- needs sync (fast stat-only check) -----------------------------------
564
+
565
+ def needs_sync(self, conn: sqlite3.Connection) -> bool:
566
+ """Quick O(n) stat check — no file reads, no hashing."""
567
+ self.init_db(conn)
568
+ disk_files = {_normalize_path(p, self.root): p for p in self._iter_source_files()}
569
+ rows = conn.execute("SELECT path, mtime, size FROM files_state").fetchall()
570
+ db_paths: set[str] = set()
571
+ for r in rows:
572
+ path = str(r["path"])
573
+ db_paths.add(path)
574
+ fp = disk_files.get(path)
575
+ if fp is None:
576
+ return True # file removed
577
+ try:
578
+ st = fp.stat()
579
+ except OSError:
580
+ return True
581
+ if float(r["mtime"]) != st.st_mtime or int(r["size"]) != st.st_size:
582
+ return True
583
+ if set(disk_files.keys()) - db_paths:
584
+ return True # new files
585
+ return False
586
+
587
+ # -- search --------------------------------------------------------------
588
+
589
+ def _semantic_search(
590
+ self,
591
+ conn: sqlite3.Connection,
592
+ query_vec: np.ndarray,
593
+ limit: int,
594
+ path_prefix: str | None = None,
595
+ ) -> list[tuple[int, float]]:
596
+ """Vectorized cosine search via cached numpy matrix."""
597
+ self._load_vec_cache(conn)
598
+ cache = self._vec_cache
599
+ if cache.matrix is None or len(cache.chunk_ids) == 0:
600
+ return []
601
+
602
+ scores = cache.matrix @ query_vec
603
+
604
+ if path_prefix:
605
+ mask = np.array(
606
+ [p.startswith(path_prefix) for p in cache.paths], dtype=bool
607
+ )
608
+ scores = np.where(mask, scores, -np.inf)
609
+
610
+ k = min(limit, len(cache.chunk_ids))
611
+ if k < len(cache.chunk_ids):
612
+ indices = np.argpartition(scores, -k)[-k:]
613
+ else:
614
+ indices = np.arange(len(cache.chunk_ids))
615
+ indices = indices[np.argsort(scores[indices])[::-1]]
616
+ return [
617
+ (cache.chunk_ids[i], float(scores[i]))
618
+ for i in indices
619
+ if scores[i] > -np.inf
620
+ ]
621
+
622
+ def _keyword_search(
623
+ self,
624
+ conn: sqlite3.Connection,
625
+ query: str,
626
+ limit: int,
627
+ path_prefix: str | None = None,
628
+ ) -> list[tuple[int, float]]:
629
+ """FTS5 BM25 keyword search."""
630
+ if not self._fts_available:
631
+ return []
632
+ fts_q = _fts_sanitize(query)
633
+ if not fts_q:
634
+ return []
635
+ try:
636
+ if path_prefix:
637
+ rows = conn.execute(
638
+ "SELECT rowid, bm25(chunks_fts) AS score FROM chunks_fts "
639
+ "WHERE chunks_fts MATCH ? "
640
+ "AND rowid IN (SELECT id FROM chunks WHERE path LIKE ? || '%') "
641
+ "ORDER BY score LIMIT ?",
642
+ (fts_q, path_prefix, limit),
643
+ ).fetchall()
644
+ else:
645
+ rows = conn.execute(
646
+ "SELECT rowid, bm25(chunks_fts) AS score FROM chunks_fts "
647
+ "WHERE chunks_fts MATCH ? ORDER BY score LIMIT ?",
648
+ (fts_q, limit),
649
+ ).fetchall()
650
+ except sqlite3.OperationalError:
651
+ return []
652
+ # bm25() returns negative scores (lower = more relevant)
653
+ return [(int(r[0]), -float(r[1])) for r in rows]
654
+
655
+ def search(
656
+ self,
657
+ conn: sqlite3.Connection,
658
+ query: str,
659
+ *,
660
+ limit: int = 12,
661
+ path_prefix: str | None = None,
662
+ ) -> tuple[list[SearchHit], dict[str, object]]:
663
+ """Hybrid search: semantic + keyword fused with Reciprocal Rank Fusion.
664
+
665
+ Returns (hits, usage) where usage contains token stats for this query.
666
+ """
667
+ self.init_db(conn)
668
+ t0 = time.time()
669
+ query_vec = self._embed_batch([query])[0]
670
+
671
+ fetch_limit = limit * 3
672
+ sem_results = self._semantic_search(conn, query_vec, fetch_limit, path_prefix)
673
+ kw_results = self._keyword_search(conn, query, fetch_limit, path_prefix)
674
+
675
+ rrf: dict[int, float] = {}
676
+ for rank, (cid, _score) in enumerate(sem_results):
677
+ rrf[cid] = rrf.get(cid, 0.0) + 1.0 / (RRF_K + rank + 1)
678
+ for rank, (cid, _score) in enumerate(kw_results):
679
+ rrf[cid] = rrf.get(cid, 0.0) + 1.0 / (RRF_K + rank + 1)
680
+
681
+ top_ids = sorted(rrf, key=lambda k: rrf[k], reverse=True)[:limit]
682
+
683
+ hits: list[SearchHit] = []
684
+ for cid in top_ids:
685
+ row = conn.execute(
686
+ "SELECT path, start_line, end_line, content FROM chunks WHERE id = ?",
687
+ (cid,),
688
+ ).fetchone()
689
+ if not row:
690
+ continue
691
+ content = str(row["content"])
692
+ snippet = content[:SNIPPET_MAX_CHARS] + ("…" if len(content) > SNIPPET_MAX_CHARS else "")
693
+ hits.append(
694
+ SearchHit(
695
+ path=str(row["path"]),
696
+ start_line=int(row["start_line"]),
697
+ end_line=int(row["end_line"]),
698
+ score=rrf[cid],
699
+ snippet=snippet,
700
+ )
701
+ )
702
+
703
+ q_tokens = self.count_tokens(query)
704
+ result_text = "\n\n".join(h.snippet for h in hits)
705
+ r_tokens = self.count_tokens(result_text)
706
+ duration_ms = round((time.time() - t0) * 1000, 1)
707
+ native = int(
708
+ conn.execute(
709
+ "SELECT total_native_tokens_est FROM metrics WHERE id = 1"
710
+ ).fetchone()[0] or 0
711
+ )
712
+
713
+ conn.execute(
714
+ """
715
+ UPDATE metrics SET
716
+ total_search_queries = total_search_queries + 1,
717
+ total_search_result_tokens_est = total_search_result_tokens_est + ?,
718
+ total_query_tokens_est = total_query_tokens_est + ?,
719
+ updated_at = ?
720
+ WHERE id = 1
721
+ """,
722
+ (r_tokens, q_tokens, time.time()),
723
+ )
724
+ now = time.time()
725
+ conn.execute("DELETE FROM search_log WHERE ts < ?", (now - 86400,))
726
+ conn.execute(
727
+ "INSERT INTO search_log(ts, query, limit_val, path_prefix, hits, "
728
+ "query_tokens, result_tokens, native_tokens_full_repo, duration_ms) "
729
+ "VALUES(?,?,?,?,?,?,?,?,?)",
730
+ (now, query, limit, path_prefix, len(hits),
731
+ q_tokens, r_tokens, native, duration_ms),
732
+ )
733
+ conn.commit()
734
+
735
+ usage = {
736
+ "query_tokens": q_tokens,
737
+ "result_tokens": r_tokens,
738
+ "total_tokens": q_tokens + r_tokens,
739
+ "native_tokens_full_repo": native,
740
+ "saved_tokens": native - (q_tokens + r_tokens) if native else None,
741
+ "duration_ms": duration_ms,
742
+ }
743
+ return hits, usage
744
+
745
+ # -- list / detail -------------------------------------------------------
746
+
747
+ def list_files(self, conn: sqlite3.Connection) -> list[dict[str, object]]:
748
+ """List all indexed files with token count and chunk count."""
749
+ self.init_db(conn)
750
+ rows = conn.execute(
751
+ "SELECT fs.path, fs.tokens, COUNT(c.id) AS chunks "
752
+ "FROM files_state fs "
753
+ "LEFT JOIN chunks c ON c.path = fs.path "
754
+ "GROUP BY fs.path ORDER BY fs.path"
755
+ ).fetchall()
756
+ return [
757
+ {"path": str(r["path"]), "tokens": int(r["tokens"]), "chunks": int(r["chunks"])}
758
+ for r in rows
759
+ ]
760
+
761
+ def get_file_chunks(self, conn: sqlite3.Connection, path: str) -> list[dict[str, object]]:
762
+ """Return all chunks for a given file path (full content, no truncation)."""
763
+ self.init_db(conn)
764
+ rows = conn.execute(
765
+ "SELECT start_line, end_line, content FROM chunks WHERE path = ? "
766
+ "ORDER BY start_line",
767
+ (path,),
768
+ ).fetchall()
769
+ return [
770
+ {
771
+ "start_line": int(r["start_line"]),
772
+ "end_line": int(r["end_line"]),
773
+ "content": str(r["content"]),
774
+ }
775
+ for r in rows
776
+ ]
777
+
778
+ # -- stats ---------------------------------------------------------------
779
+
780
+ def stats(self, conn: sqlite3.Connection) -> dict[str, object]:
781
+ self.init_db(conn)
782
+ n_chunks = int(conn.execute("SELECT COUNT(*) FROM chunks").fetchone()[0])
783
+ n_files = int(conn.execute("SELECT COUNT(*) FROM files_state").fetchone()[0])
784
+ m = conn.execute(
785
+ "SELECT total_native_tokens_est, total_search_queries, "
786
+ "total_search_result_tokens_est, total_query_tokens_est "
787
+ "FROM metrics WHERE id = 1"
788
+ ).fetchone()
789
+ native = int(m[0] or 0)
790
+ nq = int(m[1] or 0)
791
+ rtok = int(m[2] or 0)
792
+ qtok = int(m[3] or 0)
793
+ avg_per_query = (qtok + rtok) / nq if nq else 0
794
+ ratio = (qtok + rtok) / native if native else None
795
+ return {
796
+ "files_indexed": n_files,
797
+ "chunks": n_chunks,
798
+ "fts5_enabled": bool(self._fts_available),
799
+ "embed_model": self._get_meta(conn, "embed_model") or self.embed_model,
800
+ "index_db": str(self.db_path),
801
+ "root": str(self.root),
802
+ "native_tokens_est_full_repo": native,
803
+ "search_queries": nq,
804
+ "search_total_query_tokens_est": qtok,
805
+ "search_total_result_tokens_est": rtok,
806
+ "search_avg_tokens_per_query_est": round(avg_per_query, 2),
807
+ "search_vs_native_ratio_est": round(ratio, 6) if ratio is not None else None,
808
+ }
809
+
810
+ def search_logs(
811
+ self, conn: sqlite3.Connection, last_n: int = 50
812
+ ) -> list[dict[str, object]]:
813
+ self.init_db(conn)
814
+ rows = conn.execute(
815
+ "SELECT ts, query, limit_val, path_prefix, hits, "
816
+ "query_tokens, result_tokens, native_tokens_full_repo, duration_ms "
817
+ "FROM search_log ORDER BY id DESC LIMIT ?",
818
+ (last_n,),
819
+ ).fetchall()
820
+ return [
821
+ {
822
+ "ts": r["ts"],
823
+ "query": r["query"],
824
+ "limit": r["limit_val"],
825
+ "path_prefix": r["path_prefix"],
826
+ "hits": r["hits"],
827
+ "query_tokens": r["query_tokens"],
828
+ "result_tokens": r["result_tokens"],
829
+ "native_tokens_full_repo": r["native_tokens_full_repo"],
830
+ "saved_tokens": r["native_tokens_full_repo"] - r["query_tokens"] - r["result_tokens"],
831
+ "duration_ms": r["duration_ms"],
832
+ }
833
+ for r in rows
834
+ ]
@@ -0,0 +1,210 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import shutil
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ from mcp.server.fastmcp import FastMCP
10
+
11
+ from semantic_code_index_mcp.indexer import CodeIndexer
12
+
13
+ mcp = FastMCP(
14
+ "semantic-code-index",
15
+ instructions=(
16
+ "Semantic code search over a local index (SQLite + local embeddings). "
17
+ "Use semantic_search to find code — it auto-syncs changed files before searching. "
18
+ "Call index_project once per workspace for initial setup. "
19
+ "Use list_indexed_files to see what's indexed, get_file_chunks for full content."
20
+ ),
21
+ )
22
+
23
+ _indexer_cache: dict[str, CodeIndexer] = {}
24
+
25
+
26
+ def _default_root() -> Path:
27
+ env = os.environ.get("SEMANTIC_CODE_ROOT") or os.environ.get("WORKSPACE_ROOT")
28
+ if env:
29
+ return Path(env).expanduser().resolve()
30
+ return Path.cwd()
31
+
32
+
33
+ def _get_indexer(root: str | None = None) -> CodeIndexer:
34
+ r = Path(root).expanduser().resolve() if root else _default_root()
35
+ key = str(r)
36
+ if key not in _indexer_cache:
37
+ _indexer_cache[key] = CodeIndexer(r)
38
+ return _indexer_cache[key]
39
+
40
+
41
+ @mcp.tool()
42
+ def index_project(root_path: str | None = None) -> str:
43
+ """Full re-index: xóa index cũ và vector hóa lại toàn bộ codebase dưới root_path (hoặc SEMANTIC_CODE_ROOT / cwd)."""
44
+ idx = _get_indexer(root_path)
45
+ with idx.open() as conn:
46
+ out = idx.index_full(conn)
47
+ return json.dumps(out, ensure_ascii=False, indent=2)
48
+
49
+
50
+ @mcp.tool()
51
+ def sync_index(root_path: str | None = None) -> str:
52
+ """Đồng bộ tăng dần: file mới/sửa/xóa → cập nhật chunk + embedding; không đụng file không đổi."""
53
+ idx = _get_indexer(root_path)
54
+ with idx.open() as conn:
55
+ out = idx.sync_incremental(conn)
56
+ return json.dumps(out, ensure_ascii=False, indent=2)
57
+
58
+
59
+ @mcp.tool()
60
+ def semantic_search(
61
+ query: str,
62
+ limit: int = 12,
63
+ path_prefix: str | None = None,
64
+ auto_sync: bool = True,
65
+ root_path: str | None = None,
66
+ ) -> str:
67
+ """Tìm đoạn code liên quan (hybrid: semantic vector + keyword BM25).
68
+ auto_sync=true sẽ tự đồng bộ file thay đổi trước khi tìm.
69
+ path_prefix lọc theo tiền tố đường dẫn tương đối."""
70
+ idx = _get_indexer(root_path)
71
+ with idx.open() as conn:
72
+ if auto_sync and idx.needs_sync(conn):
73
+ idx.sync_incremental(conn)
74
+ hits, usage = idx.search(
75
+ conn, query, limit=min(max(limit, 1), 50), path_prefix=path_prefix
76
+ )
77
+ payload = {
78
+ "results": [
79
+ {
80
+ "path": h.path,
81
+ "lines": [h.start_line, h.end_line],
82
+ "score": round(h.score, 6),
83
+ "snippet": h.snippet,
84
+ }
85
+ for h in hits
86
+ ],
87
+ "usage": usage,
88
+ }
89
+ return json.dumps(payload, ensure_ascii=False)
90
+
91
+
92
+ @mcp.tool()
93
+ def list_indexed_files(root_path: str | None = None) -> str:
94
+ """Liệt kê các file đã index, kèm token count và số chunks mỗi file."""
95
+ idx = _get_indexer(root_path)
96
+ with idx.open() as conn:
97
+ out = idx.list_files(conn)
98
+ return json.dumps(out, ensure_ascii=False)
99
+
100
+
101
+ @mcp.tool()
102
+ def get_file_chunks(path: str, root_path: str | None = None) -> str:
103
+ """Lấy toàn bộ chunks đã index của một file (full content, không cắt). Dùng sau semantic_search để xem context đầy đủ."""
104
+ idx = _get_indexer(root_path)
105
+ with idx.open() as conn:
106
+ out = idx.get_file_chunks(conn, path)
107
+ return json.dumps(out, ensure_ascii=False)
108
+
109
+
110
+ @mcp.tool()
111
+ def token_usage_stats(root_path: str | None = None) -> str:
112
+ """So sánh tích lũy: token ước lượng nếu đọc full repo đã index vs token query+kết quả từ semantic_search."""
113
+ idx = _get_indexer(root_path)
114
+ with idx.open() as conn:
115
+ out = idx.stats(conn)
116
+ return json.dumps(out, ensure_ascii=False)
117
+
118
+
119
+ @mcp.tool()
120
+ def search_log(last_n: int = 50, root_path: str | None = None) -> str:
121
+ """Xem log các lần search gần đây: query, tokens dùng, tokens tiết kiệm, thời gian."""
122
+ idx = _get_indexer(root_path)
123
+ with idx.open() as conn:
124
+ out = idx.search_logs(conn, last_n)
125
+ return json.dumps(out, ensure_ascii=False)
126
+
127
+
128
+ def _find_server_bin() -> str:
129
+ bin_path = shutil.which("semantic-code-index-mcp")
130
+ if bin_path:
131
+ return str(Path(bin_path).resolve())
132
+ return str(Path(sys.executable).parent / "semantic-code-index-mcp")
133
+
134
+
135
+ def _merge_mcp_entry(path: Path, server_bin: str, root: Path) -> None:
136
+ config: dict = {}
137
+ if path.exists():
138
+ try:
139
+ config = json.loads(path.read_text())
140
+ except (json.JSONDecodeError, OSError):
141
+ pass
142
+
143
+ config.setdefault("mcpServers", {})
144
+ config["mcpServers"]["semantic-code-index"] = {
145
+ "command": server_bin,
146
+ "env": {"SEMANTIC_CODE_ROOT": str(root)},
147
+ }
148
+
149
+ path.parent.mkdir(parents=True, exist_ok=True)
150
+ path.write_text(json.dumps(config, indent=2, ensure_ascii=False) + "\n")
151
+ print(f"✓ Wrote {path}")
152
+
153
+
154
+ def _ensure_gitignore(gitignore: Path, entry: str) -> None:
155
+ lines: list[str] = []
156
+ if gitignore.exists():
157
+ lines = gitignore.read_text().splitlines()
158
+ if entry not in lines:
159
+ lines.append(entry)
160
+ gitignore.write_text("\n".join(lines) + "\n")
161
+
162
+
163
+ _SEMANTIC_SEARCH_RULE = """\
164
+ ---
165
+ description: Prefer semantic_search MCP tool over Grep/Glob for code exploration
166
+ globs: "**/*"
167
+ ---
168
+
169
+ # Semantic Code Search
170
+
171
+ This project has a semantic code index (semantic-code-index MCP server).
172
+
173
+ When you need to explore, understand, or find code:
174
+ 1. **Use `semantic_search` first** — it finds conceptually related code, not just exact string matches.
175
+ 2. Only fall back to Grep/Glob when you need exact string/pattern matching (e.g. finding a specific variable name or import).
176
+ 3. `semantic_search` auto-syncs changed files before searching — no need to manually sync.
177
+ 4. Run `index_project` once if the project has never been indexed.
178
+
179
+ When spawning sub-agents (Agent tool) for code exploration or research, always include this instruction in the prompt:
180
+ > Use the `mcp__semantic-code-index__semantic_search` tool to search code. It finds conceptually related code via semantic vector search. Prefer it over Grep/Glob.
181
+ """
182
+
183
+
184
+ def install(target: str | None = None) -> None:
185
+ root = Path(target).expanduser().resolve() if target else Path.cwd()
186
+ server_bin = _find_server_bin()
187
+
188
+ _merge_mcp_entry(root / ".claude" / "mcp.json", server_bin, root)
189
+
190
+ _merge_mcp_entry(root / ".mcp.json", server_bin, root)
191
+ _ensure_gitignore(root / ".gitignore", ".mcp.json")
192
+ _ensure_gitignore(root / ".gitignore", ".claude/")
193
+
194
+ rule_file = root / ".claude" / "rules" / "semantic-search.md"
195
+ rule_file.parent.mkdir(parents=True, exist_ok=True)
196
+ rule_file.write_text(_SEMANTIC_SEARCH_RULE)
197
+ print(f"✓ Wrote {rule_file}")
198
+
199
+ print(f" server: {server_bin}")
200
+ print(f" root: {root}")
201
+ db = CodeIndexer._default_db_path(root)
202
+ print(f" db: {db}")
203
+
204
+
205
+ def main() -> None:
206
+ if len(sys.argv) > 1 and sys.argv[1] == "install":
207
+ target = sys.argv[2] if len(sys.argv) > 2 else None
208
+ install(target)
209
+ else:
210
+ mcp.run()