mcp-code-index 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
code_index/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """SQLite-backed code index for Claude Code, exposed via MCP."""
2
+
3
+ __version__ = "0.1.0"
code_index/chunker.py ADDED
@@ -0,0 +1,224 @@
1
+ """Per-symbol chunking with identifier expansion for retrieval."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from collections import defaultdict
7
+ from dataclasses import dataclass, field
8
+ from pathlib import Path
9
+
10
+ from .parser import ParseResult, Symbol
11
+
12
+ # Defaults; tuned to keep individual chunks under ~2KB of code.
13
+ MAX_SYMBOL_LINES = 80
14
+ WINDOW_LINES = 40
15
+ OVERLAP_LINES = 10
16
+ MAX_FILE_LINES_NO_SYMBOLS = 800
17
+
18
+ _CAMEL_BOUNDARY = re.compile(
19
+ r"(?<!^)(?=[A-Z][a-z])|(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=[a-zA-Z])(?=[0-9])|(?<=[0-9])(?=[a-zA-Z])"
20
+ )
21
+ _IDENT = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*\b")
22
+
23
+
24
+ @dataclass
25
+ class Chunk:
26
+ symbol_idx: int | None # index into ParseResult.symbols, or None for file-level
27
+ start_line: int # 1-based, inclusive
28
+ end_line: int # 1-based, inclusive
29
+ content: str # raw code, returned to agents
30
+ embedded_text: str = field(default="", repr=False) # fed to embedder
31
+
32
+
33
+ def expand_identifier(ident: str) -> list[str]:
34
+ """Split a single identifier into lowercased word pieces.
35
+
36
+ >>> expand_identifier("getUserAuthToken")
37
+ ['get', 'user', 'auth', 'token']
38
+ >>> expand_identifier("OAUTH_REDIRECT_URI")
39
+ ['oauth', 'redirect', 'uri']
40
+ >>> expand_identifier("get_user_v2")
41
+ ['get', 'user', 'v2']
42
+ """
43
+ parts = re.split(r"[_\-.]+", ident)
44
+ out: list[str] = []
45
+ for piece in parts:
46
+ if not piece:
47
+ continue
48
+ spaced = _CAMEL_BOUNDARY.sub(" ", piece)
49
+ for word in spaced.split():
50
+ out.append(word.lower())
51
+ return out
52
+
53
+
54
+ def expand_identifiers(text: str, *, max_words: int = 200) -> str:
55
+ """Return de-duplicated word forms of every identifier in `text`."""
56
+ seen: set[str] = set()
57
+ out: list[str] = []
58
+ for match in _IDENT.finditer(text):
59
+ for word in expand_identifier(match.group(0)):
60
+ if len(word) < 2 or word in seen:
61
+ continue
62
+ seen.add(word)
63
+ out.append(word)
64
+ if len(out) >= max_words:
65
+ return " ".join(out)
66
+ return " ".join(out)
67
+
68
+
69
+ def chunk_file(
70
+ path: Path | str,
71
+ source: bytes,
72
+ parse_result: ParseResult,
73
+ *,
74
+ max_symbol_lines: int = MAX_SYMBOL_LINES,
75
+ window_lines: int = WINDOW_LINES,
76
+ overlap_lines: int = OVERLAP_LINES,
77
+ ) -> list[Chunk]:
78
+ """Produce chunks for one file. Returns empty list if file is empty."""
79
+ text = source.decode("utf-8", errors="replace")
80
+ if not text.strip():
81
+ return []
82
+ lines = text.splitlines()
83
+
84
+ if not parse_result.symbols:
85
+ return _file_window_chunks(
86
+ path, lines, parse_result.lang, window_lines, overlap_lines
87
+ )
88
+
89
+ children_by_parent: dict[int, list[int]] = defaultdict(list)
90
+ for i, sym in enumerate(parse_result.symbols):
91
+ if sym.parent_idx is not None:
92
+ children_by_parent[sym.parent_idx].append(i)
93
+
94
+ chunks: list[Chunk] = []
95
+ for i, sym in enumerate(parse_result.symbols):
96
+ children = children_by_parent.get(i, [])
97
+ if children:
98
+ first_child_start = min(parse_result.symbols[c].start_line for c in children)
99
+ header_end = min(first_child_start - 1, sym.end_line)
100
+ if header_end >= sym.start_line:
101
+ chunks.append(
102
+ _make_chunk(
103
+ path, i, sym, sym.start_line, header_end, lines
104
+ )
105
+ )
106
+ else:
107
+ length = sym.end_line - sym.start_line + 1
108
+ if length > max_symbol_lines:
109
+ chunks.extend(
110
+ _windowed_symbol_chunks(
111
+ path, i, sym, lines, window_lines, overlap_lines
112
+ )
113
+ )
114
+ else:
115
+ chunks.append(
116
+ _make_chunk(path, i, sym, sym.start_line, sym.end_line, lines)
117
+ )
118
+ return chunks
119
+
120
+
121
+ def _make_chunk(
122
+ path: Path | str,
123
+ symbol_idx: int,
124
+ sym: Symbol,
125
+ start: int,
126
+ end: int,
127
+ lines: list[str],
128
+ ) -> Chunk:
129
+ content = "\n".join(lines[start - 1:end])
130
+ embedded = build_embedded_text(path, sym, content)
131
+ return Chunk(
132
+ symbol_idx=symbol_idx,
133
+ start_line=start,
134
+ end_line=end,
135
+ content=content,
136
+ embedded_text=embedded,
137
+ )
138
+
139
+
140
+ def _windowed_symbol_chunks(
141
+ path: Path | str,
142
+ symbol_idx: int,
143
+ sym: Symbol,
144
+ lines: list[str],
145
+ window_lines: int,
146
+ overlap_lines: int,
147
+ ) -> list[Chunk]:
148
+ out: list[Chunk] = []
149
+ step = max(1, window_lines - overlap_lines)
150
+ cursor = sym.start_line
151
+ while cursor <= sym.end_line:
152
+ end = min(cursor + window_lines - 1, sym.end_line)
153
+ content = "\n".join(lines[cursor - 1:end])
154
+ out.append(
155
+ Chunk(
156
+ symbol_idx=symbol_idx,
157
+ start_line=cursor,
158
+ end_line=end,
159
+ content=content,
160
+ embedded_text=build_embedded_text(path, sym, content),
161
+ )
162
+ )
163
+ if end >= sym.end_line:
164
+ break
165
+ cursor += step
166
+ return out
167
+
168
+
169
+ def _file_window_chunks(
170
+ path: Path | str,
171
+ lines: list[str],
172
+ lang: str,
173
+ window_lines: int,
174
+ overlap_lines: int,
175
+ ) -> list[Chunk]:
176
+ out: list[Chunk] = []
177
+ total = len(lines)
178
+ if total == 0:
179
+ return out
180
+ capped = min(total, MAX_FILE_LINES_NO_SYMBOLS)
181
+ step = max(1, window_lines - overlap_lines)
182
+ for start_zero in range(0, capped, step):
183
+ end_zero = min(start_zero + window_lines, capped)
184
+ content = "\n".join(lines[start_zero:end_zero])
185
+ embedded = "\n".join(
186
+ [
187
+ str(path),
188
+ f"language: {lang}",
189
+ expand_identifiers(content),
190
+ content,
191
+ ]
192
+ )
193
+ out.append(
194
+ Chunk(
195
+ symbol_idx=None,
196
+ start_line=start_zero + 1,
197
+ end_line=end_zero,
198
+ content=content,
199
+ embedded_text=embedded,
200
+ )
201
+ )
202
+ if end_zero >= capped:
203
+ break
204
+ return out
205
+
206
+
207
+ def build_embedded_text(path: Path | str, sym: Symbol, raw_code: str) -> str:
208
+ """Compose the text that gets sent to the embedder.
209
+
210
+ Per the spec: file path, signature, docstring, expanded identifiers, raw code.
211
+ The DB still stores raw_code in `chunks.content` — this is only for the embedder.
212
+ """
213
+ parts: list[str] = [str(path)]
214
+ if sym.qualified_name:
215
+ parts.append(sym.qualified_name)
216
+ if sym.signature:
217
+ parts.append(sym.signature)
218
+ if sym.docstring:
219
+ parts.append(sym.docstring)
220
+ expanded = expand_identifiers(raw_code)
221
+ if expanded:
222
+ parts.append(expanded)
223
+ parts.append(raw_code)
224
+ return "\n".join(parts)
code_index/cli.py ADDED
@@ -0,0 +1,144 @@
1
+ """Command-line entry point: init, reindex, watch, stats."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sys
7
+ from pathlib import Path
8
+
9
+ import click
10
+
11
+ from . import db as dbm
12
+ from .embedder import make_embedder
13
+ from .indexer import Indexer
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+
18
+ @click.group()
19
+ @click.option("--root", type=click.Path(exists=True, file_okay=False, path_type=Path),
20
+ default=Path.cwd, show_default="cwd",
21
+ help="Repo root to operate on.")
22
+ @click.option("-v", "--verbose", is_flag=True, help="Enable debug logging.")
23
+ @click.pass_context
24
+ def cli(ctx: click.Context, root: Path, verbose: bool) -> None:
25
+ """SQLite-backed code index for Claude Code."""
26
+ logging.basicConfig(
27
+ level=logging.DEBUG if verbose else logging.INFO,
28
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
29
+ )
30
+ ctx.ensure_object(dict)
31
+ ctx.obj["root"] = root.resolve()
32
+
33
+
34
+ @cli.command()
35
+ @click.pass_context
36
+ def init(ctx: click.Context) -> None:
37
+ """Build the index from scratch (or refresh changed files)."""
38
+ root = ctx.obj["root"]
39
+ embedder = make_embedder()
40
+ click.echo(f"Indexing {root} (embedder={embedder.model_name}, dim={embedder.dim})")
41
+ indexer = Indexer(root=root, embedder=embedder)
42
+ try:
43
+ stats = indexer.reindex_all()
44
+ finally:
45
+ indexer.close()
46
+ click.echo(
47
+ f" files seen: {stats.files_seen}\n"
48
+ f" files indexed: {stats.files_indexed}\n"
49
+ f" files skipped: {stats.files_skipped}\n"
50
+ f" chunks added: {stats.chunks_written}\n"
51
+ f" elapsed: {stats.elapsed_ms} ms"
52
+ )
53
+
54
+
55
+ @cli.command()
56
+ @click.option("--file", "file_path", type=click.Path(path_type=Path),
57
+ help="Reindex one file (used by the PostToolUse hook).")
58
+ @click.option("--all", "all_files", is_flag=True, help="Reindex everything that changed.")
59
+ @click.pass_context
60
+ def reindex(ctx: click.Context, file_path: Path | None, all_files: bool) -> None:
61
+ """Reindex one file or the whole repo."""
62
+ root = ctx.obj["root"]
63
+ embedder = make_embedder()
64
+ indexer = Indexer(root=root, embedder=embedder)
65
+ try:
66
+ if file_path:
67
+ target = file_path.resolve()
68
+ result = indexer.reindex_file(target)
69
+ if result is None:
70
+ click.echo(f"skip {target} (not indexable)", err=True)
71
+ return
72
+ verb = "indexed" if result.indexed else "unchanged"
73
+ click.echo(
74
+ f"{verb}: {target} ({result.chunk_count} chunks, "
75
+ f"{result.symbol_count} symbols, {result.elapsed_ms} ms)"
76
+ )
77
+ return
78
+ if all_files:
79
+ stats = indexer.reindex_all()
80
+ click.echo(
81
+ f"reindexed: {stats.files_indexed} indexed, {stats.files_skipped} unchanged, "
82
+ f"{stats.chunks_written} chunks, {stats.elapsed_ms} ms"
83
+ )
84
+ return
85
+ click.echo("Specify --file PATH or --all", err=True)
86
+ sys.exit(2)
87
+ finally:
88
+ indexer.close()
89
+
90
+
91
+ @cli.command()
92
+ @click.pass_context
93
+ def watch(ctx: click.Context) -> None:
94
+ """Run a foreground file watcher. Reindexes files on edit."""
95
+ from .watcher import run_watcher
96
+ run_watcher(ctx.obj["root"])
97
+
98
+
99
+ @cli.command()
100
+ @click.pass_context
101
+ def stats(ctx: click.Context) -> None:
102
+ """Print index summary."""
103
+ conn = dbm.connect(read_only=True)
104
+ try:
105
+ meta = {
106
+ row["key"]: row["value"]
107
+ for row in conn.execute("SELECT key, value FROM meta")
108
+ }
109
+ f = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"]
110
+ s = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"]
111
+ c = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"]
112
+ e = conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"]
113
+ i = conn.execute("SELECT COUNT(*) AS c FROM file_imports").fetchone()["c"]
114
+ last = conn.execute(
115
+ "SELECT path, indexed_at FROM files ORDER BY indexed_at DESC LIMIT 1"
116
+ ).fetchone()
117
+ finally:
118
+ conn.close()
119
+
120
+ click.echo(f"db: {dbm.db_path()}")
121
+ click.echo(f"embed_model: {meta.get('embed_model', '?')}")
122
+ click.echo(f"embed_dim: {meta.get('embed_dim', '?')}")
123
+ click.echo(f"files: {f}")
124
+ click.echo(f"symbols: {s}")
125
+ click.echo(f"chunks: {c}")
126
+ click.echo(f"edges: {e}")
127
+ click.echo(f"file_imports: {i}")
128
+ if last:
129
+ click.echo(f"last update: {last['path']} (epoch {last['indexed_at']})")
130
+
131
+
132
+ @cli.command()
133
+ def serve() -> None:
134
+ """Start the MCP server on stdio (same as `code-index-mcp`)."""
135
+ from .mcp_server import main as mcp_main
136
+ mcp_main()
137
+
138
+
139
+ def main() -> None:
140
+ cli(obj={})
141
+
142
+
143
+ if __name__ == "__main__":
144
+ main()
code_index/db.py ADDED
@@ -0,0 +1,198 @@
1
+ """SQLite schema, connection management, sqlite-vec loading."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import sqlite3
7
+ import struct
8
+ from contextlib import contextmanager
9
+ from pathlib import Path
10
+ from typing import Iterator
11
+
12
+ import sqlite_vec
13
+
14
+ DEFAULT_DB_PATH = ".claude/index.db"
15
+
16
+
17
+ def db_path() -> Path:
18
+ """Resolve the configured database path."""
19
+ raw = os.environ.get("CODE_INDEX_DB", DEFAULT_DB_PATH)
20
+ return Path(raw).expanduser().resolve()
21
+
22
+
23
+ def serialize_vector(vec: list[float]) -> bytes:
24
+ """Pack a list of floats as little-endian float32 bytes for sqlite-vec."""
25
+ return struct.pack(f"{len(vec)}f", *vec)
26
+
27
+
28
+ def connect(path: Path | None = None, *, read_only: bool = False) -> sqlite3.Connection:
29
+ """Open a connection with sqlite-vec loaded and pragmas applied."""
30
+ target = path or db_path()
31
+ target.parent.mkdir(parents=True, exist_ok=True)
32
+
33
+ if read_only and target.exists():
34
+ uri = f"file:{target}?mode=ro"
35
+ conn = sqlite3.connect(uri, uri=True, check_same_thread=False)
36
+ else:
37
+ conn = sqlite3.connect(str(target), check_same_thread=False)
38
+
39
+ conn.row_factory = sqlite3.Row
40
+ if not hasattr(conn, "enable_load_extension"):
41
+ raise RuntimeError(
42
+ "Your Python's sqlite3 module was built without loadable extension support. "
43
+ "Use a Python built with --enable-loadable-sqlite-extensions "
44
+ "(e.g. python.org installer, pyenv with PYTHON_CONFIGURE_OPTS, or Python 3.13)."
45
+ )
46
+ conn.enable_load_extension(True)
47
+ sqlite_vec.load(conn)
48
+ conn.enable_load_extension(False)
49
+
50
+ if not read_only:
51
+ conn.execute("PRAGMA journal_mode = WAL")
52
+ conn.execute("PRAGMA synchronous = NORMAL")
53
+ conn.execute("PRAGMA foreign_keys = ON")
54
+ conn.execute("PRAGMA temp_store = MEMORY")
55
+ conn.execute("PRAGMA mmap_size = 268435456") # 256MB
56
+ return conn
57
+
58
+
59
+ def init_schema(conn: sqlite3.Connection, embed_dim: int) -> None:
60
+ """Create all tables, indexes, and virtual tables. Idempotent."""
61
+ cur = conn.cursor()
62
+ cur.executescript(SCHEMA_SQL)
63
+
64
+ # Vector virtual table dimension is baked in at creation time.
65
+ existing = cur.execute(
66
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'"
67
+ ).fetchone()
68
+ if existing is None:
69
+ cur.execute(
70
+ f"CREATE VIRTUAL TABLE chunks_vec USING vec0("
71
+ f"chunk_id INTEGER PRIMARY KEY, embedding FLOAT[{embed_dim}])"
72
+ )
73
+
74
+ # Persist the embedding dim so callers can detect mismatches.
75
+ cur.execute(
76
+ "INSERT INTO meta(key, value) VALUES('embed_dim', ?) "
77
+ "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
78
+ (str(embed_dim),),
79
+ )
80
+ conn.commit()
81
+
82
+
83
+ def get_meta(conn: sqlite3.Connection, key: str) -> str | None:
84
+ row = conn.execute("SELECT value FROM meta WHERE key = ?", (key,)).fetchone()
85
+ return row["value"] if row else None
86
+
87
+
88
+ def set_meta(conn: sqlite3.Connection, key: str, value: str) -> None:
89
+ conn.execute(
90
+ "INSERT INTO meta(key, value) VALUES(?, ?) "
91
+ "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
92
+ (key, value),
93
+ )
94
+
95
+
96
+ @contextmanager
97
+ def transaction(conn: sqlite3.Connection) -> Iterator[sqlite3.Connection]:
98
+ """Single-statement transaction wrapper. Commits on success, rolls back on error."""
99
+ try:
100
+ yield conn
101
+ conn.commit()
102
+ except Exception:
103
+ conn.rollback()
104
+ raise
105
+
106
+
107
+ SCHEMA_SQL = """
108
+ CREATE TABLE IF NOT EXISTS meta (
109
+ key TEXT PRIMARY KEY,
110
+ value TEXT NOT NULL
111
+ );
112
+
113
+ CREATE TABLE IF NOT EXISTS files (
114
+ id INTEGER PRIMARY KEY,
115
+ path TEXT UNIQUE NOT NULL,
116
+ hash TEXT NOT NULL,
117
+ lang TEXT,
118
+ mtime INTEGER,
119
+ indexed_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now'))
120
+ );
121
+
122
+ CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
123
+
124
+ CREATE TABLE IF NOT EXISTS symbols (
125
+ id INTEGER PRIMARY KEY,
126
+ file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
127
+ name TEXT NOT NULL,
128
+ qualified_name TEXT,
129
+ kind TEXT NOT NULL,
130
+ parent_id INTEGER REFERENCES symbols(id) ON DELETE CASCADE,
131
+ start_line INTEGER NOT NULL,
132
+ end_line INTEGER NOT NULL,
133
+ signature TEXT,
134
+ docstring TEXT
135
+ );
136
+
137
+ CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name);
138
+ CREATE INDEX IF NOT EXISTS idx_symbols_qname ON symbols(qualified_name);
139
+ CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id);
140
+ CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind);
141
+
142
+ CREATE TABLE IF NOT EXISTS unresolved_refs (
143
+ id INTEGER PRIMARY KEY,
144
+ src_symbol INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
145
+ target_name TEXT NOT NULL,
146
+ kind TEXT NOT NULL
147
+ );
148
+
149
+ CREATE INDEX IF NOT EXISTS idx_unresolved_target ON unresolved_refs(target_name);
150
+
151
+ CREATE TABLE IF NOT EXISTS edges (
152
+ src_symbol INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
153
+ dst_symbol INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
154
+ kind TEXT NOT NULL,
155
+ PRIMARY KEY (src_symbol, dst_symbol, kind)
156
+ );
157
+
158
+ CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_symbol, kind);
159
+ CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src_symbol, kind);
160
+
161
+ CREATE TABLE IF NOT EXISTS file_imports (
162
+ src_file INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
163
+ dst_file INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
164
+ PRIMARY KEY (src_file, dst_file)
165
+ );
166
+
167
+ CREATE INDEX IF NOT EXISTS idx_fimp_src ON file_imports(src_file);
168
+ CREATE INDEX IF NOT EXISTS idx_fimp_dst ON file_imports(dst_file);
169
+
170
+ CREATE TABLE IF NOT EXISTS chunks (
171
+ id INTEGER PRIMARY KEY,
172
+ file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
173
+ symbol_id INTEGER REFERENCES symbols(id) ON DELETE CASCADE,
174
+ start_line INTEGER NOT NULL,
175
+ end_line INTEGER NOT NULL,
176
+ content TEXT NOT NULL
177
+ );
178
+
179
+ CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id);
180
+ CREATE INDEX IF NOT EXISTS idx_chunks_symbol ON chunks(symbol_id);
181
+
182
+ CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
183
+ content, content='chunks', content_rowid='id', tokenize='trigram'
184
+ );
185
+
186
+ CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
187
+ INSERT INTO chunks_fts(rowid, content) VALUES (new.id, new.content);
188
+ END;
189
+
190
+ CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
191
+ INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES('delete', old.id, old.content);
192
+ END;
193
+
194
+ CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
195
+ INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES('delete', old.id, old.content);
196
+ INSERT INTO chunks_fts(rowid, content) VALUES (new.id, new.content);
197
+ END;
198
+ """