PyPI - mcp-code-index - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mcp-code-index 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

code_index/__init__.py +3 -0
code_index/chunker.py +224 -0
code_index/cli.py +144 -0
code_index/db.py +198 -0
code_index/embedder.py +167 -0
code_index/indexer.py +411 -0
code_index/mcp_server.py +484 -0
code_index/parser.py +739 -0
code_index/retriever.py +395 -0
code_index/walker.py +76 -0
code_index/watcher.py +118 -0
mcp_code_index-0.1.0.dist-info/METADATA +155 -0
mcp_code_index-0.1.0.dist-info/RECORD +16 -0
mcp_code_index-0.1.0.dist-info/WHEEL +4 -0
mcp_code_index-0.1.0.dist-info/entry_points.txt +3 -0
mcp_code_index-0.1.0.dist-info/licenses/LICENSE +21 -0

code_index/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+"""SQLite-backed code index for Claude Code, exposed via MCP."""
+__version__ = "0.1.0"

code_index/chunker.py ADDED Viewed

@@ -0,0 +1,224 @@
+"""Per-symbol chunking with identifier expansion for retrieval."""
+from __future__ import annotations
+import re
+from collections import defaultdict
+from dataclasses import dataclass, field
+from pathlib import Path
+from .parser import ParseResult, Symbol
+# Defaults; tuned to keep individual chunks under ~2KB of code.
+MAX_SYMBOL_LINES = 80
+WINDOW_LINES = 40
+OVERLAP_LINES = 10
+MAX_FILE_LINES_NO_SYMBOLS = 800
+_CAMEL_BOUNDARY = re.compile(
+    r"(?<!^)(?=[A-Z][a-z])|(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=[a-zA-Z])(?=[0-9])|(?<=[0-9])(?=[a-zA-Z])"
+)
+_IDENT = re.compile(r"\b[A-Za-z_][A-Za-z0-9_]*\b")
+@dataclass
+class Chunk:
+    symbol_idx: int | None  # index into ParseResult.symbols, or None for file-level
+    start_line: int  # 1-based, inclusive
+    end_line: int  # 1-based, inclusive
+    content: str  # raw code, returned to agents
+    embedded_text: str = field(default="", repr=False)  # fed to embedder
+def expand_identifier(ident: str) -> list[str]:
+    """Split a single identifier into lowercased word pieces.
+    >>> expand_identifier("getUserAuthToken")
+    ['get', 'user', 'auth', 'token']
+    >>> expand_identifier("OAUTH_REDIRECT_URI")
+    ['oauth', 'redirect', 'uri']
+    >>> expand_identifier("get_user_v2")
+    ['get', 'user', 'v2']
+    """
+    parts = re.split(r"[_\-.]+", ident)
+    out: list[str] = []
+    for piece in parts:
+        if not piece:
+            continue
+        spaced = _CAMEL_BOUNDARY.sub(" ", piece)
+        for word in spaced.split():
+            out.append(word.lower())
+    return out
+def expand_identifiers(text: str, *, max_words: int = 200) -> str:
+    """Return de-duplicated word forms of every identifier in `text`."""
+    seen: set[str] = set()
+    out: list[str] = []
+    for match in _IDENT.finditer(text):
+        for word in expand_identifier(match.group(0)):
+            if len(word) < 2 or word in seen:
+                continue
+            seen.add(word)
+            out.append(word)
+            if len(out) >= max_words:
+                return " ".join(out)
+    return " ".join(out)
+def chunk_file(
+    path: Path | str,
+    source: bytes,
+    parse_result: ParseResult,
+    *,
+    max_symbol_lines: int = MAX_SYMBOL_LINES,
+    window_lines: int = WINDOW_LINES,
+    overlap_lines: int = OVERLAP_LINES,
+) -> list[Chunk]:
+    """Produce chunks for one file. Returns empty list if file is empty."""
+    text = source.decode("utf-8", errors="replace")
+    if not text.strip():
+        return []
+    lines = text.splitlines()
+    if not parse_result.symbols:
+        return _file_window_chunks(
+            path, lines, parse_result.lang, window_lines, overlap_lines
+        )
+    children_by_parent: dict[int, list[int]] = defaultdict(list)
+    for i, sym in enumerate(parse_result.symbols):
+        if sym.parent_idx is not None:
+            children_by_parent[sym.parent_idx].append(i)
+    chunks: list[Chunk] = []
+    for i, sym in enumerate(parse_result.symbols):
+        children = children_by_parent.get(i, [])
+        if children:
+            first_child_start = min(parse_result.symbols[c].start_line for c in children)
+            header_end = min(first_child_start - 1, sym.end_line)
+            if header_end >= sym.start_line:
+                chunks.append(
+                    _make_chunk(
+                        path, i, sym, sym.start_line, header_end, lines
+                    )
+                )
+        else:
+            length = sym.end_line - sym.start_line + 1
+            if length > max_symbol_lines:
+                chunks.extend(
+                    _windowed_symbol_chunks(
+                        path, i, sym, lines, window_lines, overlap_lines
+                    )
+                )
+            else:
+                chunks.append(
+                    _make_chunk(path, i, sym, sym.start_line, sym.end_line, lines)
+                )
+    return chunks
+def _make_chunk(
+    path: Path | str,
+    symbol_idx: int,
+    sym: Symbol,
+    start: int,
+    end: int,
+    lines: list[str],
+) -> Chunk:
+    content = "\n".join(lines[start - 1:end])
+    embedded = build_embedded_text(path, sym, content)
+    return Chunk(
+        symbol_idx=symbol_idx,
+        start_line=start,
+        end_line=end,
+        content=content,
+        embedded_text=embedded,
+    )
+def _windowed_symbol_chunks(
+    path: Path | str,
+    symbol_idx: int,
+    sym: Symbol,
+    lines: list[str],
+    window_lines: int,
+    overlap_lines: int,
+) -> list[Chunk]:
+    out: list[Chunk] = []
+    step = max(1, window_lines - overlap_lines)
+    cursor = sym.start_line
+    while cursor <= sym.end_line:
+        end = min(cursor + window_lines - 1, sym.end_line)
+        content = "\n".join(lines[cursor - 1:end])
+        out.append(
+            Chunk(
+                symbol_idx=symbol_idx,
+                start_line=cursor,
+                end_line=end,
+                content=content,
+                embedded_text=build_embedded_text(path, sym, content),
+            )
+        )
+        if end >= sym.end_line:
+            break
+        cursor += step
+    return out
+def _file_window_chunks(
+    path: Path | str,
+    lines: list[str],
+    lang: str,
+    window_lines: int,
+    overlap_lines: int,
+) -> list[Chunk]:
+    out: list[Chunk] = []
+    total = len(lines)
+    if total == 0:
+        return out
+    capped = min(total, MAX_FILE_LINES_NO_SYMBOLS)
+    step = max(1, window_lines - overlap_lines)
+    for start_zero in range(0, capped, step):
+        end_zero = min(start_zero + window_lines, capped)
+        content = "\n".join(lines[start_zero:end_zero])
+        embedded = "\n".join(
+            [
+                str(path),
+                f"language: {lang}",
+                expand_identifiers(content),
+                content,
+            ]
+        )
+        out.append(
+            Chunk(
+                symbol_idx=None,
+                start_line=start_zero + 1,
+                end_line=end_zero,
+                content=content,
+                embedded_text=embedded,
+            )
+        )
+        if end_zero >= capped:
+            break
+    return out
+def build_embedded_text(path: Path | str, sym: Symbol, raw_code: str) -> str:
+    """Compose the text that gets sent to the embedder.
+    Per the spec: file path, signature, docstring, expanded identifiers, raw code.
+    The DB still stores raw_code in `chunks.content` — this is only for the embedder.
+    """
+    parts: list[str] = [str(path)]
+    if sym.qualified_name:
+        parts.append(sym.qualified_name)
+    if sym.signature:
+        parts.append(sym.signature)
+    if sym.docstring:
+        parts.append(sym.docstring)
+    expanded = expand_identifiers(raw_code)
+    if expanded:
+        parts.append(expanded)
+    parts.append(raw_code)
+    return "\n".join(parts)

code_index/cli.py ADDED Viewed

@@ -0,0 +1,144 @@
+"""Command-line entry point: init, reindex, watch, stats."""
+from __future__ import annotations
+import logging
+import sys
+from pathlib import Path
+import click
+from . import db as dbm
+from .embedder import make_embedder
+from .indexer import Indexer
+log = logging.getLogger(__name__)
+@click.group()
+@click.option("--root", type=click.Path(exists=True, file_okay=False, path_type=Path),
+              default=Path.cwd, show_default="cwd",
+              help="Repo root to operate on.")
+@click.option("-v", "--verbose", is_flag=True, help="Enable debug logging.")
+@click.pass_context
+def cli(ctx: click.Context, root: Path, verbose: bool) -> None:
+    """SQLite-backed code index for Claude Code."""
+    logging.basicConfig(
+        level=logging.DEBUG if verbose else logging.INFO,
+        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
+    )
+    ctx.ensure_object(dict)
+    ctx.obj["root"] = root.resolve()
+@cli.command()
+@click.pass_context
+def init(ctx: click.Context) -> None:
+    """Build the index from scratch (or refresh changed files)."""
+    root = ctx.obj["root"]
+    embedder = make_embedder()
+    click.echo(f"Indexing {root} (embedder={embedder.model_name}, dim={embedder.dim})")
+    indexer = Indexer(root=root, embedder=embedder)
+    try:
+        stats = indexer.reindex_all()
+    finally:
+        indexer.close()
+    click.echo(
+        f"  files seen:    {stats.files_seen}\n"
+        f"  files indexed: {stats.files_indexed}\n"
+        f"  files skipped: {stats.files_skipped}\n"
+        f"  chunks added:  {stats.chunks_written}\n"
+        f"  elapsed:       {stats.elapsed_ms} ms"
+    )
+@cli.command()
+@click.option("--file", "file_path", type=click.Path(path_type=Path),
+              help="Reindex one file (used by the PostToolUse hook).")
+@click.option("--all", "all_files", is_flag=True, help="Reindex everything that changed.")
+@click.pass_context
+def reindex(ctx: click.Context, file_path: Path | None, all_files: bool) -> None:
+    """Reindex one file or the whole repo."""
+    root = ctx.obj["root"]
+    embedder = make_embedder()
+    indexer = Indexer(root=root, embedder=embedder)
+    try:
+        if file_path:
+            target = file_path.resolve()
+            result = indexer.reindex_file(target)
+            if result is None:
+                click.echo(f"skip {target} (not indexable)", err=True)
+                return
+            verb = "indexed" if result.indexed else "unchanged"
+            click.echo(
+                f"{verb}: {target} ({result.chunk_count} chunks, "
+                f"{result.symbol_count} symbols, {result.elapsed_ms} ms)"
+            )
+            return
+        if all_files:
+            stats = indexer.reindex_all()
+            click.echo(
+                f"reindexed: {stats.files_indexed} indexed, {stats.files_skipped} unchanged, "
+                f"{stats.chunks_written} chunks, {stats.elapsed_ms} ms"
+            )
+            return
+        click.echo("Specify --file PATH or --all", err=True)
+        sys.exit(2)
+    finally:
+        indexer.close()
+@cli.command()
+@click.pass_context
+def watch(ctx: click.Context) -> None:
+    """Run a foreground file watcher. Reindexes files on edit."""
+    from .watcher import run_watcher
+    run_watcher(ctx.obj["root"])
+@cli.command()
+@click.pass_context
+def stats(ctx: click.Context) -> None:
+    """Print index summary."""
+    conn = dbm.connect(read_only=True)
+    try:
+        meta = {
+            row["key"]: row["value"]
+            for row in conn.execute("SELECT key, value FROM meta")
+        }
+        f = conn.execute("SELECT COUNT(*) AS c FROM files").fetchone()["c"]
+        s = conn.execute("SELECT COUNT(*) AS c FROM symbols").fetchone()["c"]
+        c = conn.execute("SELECT COUNT(*) AS c FROM chunks").fetchone()["c"]
+        e = conn.execute("SELECT COUNT(*) AS c FROM edges").fetchone()["c"]
+        i = conn.execute("SELECT COUNT(*) AS c FROM file_imports").fetchone()["c"]
+        last = conn.execute(
+            "SELECT path, indexed_at FROM files ORDER BY indexed_at DESC LIMIT 1"
+        ).fetchone()
+    finally:
+        conn.close()
+    click.echo(f"db:           {dbm.db_path()}")
+    click.echo(f"embed_model:  {meta.get('embed_model', '?')}")
+    click.echo(f"embed_dim:    {meta.get('embed_dim', '?')}")
+    click.echo(f"files:        {f}")
+    click.echo(f"symbols:      {s}")
+    click.echo(f"chunks:       {c}")
+    click.echo(f"edges:        {e}")
+    click.echo(f"file_imports: {i}")
+    if last:
+        click.echo(f"last update:  {last['path']} (epoch {last['indexed_at']})")
+@cli.command()
+def serve() -> None:
+    """Start the MCP server on stdio (same as `code-index-mcp`)."""
+    from .mcp_server import main as mcp_main
+    mcp_main()
+def main() -> None:
+    cli(obj={})
+if __name__ == "__main__":
+    main()

code_index/db.py ADDED Viewed

@@ -0,0 +1,198 @@
+"""SQLite schema, connection management, sqlite-vec loading."""
+from __future__ import annotations
+import os
+import sqlite3
+import struct
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Iterator
+import sqlite_vec
+DEFAULT_DB_PATH = ".claude/index.db"
+def db_path() -> Path:
+    """Resolve the configured database path."""
+    raw = os.environ.get("CODE_INDEX_DB", DEFAULT_DB_PATH)
+    return Path(raw).expanduser().resolve()
+def serialize_vector(vec: list[float]) -> bytes:
+    """Pack a list of floats as little-endian float32 bytes for sqlite-vec."""
+    return struct.pack(f"{len(vec)}f", *vec)
+def connect(path: Path | None = None, *, read_only: bool = False) -> sqlite3.Connection:
+    """Open a connection with sqlite-vec loaded and pragmas applied."""
+    target = path or db_path()
+    target.parent.mkdir(parents=True, exist_ok=True)
+    if read_only and target.exists():
+        uri = f"file:{target}?mode=ro"
+        conn = sqlite3.connect(uri, uri=True, check_same_thread=False)
+    else:
+        conn = sqlite3.connect(str(target), check_same_thread=False)
+    conn.row_factory = sqlite3.Row
+    if not hasattr(conn, "enable_load_extension"):
+        raise RuntimeError(
+            "Your Python's sqlite3 module was built without loadable extension support. "
+            "Use a Python built with --enable-loadable-sqlite-extensions "
+            "(e.g. python.org installer, pyenv with PYTHON_CONFIGURE_OPTS, or Python 3.13)."
+        )
+    conn.enable_load_extension(True)
+    sqlite_vec.load(conn)
+    conn.enable_load_extension(False)
+    if not read_only:
+        conn.execute("PRAGMA journal_mode = WAL")
+        conn.execute("PRAGMA synchronous = NORMAL")
+        conn.execute("PRAGMA foreign_keys = ON")
+        conn.execute("PRAGMA temp_store = MEMORY")
+        conn.execute("PRAGMA mmap_size = 268435456")  # 256MB
+    return conn
+def init_schema(conn: sqlite3.Connection, embed_dim: int) -> None:
+    """Create all tables, indexes, and virtual tables. Idempotent."""
+    cur = conn.cursor()
+    cur.executescript(SCHEMA_SQL)
+    # Vector virtual table dimension is baked in at creation time.
+    existing = cur.execute(
+        "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks_vec'"
+    ).fetchone()
+    if existing is None:
+        cur.execute(
+            f"CREATE VIRTUAL TABLE chunks_vec USING vec0("
+            f"chunk_id INTEGER PRIMARY KEY, embedding FLOAT[{embed_dim}])"
+        )
+    # Persist the embedding dim so callers can detect mismatches.
+    cur.execute(
+        "INSERT INTO meta(key, value) VALUES('embed_dim', ?) "
+        "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
+        (str(embed_dim),),
+    )
+    conn.commit()
+def get_meta(conn: sqlite3.Connection, key: str) -> str | None:
+    row = conn.execute("SELECT value FROM meta WHERE key = ?", (key,)).fetchone()
+    return row["value"] if row else None
+def set_meta(conn: sqlite3.Connection, key: str, value: str) -> None:
+    conn.execute(
+        "INSERT INTO meta(key, value) VALUES(?, ?) "
+        "ON CONFLICT(key) DO UPDATE SET value=excluded.value",
+        (key, value),
+    )
+@contextmanager
+def transaction(conn: sqlite3.Connection) -> Iterator[sqlite3.Connection]:
+    """Single-statement transaction wrapper. Commits on success, rolls back on error."""
+    try:
+        yield conn
+        conn.commit()
+    except Exception:
+        conn.rollback()
+        raise
+SCHEMA_SQL = """
+CREATE TABLE IF NOT EXISTS meta (
+  key TEXT PRIMARY KEY,
+  value TEXT NOT NULL
+);
+CREATE TABLE IF NOT EXISTS files (
+  id INTEGER PRIMARY KEY,
+  path TEXT UNIQUE NOT NULL,
+  hash TEXT NOT NULL,
+  lang TEXT,
+  mtime INTEGER,
+  indexed_at INTEGER NOT NULL DEFAULT (strftime('%s', 'now'))
+);
+CREATE INDEX IF NOT EXISTS idx_files_path ON files(path);
+CREATE TABLE IF NOT EXISTS symbols (
+  id INTEGER PRIMARY KEY,
+  file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
+  name TEXT NOT NULL,
+  qualified_name TEXT,
+  kind TEXT NOT NULL,
+  parent_id INTEGER REFERENCES symbols(id) ON DELETE CASCADE,
+  start_line INTEGER NOT NULL,
+  end_line INTEGER NOT NULL,
+  signature TEXT,
+  docstring TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name);
+CREATE INDEX IF NOT EXISTS idx_symbols_qname ON symbols(qualified_name);
+CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_id);
+CREATE INDEX IF NOT EXISTS idx_symbols_kind ON symbols(kind);
+CREATE TABLE IF NOT EXISTS unresolved_refs (
+  id INTEGER PRIMARY KEY,
+  src_symbol INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+  target_name TEXT NOT NULL,
+  kind TEXT NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_unresolved_target ON unresolved_refs(target_name);
+CREATE TABLE IF NOT EXISTS edges (
+  src_symbol INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+  dst_symbol INTEGER NOT NULL REFERENCES symbols(id) ON DELETE CASCADE,
+  kind TEXT NOT NULL,
+  PRIMARY KEY (src_symbol, dst_symbol, kind)
+);
+CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_symbol, kind);
+CREATE INDEX IF NOT EXISTS idx_edges_src ON edges(src_symbol, kind);
+CREATE TABLE IF NOT EXISTS file_imports (
+  src_file INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
+  dst_file INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
+  PRIMARY KEY (src_file, dst_file)
+);
+CREATE INDEX IF NOT EXISTS idx_fimp_src ON file_imports(src_file);
+CREATE INDEX IF NOT EXISTS idx_fimp_dst ON file_imports(dst_file);
+CREATE TABLE IF NOT EXISTS chunks (
+  id INTEGER PRIMARY KEY,
+  file_id INTEGER NOT NULL REFERENCES files(id) ON DELETE CASCADE,
+  symbol_id INTEGER REFERENCES symbols(id) ON DELETE CASCADE,
+  start_line INTEGER NOT NULL,
+  end_line INTEGER NOT NULL,
+  content TEXT NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_chunks_file ON chunks(file_id);
+CREATE INDEX IF NOT EXISTS idx_chunks_symbol ON chunks(symbol_id);
+CREATE VIRTUAL TABLE IF NOT EXISTS chunks_fts USING fts5(
+  content, content='chunks', content_rowid='id', tokenize='trigram'
+);
+CREATE TRIGGER IF NOT EXISTS chunks_ai AFTER INSERT ON chunks BEGIN
+  INSERT INTO chunks_fts(rowid, content) VALUES (new.id, new.content);
+END;
+CREATE TRIGGER IF NOT EXISTS chunks_ad AFTER DELETE ON chunks BEGIN
+  INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES('delete', old.id, old.content);
+END;
+CREATE TRIGGER IF NOT EXISTS chunks_au AFTER UPDATE ON chunks BEGIN
+  INSERT INTO chunks_fts(chunks_fts, rowid, content) VALUES('delete', old.id, old.content);
+  INSERT INTO chunks_fts(rowid, content) VALUES (new.id, new.content);
+END;
+"""