PyPI - codebase-index - Versions diffs - 1.6.0__py3-none-any.whl - Mend

codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

codebase_index/__init__.py +7 -0
codebase_index/__main__.py +3 -0
codebase_index/cli.py +916 -0
codebase_index/config.py +110 -0
codebase_index/discovery/__init__.py +10 -0
codebase_index/discovery/classify.py +151 -0
codebase_index/discovery/ignore.py +58 -0
codebase_index/discovery/walker.py +75 -0
codebase_index/doctor.py +138 -0
codebase_index/embeddings/__init__.py +2 -0
codebase_index/embeddings/backend.py +67 -0
codebase_index/embeddings/external.py +56 -0
codebase_index/embeddings/local.py +41 -0
codebase_index/embeddings/noop.py +15 -0
codebase_index/graph/__init__.py +8 -0
codebase_index/graph/analysis.py +468 -0
codebase_index/graph/builder.py +160 -0
codebase_index/graph/expand.py +136 -0
codebase_index/graph/export.py +381 -0
codebase_index/graph/navigate.py +201 -0
codebase_index/indexer/__init__.py +8 -0
codebase_index/indexer/doc_chunks.py +202 -0
codebase_index/indexer/freshness.py +109 -0
codebase_index/indexer/pipeline.py +423 -0
codebase_index/mcp/__init__.py +2 -0
codebase_index/mcp/server.py +354 -0
codebase_index/models.py +145 -0
codebase_index/output/__init__.py +6 -0
codebase_index/output/json.py +13 -0
codebase_index/output/markdown.py +316 -0
codebase_index/output/redact.py +31 -0
codebase_index/parsers/__init__.py +9 -0
codebase_index/parsers/base.py +47 -0
codebase_index/parsers/languages.py +290 -0
codebase_index/parsers/line_chunker.py +39 -0
codebase_index/parsers/symbol_chunks.py +62 -0
codebase_index/parsers/treesitter.py +439 -0
codebase_index/retrieval/__init__.py +9 -0
codebase_index/retrieval/budget.py +82 -0
codebase_index/retrieval/fusion.py +62 -0
codebase_index/retrieval/intent.py +56 -0
codebase_index/retrieval/pipeline.py +207 -0
codebase_index/retrieval/rerank.py +69 -0
codebase_index/retrieval/searchers.py +291 -0
codebase_index/retrieval/skeleton.py +251 -0
codebase_index/retrieval/types.py +79 -0
codebase_index/scaffold.py +399 -0
codebase_index/service.py +158 -0
codebase_index/skill_template/SKILL.md +198 -0
codebase_index/skill_template/examples/hooks/settings.json +16 -0
codebase_index/skill_template/scripts/cbx +25 -0
codebase_index/skill_template/scripts/cbx.ps1 +25 -0
codebase_index/skill_update.py +150 -0
codebase_index/storage/__init__.py +8 -0
codebase_index/storage/db.py +116 -0
codebase_index/storage/repo.py +701 -0
codebase_index/storage/schema.sql +125 -0
codebase_index/watch/__init__.py +5 -0
codebase_index/watch/watcher.py +93 -0
codebase_index-1.6.0.dist-info/METADATA +748 -0
codebase_index-1.6.0.dist-info/RECORD +64 -0
codebase_index-1.6.0.dist-info/WHEEL +4 -0
codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0

codebase_index/mcp/server.py ADDED Viewed

@@ -0,0 +1,354 @@
+"""MCP server exposing codebase-index retrieval as tools for Claude.
+Wraps the same retrieval/ layer the CLI uses — no subprocess overhead.
+Launch via: codebase-index mcp  (or codebase-index-mcp as a standalone entry point)
+MCP client config example (.claude/settings.json):
+  {
+    "mcpServers": {
+      "codebase-index": {
+        "command": "codebase-index",
+        "args": ["mcp"],
+        "cwd": "/path/to/your/project"
+      }
+    }
+  }
+"""
+from __future__ import annotations
+import inspect
+import json
+import os
+import sys
+from pathlib import Path
+from typing import TYPE_CHECKING, Optional
+from .. import __version__
+if TYPE_CHECKING:
+    from ..config import Config
+try:
+    from mcp.server.fastmcp import FastMCP
+except ImportError as exc:  # pragma: no cover
+    raise ImportError(
+        "MCP server needs the optional extra: pip install codebase-index[mcp]"
+    ) from exc
+mcp = FastMCP(
+    "codebase-index",
+    instructions=(
+        "Local codebase index. Use search_code for general queries, find_symbol for exact "
+        "symbol lookups, find_refs to find callers/usages, impact_of for blast-radius analysis, "
+        "explain_code for architecture/how-it-works questions, and architecture_overview to map "
+        "the codebase's modules, god nodes, and surprising connections before diving in."
+    ),
+)
+# Contract version for every structured tool payload. Bump on a breaking change
+# (field removal / type change); additive fields keep the same version. Every tool
+# return — including errors — is wrapped by `_emit`, so clients can branch on
+# `schema_version` and `tool` without sniffing the shape. See docs/MCP.md.
+MCP_SCHEMA_VERSION = 1
+def _emit(tool: str, payload: dict) -> str:
+    """Serialize a tool payload inside the stable MCP envelope.
+    `schema_version` and `tool` lead; the payload follows. A payload key never
+    shadows the envelope (payloads do not carry these keys), but the explicit
+    order makes the contract self-describing in the raw JSON.
+    """
+    return json.dumps({"schema_version": MCP_SCHEMA_VERSION, "tool": tool, **payload})
+# Tools return JSON *strings* (unstructured text). Newer FastMCP otherwise
+# auto-builds a structured-output schema from the `-> str` return annotation,
+# which crashes on some mcp/pydantic combinations (mcp>=1.27 + pydantic 2.10).
+# Force unstructured output where the kwarg exists; older mcp (>=1.0) lacks it.
+_SUPPORTS_STRUCTURED_OUTPUT = "structured_output" in inspect.signature(mcp.tool).parameters
+def _tool():
+    if _SUPPORTS_STRUCTURED_OUTPUT:
+        return mcp.tool(structured_output=False)
+    return mcp.tool()
+def _resolve_db() -> tuple[Path, "Config"]:
+    """Return (db_path, config). Respects CBX_DB_PATH and CBX_ROOT env vars."""
+    from ..service import resolve_db
+    root_env = os.environ.get("CBX_ROOT")
+    return resolve_db(Path(root_env) if root_env else None)
+def _search_backend(cfg: "Config"):
+    # stdout carries the JSON-RPC stream — warnings must go to stderr.
+    from ..service import search_backend
+    return search_backend(cfg, warn=lambda m: print(m, file=sys.stderr))
+def _no_index_payload() -> dict:
+    return {"error": "No index found. Run `codebase-index index` in your project first."}
+@_tool()
+def healthcheck() -> str:
+    """Report package, root, and index health for MCP clients."""
+    db_path, cfg = _resolve_db()
+    payload: dict[str, object] = {
+        "package_version": __version__,
+        "root": str(cfg.root),
+        "index": {"exists": db_path.exists(), "path": str(db_path)},
+    }
+    if db_path.exists():
+        from ..indexer.freshness import compute_freshness
+        from ..storage.db import Database
+        with Database(db_path) as db:
+            payload["index"] = {
+                "exists": True,
+                "path": str(db_path),
+                **compute_freshness(db.conn, Path(cfg.root), cfg).model_dump(),
+            }
+    return _emit("healthcheck", payload)
+@_tool()
+def search_code(
+    query: str,
+    mode: str = "hybrid",
+    limit: int = 10,
+    token_budget: int = 1500,
+    offset: int = 0,
+    raw: bool = False,
+) -> str:
+    """Hybrid search over the codebase index.
+    Returns ranked results with file paths, line ranges, symbol names, and
+    recommended_reads — the exact ranges to open next.
+    When the response includes a ``pagination`` key, pass ``next_offset`` as
+    ``offset`` in the next call to retrieve the following page of results.
+    Args:
+        query: Natural-language or keyword search query.
+        mode: Search mode — "hybrid" (default), "fts" (full-text), or "symbol".
+        limit: Maximum number of results to return per page.
+        token_budget: Token budget for the response payload.
+        offset: Result offset for pagination. Pass ``next_offset`` from a
+                previous response to fetch the next page.
+        raw: If true, return full raw snippets instead of skeletons.
+    """
+    db_path, cfg = _resolve_db()
+    if not db_path.exists():
+        return _emit("search_code", _no_index_payload())
+    from ..service import search_payload
+    payload = search_payload(
+        db_path, cfg, query, mode=mode, limit=limit, offset=offset,
+        token_budget=token_budget, no_fallback=False, backend=_search_backend(cfg),
+        raw=raw,
+    )
+    return _emit("search_code", payload)
+@_tool()
+def find_symbol(
+    name: str,
+    kind: Optional[str] = None,
+    exact: bool = False,
+) -> str:
+    """Locate a symbol definition by name (function, class, method, etc.).
+    Returns file path, line range, and signature for each match.
+    Args:
+        name: Symbol name to look up (e.g. "parse_file", "Database", "MyClass.method").
+        kind: Optional filter — "function", "class", "method", "struct", etc.
+        exact: If True, only exact name matches are returned (no prefix/fuzzy).
+    """
+    db_path, _ = _resolve_db()
+    if not db_path.exists():
+        return _emit("find_symbol", _no_index_payload())
+    from ..retrieval.searchers import symbol_lookup
+    from ..storage.db import Database
+    with Database(db_path) as db:
+        resp = symbol_lookup(db.conn, name, kind=kind, exact=exact)
+    return _emit("find_symbol", resp.model_dump())
+@_tool()
+def find_refs(
+    symbol: str,
+    kind: str = "all",
+) -> str:
+    """Find all references and callers of a symbol.
+    Returns call sites with file path and line number.
+    Args:
+        symbol: Symbol name whose references to find.
+        kind: "callers" for call edges only, "all" for any reference type.
+    """
+    db_path, _ = _resolve_db()
+    if not db_path.exists():
+        return _emit("find_refs", _no_index_payload())
+    from ..retrieval.searchers import refs_lookup
+    from ..storage.db import Database
+    with Database(db_path) as db:
+        resp = refs_lookup(db.conn, symbol, kind=kind)
+    return _emit("find_refs", resp.model_dump())
+@_tool()
+def impact_of(
+    target: str,
+    depth: int = 2,
+    direction: str = "up",
+) -> str:
+    """Blast-radius analysis: what is affected if `target` changes.
+    Walks the dependency/call graph and returns affected files and symbols.
+    Args:
+        target: File path (relative) or symbol name to analyse.
+        depth: How many graph hops to follow (default 2).
+        direction: "up" (what depends on target), "down" (what target depends on), or "both".
+    """
+    db_path, _ = _resolve_db()
+    if not db_path.exists():
+        return _emit("impact_of", _no_index_payload())
+    from ..graph.expand import impact_lookup
+    from ..storage.db import Database
+    with Database(db_path) as db:
+        resp = impact_lookup(db.conn, target, depth=depth, direction=direction)
+    return _emit("impact_of", resp.model_dump())
+@_tool()
+def explain_code(
+    query: str,
+    token_budget: int = 2200,
+    offset: int = 0,
+    raw: bool = False,
+) -> str:
+    """Intent-aware retrieval for architecture / how-does-X-work questions.
+    Uses a higher token budget and how-it-works intent weights compared to search_code.
+    Supports the same pagination protocol as search_code.
+    Args:
+        query: Question about the codebase (e.g. "how does the retrieval pipeline work").
+        token_budget: Token budget for the response payload.
+        offset: Result offset for pagination. Pass ``next_offset`` from a
+                previous response to fetch the next page.
+        raw: If true, return full raw snippets instead of skeletons.
+    """
+    db_path, cfg = _resolve_db()
+    if not db_path.exists():
+        return _emit("explain_code", _no_index_payload())
+    from ..service import normalize_explain_query, search_payload
+    payload = search_payload(
+        db_path, cfg, normalize_explain_query(query), mode="hybrid", limit=10,
+        offset=offset, token_budget=token_budget, no_fallback=False,
+        backend=_search_backend(cfg), raw=raw,
+    )
+    return _emit("explain_code", payload)
+@_tool()
+def architecture_overview() -> str:
+    """High-level map of the codebase from the cached graph analytics.
+    Returns the detected modules (communities), god nodes (most-connected
+    symbols/files), surprising cross-module connections, and suggested starting
+    questions. Use this to orient before diving into specifics. Rebuild the index
+    if it reports ``available: false``.
+    """
+    db_path, cfg = _resolve_db()
+    if not db_path.exists():
+        return _emit("architecture_overview", _no_index_payload())
+    from ..service import architecture_payload
+    payload = architecture_payload(db_path, cfg)
+    return _emit("architecture_overview", payload)
+@_tool()
+def path_between(source: str, target: str) -> str:
+    """Shortest dependency/call path between two symbols or files.
+    Answers "how is X connected to Y" — returns the chain of nodes and the edge
+    types (with confidence) linking them. Useful for tracing how a request reaches
+    the database, or how two modules touch.
+    Args:
+        source: File path (relative) or symbol name to start from.
+        target: File path (relative) or symbol name to reach.
+    """
+    db_path, _ = _resolve_db()
+    if not db_path.exists():
+        return _emit("path_between", _no_index_payload())
+    from ..graph.navigate import path_payload
+    from ..storage.db import Database
+    with Database(db_path) as db:
+        payload = path_payload(db.conn, source, target)
+    return _emit("path_between", payload)
+@_tool()
+def describe_symbol(symbol: str) -> str:
+    """Node card for a symbol: definition(s), callers, callees, centrality, module.
+    A compact "what is this and how does it sit in the graph" view — the in/out
+    degree, its module, whether it's a god node, and its direct callers/callees.
+    Args:
+        symbol: Symbol name to describe (e.g. "Database", "build_index").
+    """
+    db_path, _ = _resolve_db()
+    if not db_path.exists():
+        return _emit("describe_symbol", _no_index_payload())
+    from ..graph.navigate import describe_payload
+    from ..storage.db import Database
+    with Database(db_path) as db:
+        payload = describe_payload(db.conn, symbol)
+    return _emit("describe_symbol", payload)
+@_tool()
+def index_stats() -> str:
+    """Return index freshness, file count, symbol count, and per-language coverage."""
+    db_path, _ = _resolve_db()
+    if not db_path.exists():
+        return _emit("index_stats", {"exists": False, "error": "No index found."})
+    from ..service import stats_payload
+    from ..storage.db import Database
+    with Database(db_path) as db:
+        payload = stats_payload(db.conn)
+    return _emit("index_stats", payload)
+def run() -> None:
+    """Entry point for the standalone `codebase-index-mcp` script."""
+    mcp.run(transport="stdio")

codebase_index/models.py ADDED Viewed

@@ -0,0 +1,145 @@
+"""Shared result models (pydantic). The same shapes feed both JSON and Markdown renderers.
+Mirrors the payload documented in docs/RETRIEVAL.md §8.
+"""
+from __future__ import annotations
+from typing import Iterable, Literal, Optional
+from pydantic import BaseModel, Field
+Intent = Literal[
+    "locate_impl", "how_it_works", "impact", "find_refs",
+    "data_flow", "debug_error", "architecture", "keyword",
+]
+Confidence = Literal["high", "medium", "low"]
+class IndexFreshness(BaseModel):
+    exists: bool
+    stale: bool
+    files_changed_since_build: int = 0
+    built_at: Optional[str] = None
+    head_commit: Optional[str] = None
+class ReadRange(BaseModel):
+    path: str
+    line_start: int
+    line_end: int
+class Result(BaseModel):
+    rank: int
+    path: str
+    line_start: int
+    line_end: int
+    symbols: list[str] = []
+    score: float
+    reason: str
+    snippet: Optional[str] = None
+class SearchResponse(BaseModel):
+    query: str
+    intent: Intent
+    index: IndexFreshness
+    confidence: Confidence
+    results: list[Result] = []
+    recommended_reads: list[ReadRange] = []
+    fallback_suggestions: dict[str, list[str]] = {}
+class SymbolDef(BaseModel):
+    name: str
+    qualified: Optional[str] = None
+    kind: str
+    path: str
+    line_start: int
+    line_end: int
+    signature: Optional[str] = None
+class SymbolResponse(BaseModel):
+    query: str
+    index: IndexFreshness
+    symbols: list[SymbolDef] = []
+class GraphCoverage(BaseModel):
+    """Honesty signal for graph-derived answers (refs/impact).
+    Dependency edges (imports / inheritance) are only extracted for the fully
+    supported (Tier-A) languages. A symbol or file in a Tier-B language (generic
+    tree-sitter walk) yields symbols and best-effort call sites but no
+    import/extends/implements edges, so refs/impact can undercount. When
+    ``partial`` is true an *empty or short* result does not prove there are no
+    references — it may just be unanalyzed; confirm with Grep.
+    """
+    partial: bool = False
+    languages: list[str] = []
+    reason: Optional[str] = None
+    @classmethod
+    def for_paths(cls, paths: Iterable[str]) -> "GraphCoverage":
+        from .discovery.classify import detect_language, parser_for
+        from .parsers.languages import spec_for
+        tier_b = sorted(
+            {
+                lang
+                for p in paths
+                if (lang := detect_language(p)) is not None
+                and parser_for(lang) == "treesitter"
+                and spec_for(lang) is None
+            }
+        )
+        if not tier_b:
+            return cls()
+        return cls(
+            partial=True,
+            languages=tier_b,
+            reason=(
+                "Import/inheritance edges are not extracted for "
+                f"{', '.join(tier_b)} (best-effort symbols only). An empty or short "
+                "result is inconclusive — confirm with a Grep over the codebase."
+            ),
+        )
+class RefSite(BaseModel):
+    path: str
+    line: int
+    kind: str
+    # Audit trail (see edges.confidence): 'extracted' = exact match, 'inferred' =
+    # heuristic, 'ambiguous' = unresolved/non-unique. Defaults keep older callers valid.
+    confidence: str = "extracted"
+class RefsResponse(BaseModel):
+    query: str
+    index: IndexFreshness
+    sites: list[RefSite] = []
+    coverage: GraphCoverage = Field(default_factory=GraphCoverage)
+class ImpactNode(BaseModel):
+    kind: str                       # 'file' | 'symbol'
+    path: str
+    name: Optional[str] = None      # symbol name (None for file nodes)
+    line_start: Optional[int] = None
+    distance: int                   # BFS hops from the target (1 = direct)
+    via_edge: Optional[str] = None  # edge_type that linked it (import|call|extends|...)
+    via_confidence: Optional[str] = None  # confidence of the linking edge (audit trail)
+class ImpactResponse(BaseModel):
+    target: str
+    direction: str                  # 'up' | 'down' | 'both'
+    depth: int
+    index: IndexFreshness
+    nodes: list[ImpactNode] = []
+    files: list[str] = []           # distinct affected files, ranked
+    coverage: GraphCoverage = Field(default_factory=GraphCoverage)

codebase_index/output/__init__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Result rendering. Both renderers consume models.SearchResponse so output stays consistent.
+markdown.py : compact Markdown for Claude — tight results table + fenced snippets +
+              recommended_reads list + fallback suggestions. Optimized for low token count.
+json.py     : machine-readable JSON (what the skill parses with --json).
+"""

codebase_index/output/json.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Machine-readable JSON renderer for pydantic response models and dict payloads."""
+from __future__ import annotations
+import json
+from pydantic import BaseModel
+def render(resp: BaseModel | dict) -> str:
+    if isinstance(resp, dict):
+        return json.dumps(resp, indent=2, ensure_ascii=False)
+    return resp.model_dump_json(indent=2)