codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,354 @@
1
+ """MCP server exposing codebase-index retrieval as tools for Claude.
2
+
3
+ Wraps the same retrieval/ layer the CLI uses — no subprocess overhead.
4
+ Launch via: codebase-index mcp (or codebase-index-mcp as a standalone entry point)
5
+
6
+ MCP client config example (.claude/settings.json):
7
+ {
8
+ "mcpServers": {
9
+ "codebase-index": {
10
+ "command": "codebase-index",
11
+ "args": ["mcp"],
12
+ "cwd": "/path/to/your/project"
13
+ }
14
+ }
15
+ }
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import inspect
21
+ import json
22
+ import os
23
+ import sys
24
+ from pathlib import Path
25
+ from typing import TYPE_CHECKING, Optional
26
+
27
+ from .. import __version__
28
+
29
+ if TYPE_CHECKING:
30
+ from ..config import Config
31
+
32
+ try:
33
+ from mcp.server.fastmcp import FastMCP
34
+ except ImportError as exc: # pragma: no cover
35
+ raise ImportError(
36
+ "MCP server needs the optional extra: pip install codebase-index[mcp]"
37
+ ) from exc
38
+
39
+ mcp = FastMCP(
40
+ "codebase-index",
41
+ instructions=(
42
+ "Local codebase index. Use search_code for general queries, find_symbol for exact "
43
+ "symbol lookups, find_refs to find callers/usages, impact_of for blast-radius analysis, "
44
+ "explain_code for architecture/how-it-works questions, and architecture_overview to map "
45
+ "the codebase's modules, god nodes, and surprising connections before diving in."
46
+ ),
47
+ )
48
+
49
+ # Contract version for every structured tool payload. Bump on a breaking change
50
+ # (field removal / type change); additive fields keep the same version. Every tool
51
+ # return — including errors — is wrapped by `_emit`, so clients can branch on
52
+ # `schema_version` and `tool` without sniffing the shape. See docs/MCP.md.
53
+ MCP_SCHEMA_VERSION = 1
54
+
55
+
56
+ def _emit(tool: str, payload: dict) -> str:
57
+ """Serialize a tool payload inside the stable MCP envelope.
58
+
59
+ `schema_version` and `tool` lead; the payload follows. A payload key never
60
+ shadows the envelope (payloads do not carry these keys), but the explicit
61
+ order makes the contract self-describing in the raw JSON.
62
+ """
63
+ return json.dumps({"schema_version": MCP_SCHEMA_VERSION, "tool": tool, **payload})
64
+
65
+
66
+ # Tools return JSON *strings* (unstructured text). Newer FastMCP otherwise
67
+ # auto-builds a structured-output schema from the `-> str` return annotation,
68
+ # which crashes on some mcp/pydantic combinations (mcp>=1.27 + pydantic 2.10).
69
+ # Force unstructured output where the kwarg exists; older mcp (>=1.0) lacks it.
70
+ _SUPPORTS_STRUCTURED_OUTPUT = "structured_output" in inspect.signature(mcp.tool).parameters
71
+
72
+
73
+ def _tool():
74
+ if _SUPPORTS_STRUCTURED_OUTPUT:
75
+ return mcp.tool(structured_output=False)
76
+ return mcp.tool()
77
+
78
+
79
+ def _resolve_db() -> tuple[Path, "Config"]:
80
+ """Return (db_path, config). Respects CBX_DB_PATH and CBX_ROOT env vars."""
81
+ from ..service import resolve_db
82
+
83
+ root_env = os.environ.get("CBX_ROOT")
84
+ return resolve_db(Path(root_env) if root_env else None)
85
+
86
+
87
+ def _search_backend(cfg: "Config"):
88
+ # stdout carries the JSON-RPC stream — warnings must go to stderr.
89
+ from ..service import search_backend
90
+
91
+ return search_backend(cfg, warn=lambda m: print(m, file=sys.stderr))
92
+
93
+
94
+ def _no_index_payload() -> dict:
95
+ return {"error": "No index found. Run `codebase-index index` in your project first."}
96
+
97
+
98
+ @_tool()
99
+ def healthcheck() -> str:
100
+ """Report package, root, and index health for MCP clients."""
101
+ db_path, cfg = _resolve_db()
102
+ payload: dict[str, object] = {
103
+ "package_version": __version__,
104
+ "root": str(cfg.root),
105
+ "index": {"exists": db_path.exists(), "path": str(db_path)},
106
+ }
107
+ if db_path.exists():
108
+ from ..indexer.freshness import compute_freshness
109
+ from ..storage.db import Database
110
+
111
+ with Database(db_path) as db:
112
+ payload["index"] = {
113
+ "exists": True,
114
+ "path": str(db_path),
115
+ **compute_freshness(db.conn, Path(cfg.root), cfg).model_dump(),
116
+ }
117
+ return _emit("healthcheck", payload)
118
+
119
+
120
+ @_tool()
121
+ def search_code(
122
+ query: str,
123
+ mode: str = "hybrid",
124
+ limit: int = 10,
125
+ token_budget: int = 1500,
126
+ offset: int = 0,
127
+ raw: bool = False,
128
+ ) -> str:
129
+ """Hybrid search over the codebase index.
130
+
131
+ Returns ranked results with file paths, line ranges, symbol names, and
132
+ recommended_reads — the exact ranges to open next.
133
+
134
+ When the response includes a ``pagination`` key, pass ``next_offset`` as
135
+ ``offset`` in the next call to retrieve the following page of results.
136
+
137
+ Args:
138
+ query: Natural-language or keyword search query.
139
+ mode: Search mode — "hybrid" (default), "fts" (full-text), or "symbol".
140
+ limit: Maximum number of results to return per page.
141
+ token_budget: Token budget for the response payload.
142
+ offset: Result offset for pagination. Pass ``next_offset`` from a
143
+ previous response to fetch the next page.
144
+ raw: If true, return full raw snippets instead of skeletons.
145
+ """
146
+ db_path, cfg = _resolve_db()
147
+ if not db_path.exists():
148
+ return _emit("search_code", _no_index_payload())
149
+
150
+ from ..service import search_payload
151
+
152
+ payload = search_payload(
153
+ db_path, cfg, query, mode=mode, limit=limit, offset=offset,
154
+ token_budget=token_budget, no_fallback=False, backend=_search_backend(cfg),
155
+ raw=raw,
156
+ )
157
+ return _emit("search_code", payload)
158
+
159
+
160
+ @_tool()
161
+ def find_symbol(
162
+ name: str,
163
+ kind: Optional[str] = None,
164
+ exact: bool = False,
165
+ ) -> str:
166
+ """Locate a symbol definition by name (function, class, method, etc.).
167
+
168
+ Returns file path, line range, and signature for each match.
169
+
170
+ Args:
171
+ name: Symbol name to look up (e.g. "parse_file", "Database", "MyClass.method").
172
+ kind: Optional filter — "function", "class", "method", "struct", etc.
173
+ exact: If True, only exact name matches are returned (no prefix/fuzzy).
174
+ """
175
+ db_path, _ = _resolve_db()
176
+ if not db_path.exists():
177
+ return _emit("find_symbol", _no_index_payload())
178
+
179
+ from ..retrieval.searchers import symbol_lookup
180
+ from ..storage.db import Database
181
+
182
+ with Database(db_path) as db:
183
+ resp = symbol_lookup(db.conn, name, kind=kind, exact=exact)
184
+ return _emit("find_symbol", resp.model_dump())
185
+
186
+
187
+ @_tool()
188
+ def find_refs(
189
+ symbol: str,
190
+ kind: str = "all",
191
+ ) -> str:
192
+ """Find all references and callers of a symbol.
193
+
194
+ Returns call sites with file path and line number.
195
+
196
+ Args:
197
+ symbol: Symbol name whose references to find.
198
+ kind: "callers" for call edges only, "all" for any reference type.
199
+ """
200
+ db_path, _ = _resolve_db()
201
+ if not db_path.exists():
202
+ return _emit("find_refs", _no_index_payload())
203
+
204
+ from ..retrieval.searchers import refs_lookup
205
+ from ..storage.db import Database
206
+
207
+ with Database(db_path) as db:
208
+ resp = refs_lookup(db.conn, symbol, kind=kind)
209
+ return _emit("find_refs", resp.model_dump())
210
+
211
+
212
+ @_tool()
213
+ def impact_of(
214
+ target: str,
215
+ depth: int = 2,
216
+ direction: str = "up",
217
+ ) -> str:
218
+ """Blast-radius analysis: what is affected if `target` changes.
219
+
220
+ Walks the dependency/call graph and returns affected files and symbols.
221
+
222
+ Args:
223
+ target: File path (relative) or symbol name to analyse.
224
+ depth: How many graph hops to follow (default 2).
225
+ direction: "up" (what depends on target), "down" (what target depends on), or "both".
226
+ """
227
+ db_path, _ = _resolve_db()
228
+ if not db_path.exists():
229
+ return _emit("impact_of", _no_index_payload())
230
+
231
+ from ..graph.expand import impact_lookup
232
+ from ..storage.db import Database
233
+
234
+ with Database(db_path) as db:
235
+ resp = impact_lookup(db.conn, target, depth=depth, direction=direction)
236
+ return _emit("impact_of", resp.model_dump())
237
+
238
+
239
+ @_tool()
240
+ def explain_code(
241
+ query: str,
242
+ token_budget: int = 2200,
243
+ offset: int = 0,
244
+ raw: bool = False,
245
+ ) -> str:
246
+ """Intent-aware retrieval for architecture / how-does-X-work questions.
247
+
248
+ Uses a higher token budget and how-it-works intent weights compared to search_code.
249
+ Supports the same pagination protocol as search_code.
250
+
251
+ Args:
252
+ query: Question about the codebase (e.g. "how does the retrieval pipeline work").
253
+ token_budget: Token budget for the response payload.
254
+ offset: Result offset for pagination. Pass ``next_offset`` from a
255
+ previous response to fetch the next page.
256
+ raw: If true, return full raw snippets instead of skeletons.
257
+ """
258
+ db_path, cfg = _resolve_db()
259
+ if not db_path.exists():
260
+ return _emit("explain_code", _no_index_payload())
261
+
262
+ from ..service import normalize_explain_query, search_payload
263
+
264
+ payload = search_payload(
265
+ db_path, cfg, normalize_explain_query(query), mode="hybrid", limit=10,
266
+ offset=offset, token_budget=token_budget, no_fallback=False,
267
+ backend=_search_backend(cfg), raw=raw,
268
+ )
269
+ return _emit("explain_code", payload)
270
+
271
+
272
+ @_tool()
273
+ def architecture_overview() -> str:
274
+ """High-level map of the codebase from the cached graph analytics.
275
+
276
+ Returns the detected modules (communities), god nodes (most-connected
277
+ symbols/files), surprising cross-module connections, and suggested starting
278
+ questions. Use this to orient before diving into specifics. Rebuild the index
279
+ if it reports ``available: false``.
280
+ """
281
+ db_path, cfg = _resolve_db()
282
+ if not db_path.exists():
283
+ return _emit("architecture_overview", _no_index_payload())
284
+
285
+ from ..service import architecture_payload
286
+
287
+ payload = architecture_payload(db_path, cfg)
288
+ return _emit("architecture_overview", payload)
289
+
290
+
291
+ @_tool()
292
+ def path_between(source: str, target: str) -> str:
293
+ """Shortest dependency/call path between two symbols or files.
294
+
295
+ Answers "how is X connected to Y" — returns the chain of nodes and the edge
296
+ types (with confidence) linking them. Useful for tracing how a request reaches
297
+ the database, or how two modules touch.
298
+
299
+ Args:
300
+ source: File path (relative) or symbol name to start from.
301
+ target: File path (relative) or symbol name to reach.
302
+ """
303
+ db_path, _ = _resolve_db()
304
+ if not db_path.exists():
305
+ return _emit("path_between", _no_index_payload())
306
+
307
+ from ..graph.navigate import path_payload
308
+ from ..storage.db import Database
309
+
310
+ with Database(db_path) as db:
311
+ payload = path_payload(db.conn, source, target)
312
+ return _emit("path_between", payload)
313
+
314
+
315
+ @_tool()
316
+ def describe_symbol(symbol: str) -> str:
317
+ """Node card for a symbol: definition(s), callers, callees, centrality, module.
318
+
319
+ A compact "what is this and how does it sit in the graph" view — the in/out
320
+ degree, its module, whether it's a god node, and its direct callers/callees.
321
+
322
+ Args:
323
+ symbol: Symbol name to describe (e.g. "Database", "build_index").
324
+ """
325
+ db_path, _ = _resolve_db()
326
+ if not db_path.exists():
327
+ return _emit("describe_symbol", _no_index_payload())
328
+
329
+ from ..graph.navigate import describe_payload
330
+ from ..storage.db import Database
331
+
332
+ with Database(db_path) as db:
333
+ payload = describe_payload(db.conn, symbol)
334
+ return _emit("describe_symbol", payload)
335
+
336
+
337
+ @_tool()
338
+ def index_stats() -> str:
339
+ """Return index freshness, file count, symbol count, and per-language coverage."""
340
+ db_path, _ = _resolve_db()
341
+ if not db_path.exists():
342
+ return _emit("index_stats", {"exists": False, "error": "No index found."})
343
+
344
+ from ..service import stats_payload
345
+ from ..storage.db import Database
346
+
347
+ with Database(db_path) as db:
348
+ payload = stats_payload(db.conn)
349
+ return _emit("index_stats", payload)
350
+
351
+
352
+ def run() -> None:
353
+ """Entry point for the standalone `codebase-index-mcp` script."""
354
+ mcp.run(transport="stdio")
@@ -0,0 +1,145 @@
1
+ """Shared result models (pydantic). The same shapes feed both JSON and Markdown renderers.
2
+
3
+ Mirrors the payload documented in docs/RETRIEVAL.md §8.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Iterable, Literal, Optional
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ Intent = Literal[
13
+ "locate_impl", "how_it_works", "impact", "find_refs",
14
+ "data_flow", "debug_error", "architecture", "keyword",
15
+ ]
16
+ Confidence = Literal["high", "medium", "low"]
17
+
18
+
19
+ class IndexFreshness(BaseModel):
20
+ exists: bool
21
+ stale: bool
22
+ files_changed_since_build: int = 0
23
+ built_at: Optional[str] = None
24
+ head_commit: Optional[str] = None
25
+
26
+
27
+ class ReadRange(BaseModel):
28
+ path: str
29
+ line_start: int
30
+ line_end: int
31
+
32
+
33
+ class Result(BaseModel):
34
+ rank: int
35
+ path: str
36
+ line_start: int
37
+ line_end: int
38
+ symbols: list[str] = []
39
+ score: float
40
+ reason: str
41
+ snippet: Optional[str] = None
42
+
43
+
44
+ class SearchResponse(BaseModel):
45
+ query: str
46
+ intent: Intent
47
+ index: IndexFreshness
48
+ confidence: Confidence
49
+ results: list[Result] = []
50
+ recommended_reads: list[ReadRange] = []
51
+ fallback_suggestions: dict[str, list[str]] = {}
52
+
53
+
54
+ class SymbolDef(BaseModel):
55
+ name: str
56
+ qualified: Optional[str] = None
57
+ kind: str
58
+ path: str
59
+ line_start: int
60
+ line_end: int
61
+ signature: Optional[str] = None
62
+
63
+
64
+ class SymbolResponse(BaseModel):
65
+ query: str
66
+ index: IndexFreshness
67
+ symbols: list[SymbolDef] = []
68
+
69
+
70
+ class GraphCoverage(BaseModel):
71
+ """Honesty signal for graph-derived answers (refs/impact).
72
+
73
+ Dependency edges (imports / inheritance) are only extracted for the fully
74
+ supported (Tier-A) languages. A symbol or file in a Tier-B language (generic
75
+ tree-sitter walk) yields symbols and best-effort call sites but no
76
+ import/extends/implements edges, so refs/impact can undercount. When
77
+ ``partial`` is true an *empty or short* result does not prove there are no
78
+ references — it may just be unanalyzed; confirm with Grep.
79
+ """
80
+
81
+ partial: bool = False
82
+ languages: list[str] = []
83
+ reason: Optional[str] = None
84
+
85
+ @classmethod
86
+ def for_paths(cls, paths: Iterable[str]) -> "GraphCoverage":
87
+ from .discovery.classify import detect_language, parser_for
88
+ from .parsers.languages import spec_for
89
+
90
+ tier_b = sorted(
91
+ {
92
+ lang
93
+ for p in paths
94
+ if (lang := detect_language(p)) is not None
95
+ and parser_for(lang) == "treesitter"
96
+ and spec_for(lang) is None
97
+ }
98
+ )
99
+ if not tier_b:
100
+ return cls()
101
+ return cls(
102
+ partial=True,
103
+ languages=tier_b,
104
+ reason=(
105
+ "Import/inheritance edges are not extracted for "
106
+ f"{', '.join(tier_b)} (best-effort symbols only). An empty or short "
107
+ "result is inconclusive — confirm with a Grep over the codebase."
108
+ ),
109
+ )
110
+
111
+
112
+ class RefSite(BaseModel):
113
+ path: str
114
+ line: int
115
+ kind: str
116
+ # Audit trail (see edges.confidence): 'extracted' = exact match, 'inferred' =
117
+ # heuristic, 'ambiguous' = unresolved/non-unique. Defaults keep older callers valid.
118
+ confidence: str = "extracted"
119
+
120
+
121
+ class RefsResponse(BaseModel):
122
+ query: str
123
+ index: IndexFreshness
124
+ sites: list[RefSite] = []
125
+ coverage: GraphCoverage = Field(default_factory=GraphCoverage)
126
+
127
+
128
+ class ImpactNode(BaseModel):
129
+ kind: str # 'file' | 'symbol'
130
+ path: str
131
+ name: Optional[str] = None # symbol name (None for file nodes)
132
+ line_start: Optional[int] = None
133
+ distance: int # BFS hops from the target (1 = direct)
134
+ via_edge: Optional[str] = None # edge_type that linked it (import|call|extends|...)
135
+ via_confidence: Optional[str] = None # confidence of the linking edge (audit trail)
136
+
137
+
138
+ class ImpactResponse(BaseModel):
139
+ target: str
140
+ direction: str # 'up' | 'down' | 'both'
141
+ depth: int
142
+ index: IndexFreshness
143
+ nodes: list[ImpactNode] = []
144
+ files: list[str] = [] # distinct affected files, ranked
145
+ coverage: GraphCoverage = Field(default_factory=GraphCoverage)
@@ -0,0 +1,6 @@
1
+ """Result rendering. Both renderers consume models.SearchResponse so output stays consistent.
2
+
3
+ markdown.py : compact Markdown for Claude — tight results table + fenced snippets +
4
+ recommended_reads list + fallback suggestions. Optimized for low token count.
5
+ json.py : machine-readable JSON (what the skill parses with --json).
6
+ """
@@ -0,0 +1,13 @@
1
+ """Machine-readable JSON renderer for pydantic response models and dict payloads."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+
7
+ from pydantic import BaseModel
8
+
9
+
10
+ def render(resp: BaseModel | dict) -> str:
11
+ if isinstance(resp, dict):
12
+ return json.dumps(resp, indent=2, ensure_ascii=False)
13
+ return resp.model_dump_json(indent=2)