code-context-mcp 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context/__init__.py +3 -0
- code_context/_background.py +93 -0
- code_context/_composition.py +425 -0
- code_context/_watcher.py +89 -0
- code_context/adapters/__init__.py +0 -0
- code_context/adapters/driven/__init__.py +0 -0
- code_context/adapters/driven/chunker_dispatcher.py +43 -0
- code_context/adapters/driven/chunker_line.py +54 -0
- code_context/adapters/driven/chunker_treesitter.py +215 -0
- code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
- code_context/adapters/driven/code_source_fs.py +122 -0
- code_context/adapters/driven/embeddings_local.py +111 -0
- code_context/adapters/driven/embeddings_openai.py +58 -0
- code_context/adapters/driven/git_source_cli.py +211 -0
- code_context/adapters/driven/introspector_fs.py +224 -0
- code_context/adapters/driven/keyword_index_sqlite.py +206 -0
- code_context/adapters/driven/reranker_crossencoder.py +61 -0
- code_context/adapters/driven/symbol_index_sqlite.py +264 -0
- code_context/adapters/driven/vector_store_numpy.py +119 -0
- code_context/adapters/driving/__init__.py +0 -0
- code_context/adapters/driving/mcp_server.py +365 -0
- code_context/cli.py +161 -0
- code_context/config.py +114 -0
- code_context/domain/__init__.py +0 -0
- code_context/domain/index_bus.py +52 -0
- code_context/domain/models.py +140 -0
- code_context/domain/ports.py +205 -0
- code_context/domain/use_cases/__init__.py +0 -0
- code_context/domain/use_cases/explain_diff.py +98 -0
- code_context/domain/use_cases/find_definition.py +30 -0
- code_context/domain/use_cases/find_references.py +22 -0
- code_context/domain/use_cases/get_file_tree.py +36 -0
- code_context/domain/use_cases/get_summary.py +24 -0
- code_context/domain/use_cases/indexer.py +336 -0
- code_context/domain/use_cases/recent_changes.py +36 -0
- code_context/domain/use_cases/search_repo.py +131 -0
- code_context/server.py +151 -0
- code_context_mcp-1.0.0.dist-info/METADATA +181 -0
- code_context_mcp-1.0.0.dist-info/RECORD +43 -0
- code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
- code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
- code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
- code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,365 @@
|
|
|
1
|
+
"""MCP driving adapter: registers the 7 contract tools on an mcp Server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from mcp.server import Server
|
|
13
|
+
from mcp.types import TextContent, Tool
|
|
14
|
+
|
|
15
|
+
from code_context.domain.use_cases.explain_diff import ExplainDiffUseCase
|
|
16
|
+
from code_context.domain.use_cases.find_definition import FindDefinitionUseCase
|
|
17
|
+
from code_context.domain.use_cases.find_references import FindReferencesUseCase
|
|
18
|
+
from code_context.domain.use_cases.get_file_tree import GetFileTreeUseCase
|
|
19
|
+
from code_context.domain.use_cases.get_summary import GetSummaryUseCase
|
|
20
|
+
from code_context.domain.use_cases.recent_changes import RecentChangesUseCase
|
|
21
|
+
from code_context.domain.use_cases.search_repo import SearchRepoUseCase
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def register(
|
|
27
|
+
server: Server,
|
|
28
|
+
*,
|
|
29
|
+
search_repo: SearchRepoUseCase,
|
|
30
|
+
recent_changes: RecentChangesUseCase,
|
|
31
|
+
get_summary: GetSummaryUseCase,
|
|
32
|
+
find_definition: FindDefinitionUseCase,
|
|
33
|
+
find_references: FindReferencesUseCase,
|
|
34
|
+
get_file_tree: GetFileTreeUseCase,
|
|
35
|
+
explain_diff: ExplainDiffUseCase,
|
|
36
|
+
) -> None:
|
|
37
|
+
"""Register the 7 contract tools on the given mcp Server instance."""
|
|
38
|
+
|
|
39
|
+
@server.list_tools()
|
|
40
|
+
async def list_tools() -> list[Tool]:
|
|
41
|
+
return [
|
|
42
|
+
Tool(
|
|
43
|
+
name="search_repo",
|
|
44
|
+
description=(
|
|
45
|
+
"Semantic search over the indexed codebase. Use this INSTEAD of Grep "
|
|
46
|
+
"when the query is conceptual (e.g. 'where do we validate input', "
|
|
47
|
+
"'how is caching implemented', 'authentication flow'). Returns ranked "
|
|
48
|
+
"code fragments with file path, line range, snippet, score and a "
|
|
49
|
+
"one-line `why` excerpt. For exact-string lookup, Grep is still better."
|
|
50
|
+
),
|
|
51
|
+
inputSchema={
|
|
52
|
+
"type": "object",
|
|
53
|
+
"properties": {
|
|
54
|
+
"query": {"type": "string"},
|
|
55
|
+
"top_k": {"type": "integer", "default": 5},
|
|
56
|
+
"scope": {
|
|
57
|
+
"type": "string",
|
|
58
|
+
"description": (
|
|
59
|
+
"Optional repo-relative path prefix to constrain results."
|
|
60
|
+
),
|
|
61
|
+
},
|
|
62
|
+
},
|
|
63
|
+
"required": ["query"],
|
|
64
|
+
},
|
|
65
|
+
),
|
|
66
|
+
Tool(
|
|
67
|
+
name="recent_changes",
|
|
68
|
+
description=(
|
|
69
|
+
"Recent git commits with structured fields (sha, ISO date, author, "
|
|
70
|
+
"paths, summary). Use INSTEAD of `git log` shell calls — the output "
|
|
71
|
+
"is already parsed and filterable by `since` and `paths`. Defaults "
|
|
72
|
+
"to the last 7 days when `since` is omitted."
|
|
73
|
+
),
|
|
74
|
+
inputSchema={
|
|
75
|
+
"type": "object",
|
|
76
|
+
"properties": {
|
|
77
|
+
"since": {
|
|
78
|
+
"type": "string",
|
|
79
|
+
"description": "ISO 8601 cutoff; defaults to 7 days ago.",
|
|
80
|
+
},
|
|
81
|
+
"paths": {"type": "array", "items": {"type": "string"}},
|
|
82
|
+
"max": {"type": "integer", "default": 20},
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
),
|
|
86
|
+
Tool(
|
|
87
|
+
name="get_summary",
|
|
88
|
+
description=(
|
|
89
|
+
"Structured snapshot of the project or a module: name, purpose "
|
|
90
|
+
"(README first paragraph), stack (Python/Node/Rust/Go/Java), "
|
|
91
|
+
"entry_points, key_modules, stats (files, loc, languages). Useful "
|
|
92
|
+
"at session start for orientation; prefer it over reading "
|
|
93
|
+
"README/CLAUDE.md when you need machine-readable fields."
|
|
94
|
+
),
|
|
95
|
+
inputSchema={
|
|
96
|
+
"type": "object",
|
|
97
|
+
"properties": {
|
|
98
|
+
"scope": {"type": "string", "enum": ["project", "module"]},
|
|
99
|
+
"path": {
|
|
100
|
+
"type": "string",
|
|
101
|
+
"description": "Required when scope='module'; repo-relative path.",
|
|
102
|
+
},
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
),
|
|
106
|
+
Tool(
|
|
107
|
+
name="find_definition",
|
|
108
|
+
description=(
|
|
109
|
+
"Locate the definition site of a named symbol (function, class, "
|
|
110
|
+
"method, type, struct, enum, interface, record). Use this INSTEAD of "
|
|
111
|
+
'shelling out to grep when the user asks "where is X defined?" — '
|
|
112
|
+
"returns SymbolDef[] with path, line range, kind, and language. "
|
|
113
|
+
"Faster and more accurate than grepping for `def X` / `class X` / "
|
|
114
|
+
"`function X` / etc., because it consults a tree-sitter-indexed "
|
|
115
|
+
"symbol table built at reindex time, not the raw text."
|
|
116
|
+
),
|
|
117
|
+
inputSchema={
|
|
118
|
+
"type": "object",
|
|
119
|
+
"properties": {
|
|
120
|
+
"name": {
|
|
121
|
+
"type": "string",
|
|
122
|
+
"description": "Exact identifier to locate.",
|
|
123
|
+
},
|
|
124
|
+
"language": {
|
|
125
|
+
"type": "string",
|
|
126
|
+
"enum": [
|
|
127
|
+
"python",
|
|
128
|
+
"javascript",
|
|
129
|
+
"typescript",
|
|
130
|
+
"go",
|
|
131
|
+
"rust",
|
|
132
|
+
"csharp",
|
|
133
|
+
],
|
|
134
|
+
"description": ("Optional language hint for same-name disambiguation."),
|
|
135
|
+
},
|
|
136
|
+
"max": {"type": "integer", "default": 5},
|
|
137
|
+
},
|
|
138
|
+
"required": ["name"],
|
|
139
|
+
},
|
|
140
|
+
),
|
|
141
|
+
Tool(
|
|
142
|
+
name="find_references",
|
|
143
|
+
description=(
|
|
144
|
+
"List every textual occurrence of a named symbol in the indexed "
|
|
145
|
+
'corpus. Use INSTEAD of `grep -n "X"` when the user asks "who '
|
|
146
|
+
'calls X?" or "where is X used?". Returns SymbolRef[] with path, '
|
|
147
|
+
"line, snippet. Word-boundary matched, so 'log' won't return "
|
|
148
|
+
"'logger' or 'log_format'."
|
|
149
|
+
),
|
|
150
|
+
inputSchema={
|
|
151
|
+
"type": "object",
|
|
152
|
+
"properties": {
|
|
153
|
+
"name": {
|
|
154
|
+
"type": "string",
|
|
155
|
+
"description": "Exact identifier to find references for.",
|
|
156
|
+
},
|
|
157
|
+
"max": {"type": "integer", "default": 50},
|
|
158
|
+
},
|
|
159
|
+
"required": ["name"],
|
|
160
|
+
},
|
|
161
|
+
),
|
|
162
|
+
Tool(
|
|
163
|
+
name="get_file_tree",
|
|
164
|
+
description=(
|
|
165
|
+
"Repo-relative directory tree, gitignore-aware. Use INSTEAD of "
|
|
166
|
+
"shelling out to `Bash: ls -R` or `Bash: tree` when the user "
|
|
167
|
+
"asks for the project structure or for orientation in an "
|
|
168
|
+
"unfamiliar module. Returns a hierarchical FileTreeNode with "
|
|
169
|
+
"files (with byte sizes) and directories (with recursive "
|
|
170
|
+
"children, capped at max_depth). Honors .gitignore; skips "
|
|
171
|
+
"hidden files unless include_hidden=true."
|
|
172
|
+
),
|
|
173
|
+
inputSchema={
|
|
174
|
+
"type": "object",
|
|
175
|
+
"properties": {
|
|
176
|
+
"path": {
|
|
177
|
+
"type": "string",
|
|
178
|
+
"description": (
|
|
179
|
+
"Optional repo-relative subdirectory; defaults to root."
|
|
180
|
+
),
|
|
181
|
+
},
|
|
182
|
+
"max_depth": {
|
|
183
|
+
"type": "integer",
|
|
184
|
+
"default": 4,
|
|
185
|
+
"description": "Cap on tree depth.",
|
|
186
|
+
},
|
|
187
|
+
"include_hidden": {
|
|
188
|
+
"type": "boolean",
|
|
189
|
+
"default": False,
|
|
190
|
+
"description": "Include dot-files / dot-directories.",
|
|
191
|
+
},
|
|
192
|
+
},
|
|
193
|
+
},
|
|
194
|
+
),
|
|
195
|
+
Tool(
|
|
196
|
+
name="explain_diff",
|
|
197
|
+
description=(
|
|
198
|
+
"AST-aligned chunks affected by the diff at `ref`. Use INSTEAD "
|
|
199
|
+
'of `Bash: git show <sha>` when the user asks "what does this '
|
|
200
|
+
'commit do?" or "what changed in HEAD~3?". The chunker resolves '
|
|
201
|
+
"which whole functions / classes were touched, not just raw line "
|
|
202
|
+
"additions — much easier for an LLM to reason about. Returns "
|
|
203
|
+
"DiffChunk[] with path, lines, snippet, kind, and change "
|
|
204
|
+
'("added"|"modified"|"deleted").'
|
|
205
|
+
),
|
|
206
|
+
inputSchema={
|
|
207
|
+
"type": "object",
|
|
208
|
+
"properties": {
|
|
209
|
+
"ref": {
|
|
210
|
+
"type": "string",
|
|
211
|
+
"description": (
|
|
212
|
+
"Git ref: full SHA, short SHA, HEAD, HEAD~N, branch name."
|
|
213
|
+
),
|
|
214
|
+
},
|
|
215
|
+
"max_chunks": {"type": "integer", "default": 50},
|
|
216
|
+
},
|
|
217
|
+
"required": ["ref"],
|
|
218
|
+
},
|
|
219
|
+
),
|
|
220
|
+
]
|
|
221
|
+
|
|
222
|
+
@server.call_tool()
|
|
223
|
+
async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]:
|
|
224
|
+
if name == "search_repo":
|
|
225
|
+
return await asyncio.to_thread(_handle_search, search_repo, arguments)
|
|
226
|
+
if name == "recent_changes":
|
|
227
|
+
return await asyncio.to_thread(_handle_recent, recent_changes, arguments)
|
|
228
|
+
if name == "get_summary":
|
|
229
|
+
return await asyncio.to_thread(_handle_summary, get_summary, arguments)
|
|
230
|
+
if name == "find_definition":
|
|
231
|
+
return await asyncio.to_thread(_handle_find_definition, find_definition, arguments)
|
|
232
|
+
if name == "find_references":
|
|
233
|
+
return await asyncio.to_thread(_handle_find_references, find_references, arguments)
|
|
234
|
+
if name == "get_file_tree":
|
|
235
|
+
return await asyncio.to_thread(_handle_file_tree, get_file_tree, arguments)
|
|
236
|
+
if name == "explain_diff":
|
|
237
|
+
return await asyncio.to_thread(_handle_explain_diff, explain_diff, arguments)
|
|
238
|
+
raise ValueError(f"unknown tool: {name}")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _handle_search(uc: SearchRepoUseCase, args: dict[str, Any]) -> list[TextContent]:
|
|
242
|
+
results = uc.run(
|
|
243
|
+
query=args["query"],
|
|
244
|
+
top_k=int(args.get("top_k", 5)),
|
|
245
|
+
scope=args.get("scope"),
|
|
246
|
+
)
|
|
247
|
+
payload = [
|
|
248
|
+
{
|
|
249
|
+
"path": r.path,
|
|
250
|
+
"lines": list(r.lines),
|
|
251
|
+
"snippet": r.snippet,
|
|
252
|
+
"score": r.score,
|
|
253
|
+
"why": r.why,
|
|
254
|
+
}
|
|
255
|
+
for r in results
|
|
256
|
+
]
|
|
257
|
+
return [TextContent(type="text", text=_to_json(payload))]
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _handle_recent(uc: RecentChangesUseCase, args: dict[str, Any]) -> list[TextContent]:
|
|
261
|
+
since = None
|
|
262
|
+
if args.get("since"):
|
|
263
|
+
since = datetime.fromisoformat(args["since"])
|
|
264
|
+
commits = uc.run(
|
|
265
|
+
since=since,
|
|
266
|
+
paths=args.get("paths"),
|
|
267
|
+
max_count=int(args.get("max", 20)),
|
|
268
|
+
)
|
|
269
|
+
payload = [
|
|
270
|
+
{
|
|
271
|
+
"sha": c.sha,
|
|
272
|
+
"date": c.date.isoformat(),
|
|
273
|
+
"author": c.author,
|
|
274
|
+
"paths": c.paths,
|
|
275
|
+
"summary": c.summary,
|
|
276
|
+
}
|
|
277
|
+
for c in commits
|
|
278
|
+
]
|
|
279
|
+
return [TextContent(type="text", text=_to_json(payload))]
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _handle_summary(uc: GetSummaryUseCase, args: dict[str, Any]) -> list[TextContent]:
|
|
283
|
+
scope = args.get("scope", "project")
|
|
284
|
+
path = Path(args["path"]) if args.get("path") else None
|
|
285
|
+
summary = uc.run(scope=scope, path=path)
|
|
286
|
+
payload = {
|
|
287
|
+
"name": summary.name,
|
|
288
|
+
"purpose": summary.purpose,
|
|
289
|
+
"stack": summary.stack,
|
|
290
|
+
"entry_points": summary.entry_points,
|
|
291
|
+
"key_modules": summary.key_modules,
|
|
292
|
+
"stats": summary.stats,
|
|
293
|
+
}
|
|
294
|
+
return [TextContent(type="text", text=_to_json(payload))]
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _handle_find_definition(uc: FindDefinitionUseCase, args: dict[str, Any]) -> list[TextContent]:
|
|
298
|
+
defs = uc.run(
|
|
299
|
+
name=args["name"],
|
|
300
|
+
language=args.get("language"),
|
|
301
|
+
max_count=int(args.get("max", 5)),
|
|
302
|
+
)
|
|
303
|
+
payload = [
|
|
304
|
+
{
|
|
305
|
+
"name": d.name,
|
|
306
|
+
"path": d.path,
|
|
307
|
+
"lines": list(d.lines),
|
|
308
|
+
"kind": d.kind,
|
|
309
|
+
"language": d.language,
|
|
310
|
+
}
|
|
311
|
+
for d in defs
|
|
312
|
+
]
|
|
313
|
+
return [TextContent(type="text", text=_to_json(payload))]
|
|
314
|
+
|
|
315
|
+
|
|
316
|
+
def _handle_find_references(uc: FindReferencesUseCase, args: dict[str, Any]) -> list[TextContent]:
|
|
317
|
+
refs = uc.run(
|
|
318
|
+
name=args["name"],
|
|
319
|
+
max_count=int(args.get("max", 50)),
|
|
320
|
+
)
|
|
321
|
+
payload = [{"path": r.path, "line": r.line, "snippet": r.snippet} for r in refs]
|
|
322
|
+
return [TextContent(type="text", text=_to_json(payload))]
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _handle_file_tree(uc: GetFileTreeUseCase, args: dict[str, Any]) -> list[TextContent]:
|
|
326
|
+
tree = uc.run(
|
|
327
|
+
path=args.get("path"),
|
|
328
|
+
max_depth=int(args.get("max_depth", 4)),
|
|
329
|
+
include_hidden=bool(args.get("include_hidden", False)),
|
|
330
|
+
)
|
|
331
|
+
payload = _serialize_tree_node(tree)
|
|
332
|
+
return [TextContent(type="text", text=_to_json(payload))]
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _serialize_tree_node(node) -> dict[str, Any]:
|
|
336
|
+
"""Recursively flatten a FileTreeNode tuple to plain JSON dicts."""
|
|
337
|
+
out: dict[str, Any] = {
|
|
338
|
+
"path": node.path,
|
|
339
|
+
"kind": node.kind,
|
|
340
|
+
"size": node.size,
|
|
341
|
+
"children": [_serialize_tree_node(c) for c in node.children],
|
|
342
|
+
}
|
|
343
|
+
return out
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def _handle_explain_diff(uc: ExplainDiffUseCase, args: dict[str, Any]) -> list[TextContent]:
|
|
347
|
+
chunks = uc.run(
|
|
348
|
+
ref=args["ref"],
|
|
349
|
+
max_chunks=int(args.get("max_chunks", 50)),
|
|
350
|
+
)
|
|
351
|
+
payload = [
|
|
352
|
+
{
|
|
353
|
+
"path": c.path,
|
|
354
|
+
"lines": list(c.lines),
|
|
355
|
+
"snippet": c.snippet,
|
|
356
|
+
"kind": c.kind,
|
|
357
|
+
"change": c.change,
|
|
358
|
+
}
|
|
359
|
+
for c in chunks
|
|
360
|
+
]
|
|
361
|
+
return [TextContent(type="text", text=_to_json(payload))]
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
def _to_json(obj: Any) -> str:
|
|
365
|
+
return json.dumps(obj, ensure_ascii=False, indent=2)
|
code_context/cli.py
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""code-context CLI: reindex, status, query, clear."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import shutil
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
from code_context._composition import (
|
|
12
|
+
build_indexer_and_store,
|
|
13
|
+
build_use_cases,
|
|
14
|
+
safe_reindex,
|
|
15
|
+
setup_logging,
|
|
16
|
+
)
|
|
17
|
+
from code_context.config import load_config
|
|
18
|
+
|
|
19
|
+
log = logging.getLogger("code_context")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _cmd_reindex(args: argparse.Namespace) -> int:
|
|
23
|
+
cfg = load_config()
|
|
24
|
+
setup_logging(cfg)
|
|
25
|
+
indexer, _, _, _, _ = build_indexer_and_store(cfg)
|
|
26
|
+
if args.force:
|
|
27
|
+
log.info("reindexing %s (forced full)", cfg.repo_root)
|
|
28
|
+
new_dir = safe_reindex(cfg, indexer)
|
|
29
|
+
print(f"reindexed (full, forced) -> {new_dir}")
|
|
30
|
+
return 0
|
|
31
|
+
stale = indexer.dirty_set()
|
|
32
|
+
log.info("reindexing %s (%s)", cfg.repo_root, stale.reason)
|
|
33
|
+
new_dir = safe_reindex(cfg, indexer, stale=stale)
|
|
34
|
+
mode = "full" if stale.full_reindex_required else "incremental"
|
|
35
|
+
print(f"reindexed ({mode}: {stale.reason}) -> {new_dir}")
|
|
36
|
+
return 0
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _cmd_status(args: argparse.Namespace) -> int:
|
|
40
|
+
cfg = load_config()
|
|
41
|
+
setup_logging(cfg)
|
|
42
|
+
indexer, _, _, _, _ = build_indexer_and_store(cfg)
|
|
43
|
+
current = indexer.current_index_dir()
|
|
44
|
+
print(f"repo_root: {cfg.repo_root}")
|
|
45
|
+
print(f"cache_dir: {cfg.repo_cache_subdir()}")
|
|
46
|
+
if current is None:
|
|
47
|
+
print("status: no index yet")
|
|
48
|
+
return 0
|
|
49
|
+
meta_path = current / "metadata.json"
|
|
50
|
+
if not meta_path.exists():
|
|
51
|
+
print("status: index dir present but metadata missing")
|
|
52
|
+
return 1
|
|
53
|
+
meta = json.loads(meta_path.read_text())
|
|
54
|
+
print(f"index_dir: {current}")
|
|
55
|
+
print(f"head_sha: {meta.get('head_sha')}")
|
|
56
|
+
print(f"indexed_at: {meta.get('indexed_at')}")
|
|
57
|
+
print(f"n_chunks: {meta.get('n_chunks')}")
|
|
58
|
+
print(f"n_files: {meta.get('n_files')}")
|
|
59
|
+
print(f"model: {meta.get('embeddings_model')}")
|
|
60
|
+
print(f"chunker: {meta.get('chunker_version')}")
|
|
61
|
+
print(f"keyword: {meta.get('keyword_version', '<not indexed — pre-v0.4.0>')}")
|
|
62
|
+
print(f"symbol: {meta.get('symbol_version', '<not indexed — pre-v0.5.0>')}")
|
|
63
|
+
stale = indexer.dirty_set()
|
|
64
|
+
print(f"dirty: {len(stale.dirty_files)}")
|
|
65
|
+
print(f"deleted: {len(stale.deleted_files)}")
|
|
66
|
+
print(f"full_reindex_required: {stale.full_reindex_required}")
|
|
67
|
+
print(f"reason: {stale.reason}")
|
|
68
|
+
return 0
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _cmd_query(args: argparse.Namespace) -> int:
|
|
72
|
+
cfg = load_config()
|
|
73
|
+
setup_logging(cfg)
|
|
74
|
+
indexer, store, embeddings, keyword_index, symbol_index = build_indexer_and_store(cfg)
|
|
75
|
+
current = indexer.current_index_dir()
|
|
76
|
+
if current is None:
|
|
77
|
+
print("error: no index. run `code-context reindex` first.", file=sys.stderr)
|
|
78
|
+
return 1
|
|
79
|
+
if indexer.is_stale():
|
|
80
|
+
print(
|
|
81
|
+
"warning: index is stale (HEAD/files/model/chunker changed since last reindex). "
|
|
82
|
+
"Results may be out of date. Run `code-context reindex` to refresh.",
|
|
83
|
+
file=sys.stderr,
|
|
84
|
+
)
|
|
85
|
+
store.load(current)
|
|
86
|
+
try:
|
|
87
|
+
keyword_index.load(current)
|
|
88
|
+
except FileNotFoundError:
|
|
89
|
+
log.warning(
|
|
90
|
+
"keyword index missing in %s — search will fall back to vector-only. "
|
|
91
|
+
"Run `code-context reindex` to backfill the keyword leg.",
|
|
92
|
+
current,
|
|
93
|
+
)
|
|
94
|
+
try:
|
|
95
|
+
symbol_index.load(current)
|
|
96
|
+
except FileNotFoundError:
|
|
97
|
+
log.warning(
|
|
98
|
+
"symbol index missing in %s — find_definition/find_references unavailable. "
|
|
99
|
+
"Run `code-context reindex` to backfill the symbol leg.",
|
|
100
|
+
current,
|
|
101
|
+
)
|
|
102
|
+
search, _, _, _, _, _, _ = build_use_cases(
|
|
103
|
+
cfg,
|
|
104
|
+
indexer,
|
|
105
|
+
store,
|
|
106
|
+
embeddings,
|
|
107
|
+
keyword_index,
|
|
108
|
+
symbol_index,
|
|
109
|
+
)
|
|
110
|
+
results = search.run(query=args.text, top_k=args.k or cfg.top_k_default)
|
|
111
|
+
for r in results:
|
|
112
|
+
print(f"{r.score:.3f} {r.path}:{r.lines[0]}-{r.lines[1]} ({r.why})")
|
|
113
|
+
return 0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def _cmd_clear(args: argparse.Namespace) -> int:
|
|
117
|
+
cfg = load_config()
|
|
118
|
+
setup_logging(cfg)
|
|
119
|
+
target = cfg.repo_cache_subdir()
|
|
120
|
+
if not target.exists():
|
|
121
|
+
print("nothing to clear")
|
|
122
|
+
return 0
|
|
123
|
+
if not args.yes:
|
|
124
|
+
print(f"this will delete {target}. pass --yes to confirm.", file=sys.stderr)
|
|
125
|
+
return 1
|
|
126
|
+
shutil.rmtree(target)
|
|
127
|
+
print(f"cleared {target}")
|
|
128
|
+
return 0
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def main() -> int:
|
|
132
|
+
parser = argparse.ArgumentParser(prog="code-context", description="code-context CLI")
|
|
133
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
134
|
+
|
|
135
|
+
r = sub.add_parser(
|
|
136
|
+
"reindex",
|
|
137
|
+
help="Reindex now (incremental by default; --force for full)",
|
|
138
|
+
)
|
|
139
|
+
r.add_argument(
|
|
140
|
+
"--force",
|
|
141
|
+
action="store_true",
|
|
142
|
+
help="Force a full reindex regardless of dirty_set verdict.",
|
|
143
|
+
)
|
|
144
|
+
r.set_defaults(func=_cmd_reindex)
|
|
145
|
+
sub.add_parser("status", help="Show index health").set_defaults(func=_cmd_status)
|
|
146
|
+
|
|
147
|
+
q = sub.add_parser("query", help="Run a search query without MCP")
|
|
148
|
+
q.add_argument("text")
|
|
149
|
+
q.add_argument("-k", type=int, default=None, help="Override top_k")
|
|
150
|
+
q.set_defaults(func=_cmd_query)
|
|
151
|
+
|
|
152
|
+
c = sub.add_parser("clear", help="Delete the cache for this repo")
|
|
153
|
+
c.add_argument("--yes", action="store_true", help="Confirm deletion")
|
|
154
|
+
c.set_defaults(func=_cmd_clear)
|
|
155
|
+
|
|
156
|
+
args = parser.parse_args()
|
|
157
|
+
return int(args.func(args))
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
if __name__ == "__main__":
|
|
161
|
+
sys.exit(main())
|
code_context/config.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Configuration: env vars + defaults, frozen dataclass."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import platformdirs
|
|
11
|
+
|
|
12
|
+
_DEFAULT_EXTENSIONS = [
|
|
13
|
+
".py",
|
|
14
|
+
".js",
|
|
15
|
+
".ts",
|
|
16
|
+
".jsx",
|
|
17
|
+
".tsx",
|
|
18
|
+
".go",
|
|
19
|
+
".rs",
|
|
20
|
+
".cs",
|
|
21
|
+
".java",
|
|
22
|
+
".c",
|
|
23
|
+
".cpp",
|
|
24
|
+
".h",
|
|
25
|
+
".hpp",
|
|
26
|
+
".md",
|
|
27
|
+
".yaml",
|
|
28
|
+
".yml",
|
|
29
|
+
".json",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True, slots=True)
|
|
34
|
+
class Config:
|
|
35
|
+
repo_root: Path
|
|
36
|
+
embeddings_provider: str # "local" or "openai"
|
|
37
|
+
embeddings_model: str | None
|
|
38
|
+
openai_api_key: str | None
|
|
39
|
+
include_extensions: list[str]
|
|
40
|
+
max_file_bytes: int
|
|
41
|
+
cache_dir: Path
|
|
42
|
+
log_level: str
|
|
43
|
+
top_k_default: int
|
|
44
|
+
chunk_lines: int
|
|
45
|
+
chunk_overlap: int
|
|
46
|
+
chunker_strategy: str # "treesitter" (default) or "line"
|
|
47
|
+
keyword_strategy: str # "sqlite" (default) or "none"
|
|
48
|
+
rerank: bool
|
|
49
|
+
rerank_model: str | None
|
|
50
|
+
symbol_index_strategy: str # "sqlite" (default) or "none"
|
|
51
|
+
trust_remote_code: bool # Off by default. Required for some HF models that ship custom Python.
|
|
52
|
+
# Sprint 7 — background reindex thread (default ON). Coalesce window
|
|
53
|
+
# for trigger storms.
|
|
54
|
+
bg_reindex: bool = True
|
|
55
|
+
bg_idle_seconds: float = 1.0
|
|
56
|
+
# Sprint 7 — optional file-system watcher (off by default; needs
|
|
57
|
+
# the [watch] extra installed).
|
|
58
|
+
watch: bool = False
|
|
59
|
+
watch_debounce_ms: int = 1000
|
|
60
|
+
|
|
61
|
+
def repo_cache_subdir(self) -> Path:
|
|
62
|
+
"""Cache subdir specific to this repo (hashed for collision safety)."""
|
|
63
|
+
h = hashlib.sha256(str(self.repo_root.resolve()).encode("utf-8")).hexdigest()[:16]
|
|
64
|
+
return self.cache_dir / h
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def load_config(default_repo_root: Path | None = None) -> Config:
|
|
68
|
+
repo_root = Path(os.environ.get("CC_REPO_ROOT") or default_repo_root or Path.cwd())
|
|
69
|
+
embeddings = os.environ.get("CC_EMBEDDINGS", "local")
|
|
70
|
+
|
|
71
|
+
default_model = "all-MiniLM-L6-v2" if embeddings == "local" else "text-embedding-3-small"
|
|
72
|
+
model = os.environ.get("CC_EMBEDDINGS_MODEL", default_model)
|
|
73
|
+
|
|
74
|
+
cache_override = os.environ.get("CC_CACHE_DIR")
|
|
75
|
+
cache_dir = (
|
|
76
|
+
Path(cache_override)
|
|
77
|
+
if cache_override
|
|
78
|
+
else Path(platformdirs.user_cache_dir("code-context"))
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
exts_raw = os.environ.get("CC_INCLUDE_EXTENSIONS")
|
|
82
|
+
if exts_raw:
|
|
83
|
+
exts = [
|
|
84
|
+
e.strip() if e.startswith(".") else f".{e.strip()}"
|
|
85
|
+
for e in exts_raw.split(",")
|
|
86
|
+
if e.strip()
|
|
87
|
+
]
|
|
88
|
+
else:
|
|
89
|
+
exts = list(_DEFAULT_EXTENSIONS)
|
|
90
|
+
|
|
91
|
+
return Config(
|
|
92
|
+
repo_root=repo_root.resolve(),
|
|
93
|
+
embeddings_provider=embeddings,
|
|
94
|
+
embeddings_model=model,
|
|
95
|
+
openai_api_key=os.environ.get("OPENAI_API_KEY"),
|
|
96
|
+
include_extensions=exts,
|
|
97
|
+
max_file_bytes=int(os.environ.get("CC_MAX_FILE_BYTES", "1048576")),
|
|
98
|
+
cache_dir=cache_dir,
|
|
99
|
+
log_level=os.environ.get("CC_LOG_LEVEL", "INFO"),
|
|
100
|
+
top_k_default=int(os.environ.get("CC_TOP_K_DEFAULT", "5")),
|
|
101
|
+
chunk_lines=int(os.environ.get("CC_CHUNK_LINES", "50")),
|
|
102
|
+
chunk_overlap=int(os.environ.get("CC_CHUNK_OVERLAP", "10")),
|
|
103
|
+
chunker_strategy=os.environ.get("CC_CHUNKER", "treesitter"),
|
|
104
|
+
keyword_strategy=os.environ.get("CC_KEYWORD_INDEX", "sqlite"),
|
|
105
|
+
rerank=os.environ.get("CC_RERANK", "off").lower() in ("on", "true", "1"),
|
|
106
|
+
rerank_model=os.environ.get("CC_RERANK_MODEL"),
|
|
107
|
+
symbol_index_strategy=os.environ.get("CC_SYMBOL_INDEX", "sqlite"),
|
|
108
|
+
trust_remote_code=os.environ.get("CC_TRUST_REMOTE_CODE", "off").lower()
|
|
109
|
+
in ("on", "true", "1"),
|
|
110
|
+
bg_reindex=os.environ.get("CC_BG_REINDEX", "on").lower() in ("on", "true", "1"),
|
|
111
|
+
bg_idle_seconds=float(os.environ.get("CC_BG_IDLE_SECONDS", "1.0")),
|
|
112
|
+
watch=os.environ.get("CC_WATCH", "off").lower() in ("on", "true", "1"),
|
|
113
|
+
watch_debounce_ms=int(os.environ.get("CC_WATCH_DEBOUNCE_MS", "1000")),
|
|
114
|
+
)
|
|
File without changes
|