code-context-mcp 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context/__init__.py +3 -0
- code_context/_background.py +93 -0
- code_context/_composition.py +425 -0
- code_context/_watcher.py +89 -0
- code_context/adapters/__init__.py +0 -0
- code_context/adapters/driven/__init__.py +0 -0
- code_context/adapters/driven/chunker_dispatcher.py +43 -0
- code_context/adapters/driven/chunker_line.py +54 -0
- code_context/adapters/driven/chunker_treesitter.py +215 -0
- code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
- code_context/adapters/driven/code_source_fs.py +122 -0
- code_context/adapters/driven/embeddings_local.py +111 -0
- code_context/adapters/driven/embeddings_openai.py +58 -0
- code_context/adapters/driven/git_source_cli.py +211 -0
- code_context/adapters/driven/introspector_fs.py +224 -0
- code_context/adapters/driven/keyword_index_sqlite.py +206 -0
- code_context/adapters/driven/reranker_crossencoder.py +61 -0
- code_context/adapters/driven/symbol_index_sqlite.py +264 -0
- code_context/adapters/driven/vector_store_numpy.py +119 -0
- code_context/adapters/driving/__init__.py +0 -0
- code_context/adapters/driving/mcp_server.py +365 -0
- code_context/cli.py +161 -0
- code_context/config.py +114 -0
- code_context/domain/__init__.py +0 -0
- code_context/domain/index_bus.py +52 -0
- code_context/domain/models.py +140 -0
- code_context/domain/ports.py +205 -0
- code_context/domain/use_cases/__init__.py +0 -0
- code_context/domain/use_cases/explain_diff.py +98 -0
- code_context/domain/use_cases/find_definition.py +30 -0
- code_context/domain/use_cases/find_references.py +22 -0
- code_context/domain/use_cases/get_file_tree.py +36 -0
- code_context/domain/use_cases/get_summary.py +24 -0
- code_context/domain/use_cases/indexer.py +336 -0
- code_context/domain/use_cases/recent_changes.py +36 -0
- code_context/domain/use_cases/search_repo.py +131 -0
- code_context/server.py +151 -0
- code_context_mcp-1.0.0.dist-info/METADATA +181 -0
- code_context_mcp-1.0.0.dist-info/RECORD +43 -0
- code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
- code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
- code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
- code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
"""IndexerUseCase — orchestrates the 5 ports for full + incremental reindex."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from datetime import UTC, datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from code_context.domain.models import IndexEntry, StaleSet, SymbolDef
|
|
13
|
+
from code_context.domain.ports import (
|
|
14
|
+
Chunker,
|
|
15
|
+
CodeSource,
|
|
16
|
+
EmbeddingsProvider,
|
|
17
|
+
GitSource,
|
|
18
|
+
KeywordIndex,
|
|
19
|
+
SymbolIndex,
|
|
20
|
+
VectorStore,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
_BATCH_SIZE = 64
|
|
26
|
+
_CURRENT_FILE = "current.json"
|
|
27
|
+
# v1: original schema (no file_hashes).
|
|
28
|
+
# v2: Sprint 6 — adds file_hashes for incremental reindex.
|
|
29
|
+
_VERSION = 2
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class IndexerUseCase:
|
|
34
|
+
cache_dir: Path
|
|
35
|
+
repo_root: Path
|
|
36
|
+
embeddings: EmbeddingsProvider
|
|
37
|
+
vector_store: VectorStore
|
|
38
|
+
keyword_index: KeywordIndex
|
|
39
|
+
symbol_index: SymbolIndex
|
|
40
|
+
chunker: Chunker
|
|
41
|
+
code_source: CodeSource
|
|
42
|
+
git_source: GitSource
|
|
43
|
+
include_extensions: list[str]
|
|
44
|
+
max_file_bytes: int = 1_048_576
|
|
45
|
+
|
|
46
|
+
# ---------- public ----------
|
|
47
|
+
|
|
48
|
+
def dirty_set(self) -> StaleSet:
|
|
49
|
+
"""Verdict that drives Sprint 6's incremental reindex.
|
|
50
|
+
|
|
51
|
+
Returns a StaleSet whose `full_reindex_required` is True for any
|
|
52
|
+
of these blow-it-all-away conditions: no current index, no git
|
|
53
|
+
repo, metadata schema older than v2 (i.e. file_hashes absent),
|
|
54
|
+
or any global version (embeddings model id, chunker version,
|
|
55
|
+
keyword/symbol index version) changed since last index. Otherwise
|
|
56
|
+
compares the per-file content SHA of every currently-indexable
|
|
57
|
+
file against `metadata.file_hashes`; mismatches go to
|
|
58
|
+
`dirty_files`, vanished entries go to `deleted_files`. Both
|
|
59
|
+
empty + flag False = "no work" steady state.
|
|
60
|
+
"""
|
|
61
|
+
active = self._current_metadata()
|
|
62
|
+
if active is None:
|
|
63
|
+
return StaleSet(full_reindex_required=True, reason="no current index")
|
|
64
|
+
if not self.git_source.is_repo(self.repo_root):
|
|
65
|
+
return StaleSet(full_reindex_required=True, reason="not a git repo")
|
|
66
|
+
if active.get("version", 1) < _VERSION:
|
|
67
|
+
return StaleSet(
|
|
68
|
+
full_reindex_required=True,
|
|
69
|
+
reason="metadata schema upgrade (v1 → v2)",
|
|
70
|
+
)
|
|
71
|
+
if active.get("embeddings_model") != self.embeddings.model_id:
|
|
72
|
+
return StaleSet(full_reindex_required=True, reason="embeddings_model changed")
|
|
73
|
+
if active.get("chunker_version") != self.chunker.version:
|
|
74
|
+
return StaleSet(full_reindex_required=True, reason="chunker_version changed")
|
|
75
|
+
if active.get("keyword_version") != self.keyword_index.version:
|
|
76
|
+
return StaleSet(full_reindex_required=True, reason="keyword_version changed")
|
|
77
|
+
if active.get("symbol_version") != self.symbol_index.version:
|
|
78
|
+
return StaleSet(full_reindex_required=True, reason="symbol_version changed")
|
|
79
|
+
|
|
80
|
+
prior_hashes: dict[str, str] = active.get("file_hashes") or {}
|
|
81
|
+
files = self.code_source.list_files(
|
|
82
|
+
self.repo_root, self.include_extensions, self.max_file_bytes
|
|
83
|
+
)
|
|
84
|
+
current_paths_rel: set[str] = set()
|
|
85
|
+
dirty: list[Path] = []
|
|
86
|
+
for f in files:
|
|
87
|
+
rel = f.relative_to(self.repo_root).as_posix()
|
|
88
|
+
current_paths_rel.add(rel)
|
|
89
|
+
try:
|
|
90
|
+
content = self.code_source.read(f)
|
|
91
|
+
except (OSError, UnicodeDecodeError):
|
|
92
|
+
# Unreadable now — skip; if it was indexed before, the next
|
|
93
|
+
# full reindex picks it up. Don't mark as dirty (avoids a
|
|
94
|
+
# poison-pill loop where a permanently-broken file forces
|
|
95
|
+
# repeated incremental runs).
|
|
96
|
+
continue
|
|
97
|
+
sha = hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
98
|
+
if prior_hashes.get(rel) != sha:
|
|
99
|
+
dirty.append(f)
|
|
100
|
+
|
|
101
|
+
deleted = tuple(p for p in prior_hashes if p not in current_paths_rel)
|
|
102
|
+
|
|
103
|
+
return StaleSet(
|
|
104
|
+
full_reindex_required=False,
|
|
105
|
+
reason=f"{len(dirty)} dirty, {len(deleted)} deleted",
|
|
106
|
+
dirty_files=tuple(dirty),
|
|
107
|
+
deleted_files=deleted,
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
def is_stale(self) -> bool:
|
|
111
|
+
"""Thin wrapper kept so existing CLI / composition callers work.
|
|
112
|
+
|
|
113
|
+
Returns True when dirty_set's verdict is anything other than
|
|
114
|
+
the steady-state "no work". Sprint 6 retired the head_sha
|
|
115
|
+
global invalidator: changing HEAD without modifying any indexed
|
|
116
|
+
file no longer triggers a reindex (per-file SHA tracks content
|
|
117
|
+
truth, not commit position).
|
|
118
|
+
"""
|
|
119
|
+
s = self.dirty_set()
|
|
120
|
+
return s.full_reindex_required or bool(s.dirty_files) or bool(s.deleted_files)
|
|
121
|
+
|
|
122
|
+
def run_incremental(self, stale: StaleSet) -> Path:
|
|
123
|
+
"""Re-embed dirty files; purge deleted files; persist a new index dir.
|
|
124
|
+
|
|
125
|
+
Caller (composition root) is responsible for the atomic swap of
|
|
126
|
+
current.json after this returns — same contract as run().
|
|
127
|
+
|
|
128
|
+
When `stale.full_reindex_required` is True, falls back to
|
|
129
|
+
`self.run()` (the file lists are advisory in that mode).
|
|
130
|
+
Otherwise:
|
|
131
|
+
1. Loads the active index into the three stores. Mutations stay
|
|
132
|
+
in-memory (the SQLite adapters' load() copies disk → :memory:
|
|
133
|
+
specifically so this step is safe).
|
|
134
|
+
2. Drops every row whose path is in `stale.deleted_files`.
|
|
135
|
+
3. For each path in `stale.dirty_files`: drops its old rows from
|
|
136
|
+
every store, then re-chunks + re-embeds + re-extracts symbols
|
|
137
|
+
from the current content.
|
|
138
|
+
4. Persists every store to a fresh index dir.
|
|
139
|
+
5. Stamps metadata: file_hashes copied forward from the prior
|
|
140
|
+
run, with deletes removed and dirties updated. n_files derives
|
|
141
|
+
from len(file_hashes) so the count stays honest.
|
|
142
|
+
"""
|
|
143
|
+
if stale.full_reindex_required:
|
|
144
|
+
return self.run()
|
|
145
|
+
|
|
146
|
+
active = self.current_index_dir()
|
|
147
|
+
prior = self._current_metadata()
|
|
148
|
+
if active is None or prior is None:
|
|
149
|
+
return self.run()
|
|
150
|
+
|
|
151
|
+
log.info("indexer-incremental: %s", stale.reason)
|
|
152
|
+
|
|
153
|
+
self.vector_store.load(active)
|
|
154
|
+
self.keyword_index.load(active)
|
|
155
|
+
self.symbol_index.load(active)
|
|
156
|
+
|
|
157
|
+
for path in stale.deleted_files:
|
|
158
|
+
self.vector_store.delete_by_path(path)
|
|
159
|
+
self.keyword_index.delete_by_path(path)
|
|
160
|
+
self.symbol_index.delete_by_path(path)
|
|
161
|
+
|
|
162
|
+
new_file_hashes: dict[str, str] = dict(prior.get("file_hashes") or {})
|
|
163
|
+
for path in stale.deleted_files:
|
|
164
|
+
new_file_hashes.pop(path, None)
|
|
165
|
+
|
|
166
|
+
new_chunks: list = []
|
|
167
|
+
new_defs: list[SymbolDef] = []
|
|
168
|
+
for f in stale.dirty_files:
|
|
169
|
+
rel = f.relative_to(self.repo_root).as_posix()
|
|
170
|
+
self.vector_store.delete_by_path(rel)
|
|
171
|
+
self.keyword_index.delete_by_path(rel)
|
|
172
|
+
self.symbol_index.delete_by_path(rel)
|
|
173
|
+
try:
|
|
174
|
+
content = self.code_source.read(f)
|
|
175
|
+
except (OSError, UnicodeDecodeError) as exc:
|
|
176
|
+
log.warning("indexer-incremental: skipping %s (%s)", rel, exc)
|
|
177
|
+
new_file_hashes.pop(rel, None)
|
|
178
|
+
continue
|
|
179
|
+
new_file_hashes[rel] = hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
180
|
+
for chunk in self.chunker.chunk(content, rel):
|
|
181
|
+
new_chunks.append(chunk)
|
|
182
|
+
extractor = getattr(self.chunker, "extract_definitions", None)
|
|
183
|
+
if extractor is not None:
|
|
184
|
+
try:
|
|
185
|
+
new_defs.extend(extractor(content, rel))
|
|
186
|
+
except Exception as exc: # noqa: BLE001 - same policy as run()
|
|
187
|
+
log.warning(
|
|
188
|
+
"indexer-incremental: symbol extract failed for %s (%s)",
|
|
189
|
+
rel,
|
|
190
|
+
exc,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
new_entries: list[IndexEntry] = []
|
|
194
|
+
for i in range(0, len(new_chunks), _BATCH_SIZE):
|
|
195
|
+
batch = new_chunks[i : i + _BATCH_SIZE]
|
|
196
|
+
vectors = self.embeddings.embed([c.snippet for c in batch])
|
|
197
|
+
for chunk, vec in zip(batch, vectors, strict=True):
|
|
198
|
+
new_entries.append(IndexEntry(chunk=chunk, vector=vec))
|
|
199
|
+
|
|
200
|
+
self.vector_store.add(new_entries)
|
|
201
|
+
self.keyword_index.add(new_entries)
|
|
202
|
+
self.symbol_index.add_definitions(new_defs)
|
|
203
|
+
ref_rows = [(c.path, c.line_start, c.snippet) for c in new_chunks]
|
|
204
|
+
self.symbol_index.add_references(ref_rows)
|
|
205
|
+
|
|
206
|
+
head = self.git_source.head_sha(self.repo_root) or "no-git"
|
|
207
|
+
new_dir_name = f"index-{head[:12]}-{datetime.now(UTC).strftime('%Y%m%dT%H%M%S%f')}"
|
|
208
|
+
new_dir = self.cache_dir / new_dir_name
|
|
209
|
+
new_dir.mkdir(parents=True, exist_ok=True)
|
|
210
|
+
|
|
211
|
+
self.vector_store.persist(new_dir)
|
|
212
|
+
self.keyword_index.persist(new_dir)
|
|
213
|
+
self.symbol_index.persist(new_dir)
|
|
214
|
+
|
|
215
|
+
meta = {
|
|
216
|
+
"version": _VERSION,
|
|
217
|
+
"head_sha": head,
|
|
218
|
+
"indexed_at": datetime.now(UTC).isoformat(),
|
|
219
|
+
"embeddings_model": self.embeddings.model_id,
|
|
220
|
+
"embeddings_dimension": self.embeddings.dimension,
|
|
221
|
+
"chunker_version": self.chunker.version,
|
|
222
|
+
"keyword_version": self.keyword_index.version,
|
|
223
|
+
"symbol_version": self.symbol_index.version,
|
|
224
|
+
# n_chunks here only counts what changed in this run; the
|
|
225
|
+
# store's true total is opaque from the use case's vantage
|
|
226
|
+
# point. Sprint 7 can wire a richer accounting if needed.
|
|
227
|
+
"n_chunks_added": len(new_entries),
|
|
228
|
+
"n_files": len(new_file_hashes),
|
|
229
|
+
"file_hashes": new_file_hashes,
|
|
230
|
+
}
|
|
231
|
+
(new_dir / "metadata.json").write_text(json.dumps(meta, indent=2))
|
|
232
|
+
|
|
233
|
+
return new_dir
|
|
234
|
+
|
|
235
|
+
def run(self) -> Path:
|
|
236
|
+
"""Full reindex. Returns the new index directory path.
|
|
237
|
+
|
|
238
|
+
Caller (composition root) is responsible for the atomic swap of
|
|
239
|
+
current.json after this returns.
|
|
240
|
+
"""
|
|
241
|
+
files = self.code_source.list_files(
|
|
242
|
+
self.repo_root, self.include_extensions, self.max_file_bytes
|
|
243
|
+
)
|
|
244
|
+
log.info("indexer: reindexing %d files", len(files))
|
|
245
|
+
|
|
246
|
+
all_entries: list[IndexEntry] = []
|
|
247
|
+
all_defs: list[SymbolDef] = []
|
|
248
|
+
# Collect chunks first so we can batch-embed.
|
|
249
|
+
chunks_with_paths: list = []
|
|
250
|
+
# Per-file SHA stamped into metadata so dirty_set() has a baseline
|
|
251
|
+
# for the next run. Computed inline so we don't re-read every file.
|
|
252
|
+
file_hashes: dict[str, str] = {}
|
|
253
|
+
for f in files:
|
|
254
|
+
try:
|
|
255
|
+
content = self.code_source.read(f)
|
|
256
|
+
except (OSError, UnicodeDecodeError) as exc:
|
|
257
|
+
log.warning("indexer: skipping %s (%s)", f, exc)
|
|
258
|
+
continue
|
|
259
|
+
rel = f.relative_to(self.repo_root).as_posix()
|
|
260
|
+
file_hashes[rel] = hashlib.sha256(content.encode("utf-8")).hexdigest()
|
|
261
|
+
for chunk in self.chunker.chunk(content, rel):
|
|
262
|
+
chunks_with_paths.append(chunk)
|
|
263
|
+
# Symbol extraction — only chunkers that expose it (TreeSitterChunker).
|
|
264
|
+
extractor = getattr(self.chunker, "extract_definitions", None)
|
|
265
|
+
if extractor is not None:
|
|
266
|
+
try:
|
|
267
|
+
all_defs.extend(extractor(content, rel))
|
|
268
|
+
except Exception as exc: # noqa: BLE001 - extractor failure must not abort indexing
|
|
269
|
+
log.warning("indexer: symbol extract failed for %s (%s)", rel, exc)
|
|
270
|
+
|
|
271
|
+
# Batch-embed.
|
|
272
|
+
for i in range(0, len(chunks_with_paths), _BATCH_SIZE):
|
|
273
|
+
batch = chunks_with_paths[i : i + _BATCH_SIZE]
|
|
274
|
+
vectors = self.embeddings.embed([c.snippet for c in batch])
|
|
275
|
+
for chunk, vec in zip(batch, vectors, strict=True):
|
|
276
|
+
all_entries.append(IndexEntry(chunk=chunk, vector=vec))
|
|
277
|
+
|
|
278
|
+
# Reset and add.
|
|
279
|
+
head = self.git_source.head_sha(self.repo_root) or "no-git"
|
|
280
|
+
new_dir_name = f"index-{head[:12]}-{datetime.now(UTC).strftime('%Y%m%dT%H%M%S%f')}"
|
|
281
|
+
new_dir = self.cache_dir / new_dir_name
|
|
282
|
+
new_dir.mkdir(parents=True, exist_ok=True)
|
|
283
|
+
|
|
284
|
+
self.vector_store.add(all_entries)
|
|
285
|
+
self.vector_store.persist(new_dir)
|
|
286
|
+
|
|
287
|
+
self.keyword_index.add(all_entries)
|
|
288
|
+
self.keyword_index.persist(new_dir)
|
|
289
|
+
|
|
290
|
+
self.symbol_index.add_definitions(all_defs)
|
|
291
|
+
# Feed chunk snippets to the references FTS5 table so find_references
|
|
292
|
+
# has rows to match against (definitions alone are not enough — a
|
|
293
|
+
# symbol's call sites live in the chunk text, not in the defs table).
|
|
294
|
+
ref_rows = [(c.path, c.line_start, c.snippet) for c in chunks_with_paths]
|
|
295
|
+
self.symbol_index.add_references(ref_rows)
|
|
296
|
+
self.symbol_index.persist(new_dir)
|
|
297
|
+
|
|
298
|
+
meta = {
|
|
299
|
+
"version": _VERSION,
|
|
300
|
+
"head_sha": head,
|
|
301
|
+
"indexed_at": datetime.now(UTC).isoformat(),
|
|
302
|
+
"embeddings_model": self.embeddings.model_id,
|
|
303
|
+
"embeddings_dimension": self.embeddings.dimension,
|
|
304
|
+
"chunker_version": self.chunker.version,
|
|
305
|
+
"keyword_version": self.keyword_index.version,
|
|
306
|
+
"symbol_version": self.symbol_index.version,
|
|
307
|
+
"n_chunks": len(all_entries),
|
|
308
|
+
"n_files": len(file_hashes),
|
|
309
|
+
"file_hashes": file_hashes,
|
|
310
|
+
}
|
|
311
|
+
(new_dir / "metadata.json").write_text(json.dumps(meta, indent=2))
|
|
312
|
+
|
|
313
|
+
return new_dir
|
|
314
|
+
|
|
315
|
+
def current_index_dir(self) -> Path | None:
|
|
316
|
+
current = self._read_current()
|
|
317
|
+
if current is None:
|
|
318
|
+
return None
|
|
319
|
+
return self.cache_dir / current["active"]
|
|
320
|
+
|
|
321
|
+
# ---------- internal ----------
|
|
322
|
+
|
|
323
|
+
def _read_current(self) -> dict | None:
|
|
324
|
+
cur = self.cache_dir / _CURRENT_FILE
|
|
325
|
+
if not cur.exists():
|
|
326
|
+
return None
|
|
327
|
+
return json.loads(cur.read_text())
|
|
328
|
+
|
|
329
|
+
def _current_metadata(self) -> dict | None:
|
|
330
|
+
cur = self._read_current()
|
|
331
|
+
if cur is None:
|
|
332
|
+
return None
|
|
333
|
+
meta_path = self.cache_dir / cur["active"] / "metadata.json"
|
|
334
|
+
if not meta_path.exists():
|
|
335
|
+
return None
|
|
336
|
+
return json.loads(meta_path.read_text())
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""RecentChangesUseCase — direct delegation to GitSource with no-repo fallback."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from datetime import UTC, datetime, timedelta
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from code_context.domain.models import Change
|
|
11
|
+
from code_context.domain.ports import GitSource
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_DEFAULT_LOOKBACK_DAYS = 7
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class RecentChangesUseCase:
|
|
20
|
+
git_source: GitSource
|
|
21
|
+
repo_root: Path
|
|
22
|
+
|
|
23
|
+
def run(
|
|
24
|
+
self,
|
|
25
|
+
since: datetime | None = None,
|
|
26
|
+
paths: list[str] | None = None,
|
|
27
|
+
max_count: int = 20,
|
|
28
|
+
) -> list[Change]:
|
|
29
|
+
if not self.git_source.is_repo(self.repo_root):
|
|
30
|
+
log.warning("recent_changes: %s is not a git repo; returning []", self.repo_root)
|
|
31
|
+
return []
|
|
32
|
+
if since is None:
|
|
33
|
+
since = datetime.now(UTC) - timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
|
34
|
+
return self.git_source.commits(
|
|
35
|
+
self.repo_root, since=since, paths=paths, max_count=max_count
|
|
36
|
+
)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""SearchRepoUseCase — hybrid retrieval pipeline.
|
|
2
|
+
|
|
3
|
+
vector + keyword are fused via Reciprocal Rank Fusion (RRF). If a
|
|
4
|
+
reranker is supplied, it re-scores the fused top-N. Returns top_k
|
|
5
|
+
SearchResults with the fused or reranked score.
|
|
6
|
+
|
|
7
|
+
Sprint 7: optional `bus` + `reload_callback` give the use case a
|
|
8
|
+
"stale-aware" mode. On each `.run()` call, if the bus' generation has
|
|
9
|
+
advanced since the last reload, the callback fires (typically
|
|
10
|
+
re-loading the vector / keyword / symbol stores from `current.json`'s
|
|
11
|
+
new active dir) before serving the query. Implemented as a single
|
|
12
|
+
int compare in the hot path; legacy callers (no bus, no callback)
|
|
13
|
+
incur zero overhead.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import re
|
|
19
|
+
from collections.abc import Callable
|
|
20
|
+
from dataclasses import dataclass, field
|
|
21
|
+
|
|
22
|
+
from code_context.domain.index_bus import IndexUpdateBus
|
|
23
|
+
from code_context.domain.models import IndexEntry, SearchResult
|
|
24
|
+
from code_context.domain.ports import EmbeddingsProvider, KeywordIndex, Reranker, VectorStore
|
|
25
|
+
|
|
26
|
+
_STRUCTURAL_RE = re.compile(
|
|
27
|
+
r"^\s*(def |class |function |func |fn |export |const |interface |type |struct )"
|
|
28
|
+
)
|
|
29
|
+
_WHY_MAX_LEN = 80
|
|
30
|
+
# Bumped from 2 in v0.1.x — RRF benefits from a wider pool because
|
|
31
|
+
# entries unique to one ranker still feed the fusion.
|
|
32
|
+
_OVER_FETCH_MULTIPLIER = 3
|
|
33
|
+
# Canonical Reciprocal Rank Fusion constant from the original paper.
|
|
34
|
+
_RRF_K = 60
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class SearchRepoUseCase:
|
|
39
|
+
embeddings: EmbeddingsProvider
|
|
40
|
+
vector_store: VectorStore
|
|
41
|
+
keyword_index: KeywordIndex
|
|
42
|
+
reranker: Reranker | None = None
|
|
43
|
+
bus: IndexUpdateBus | None = None
|
|
44
|
+
reload_callback: Callable[[], None] | None = None
|
|
45
|
+
# Initialized to -1 so the very first call (bus.generation == 0)
|
|
46
|
+
# also triggers a reload — covers the cold-start case where the
|
|
47
|
+
# bg indexer hasn't yet published a swap but the active index dir
|
|
48
|
+
# might already be on disk and unloaded.
|
|
49
|
+
_last_seen_generation: int = field(default=-1, init=False, repr=False)
|
|
50
|
+
|
|
51
|
+
def run(
|
|
52
|
+
self,
|
|
53
|
+
query: str,
|
|
54
|
+
top_k: int = 5,
|
|
55
|
+
scope: str | None = None,
|
|
56
|
+
) -> list[SearchResult]:
|
|
57
|
+
self._reload_if_swapped()
|
|
58
|
+
pool = top_k * _OVER_FETCH_MULTIPLIER
|
|
59
|
+
# 1. vector
|
|
60
|
+
query_vec = self.embeddings.embed([query])[0]
|
|
61
|
+
v_hits = self.vector_store.search(query_vec, k=pool)
|
|
62
|
+
# 2. keyword
|
|
63
|
+
k_hits = self.keyword_index.search(query, k=pool)
|
|
64
|
+
# 3. fuse via RRF
|
|
65
|
+
fused = _rrf_fuse(v_hits, k_hits, k_constant=_RRF_K)
|
|
66
|
+
if scope:
|
|
67
|
+
fused = [(entry, score) for entry, score in fused if entry.chunk.path.startswith(scope)]
|
|
68
|
+
# 4. optional rerank on the top of the fused pool
|
|
69
|
+
if self.reranker is not None and fused:
|
|
70
|
+
rerank_pool = fused[:pool] # re-score the whole over-fetched pool
|
|
71
|
+
fused = self.reranker.rerank(query, rerank_pool, k=top_k)
|
|
72
|
+
else:
|
|
73
|
+
fused = fused[:top_k]
|
|
74
|
+
return [self._to_result(e, s) for e, s in fused]
|
|
75
|
+
|
|
76
|
+
def _reload_if_swapped(self) -> None:
|
|
77
|
+
"""Refresh in-memory store handles if the bg indexer published a
|
|
78
|
+
new index dir since our last call. No-op for legacy callers
|
|
79
|
+
(bus is None). Reload exceptions propagate up — better to fail
|
|
80
|
+
loud than silently serve stale results — and the failed reload
|
|
81
|
+
does NOT update `_last_seen_generation`, so the next call retries.
|
|
82
|
+
"""
|
|
83
|
+
if self.bus is None or self.reload_callback is None:
|
|
84
|
+
return
|
|
85
|
+
gen = self.bus.generation
|
|
86
|
+
if gen == self._last_seen_generation:
|
|
87
|
+
return
|
|
88
|
+
self.reload_callback()
|
|
89
|
+
# Only mark as seen AFTER a successful reload, so a transient
|
|
90
|
+
# failure (e.g. disk hiccup) gets retried on the next query.
|
|
91
|
+
self._last_seen_generation = gen
|
|
92
|
+
|
|
93
|
+
@staticmethod
|
|
94
|
+
def _to_result(entry: IndexEntry, score: float) -> SearchResult:
|
|
95
|
+
return SearchResult(
|
|
96
|
+
path=entry.chunk.path,
|
|
97
|
+
lines=(entry.chunk.line_start, entry.chunk.line_end),
|
|
98
|
+
snippet=entry.chunk.snippet,
|
|
99
|
+
score=float(score),
|
|
100
|
+
why=_compute_why(entry.chunk.snippet),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _rrf_fuse(
|
|
105
|
+
a: list[tuple[IndexEntry, float]],
|
|
106
|
+
b: list[tuple[IndexEntry, float]],
|
|
107
|
+
k_constant: int = 60,
|
|
108
|
+
) -> list[tuple[IndexEntry, float]]:
|
|
109
|
+
"""Reciprocal Rank Fusion. Identifies entries by chunk.path + line range."""
|
|
110
|
+
scores: dict[tuple[str, int, int], float] = {}
|
|
111
|
+
entry_by_key: dict[tuple[str, int, int], IndexEntry] = {}
|
|
112
|
+
for hits in (a, b):
|
|
113
|
+
for rank, (entry, _) in enumerate(hits):
|
|
114
|
+
c = entry.chunk
|
|
115
|
+
key = (c.path, c.line_start, c.line_end)
|
|
116
|
+
scores[key] = scores.get(key, 0.0) + 1.0 / (k_constant + rank + 1)
|
|
117
|
+
entry_by_key.setdefault(key, entry)
|
|
118
|
+
items = sorted(scores.items(), key=lambda kv: kv[1], reverse=True)
|
|
119
|
+
return [(entry_by_key[key], score) for key, score in items]
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _compute_why(snippet: str) -> str:
|
|
123
|
+
"""Pick a one-line description from the snippet."""
|
|
124
|
+
for line in snippet.splitlines():
|
|
125
|
+
if _STRUCTURAL_RE.match(line):
|
|
126
|
+
return line.strip()[:_WHY_MAX_LEN]
|
|
127
|
+
for line in snippet.splitlines():
|
|
128
|
+
stripped = line.strip()
|
|
129
|
+
if stripped:
|
|
130
|
+
return stripped[:_WHY_MAX_LEN]
|
|
131
|
+
return ""
|
code_context/server.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""code-context-server entry: composition root + MCP stdio runner.
|
|
2
|
+
|
|
3
|
+
Sprint 7 changes the startup shape:
|
|
4
|
+
|
|
5
|
+
- **Foreground**: build the runtime, fast-load whatever index exists
|
|
6
|
+
on disk (no synchronous reindex), register MCP tools, run stdio.
|
|
7
|
+
Total time on a previously-indexed repo: ~1 s (model load + npy +
|
|
8
|
+
2× sqlite-to-memory). On a cache-cold repo: <100 ms (the foreground
|
|
9
|
+
has nothing to load yet; first queries return empty until bg
|
|
10
|
+
finishes).
|
|
11
|
+
- **Background**: a BackgroundIndexer daemon thread runs dirty_set +
|
|
12
|
+
run_incremental (or full reindex) and publishes swap events to the
|
|
13
|
+
IndexUpdateBus. SearchRepoUseCase reloads its store handles on the
|
|
14
|
+
next query after each swap, transparently.
|
|
15
|
+
|
|
16
|
+
The user pays the cold-reindex cost only on first install (or after
|
|
17
|
+
a model upgrade); ongoing edit cycles are sub-10 s and run while
|
|
18
|
+
Claude is asking other questions.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import asyncio
|
|
24
|
+
import logging
|
|
25
|
+
import sys
|
|
26
|
+
|
|
27
|
+
from mcp.server import Server
|
|
28
|
+
from mcp.server.stdio import stdio_server
|
|
29
|
+
|
|
30
|
+
from code_context._background import BackgroundIndexer
|
|
31
|
+
from code_context._composition import (
|
|
32
|
+
atomic_swap_current,
|
|
33
|
+
build_indexer_and_store,
|
|
34
|
+
build_use_cases,
|
|
35
|
+
ensure_index,
|
|
36
|
+
fast_load_existing_index,
|
|
37
|
+
make_reload_callback,
|
|
38
|
+
setup_logging,
|
|
39
|
+
)
|
|
40
|
+
from code_context._watcher import RepoWatcher
|
|
41
|
+
from code_context.adapters.driving.mcp_server import register
|
|
42
|
+
from code_context.config import Config, load_config
|
|
43
|
+
from code_context.domain.index_bus import IndexUpdateBus
|
|
44
|
+
|
|
45
|
+
log = logging.getLogger("code_context")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
async def _run_server(cfg: Config) -> None:
|
|
49
|
+
indexer, store, embeddings, keyword_index, symbol_index = build_indexer_and_store(cfg)
|
|
50
|
+
bus = IndexUpdateBus()
|
|
51
|
+
|
|
52
|
+
# Foreground: load whatever index exists right now. No reindex. If the
|
|
53
|
+
# cache is empty, queries return [] until the bg thread finishes the
|
|
54
|
+
# first reindex; SearchRepoUseCase's bus-driven reload makes that
|
|
55
|
+
# transition transparent.
|
|
56
|
+
loaded = fast_load_existing_index(indexer, store, keyword_index, symbol_index)
|
|
57
|
+
if loaded:
|
|
58
|
+
log.info("loaded existing index from %s", indexer.current_index_dir())
|
|
59
|
+
elif not cfg.bg_reindex:
|
|
60
|
+
# Background reindex disabled (CC_BG_REINDEX=off) AND no index on
|
|
61
|
+
# disk. Fall back to the v0.7-style synchronous reindex so the
|
|
62
|
+
# server is functional after startup.
|
|
63
|
+
log.info("no existing index and bg_reindex=off; running synchronous reindex")
|
|
64
|
+
ensure_index(cfg, indexer, store, keyword_index, symbol_index)
|
|
65
|
+
else:
|
|
66
|
+
log.info(
|
|
67
|
+
"no existing index — first queries will return [] until the "
|
|
68
|
+
"background reindex finishes (~%d s on a typical repo)",
|
|
69
|
+
60,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
reload_cb = make_reload_callback(indexer, store, keyword_index, symbol_index)
|
|
73
|
+
search, recent, summary, find_def, find_ref, file_tree, explain_diff = build_use_cases(
|
|
74
|
+
cfg,
|
|
75
|
+
indexer,
|
|
76
|
+
store,
|
|
77
|
+
embeddings,
|
|
78
|
+
keyword_index,
|
|
79
|
+
symbol_index,
|
|
80
|
+
bus=bus,
|
|
81
|
+
reload_callback=reload_cb,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
bg = None
|
|
85
|
+
if cfg.bg_reindex:
|
|
86
|
+
bg = BackgroundIndexer(
|
|
87
|
+
indexer=indexer,
|
|
88
|
+
swap=lambda new_dir: atomic_swap_current(cfg, new_dir),
|
|
89
|
+
bus=bus,
|
|
90
|
+
idle_seconds=cfg.bg_idle_seconds,
|
|
91
|
+
)
|
|
92
|
+
bg.start()
|
|
93
|
+
bg.trigger() # kick off initial dirty_set + (full or incremental) reindex
|
|
94
|
+
log.info("background indexer started (idle=%.2fs)", cfg.bg_idle_seconds)
|
|
95
|
+
|
|
96
|
+
watcher = None
|
|
97
|
+
if cfg.watch and bg is not None:
|
|
98
|
+
watcher = RepoWatcher(
|
|
99
|
+
root=cfg.repo_root,
|
|
100
|
+
on_change=bg.trigger,
|
|
101
|
+
debounce_ms=cfg.watch_debounce_ms,
|
|
102
|
+
)
|
|
103
|
+
watcher.start()
|
|
104
|
+
log.info(
|
|
105
|
+
"repo watcher armed (CC_WATCH=on, debounce=%dms)",
|
|
106
|
+
cfg.watch_debounce_ms,
|
|
107
|
+
)
|
|
108
|
+
elif cfg.watch and bg is None:
|
|
109
|
+
log.warning(
|
|
110
|
+
"CC_WATCH=on requires CC_BG_REINDEX=on; watcher not started "
|
|
111
|
+
"(without the bg thread there's nothing to trigger)"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
server = Server("code-context")
|
|
115
|
+
register(
|
|
116
|
+
server,
|
|
117
|
+
search_repo=search,
|
|
118
|
+
recent_changes=recent,
|
|
119
|
+
get_summary=summary,
|
|
120
|
+
find_definition=find_def,
|
|
121
|
+
find_references=find_ref,
|
|
122
|
+
get_file_tree=file_tree,
|
|
123
|
+
explain_diff=explain_diff,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
try:
|
|
127
|
+
async with stdio_server() as (read_stream, write_stream):
|
|
128
|
+
await server.run(read_stream, write_stream, server.create_initialization_options())
|
|
129
|
+
finally:
|
|
130
|
+
if watcher is not None:
|
|
131
|
+
log.info("stopping repo watcher")
|
|
132
|
+
watcher.stop()
|
|
133
|
+
if bg is not None:
|
|
134
|
+
log.info("stopping background indexer")
|
|
135
|
+
bg.stop(timeout=10.0)
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main() -> int:
|
|
139
|
+
cfg = load_config()
|
|
140
|
+
setup_logging(cfg)
|
|
141
|
+
log.info("starting code-context-server (repo=%s)", cfg.repo_root)
|
|
142
|
+
try:
|
|
143
|
+
asyncio.run(_run_server(cfg))
|
|
144
|
+
return 0
|
|
145
|
+
except KeyboardInterrupt:
|
|
146
|
+
log.info("server interrupted; exiting")
|
|
147
|
+
return 130
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
if __name__ == "__main__":
|
|
151
|
+
sys.exit(main())
|