code-context-mcp 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context/__init__.py +3 -0
- code_context/_background.py +93 -0
- code_context/_composition.py +425 -0
- code_context/_watcher.py +89 -0
- code_context/adapters/__init__.py +0 -0
- code_context/adapters/driven/__init__.py +0 -0
- code_context/adapters/driven/chunker_dispatcher.py +43 -0
- code_context/adapters/driven/chunker_line.py +54 -0
- code_context/adapters/driven/chunker_treesitter.py +215 -0
- code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
- code_context/adapters/driven/code_source_fs.py +122 -0
- code_context/adapters/driven/embeddings_local.py +111 -0
- code_context/adapters/driven/embeddings_openai.py +58 -0
- code_context/adapters/driven/git_source_cli.py +211 -0
- code_context/adapters/driven/introspector_fs.py +224 -0
- code_context/adapters/driven/keyword_index_sqlite.py +206 -0
- code_context/adapters/driven/reranker_crossencoder.py +61 -0
- code_context/adapters/driven/symbol_index_sqlite.py +264 -0
- code_context/adapters/driven/vector_store_numpy.py +119 -0
- code_context/adapters/driving/__init__.py +0 -0
- code_context/adapters/driving/mcp_server.py +365 -0
- code_context/cli.py +161 -0
- code_context/config.py +114 -0
- code_context/domain/__init__.py +0 -0
- code_context/domain/index_bus.py +52 -0
- code_context/domain/models.py +140 -0
- code_context/domain/ports.py +205 -0
- code_context/domain/use_cases/__init__.py +0 -0
- code_context/domain/use_cases/explain_diff.py +98 -0
- code_context/domain/use_cases/find_definition.py +30 -0
- code_context/domain/use_cases/find_references.py +22 -0
- code_context/domain/use_cases/get_file_tree.py +36 -0
- code_context/domain/use_cases/get_summary.py +24 -0
- code_context/domain/use_cases/indexer.py +336 -0
- code_context/domain/use_cases/recent_changes.py +36 -0
- code_context/domain/use_cases/search_repo.py +131 -0
- code_context/server.py +151 -0
- code_context_mcp-1.0.0.dist-info/METADATA +181 -0
- code_context_mcp-1.0.0.dist-info/RECORD +43 -0
- code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
- code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
- code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
- code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
code_context/__init__.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""BackgroundIndexer — runs reindex on a worker thread, posts to the bus.
|
|
2
|
+
|
|
3
|
+
Single-threaded coordinator. External code calls `.trigger()` to ask
|
|
4
|
+
for a reindex; the thread coalesces multiple triggers into one job
|
|
5
|
+
(an `Event` is set/cleared, not a queue), so a 5-event burst from a
|
|
6
|
+
file watcher saving in rapid succession produces ONE reindex, not
|
|
7
|
+
five. On completion, the configured `swap` callback runs first
|
|
8
|
+
(typically `_atomic_swap_current` from the composition root) and
|
|
9
|
+
then `bus.publish_swap(new_dir)` notifies any subscriber.
|
|
10
|
+
|
|
11
|
+
Errors in the indexer are caught and logged at ERROR level; the
|
|
12
|
+
worker keeps running so the next trigger has a chance. This matches
|
|
13
|
+
the philosophy of "background reindex must never crash the MCP
|
|
14
|
+
server."
|
|
15
|
+
|
|
16
|
+
The thread is daemonic so it doesn't block process exit if `.stop()`
|
|
17
|
+
is missed (e.g., a hard SIGINT before the main loop's finally
|
|
18
|
+
block). `.stop()` itself sets a flag and joins with a 5 s timeout
|
|
19
|
+
by default; longer for the ~1 s default `idle_seconds`.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import logging
|
|
25
|
+
import threading
|
|
26
|
+
from collections.abc import Callable
|
|
27
|
+
from pathlib import Path
|
|
28
|
+
from typing import Any
|
|
29
|
+
|
|
30
|
+
from code_context.domain.index_bus import IndexUpdateBus
|
|
31
|
+
|
|
32
|
+
log = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BackgroundIndexer(threading.Thread):
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
*,
|
|
39
|
+
indexer: Any, # IndexerUseCase, untyped to avoid circular import
|
|
40
|
+
swap: Callable[[Path], None],
|
|
41
|
+
bus: IndexUpdateBus,
|
|
42
|
+
idle_seconds: float = 1.0,
|
|
43
|
+
) -> None:
|
|
44
|
+
super().__init__(name="code-context-bg-indexer", daemon=True)
|
|
45
|
+
self._indexer = indexer
|
|
46
|
+
self._swap = swap
|
|
47
|
+
self._bus = bus
|
|
48
|
+
self._idle = idle_seconds
|
|
49
|
+
self._wake = threading.Event()
|
|
50
|
+
self._stop_event = threading.Event()
|
|
51
|
+
|
|
52
|
+
def trigger(self) -> None:
|
|
53
|
+
"""Ask the worker thread to run a reindex.
|
|
54
|
+
|
|
55
|
+
Idempotent within an idle window: 5 rapid triggers coalesce
|
|
56
|
+
into one job because the Event is sticky until consumed.
|
|
57
|
+
"""
|
|
58
|
+
self._wake.set()
|
|
59
|
+
|
|
60
|
+
def stop(self, timeout: float = 5.0) -> None:
|
|
61
|
+
"""Signal the worker to exit and join up to `timeout` seconds."""
|
|
62
|
+
self._stop_event.set()
|
|
63
|
+
self._wake.set() # break out of `wait()`
|
|
64
|
+
self.join(timeout=timeout)
|
|
65
|
+
|
|
66
|
+
def run(self) -> None:
|
|
67
|
+
while not self._stop_event.is_set():
|
|
68
|
+
self._wake.wait()
|
|
69
|
+
self._wake.clear()
|
|
70
|
+
if self._stop_event.is_set():
|
|
71
|
+
return
|
|
72
|
+
try:
|
|
73
|
+
self._reindex_once()
|
|
74
|
+
except Exception: # noqa: BLE001 - bg failure must not kill the thread
|
|
75
|
+
log.exception("background reindex failed; will retry on next trigger")
|
|
76
|
+
# Idle so rapid triggers coalesce; stop_event lets `.stop()`
|
|
77
|
+
# break out without waiting the full window.
|
|
78
|
+
self._stop_event.wait(self._idle)
|
|
79
|
+
|
|
80
|
+
def _reindex_once(self) -> None:
|
|
81
|
+
stale = self._indexer.dirty_set()
|
|
82
|
+
no_work = (
|
|
83
|
+
not stale.full_reindex_required and not stale.dirty_files and not stale.deleted_files
|
|
84
|
+
)
|
|
85
|
+
if no_work:
|
|
86
|
+
return
|
|
87
|
+
if stale.full_reindex_required:
|
|
88
|
+
new_dir = self._indexer.run()
|
|
89
|
+
else:
|
|
90
|
+
new_dir = self._indexer.run_incremental(stale)
|
|
91
|
+
self._swap(new_dir)
|
|
92
|
+
self._bus.publish_swap(str(new_dir))
|
|
93
|
+
log.info("background reindex complete (%s) -> %s", stale.reason, new_dir)
|
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
"""Composition helpers shared by server.py and cli.py."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
from collections.abc import Callable
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from code_context.adapters.driven.chunker_dispatcher import ChunkerDispatcher
|
|
13
|
+
from code_context.adapters.driven.chunker_line import LineChunker
|
|
14
|
+
from code_context.adapters.driven.chunker_treesitter import TreeSitterChunker
|
|
15
|
+
from code_context.adapters.driven.code_source_fs import FilesystemSource
|
|
16
|
+
from code_context.adapters.driven.embeddings_local import LocalST
|
|
17
|
+
from code_context.adapters.driven.git_source_cli import GitCliSource
|
|
18
|
+
from code_context.adapters.driven.introspector_fs import FilesystemIntrospector
|
|
19
|
+
from code_context.adapters.driven.keyword_index_sqlite import SqliteFTS5Index
|
|
20
|
+
from code_context.adapters.driven.reranker_crossencoder import CrossEncoderReranker
|
|
21
|
+
from code_context.adapters.driven.symbol_index_sqlite import SymbolIndexSqlite
|
|
22
|
+
from code_context.adapters.driven.vector_store_numpy import NumPyParquetStore
|
|
23
|
+
from code_context.config import Config
|
|
24
|
+
from code_context.domain.index_bus import IndexUpdateBus
|
|
25
|
+
from code_context.domain.models import StaleSet
|
|
26
|
+
from code_context.domain.ports import (
|
|
27
|
+
Chunker,
|
|
28
|
+
EmbeddingsProvider,
|
|
29
|
+
KeywordIndex,
|
|
30
|
+
Reranker,
|
|
31
|
+
SymbolIndex,
|
|
32
|
+
)
|
|
33
|
+
from code_context.domain.use_cases.explain_diff import ExplainDiffUseCase
|
|
34
|
+
from code_context.domain.use_cases.find_definition import FindDefinitionUseCase
|
|
35
|
+
from code_context.domain.use_cases.find_references import FindReferencesUseCase
|
|
36
|
+
from code_context.domain.use_cases.get_file_tree import GetFileTreeUseCase
|
|
37
|
+
from code_context.domain.use_cases.get_summary import GetSummaryUseCase
|
|
38
|
+
from code_context.domain.use_cases.indexer import IndexerUseCase
|
|
39
|
+
from code_context.domain.use_cases.recent_changes import RecentChangesUseCase
|
|
40
|
+
from code_context.domain.use_cases.search_repo import SearchRepoUseCase
|
|
41
|
+
|
|
42
|
+
log = logging.getLogger("code_context")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class _NullKeywordIndex:
|
|
46
|
+
"""No-op keyword index for users who set CC_KEYWORD_INDEX=none.
|
|
47
|
+
|
|
48
|
+
Implements the KeywordIndex Protocol with search returning []. Lets the
|
|
49
|
+
hybrid pipeline degrade gracefully to vector-only without special-casing
|
|
50
|
+
in SearchRepoUseCase.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def version(self) -> str:
|
|
55
|
+
return "null-v1"
|
|
56
|
+
|
|
57
|
+
def add(self, entries) -> None:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
def search(self, query: str, k: int):
|
|
61
|
+
return []
|
|
62
|
+
|
|
63
|
+
def delete_by_path(self, path: str) -> int:
|
|
64
|
+
return 0
|
|
65
|
+
|
|
66
|
+
def persist(self, path) -> None:
|
|
67
|
+
pass
|
|
68
|
+
|
|
69
|
+
def load(self, path) -> None:
|
|
70
|
+
pass
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class _NullSymbolIndex:
|
|
74
|
+
"""No-op symbol index for users who set CC_SYMBOL_INDEX=none.
|
|
75
|
+
|
|
76
|
+
Implements the SymbolIndex Protocol; find_definition/find_references
|
|
77
|
+
return []. Lets users disable the symbol pipeline without breaking
|
|
78
|
+
composition (e.g., on platforms where SQLite FTS5 misbehaves).
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
@property
|
|
82
|
+
def version(self) -> str:
|
|
83
|
+
return "null-symbol-v1"
|
|
84
|
+
|
|
85
|
+
def add_definitions(self, defs) -> None:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
def add_references(self, refs) -> None:
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
def find_definition(self, name, language=None, max_count=5):
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
def find_references(self, name, max_count=50):
|
|
95
|
+
return []
|
|
96
|
+
|
|
97
|
+
def delete_by_path(self, path: str) -> int:
|
|
98
|
+
return 0
|
|
99
|
+
|
|
100
|
+
def persist(self, path) -> None:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
def load(self, path) -> None:
|
|
104
|
+
pass
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def build_embeddings(cfg: Config) -> EmbeddingsProvider:
|
|
108
|
+
if cfg.embeddings_provider == "openai":
|
|
109
|
+
if not cfg.openai_api_key:
|
|
110
|
+
log.error("CC_EMBEDDINGS=openai but OPENAI_API_KEY is unset")
|
|
111
|
+
sys.exit(1)
|
|
112
|
+
from code_context.adapters.driven.embeddings_openai import OpenAIProvider
|
|
113
|
+
|
|
114
|
+
return OpenAIProvider(
|
|
115
|
+
model=cfg.embeddings_model or "text-embedding-3-small",
|
|
116
|
+
api_key=cfg.openai_api_key,
|
|
117
|
+
)
|
|
118
|
+
return LocalST(
|
|
119
|
+
model_name=cfg.embeddings_model or "all-MiniLM-L6-v2",
|
|
120
|
+
trust_remote_code=cfg.trust_remote_code,
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def build_chunker(cfg: Config) -> Chunker:
|
|
125
|
+
"""Build the chunker according to cfg.chunker_strategy.
|
|
126
|
+
|
|
127
|
+
"treesitter" (default in v0.2.0+): TreeSitterChunker for Py/JS/TS/Go/Rust,
|
|
128
|
+
LineChunker for everything else AND for parse errors. "line": legacy
|
|
129
|
+
behavior — LineChunker only. Anything else logs an error and falls back
|
|
130
|
+
to LineChunker so composition root never crashes on bad config.
|
|
131
|
+
"""
|
|
132
|
+
line = LineChunker(chunk_lines=cfg.chunk_lines, overlap=cfg.chunk_overlap)
|
|
133
|
+
if cfg.chunker_strategy == "line":
|
|
134
|
+
return line
|
|
135
|
+
if cfg.chunker_strategy == "treesitter":
|
|
136
|
+
return ChunkerDispatcher(treesitter=TreeSitterChunker(), line=line)
|
|
137
|
+
log.error("unknown CC_CHUNKER=%r; falling back to line", cfg.chunker_strategy)
|
|
138
|
+
return line
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def build_keyword_index(cfg: Config) -> KeywordIndex:
|
|
142
|
+
if cfg.keyword_strategy == "none":
|
|
143
|
+
return _NullKeywordIndex()
|
|
144
|
+
if cfg.keyword_strategy == "sqlite":
|
|
145
|
+
return SqliteFTS5Index()
|
|
146
|
+
log.error(
|
|
147
|
+
"unknown CC_KEYWORD_INDEX=%r; falling back to sqlite",
|
|
148
|
+
cfg.keyword_strategy,
|
|
149
|
+
)
|
|
150
|
+
return SqliteFTS5Index()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def build_symbol_index(cfg: Config) -> SymbolIndex:
|
|
154
|
+
if cfg.symbol_index_strategy == "none":
|
|
155
|
+
return _NullSymbolIndex()
|
|
156
|
+
if cfg.symbol_index_strategy == "sqlite":
|
|
157
|
+
return SymbolIndexSqlite()
|
|
158
|
+
log.error(
|
|
159
|
+
"unknown CC_SYMBOL_INDEX=%r; falling back to sqlite",
|
|
160
|
+
cfg.symbol_index_strategy,
|
|
161
|
+
)
|
|
162
|
+
return SymbolIndexSqlite()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def build_reranker(cfg: Config) -> Reranker | None:
|
|
166
|
+
if not cfg.rerank:
|
|
167
|
+
return None
|
|
168
|
+
return CrossEncoderReranker(
|
|
169
|
+
model_name=cfg.rerank_model or "cross-encoder/ms-marco-MiniLM-L-6-v2",
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def build_indexer_and_store(
|
|
174
|
+
cfg: Config,
|
|
175
|
+
) -> tuple[
|
|
176
|
+
IndexerUseCase,
|
|
177
|
+
NumPyParquetStore,
|
|
178
|
+
EmbeddingsProvider,
|
|
179
|
+
KeywordIndex,
|
|
180
|
+
SymbolIndex,
|
|
181
|
+
]:
|
|
182
|
+
cfg.repo_cache_subdir().mkdir(parents=True, exist_ok=True)
|
|
183
|
+
|
|
184
|
+
embeddings = build_embeddings(cfg)
|
|
185
|
+
chunker = build_chunker(cfg)
|
|
186
|
+
code_source = FilesystemSource()
|
|
187
|
+
git_source = GitCliSource()
|
|
188
|
+
store = NumPyParquetStore()
|
|
189
|
+
keyword_index = build_keyword_index(cfg)
|
|
190
|
+
symbol_index = build_symbol_index(cfg)
|
|
191
|
+
indexer = IndexerUseCase(
|
|
192
|
+
cache_dir=cfg.repo_cache_subdir(),
|
|
193
|
+
repo_root=cfg.repo_root,
|
|
194
|
+
embeddings=embeddings,
|
|
195
|
+
vector_store=store,
|
|
196
|
+
keyword_index=keyword_index,
|
|
197
|
+
symbol_index=symbol_index,
|
|
198
|
+
chunker=chunker,
|
|
199
|
+
code_source=code_source,
|
|
200
|
+
git_source=git_source,
|
|
201
|
+
include_extensions=cfg.include_extensions,
|
|
202
|
+
max_file_bytes=cfg.max_file_bytes,
|
|
203
|
+
)
|
|
204
|
+
return indexer, store, embeddings, keyword_index, symbol_index
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def build_use_cases(
|
|
208
|
+
cfg: Config,
|
|
209
|
+
indexer: IndexerUseCase,
|
|
210
|
+
store: NumPyParquetStore,
|
|
211
|
+
embeddings: EmbeddingsProvider,
|
|
212
|
+
keyword_index: KeywordIndex,
|
|
213
|
+
symbol_index: SymbolIndex,
|
|
214
|
+
bus: IndexUpdateBus | None = None,
|
|
215
|
+
reload_callback: Callable[[], None] | None = None,
|
|
216
|
+
) -> tuple[
|
|
217
|
+
SearchRepoUseCase,
|
|
218
|
+
RecentChangesUseCase,
|
|
219
|
+
GetSummaryUseCase,
|
|
220
|
+
FindDefinitionUseCase,
|
|
221
|
+
FindReferencesUseCase,
|
|
222
|
+
GetFileTreeUseCase,
|
|
223
|
+
ExplainDiffUseCase,
|
|
224
|
+
]:
|
|
225
|
+
git_source = GitCliSource()
|
|
226
|
+
introspector = FilesystemIntrospector()
|
|
227
|
+
code_source = FilesystemSource()
|
|
228
|
+
chunker = build_chunker(cfg)
|
|
229
|
+
reranker = build_reranker(cfg)
|
|
230
|
+
return (
|
|
231
|
+
SearchRepoUseCase(
|
|
232
|
+
embeddings=embeddings,
|
|
233
|
+
vector_store=store,
|
|
234
|
+
keyword_index=keyword_index,
|
|
235
|
+
reranker=reranker,
|
|
236
|
+
bus=bus,
|
|
237
|
+
reload_callback=reload_callback,
|
|
238
|
+
),
|
|
239
|
+
RecentChangesUseCase(git_source=git_source, repo_root=cfg.repo_root),
|
|
240
|
+
GetSummaryUseCase(introspector=introspector, repo_root=cfg.repo_root),
|
|
241
|
+
FindDefinitionUseCase(symbol_index=symbol_index),
|
|
242
|
+
FindReferencesUseCase(symbol_index=symbol_index),
|
|
243
|
+
GetFileTreeUseCase(code_source=code_source, repo_root=cfg.repo_root),
|
|
244
|
+
ExplainDiffUseCase(
|
|
245
|
+
chunker=chunker,
|
|
246
|
+
code_source=code_source,
|
|
247
|
+
git_source=git_source,
|
|
248
|
+
repo_root=cfg.repo_root,
|
|
249
|
+
),
|
|
250
|
+
)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def make_reload_callback(
|
|
254
|
+
indexer: IndexerUseCase,
|
|
255
|
+
store: NumPyParquetStore,
|
|
256
|
+
keyword_index: KeywordIndex,
|
|
257
|
+
symbol_index: SymbolIndex,
|
|
258
|
+
) -> Callable[[], None]:
|
|
259
|
+
"""Build the closure that SearchRepoUseCase fires on bus drift.
|
|
260
|
+
|
|
261
|
+
Reloads all 3 stores from whatever current.json says is active.
|
|
262
|
+
No-op if there's no current index yet (cold-start case where
|
|
263
|
+
the bg indexer hasn't published its first swap). Returns None
|
|
264
|
+
so the use case's reload-on-tick path remains side-effects-only.
|
|
265
|
+
"""
|
|
266
|
+
|
|
267
|
+
def _reload() -> None:
|
|
268
|
+
active = indexer.current_index_dir()
|
|
269
|
+
if active is None or not active.exists():
|
|
270
|
+
return
|
|
271
|
+
store.load(active)
|
|
272
|
+
try:
|
|
273
|
+
keyword_index.load(active)
|
|
274
|
+
symbol_index.load(active)
|
|
275
|
+
except FileNotFoundError:
|
|
276
|
+
# Reindex was published but one of the stores' files isn't
|
|
277
|
+
# there yet (race between persist + swap); next bus tick
|
|
278
|
+
# will reload again.
|
|
279
|
+
log.warning(
|
|
280
|
+
"reload: keyword/symbol index missing in %s; will retry next swap",
|
|
281
|
+
active,
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
return _reload
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def fast_load_existing_index(
|
|
288
|
+
indexer: IndexerUseCase,
|
|
289
|
+
store: NumPyParquetStore,
|
|
290
|
+
keyword_index: KeywordIndex,
|
|
291
|
+
symbol_index: SymbolIndex,
|
|
292
|
+
) -> bool:
|
|
293
|
+
"""Sprint 7: load whatever's already on disk WITHOUT triggering a
|
|
294
|
+
reindex. Returns True if all 3 stores loaded successfully, False
|
|
295
|
+
if the cache is empty / partial — caller should fall back to
|
|
296
|
+
`ensure_index` (synchronous reindex) or rely on the bg indexer to
|
|
297
|
+
populate fresh.
|
|
298
|
+
"""
|
|
299
|
+
active = indexer.current_index_dir()
|
|
300
|
+
if active is None or not active.exists():
|
|
301
|
+
return False
|
|
302
|
+
try:
|
|
303
|
+
store.load(active)
|
|
304
|
+
keyword_index.load(active)
|
|
305
|
+
symbol_index.load(active)
|
|
306
|
+
except FileNotFoundError:
|
|
307
|
+
return False
|
|
308
|
+
return True
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def atomic_swap_current(cfg: Config, new_dir: Path) -> None:
|
|
312
|
+
"""Update current.json to point at `new_dir.name`, atomically.
|
|
313
|
+
|
|
314
|
+
The bg indexer's swap callback. Mirrors the inline swap in
|
|
315
|
+
safe_reindex(); split out so the BackgroundIndexer can use it
|
|
316
|
+
directly without re-acquiring the file lock (the bg thread already
|
|
317
|
+
holds the lock during its run_incremental call when invoked via
|
|
318
|
+
safe_reindex; but when we wire it directly to the bg thread we
|
|
319
|
+
need a thinner helper that just updates current.json).
|
|
320
|
+
"""
|
|
321
|
+
current_path = cfg.repo_cache_subdir() / "current.json"
|
|
322
|
+
tmp = current_path.with_suffix(".json.tmp")
|
|
323
|
+
tmp.write_text(json.dumps({"active": new_dir.name, "version": 1}))
|
|
324
|
+
os.replace(tmp, current_path)
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def _lock_path(cfg: Config) -> Path:
|
|
328
|
+
cfg.repo_cache_subdir().mkdir(parents=True, exist_ok=True)
|
|
329
|
+
return cfg.repo_cache_subdir() / ".lock"
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def safe_reindex(
|
|
333
|
+
cfg: Config,
|
|
334
|
+
indexer: IndexerUseCase,
|
|
335
|
+
stale: StaleSet | None = None,
|
|
336
|
+
) -> Path:
|
|
337
|
+
"""Run reindex (full or incremental) protected by a cross-platform file lock.
|
|
338
|
+
|
|
339
|
+
Acquires the lock or blocks for up to 5 min. Returns the path of the
|
|
340
|
+
new index dir AND atomically swaps current.json to point at it.
|
|
341
|
+
|
|
342
|
+
If `stale` is omitted or has `full_reindex_required=True`, runs the
|
|
343
|
+
legacy full `indexer.run()`. Otherwise dispatches to
|
|
344
|
+
`indexer.run_incremental(stale)` so only `stale.dirty_files` get
|
|
345
|
+
re-embedded — the Sprint 6 win that turns a 1-2 minute edit-cycle
|
|
346
|
+
reindex into <10s on a typical repo.
|
|
347
|
+
"""
|
|
348
|
+
from filelock import FileLock, Timeout
|
|
349
|
+
|
|
350
|
+
lock = FileLock(str(_lock_path(cfg)), timeout=300)
|
|
351
|
+
try:
|
|
352
|
+
with lock:
|
|
353
|
+
log.info("acquired reindex lock at %s", _lock_path(cfg))
|
|
354
|
+
if stale is not None and not stale.full_reindex_required:
|
|
355
|
+
new_dir = indexer.run_incremental(stale)
|
|
356
|
+
else:
|
|
357
|
+
new_dir = indexer.run()
|
|
358
|
+
current_path = cfg.repo_cache_subdir() / "current.json"
|
|
359
|
+
tmp = current_path.with_suffix(".json.tmp")
|
|
360
|
+
tmp.write_text(json.dumps({"active": new_dir.name, "version": 1}))
|
|
361
|
+
os.replace(tmp, current_path)
|
|
362
|
+
return new_dir
|
|
363
|
+
except Timeout as exc:
|
|
364
|
+
raise RuntimeError(
|
|
365
|
+
f"could not acquire reindex lock at {_lock_path(cfg)} after 5 min; "
|
|
366
|
+
"is another reindex running? if not, delete the .lock file and retry."
|
|
367
|
+
) from exc
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def ensure_index(
|
|
371
|
+
cfg: Config,
|
|
372
|
+
indexer: IndexerUseCase,
|
|
373
|
+
store: NumPyParquetStore,
|
|
374
|
+
keyword_index: KeywordIndex,
|
|
375
|
+
symbol_index: SymbolIndex,
|
|
376
|
+
) -> None:
|
|
377
|
+
"""Ensure the on-disk index is fresh, reusing it if possible.
|
|
378
|
+
|
|
379
|
+
Sprint 6 routing: ask the indexer for a `dirty_set()` once, then:
|
|
380
|
+
- StaleSet says no work → load the active index, return.
|
|
381
|
+
- StaleSet says full reindex required → full `indexer.run()`.
|
|
382
|
+
- Otherwise → `indexer.run_incremental(stale)`; only the
|
|
383
|
+
`dirty_files` pay the embedding cost.
|
|
384
|
+
|
|
385
|
+
Pre-Sprint-3 caches without keyword.sqlite and pre-Sprint-4 ones
|
|
386
|
+
without symbols.sqlite still self-heal: load() raises FileNotFound,
|
|
387
|
+
which forces a full reindex via the `_force_full` short-circuit.
|
|
388
|
+
"""
|
|
389
|
+
stale = indexer.dirty_set()
|
|
390
|
+
no_work = not stale.full_reindex_required and not stale.dirty_files and not stale.deleted_files
|
|
391
|
+
if no_work:
|
|
392
|
+
current = indexer.current_index_dir()
|
|
393
|
+
if current is not None:
|
|
394
|
+
log.info("loading existing index from %s", current)
|
|
395
|
+
store.load(current)
|
|
396
|
+
try:
|
|
397
|
+
keyword_index.load(current)
|
|
398
|
+
symbol_index.load(current)
|
|
399
|
+
except FileNotFoundError:
|
|
400
|
+
log.info(
|
|
401
|
+
"keyword or symbol index missing in %s; reindexing to backfill",
|
|
402
|
+
current,
|
|
403
|
+
)
|
|
404
|
+
new_dir = safe_reindex(cfg, indexer) # full
|
|
405
|
+
store.load(new_dir)
|
|
406
|
+
keyword_index.load(new_dir)
|
|
407
|
+
symbol_index.load(new_dir)
|
|
408
|
+
return
|
|
409
|
+
log.info(
|
|
410
|
+
"ensure_index: %s — running %s reindex",
|
|
411
|
+
stale.reason,
|
|
412
|
+
"full" if stale.full_reindex_required else "incremental",
|
|
413
|
+
)
|
|
414
|
+
new_dir = safe_reindex(cfg, indexer, stale=stale)
|
|
415
|
+
store.load(new_dir)
|
|
416
|
+
keyword_index.load(new_dir)
|
|
417
|
+
symbol_index.load(new_dir)
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def setup_logging(cfg: Config) -> None:
|
|
421
|
+
logging.basicConfig(
|
|
422
|
+
level=cfg.log_level,
|
|
423
|
+
stream=sys.stderr,
|
|
424
|
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
425
|
+
)
|
code_context/_watcher.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""RepoWatcher — debounced file-system watcher that triggers reindex.
|
|
2
|
+
|
|
3
|
+
Lazily imports `watchdog` (it's an optional `[watch]` extra). Listens
|
|
4
|
+
for created/modified/deleted/moved events under `cfg.repo_root`,
|
|
5
|
+
debounces them with a configurable delay, and calls `on_change()`
|
|
6
|
+
once per quiet window.
|
|
7
|
+
|
|
8
|
+
If `watchdog` isn't installed, `start()` logs a warning and becomes a
|
|
9
|
+
no-op so users who set `CC_WATCH=on` without the extra get a clear
|
|
10
|
+
signal instead of a hard crash.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import threading
|
|
17
|
+
from collections.abc import Callable
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
log = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class RepoWatcher:
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
root: Path,
|
|
27
|
+
on_change: Callable[[], None],
|
|
28
|
+
debounce_ms: int = 1000,
|
|
29
|
+
) -> None:
|
|
30
|
+
self._root = root
|
|
31
|
+
self._on_change = on_change
|
|
32
|
+
self._debounce_s = max(debounce_ms, 1) / 1000.0
|
|
33
|
+
self._timer_lock = threading.Lock()
|
|
34
|
+
self._timer: threading.Timer | None = None
|
|
35
|
+
self._observer = None # watchdog Observer, lazy-imported in start()
|
|
36
|
+
self._stopped = False
|
|
37
|
+
|
|
38
|
+
def start(self) -> None:
|
|
39
|
+
try:
|
|
40
|
+
from watchdog.events import FileSystemEventHandler
|
|
41
|
+
from watchdog.observers import Observer
|
|
42
|
+
except ImportError as exc:
|
|
43
|
+
log.warning(
|
|
44
|
+
"watchdog not installed; CC_WATCH=on is a no-op (%s). "
|
|
45
|
+
"Install code-context[watch] to enable live reindex on save.",
|
|
46
|
+
exc,
|
|
47
|
+
)
|
|
48
|
+
return
|
|
49
|
+
|
|
50
|
+
watcher = self # closure ref for the inner handler
|
|
51
|
+
|
|
52
|
+
class _Handler(FileSystemEventHandler):
|
|
53
|
+
def on_any_event(self, _event) -> None:
|
|
54
|
+
watcher._on_event()
|
|
55
|
+
|
|
56
|
+
self._observer = Observer()
|
|
57
|
+
self._observer.schedule(_Handler(), str(self._root), recursive=True)
|
|
58
|
+
self._observer.start()
|
|
59
|
+
log.info("repo watcher started for %s (debounce=%.2fs)", self._root, self._debounce_s)
|
|
60
|
+
|
|
61
|
+
def stop(self) -> None:
|
|
62
|
+
self._stopped = True
|
|
63
|
+
if self._observer is not None:
|
|
64
|
+
self._observer.stop()
|
|
65
|
+
self._observer.join(timeout=2.0)
|
|
66
|
+
self._observer = None
|
|
67
|
+
with self._timer_lock:
|
|
68
|
+
if self._timer is not None:
|
|
69
|
+
self._timer.cancel()
|
|
70
|
+
self._timer = None
|
|
71
|
+
|
|
72
|
+
def _on_event(self) -> None:
|
|
73
|
+
"""Reset the debounce timer on every event."""
|
|
74
|
+
if self._stopped:
|
|
75
|
+
return
|
|
76
|
+
with self._timer_lock:
|
|
77
|
+
if self._timer is not None:
|
|
78
|
+
self._timer.cancel()
|
|
79
|
+
self._timer = threading.Timer(self._debounce_s, self._fire)
|
|
80
|
+
self._timer.daemon = True
|
|
81
|
+
self._timer.start()
|
|
82
|
+
|
|
83
|
+
def _fire(self) -> None:
|
|
84
|
+
if self._stopped:
|
|
85
|
+
return
|
|
86
|
+
try:
|
|
87
|
+
self._on_change()
|
|
88
|
+
except Exception: # noqa: BLE001 - watcher must survive callback bugs
|
|
89
|
+
log.exception("RepoWatcher on_change callback failed; will keep watching")
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""ChunkerDispatcher — routes chunking by file extension.
|
|
2
|
+
|
|
3
|
+
Tree-sitter languages → TreeSitterChunker. Everything else → LineChunker.
|
|
4
|
+
If TreeSitterChunker returns [] (unsupported or parse error), LineChunker
|
|
5
|
+
takes over so we don't lose the file from the index.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from code_context.domain.models import Chunk, SymbolDef
|
|
14
|
+
from code_context.domain.ports import Chunker
|
|
15
|
+
|
|
16
|
+
_TREESITTER_EXTS = {".py", ".js", ".jsx", ".ts", ".tsx", ".go", ".rs", ".cs"}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ChunkerDispatcher:
|
|
21
|
+
"""Composite chunker: tree-sitter for known languages, line fallback."""
|
|
22
|
+
|
|
23
|
+
treesitter: Chunker
|
|
24
|
+
line: Chunker
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def version(self) -> str:
|
|
28
|
+
# Both sub-versions in the identifier so any change invalidates the cache.
|
|
29
|
+
return f"dispatcher({self.treesitter.version}|{self.line.version})-v1"
|
|
30
|
+
|
|
31
|
+
def chunk(self, content: str, path: str) -> list[Chunk]:
|
|
32
|
+
if Path(path).suffix.lower() in _TREESITTER_EXTS:
|
|
33
|
+
chunks = self.treesitter.chunk(content, path)
|
|
34
|
+
if chunks:
|
|
35
|
+
return chunks
|
|
36
|
+
return self.line.chunk(content, path)
|
|
37
|
+
|
|
38
|
+
def extract_definitions(self, content: str, path: str) -> list[SymbolDef]:
|
|
39
|
+
"""Delegate symbol extraction to the tree-sitter sub-chunker if it has it."""
|
|
40
|
+
extractor = getattr(self.treesitter, "extract_definitions", None)
|
|
41
|
+
if extractor is None:
|
|
42
|
+
return []
|
|
43
|
+
return extractor(content, path)
|