code-context-mcp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. code_context/__init__.py +3 -0
  2. code_context/_background.py +93 -0
  3. code_context/_composition.py +425 -0
  4. code_context/_watcher.py +89 -0
  5. code_context/adapters/__init__.py +0 -0
  6. code_context/adapters/driven/__init__.py +0 -0
  7. code_context/adapters/driven/chunker_dispatcher.py +43 -0
  8. code_context/adapters/driven/chunker_line.py +54 -0
  9. code_context/adapters/driven/chunker_treesitter.py +215 -0
  10. code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
  11. code_context/adapters/driven/code_source_fs.py +122 -0
  12. code_context/adapters/driven/embeddings_local.py +111 -0
  13. code_context/adapters/driven/embeddings_openai.py +58 -0
  14. code_context/adapters/driven/git_source_cli.py +211 -0
  15. code_context/adapters/driven/introspector_fs.py +224 -0
  16. code_context/adapters/driven/keyword_index_sqlite.py +206 -0
  17. code_context/adapters/driven/reranker_crossencoder.py +61 -0
  18. code_context/adapters/driven/symbol_index_sqlite.py +264 -0
  19. code_context/adapters/driven/vector_store_numpy.py +119 -0
  20. code_context/adapters/driving/__init__.py +0 -0
  21. code_context/adapters/driving/mcp_server.py +365 -0
  22. code_context/cli.py +161 -0
  23. code_context/config.py +114 -0
  24. code_context/domain/__init__.py +0 -0
  25. code_context/domain/index_bus.py +52 -0
  26. code_context/domain/models.py +140 -0
  27. code_context/domain/ports.py +205 -0
  28. code_context/domain/use_cases/__init__.py +0 -0
  29. code_context/domain/use_cases/explain_diff.py +98 -0
  30. code_context/domain/use_cases/find_definition.py +30 -0
  31. code_context/domain/use_cases/find_references.py +22 -0
  32. code_context/domain/use_cases/get_file_tree.py +36 -0
  33. code_context/domain/use_cases/get_summary.py +24 -0
  34. code_context/domain/use_cases/indexer.py +336 -0
  35. code_context/domain/use_cases/recent_changes.py +36 -0
  36. code_context/domain/use_cases/search_repo.py +131 -0
  37. code_context/server.py +151 -0
  38. code_context_mcp-1.0.0.dist-info/METADATA +181 -0
  39. code_context_mcp-1.0.0.dist-info/RECORD +43 -0
  40. code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
  41. code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
  42. code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
  43. code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,264 @@
1
+ """SymbolIndexSqlite — SQLite-backed adapter for SymbolIndex.
2
+
3
+ Stores symbol definitions in an indexed table, references in an FTS5 table
4
+ that's a peer of (but distinct from) Sprint 3's keyword chunks table. This
5
+ adapter persists to its own file (`symbols.sqlite`) for isolation; if the
6
+ composition root harmonises file sharing in a future task, only this
7
+ constant changes.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import logging
13
+ import re
14
+ import sqlite3
15
+ from collections.abc import Iterable
16
+ from pathlib import Path
17
+
18
+ from code_context.domain.models import SymbolDef, SymbolRef
19
+
20
+ log = logging.getLogger(__name__)
21
+
22
+ _FILE = "symbols.sqlite"
23
+ _DEFS_TABLE = "symbol_defs"
24
+ _REFS_TABLE = "symbol_refs_fts"
25
+
26
+ # FTS5 query sanitisation — same logic as keyword_index_sqlite.py.
27
+ # Strip punctuation (FTS5 parses `.`, `-`, `:`, etc. as syntax even
28
+ # though the unicode61 tokenizer accepts them in indexed text), and
29
+ # strip the boolean operators.
30
+ _FTS_KEEP_RE = re.compile(r"[^\w\s]", flags=re.UNICODE)
31
+ _FTS_BOOLEAN_RE = re.compile(r"\b(AND|OR|NOT|NEAR)\b", re.IGNORECASE)
32
+
33
+
34
+ class SymbolIndexSqlite:
35
+ """Default SymbolIndex adapter — definitions + references via SQLite + FTS5."""
36
+
37
+ @property
38
+ def version(self) -> str:
39
+ return f"symbols-sqlite-{sqlite3.sqlite_version}-v1"
40
+
41
+ def __init__(self) -> None:
42
+ self._conn: sqlite3.Connection | None = None
43
+ self._db_path: Path | None = None
44
+ self._open_inmem()
45
+
46
+ # ---------- public ----------
47
+
48
+ def add_definitions(self, defs: Iterable[SymbolDef]) -> None:
49
+ assert self._conn is not None
50
+ rows = [(d.name, d.path, d.lines[0], d.lines[1], d.kind, d.language) for d in defs]
51
+ if not rows:
52
+ return
53
+ self._conn.executemany(
54
+ f"INSERT INTO {_DEFS_TABLE} "
55
+ "(name, path, line_start, line_end, kind, language) "
56
+ "VALUES (?, ?, ?, ?, ?, ?)",
57
+ rows,
58
+ )
59
+ self._conn.commit()
60
+
61
+ def add_references(self, refs: Iterable[tuple[str, int, str]]) -> None:
62
+ """Bulk-insert reference rows into the FTS5 references table.
63
+
64
+ Each row is (path, line, snippet). Snippet is FTS5-indexed; path and
65
+ line are UNINDEXED. IndexerUseCase feeds chunk snippets here so that
66
+ find_references has rows to MATCH against later.
67
+ """
68
+ assert self._conn is not None
69
+ rows = list(refs)
70
+ if not rows:
71
+ return
72
+ self._conn.executemany(
73
+ f"INSERT INTO {_REFS_TABLE} (path, line, snippet) VALUES (?, ?, ?)",
74
+ rows,
75
+ )
76
+ self._conn.commit()
77
+
78
+ def delete_by_path(self, path: str) -> int:
79
+ """Remove every row whose path == `path` from BOTH symbol_defs
80
+ and symbol_refs_fts. Returns the total rowcount across the two
81
+ tables. Used by Sprint 6 incremental reindex."""
82
+ assert self._conn is not None
83
+ defs_cur = self._conn.execute(f"DELETE FROM {_DEFS_TABLE} WHERE path = ?", (path,))
84
+ refs_cur = self._conn.execute(f"DELETE FROM {_REFS_TABLE} WHERE path = ?", (path,))
85
+ self._conn.commit()
86
+ return defs_cur.rowcount + refs_cur.rowcount
87
+
88
+ def find_definition(
89
+ self,
90
+ name: str,
91
+ language: str | None = None,
92
+ max_count: int = 5,
93
+ ) -> list[SymbolDef]:
94
+ assert self._conn is not None
95
+ if language:
96
+ cur = self._conn.execute(
97
+ f"SELECT name, path, line_start, line_end, kind, language "
98
+ f"FROM {_DEFS_TABLE} WHERE name = ? AND language = ? LIMIT ?",
99
+ (name, language, max_count),
100
+ )
101
+ else:
102
+ cur = self._conn.execute(
103
+ f"SELECT name, path, line_start, line_end, kind, language "
104
+ f"FROM {_DEFS_TABLE} WHERE name = ? LIMIT ?",
105
+ (name, max_count),
106
+ )
107
+ return [
108
+ SymbolDef(
109
+ name=row[0],
110
+ path=row[1],
111
+ lines=(row[2], row[3]),
112
+ kind=row[4],
113
+ language=row[5],
114
+ )
115
+ for row in cur.fetchall()
116
+ ]
117
+
118
+ def find_references(self, name: str, max_count: int = 50) -> list[SymbolRef]:
119
+ """FTS5 MATCH for the symbol, then expand each chunk to per-line hits.
120
+
121
+ FTS5 stores chunk-level rows (path, chunk_start_line, full_chunk_snippet);
122
+ we want one SymbolRef per LINE that contains the symbol — that's the
123
+ contract from tool-protocol.md ("snippet: the matching line, trimmed").
124
+ Two reasons we do it this way:
125
+
126
+ 1. **Contract**: SymbolRef.snippet is "the matching line, trimmed", not
127
+ "the chunk that contains the matching line". Returning chunks blew
128
+ past Claude Code's MCP-tool token budget on the very first smoke
129
+ (a single find_references call returned ~100KB of output).
130
+ 2. **Word boundary**: FTS5's unicode61 tokenizer treats `log` and
131
+ `logger` as different tokens, so MATCH 'log' won't match 'logger'.
132
+ But it WILL match `log_format` (split on underscore). The
133
+ per-line `\\bname\\b` filter catches that and skips lines where
134
+ `name` only appears as part of a longer identifier.
135
+ """
136
+ assert self._conn is not None
137
+ sanitised = _sanitise(name)
138
+ if not sanitised:
139
+ return []
140
+ try:
141
+ cur = self._conn.execute(
142
+ f"SELECT path, line, snippet FROM {_REFS_TABLE} "
143
+ f"WHERE {_REFS_TABLE} MATCH ? LIMIT ?",
144
+ (sanitised, max_count * 4), # over-fetch; per-line expand trims.
145
+ )
146
+ except sqlite3.OperationalError as exc:
147
+ log.warning("symbol refs query failed (%s) for %r → []", exc, name)
148
+ return []
149
+ word_re = re.compile(rf"\b{re.escape(name)}\b")
150
+ out: list[SymbolRef] = []
151
+ seen: set[tuple[str, int]] = set()
152
+ for path, chunk_start_line, chunk_snippet in cur.fetchall():
153
+ for offset, line_text in enumerate(chunk_snippet.splitlines() or [chunk_snippet]):
154
+ if not word_re.search(line_text):
155
+ continue
156
+ actual_line = int(chunk_start_line) + offset
157
+ key = (path, actual_line)
158
+ if key in seen:
159
+ continue # Same line emitted by overlapping chunks.
160
+ seen.add(key)
161
+ trimmed = line_text.strip()[:200]
162
+ out.append(SymbolRef(path=path, line=actual_line, snippet=trimmed))
163
+ if len(out) >= max_count:
164
+ return out
165
+ return out
166
+
167
+ def persist(self, path: Path) -> None:
168
+ assert self._conn is not None
169
+ path.mkdir(parents=True, exist_ok=True)
170
+ target = path / _FILE
171
+ # Commit any open implicit transaction first — backup() blocks on
172
+ # uncommitted writes in the source connection (Windows specifically).
173
+ self._conn.commit()
174
+ disk = sqlite3.connect(target, check_same_thread=False)
175
+ try:
176
+ self._conn.backup(disk)
177
+ finally:
178
+ # sqlite3.Connection's context manager only commits, doesn't close.
179
+ # Explicit close so Windows releases the file lock for tmp_path
180
+ # cleanup. Mirrors the same fix in keyword_index_sqlite.py.
181
+ disk.close()
182
+ self._db_path = target
183
+
184
+ def load(self, path: Path) -> None:
185
+ """Restore the symbol index from `<path>/symbols.sqlite` into a
186
+ fresh in-memory connection. Mirrors keyword_index_sqlite.load —
187
+ Sprint 6 needs mutations after load to stay in RAM so they don't
188
+ corrupt the active on-disk index AND a subsequent persist() to
189
+ the same dir doesn't deadlock on SQLite's backup-to-itself."""
190
+ target = path / _FILE
191
+ if not target.exists():
192
+ raise FileNotFoundError(f"symbol index missing at {target}")
193
+ if self._conn is not None:
194
+ self._conn.close()
195
+ # check_same_thread=False — see _open_inmem rationale.
196
+ self._conn = sqlite3.connect(":memory:", check_same_thread=False)
197
+ disk = sqlite3.connect(target, check_same_thread=False)
198
+ try:
199
+ disk.backup(self._conn)
200
+ finally:
201
+ disk.close()
202
+ self._db_path = target
203
+
204
+ # ---------- test helpers ----------
205
+
206
+ def populate_references_for_test(self, rows: list[tuple[str, int, str]]) -> None:
207
+ """Inject rows into the references FTS5 table for unit testing.
208
+
209
+ Bypasses the IndexerUseCase pipeline that normally feeds this table
210
+ from the chunker output. Production callers should NOT use this; it's
211
+ exposed because writing through the public API requires running the
212
+ whole pipeline.
213
+ """
214
+ assert self._conn is not None
215
+ self._conn.executemany(
216
+ f"INSERT INTO {_REFS_TABLE} (path, line, snippet) VALUES (?, ?, ?)",
217
+ rows,
218
+ )
219
+ self._conn.commit()
220
+
221
+ # ---------- internal ----------
222
+
223
+ def _open_inmem(self) -> None:
224
+ # check_same_thread=False: the MCP server runs query handlers via
225
+ # asyncio.to_thread, which uses a thread pool. Without this flag, a
226
+ # connection opened on the main thread cannot be used from worker
227
+ # threads (sqlite3.ProgrammingError). SQLite's library is built in
228
+ # serialized threading mode by default, so a single connection is
229
+ # safe across threads as long as we don't have concurrent writes —
230
+ # which we don't (writes happen at indexer.run() time, queries are
231
+ # read-only). Mirrors the same fix in keyword_index_sqlite.py.
232
+ self._conn = sqlite3.connect(":memory:", check_same_thread=False)
233
+ self._init_schema()
234
+
235
+ def _init_schema(self) -> None:
236
+ assert self._conn is not None
237
+ self._conn.executescript(
238
+ f"""
239
+ CREATE TABLE IF NOT EXISTS {_DEFS_TABLE} (
240
+ name TEXT NOT NULL,
241
+ path TEXT NOT NULL,
242
+ line_start INTEGER NOT NULL,
243
+ line_end INTEGER NOT NULL,
244
+ kind TEXT NOT NULL,
245
+ language TEXT NOT NULL
246
+ );
247
+ CREATE INDEX IF NOT EXISTS idx_{_DEFS_TABLE}_name ON {_DEFS_TABLE}(name);
248
+ CREATE INDEX IF NOT EXISTS idx_{_DEFS_TABLE}_name_lang ON {_DEFS_TABLE}(name, language);
249
+
250
+ CREATE VIRTUAL TABLE IF NOT EXISTS {_REFS_TABLE} USING fts5(
251
+ path UNINDEXED, line UNINDEXED, snippet,
252
+ tokenize='unicode61 remove_diacritics 2'
253
+ );
254
+ """
255
+ )
256
+
257
+
258
+ def _sanitise(query: str) -> str:
259
+ """Strip FTS5 syntax so user input is bare tokens only. See
260
+ keyword_index_sqlite._sanitise for the rationale (Sprint 8 fix
261
+ for the punctuation-crashes-FTS5-parser bug)."""
262
+ cleaned = _FTS_KEEP_RE.sub(" ", query)
263
+ cleaned = _FTS_BOOLEAN_RE.sub(" ", cleaned)
264
+ return " ".join(cleaned.split())
@@ -0,0 +1,119 @@
1
+ """NumPyParquetStore — brute-force cosine on a NumPy array."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterable
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ import pyarrow as pa
10
+ import pyarrow.parquet as pq
11
+
12
+ from code_context.domain.models import Chunk, IndexEntry
13
+
14
+
15
+ class NumPyParquetStore:
16
+ """In-memory vectors + chunk metadata, persistable to disk."""
17
+
18
+ _VECTORS_FILE = "vectors.npy"
19
+ _CHUNKS_FILE = "chunks.parquet"
20
+
21
+ def __init__(self) -> None:
22
+ self._vectors: np.ndarray | None = None # (n, d) float32
23
+ self._chunks: list[Chunk] = []
24
+
25
+ def add(self, entries: Iterable[IndexEntry]) -> None:
26
+ new_vecs: list[np.ndarray] = []
27
+ for entry in entries:
28
+ new_vecs.append(entry.vector)
29
+ self._chunks.append(entry.chunk)
30
+ if not new_vecs:
31
+ return
32
+ stacked = np.stack(new_vecs).astype(np.float32, copy=False)
33
+ if self._vectors is None:
34
+ self._vectors = stacked
35
+ else:
36
+ self._vectors = np.concatenate([self._vectors, stacked], axis=0)
37
+
38
+ def delete_by_path(self, path: str) -> int:
39
+ """Remove every chunk whose path == `path`. Returns the row count
40
+ removed (0 if nothing matched). Rebuilds `_vectors` via boolean
41
+ masking; if the deletion empties the store, `_vectors` resets to
42
+ None so subsequent `search` short-circuits on the empty-store
43
+ branch (matches the post-`__init__` invariant)."""
44
+ if self._vectors is None or not self._chunks:
45
+ return 0
46
+ keep = [c.path != path for c in self._chunks]
47
+ n_removed = sum(1 for k in keep if not k)
48
+ if n_removed == 0:
49
+ return 0
50
+ self._vectors = self._vectors[keep]
51
+ self._chunks = [c for c, k in zip(self._chunks, keep, strict=True) if k]
52
+ if self._vectors.shape[0] == 0:
53
+ self._vectors = None
54
+ return n_removed
55
+
56
+ def search(self, query: np.ndarray, k: int) -> list[tuple[IndexEntry, float]]:
57
+ if self._vectors is None or self._vectors.shape[0] == 0:
58
+ return []
59
+ q = query.astype(np.float32, copy=False)
60
+ # Normalize query and corpus.
61
+ q_norm = q / (np.linalg.norm(q) or 1.0)
62
+ v_norms = np.linalg.norm(self._vectors, axis=1, keepdims=True)
63
+ v_norms[v_norms == 0] = 1.0
64
+ normalized = self._vectors / v_norms
65
+ scores = normalized @ q_norm # (n,)
66
+ k = min(k, scores.shape[0])
67
+ # argpartition + sort just the top-k for performance.
68
+ if k <= 0:
69
+ return []
70
+ top_idx = np.argpartition(-scores, kth=k - 1)[:k]
71
+ top_idx = top_idx[np.argsort(-scores[top_idx])]
72
+ return [
73
+ (IndexEntry(chunk=self._chunks[i], vector=self._vectors[i]), float(scores[i]))
74
+ for i in top_idx
75
+ ]
76
+
77
+ def persist(self, path: Path) -> None:
78
+ path.mkdir(parents=True, exist_ok=True)
79
+ if self._vectors is None:
80
+ np.save(path / self._VECTORS_FILE, np.empty((0, 1), dtype=np.float32))
81
+ else:
82
+ np.save(path / self._VECTORS_FILE, self._vectors)
83
+ table = pa.table(
84
+ {
85
+ "path": [c.path for c in self._chunks],
86
+ "line_start": [c.line_start for c in self._chunks],
87
+ "line_end": [c.line_end for c in self._chunks],
88
+ "content_hash": [c.content_hash for c in self._chunks],
89
+ "snippet": [c.snippet for c in self._chunks],
90
+ }
91
+ )
92
+ pq.write_table(table, path / self._CHUNKS_FILE)
93
+
94
+ def load(self, path: Path) -> None:
95
+ vectors_path = path / self._VECTORS_FILE
96
+ chunks_path = path / self._CHUNKS_FILE
97
+ if not vectors_path.exists() or not chunks_path.exists():
98
+ raise FileNotFoundError(f"index files missing in {path}")
99
+ self._vectors = np.load(vectors_path).astype(np.float32, copy=False)
100
+ if self._vectors.shape == (0, 1):
101
+ self._vectors = None
102
+ table = pq.read_table(chunks_path)
103
+ self._chunks = [
104
+ Chunk(
105
+ path=p,
106
+ line_start=ls,
107
+ line_end=le,
108
+ content_hash=ch,
109
+ snippet=sn,
110
+ )
111
+ for p, ls, le, ch, sn in zip(
112
+ table["path"].to_pylist(),
113
+ table["line_start"].to_pylist(),
114
+ table["line_end"].to_pylist(),
115
+ table["content_hash"].to_pylist(),
116
+ table["snippet"].to_pylist(),
117
+ strict=True,
118
+ )
119
+ ]
File without changes