code-context-mcp 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context/__init__.py +3 -0
- code_context/_background.py +93 -0
- code_context/_composition.py +425 -0
- code_context/_watcher.py +89 -0
- code_context/adapters/__init__.py +0 -0
- code_context/adapters/driven/__init__.py +0 -0
- code_context/adapters/driven/chunker_dispatcher.py +43 -0
- code_context/adapters/driven/chunker_line.py +54 -0
- code_context/adapters/driven/chunker_treesitter.py +215 -0
- code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
- code_context/adapters/driven/code_source_fs.py +122 -0
- code_context/adapters/driven/embeddings_local.py +111 -0
- code_context/adapters/driven/embeddings_openai.py +58 -0
- code_context/adapters/driven/git_source_cli.py +211 -0
- code_context/adapters/driven/introspector_fs.py +224 -0
- code_context/adapters/driven/keyword_index_sqlite.py +206 -0
- code_context/adapters/driven/reranker_crossencoder.py +61 -0
- code_context/adapters/driven/symbol_index_sqlite.py +264 -0
- code_context/adapters/driven/vector_store_numpy.py +119 -0
- code_context/adapters/driving/__init__.py +0 -0
- code_context/adapters/driving/mcp_server.py +365 -0
- code_context/cli.py +161 -0
- code_context/config.py +114 -0
- code_context/domain/__init__.py +0 -0
- code_context/domain/index_bus.py +52 -0
- code_context/domain/models.py +140 -0
- code_context/domain/ports.py +205 -0
- code_context/domain/use_cases/__init__.py +0 -0
- code_context/domain/use_cases/explain_diff.py +98 -0
- code_context/domain/use_cases/find_definition.py +30 -0
- code_context/domain/use_cases/find_references.py +22 -0
- code_context/domain/use_cases/get_file_tree.py +36 -0
- code_context/domain/use_cases/get_summary.py +24 -0
- code_context/domain/use_cases/indexer.py +336 -0
- code_context/domain/use_cases/recent_changes.py +36 -0
- code_context/domain/use_cases/search_repo.py +131 -0
- code_context/server.py +151 -0
- code_context_mcp-1.0.0.dist-info/METADATA +181 -0
- code_context_mcp-1.0.0.dist-info/RECORD +43 -0
- code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
- code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
- code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
- code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""GitCliSource — subprocess to `git` with ASCII unit-separator parsing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import re
|
|
7
|
+
import subprocess
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from code_context.domain.models import Change, DiffFile
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
_FS = "\x1f" # ASCII unit separator
|
|
16
|
+
_PRETTY = f"%H{_FS}%aI{_FS}%an{_FS}%s"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GitCliSource:
|
|
20
|
+
def is_repo(self, root: Path) -> bool:
|
|
21
|
+
return (root / ".git").exists()
|
|
22
|
+
|
|
23
|
+
def head_sha(self, root: Path) -> str:
|
|
24
|
+
if not self.is_repo(root):
|
|
25
|
+
return ""
|
|
26
|
+
try:
|
|
27
|
+
out = subprocess.run(
|
|
28
|
+
["git", "rev-parse", "HEAD"],
|
|
29
|
+
cwd=str(root),
|
|
30
|
+
capture_output=True,
|
|
31
|
+
text=True,
|
|
32
|
+
encoding="utf-8",
|
|
33
|
+
errors="replace",
|
|
34
|
+
check=True,
|
|
35
|
+
)
|
|
36
|
+
return (out.stdout or "").strip()
|
|
37
|
+
except subprocess.CalledProcessError as exc:
|
|
38
|
+
log.warning("git rev-parse HEAD failed: %s", exc)
|
|
39
|
+
return ""
|
|
40
|
+
|
|
41
|
+
def commits(
|
|
42
|
+
self,
|
|
43
|
+
root: Path,
|
|
44
|
+
since: datetime | None = None,
|
|
45
|
+
paths: list[str] | None = None,
|
|
46
|
+
max_count: int = 20,
|
|
47
|
+
) -> list[Change]:
|
|
48
|
+
if not self.is_repo(root):
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
cmd = ["git", "log", f"--pretty=format:{_PRETTY}", "--name-only", f"-{max_count}"]
|
|
52
|
+
if since is not None:
|
|
53
|
+
cmd.append(f"--since={since.isoformat()}")
|
|
54
|
+
if paths:
|
|
55
|
+
cmd.append("--")
|
|
56
|
+
cmd.extend(paths)
|
|
57
|
+
|
|
58
|
+
try:
|
|
59
|
+
res = subprocess.run(
|
|
60
|
+
cmd,
|
|
61
|
+
cwd=str(root),
|
|
62
|
+
capture_output=True,
|
|
63
|
+
text=True,
|
|
64
|
+
encoding="utf-8",
|
|
65
|
+
errors="replace",
|
|
66
|
+
check=True,
|
|
67
|
+
)
|
|
68
|
+
except subprocess.CalledProcessError as exc:
|
|
69
|
+
log.warning("git log failed: %s", exc)
|
|
70
|
+
return []
|
|
71
|
+
|
|
72
|
+
return _parse(res.stdout or "")
|
|
73
|
+
|
|
74
|
+
def diff_files(self, root: Path, ref: str) -> list[DiffFile]:
|
|
75
|
+
"""Use git diff-tree + numstat-like parsing to get hunks per file.
|
|
76
|
+
|
|
77
|
+
Strategy: `git diff <ref>^! --unified=0 --no-color` gives a unified
|
|
78
|
+
diff with zero context lines. Each hunk header line is:
|
|
79
|
+
@@ -<old_start>,<old_count> +<new_start>,<new_count> @@
|
|
80
|
+
We parse those into (new_start, new_start + new_count - 1) pairs.
|
|
81
|
+
|
|
82
|
+
For ref == HEAD, the worktree diff (uncommitted changes) is excluded;
|
|
83
|
+
we always show the committed diff. To diff worktree, the caller would
|
|
84
|
+
pass an explicit ref like "HEAD" with a different strategy — out of
|
|
85
|
+
scope for v0.7.0.
|
|
86
|
+
"""
|
|
87
|
+
if not self.is_repo(root):
|
|
88
|
+
return []
|
|
89
|
+
|
|
90
|
+
# ^! syntax means "this commit's changes vs its parent". Equivalent to
|
|
91
|
+
# `git diff <ref>~1 <ref>` for non-merge commits. For the initial
|
|
92
|
+
# commit, ^! is invalid; fall back to `git diff --root <ref>`.
|
|
93
|
+
#
|
|
94
|
+
# Critical Windows note: text=True alone uses Python's default
|
|
95
|
+
# locale encoding (cp1252 on Windows), which CANNOT decode many
|
|
96
|
+
# bytes that legitimately appear in git diff output (binary chunks,
|
|
97
|
+
# mixed-encoding source files). When the reader thread fails to
|
|
98
|
+
# decode, `res.stdout` becomes None even though the subprocess
|
|
99
|
+
# exited successfully. We force UTF-8 + errors="replace" to ensure
|
|
100
|
+
# we always get a string back, and we defensively guard against
|
|
101
|
+
# None in case future git versions change the behavior again.
|
|
102
|
+
try:
|
|
103
|
+
res = subprocess.run(
|
|
104
|
+
["git", "diff", f"{ref}^!", "--unified=0", "--no-color"],
|
|
105
|
+
cwd=str(root),
|
|
106
|
+
capture_output=True,
|
|
107
|
+
text=True,
|
|
108
|
+
encoding="utf-8",
|
|
109
|
+
errors="replace",
|
|
110
|
+
check=True,
|
|
111
|
+
)
|
|
112
|
+
diff_text = res.stdout
|
|
113
|
+
except subprocess.CalledProcessError:
|
|
114
|
+
# Probably the initial commit. Try --root.
|
|
115
|
+
try:
|
|
116
|
+
res = subprocess.run(
|
|
117
|
+
["git", "diff", "--root", "--unified=0", "--no-color", ref],
|
|
118
|
+
cwd=str(root),
|
|
119
|
+
capture_output=True,
|
|
120
|
+
text=True,
|
|
121
|
+
encoding="utf-8",
|
|
122
|
+
errors="replace",
|
|
123
|
+
check=True,
|
|
124
|
+
)
|
|
125
|
+
diff_text = res.stdout
|
|
126
|
+
except subprocess.CalledProcessError as exc:
|
|
127
|
+
log.warning("git diff failed for ref %r: %s", ref, exc)
|
|
128
|
+
return []
|
|
129
|
+
|
|
130
|
+
if diff_text is None:
|
|
131
|
+
log.warning("git diff returned None stdout for ref %r — empty []", ref)
|
|
132
|
+
return []
|
|
133
|
+
return _parse_diff(diff_text)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _parse(stdout: str) -> list[Change]:
|
|
137
|
+
"""Parse the formatted output into Change objects.
|
|
138
|
+
|
|
139
|
+
Each commit is:
|
|
140
|
+
<sha>\\x1f<iso_date>\\x1f<author>\\x1f<subject>\\n
|
|
141
|
+
<path1>\\n
|
|
142
|
+
<path2>\\n
|
|
143
|
+
...
|
|
144
|
+
\\n (blank separator)
|
|
145
|
+
"""
|
|
146
|
+
commits: list[Change] = []
|
|
147
|
+
blocks = [b for b in stdout.split("\n\n") if b.strip()]
|
|
148
|
+
for block in blocks:
|
|
149
|
+
lines = block.splitlines()
|
|
150
|
+
if not lines:
|
|
151
|
+
continue
|
|
152
|
+
header = lines[0]
|
|
153
|
+
parts = header.split(_FS)
|
|
154
|
+
if len(parts) < 4:
|
|
155
|
+
continue
|
|
156
|
+
sha, iso_date, author, summary = parts[0], parts[1], parts[2], parts[3]
|
|
157
|
+
path_lines = [p.strip() for p in lines[1:] if p.strip()]
|
|
158
|
+
try:
|
|
159
|
+
date = datetime.fromisoformat(iso_date)
|
|
160
|
+
except ValueError:
|
|
161
|
+
continue
|
|
162
|
+
commits.append(
|
|
163
|
+
Change(
|
|
164
|
+
sha=sha,
|
|
165
|
+
date=date,
|
|
166
|
+
author=author,
|
|
167
|
+
paths=path_lines,
|
|
168
|
+
summary=summary,
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
return commits
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def _parse_diff(diff_text: str) -> list[DiffFile]:
|
|
175
|
+
"""Parse a unified diff into a list of (path, hunks) pairs.
|
|
176
|
+
|
|
177
|
+
Hunk headers look like:
|
|
178
|
+
@@ -<old>,<oc> +<new>,<nc> @@
|
|
179
|
+
|
|
180
|
+
File headers look like:
|
|
181
|
+
diff --git a/<path> b/<path>
|
|
182
|
+
+++ b/<path>
|
|
183
|
+
|
|
184
|
+
We use the +++ header for the "new file" path; a/<path> would point
|
|
185
|
+
at the old name in renames.
|
|
186
|
+
"""
|
|
187
|
+
files_to_hunks: dict[str, list[tuple[int, int]]] = {}
|
|
188
|
+
current_path: str | None = None
|
|
189
|
+
hunk_re = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
|
|
190
|
+
plus_path_re = re.compile(r"^\+\+\+ b/(.+)$")
|
|
191
|
+
null_path_re = re.compile(r"^\+\+\+ /dev/null")
|
|
192
|
+
|
|
193
|
+
for line in diff_text.splitlines():
|
|
194
|
+
m = plus_path_re.match(line)
|
|
195
|
+
if m:
|
|
196
|
+
current_path = m.group(1)
|
|
197
|
+
files_to_hunks.setdefault(current_path, [])
|
|
198
|
+
continue
|
|
199
|
+
if null_path_re.match(line):
|
|
200
|
+
current_path = None # File deletion — no new-file hunks.
|
|
201
|
+
continue
|
|
202
|
+
m = hunk_re.match(line)
|
|
203
|
+
if m and current_path:
|
|
204
|
+
new_start = int(m.group(1))
|
|
205
|
+
new_count = int(m.group(2)) if m.group(2) else 1
|
|
206
|
+
# new_count == 0 means pure deletion — use the surrounding line
|
|
207
|
+
# as a single-line range.
|
|
208
|
+
end_line = new_start if new_count == 0 else new_start + new_count - 1
|
|
209
|
+
files_to_hunks[current_path].append((new_start, end_line))
|
|
210
|
+
|
|
211
|
+
return [DiffFile(path=p, hunks=tuple(h)) for p, h in files_to_hunks.items()]
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""FilesystemIntrospector — extracts a ProjectSummary from filesystem heuristics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import contextlib
|
|
6
|
+
import json
|
|
7
|
+
import tomllib
|
|
8
|
+
from collections import Counter
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import pathspec
|
|
12
|
+
|
|
13
|
+
from code_context.domain.models import ProjectSummary
|
|
14
|
+
|
|
15
|
+
# Universally-noisy directories that mean "compiled output / vendored deps /
|
|
16
|
+
# editor scratch", not source. Skipped even if .gitignore is missing —
|
|
17
|
+
# every language ecosystem has at least one of these and they bloat
|
|
18
|
+
# stats by 10-1000x (e.g. Sprint 5 smoke against WinServiceScheduler
|
|
19
|
+
# reported 2179 files / 6.5M LOC because bin/obj/.dll were walked).
|
|
20
|
+
_DENYLIST_DIRS = frozenset(
|
|
21
|
+
{
|
|
22
|
+
".git",
|
|
23
|
+
".hg",
|
|
24
|
+
".svn",
|
|
25
|
+
".venv",
|
|
26
|
+
"venv",
|
|
27
|
+
"node_modules",
|
|
28
|
+
"__pycache__",
|
|
29
|
+
".pytest_cache",
|
|
30
|
+
".mypy_cache",
|
|
31
|
+
".ruff_cache",
|
|
32
|
+
".tox",
|
|
33
|
+
"dist",
|
|
34
|
+
"build",
|
|
35
|
+
"bin",
|
|
36
|
+
"obj",
|
|
37
|
+
"out",
|
|
38
|
+
"publish",
|
|
39
|
+
"target",
|
|
40
|
+
"coverage",
|
|
41
|
+
".idea",
|
|
42
|
+
".vscode",
|
|
43
|
+
".vs",
|
|
44
|
+
}
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class FilesystemIntrospector:
|
|
49
|
+
def summary(
|
|
50
|
+
self, root: Path, scope: str = "project", path: Path | None = None
|
|
51
|
+
) -> ProjectSummary:
|
|
52
|
+
target = path if (scope == "module" and path is not None) else root
|
|
53
|
+
gitignore = self._load_gitignore(root)
|
|
54
|
+
name = self._project_name(target)
|
|
55
|
+
purpose = self._readme_first_paragraph(target)
|
|
56
|
+
stack = self._detect_stack(target)
|
|
57
|
+
key_modules = self._key_modules(target, root, gitignore)
|
|
58
|
+
stats = self._stats(target, root, gitignore)
|
|
59
|
+
entry_points = self._entry_points(target)
|
|
60
|
+
return ProjectSummary(
|
|
61
|
+
name=name,
|
|
62
|
+
purpose=purpose,
|
|
63
|
+
stack=stack,
|
|
64
|
+
entry_points=entry_points,
|
|
65
|
+
key_modules=key_modules,
|
|
66
|
+
stats=stats,
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
@staticmethod
|
|
70
|
+
def _load_gitignore(root: Path) -> pathspec.PathSpec:
|
|
71
|
+
"""Return a pathspec covering .gitignore + .git/ + the denylist.
|
|
72
|
+
|
|
73
|
+
Mirrors FilesystemSource._load_gitignore (Sprint 1). Adds a
|
|
74
|
+
baseline `.git/` line so even repos without a .gitignore skip
|
|
75
|
+
version-control internals; denylist dirs are appended as
|
|
76
|
+
gitignore-style patterns so the same matcher handles both.
|
|
77
|
+
"""
|
|
78
|
+
lines = [".git/", *(f"{d}/" for d in sorted(_DENYLIST_DIRS))]
|
|
79
|
+
gi = root / ".gitignore"
|
|
80
|
+
if gi.exists():
|
|
81
|
+
with contextlib.suppress(OSError):
|
|
82
|
+
lines.extend(gi.read_text(encoding="utf-8", errors="replace").splitlines())
|
|
83
|
+
return pathspec.PathSpec.from_lines("gitignore", lines)
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def _project_name(root: Path) -> str:
|
|
87
|
+
py = root / "pyproject.toml"
|
|
88
|
+
if py.exists():
|
|
89
|
+
try:
|
|
90
|
+
data = tomllib.loads(py.read_text())
|
|
91
|
+
name = data.get("project", {}).get("name")
|
|
92
|
+
if isinstance(name, str):
|
|
93
|
+
return name
|
|
94
|
+
except (tomllib.TOMLDecodeError, OSError):
|
|
95
|
+
pass
|
|
96
|
+
pkg = root / "package.json"
|
|
97
|
+
if pkg.exists():
|
|
98
|
+
try:
|
|
99
|
+
data = json.loads(pkg.read_text())
|
|
100
|
+
if isinstance(data.get("name"), str):
|
|
101
|
+
return data["name"]
|
|
102
|
+
except (json.JSONDecodeError, OSError):
|
|
103
|
+
pass
|
|
104
|
+
return root.name
|
|
105
|
+
|
|
106
|
+
@staticmethod
|
|
107
|
+
def _readme_first_paragraph(root: Path) -> str:
|
|
108
|
+
for candidate in ("README.md", "readme.md", "README.rst", "README"):
|
|
109
|
+
f = root / candidate
|
|
110
|
+
if f.exists():
|
|
111
|
+
text = f.read_text(encoding="utf-8", errors="replace")
|
|
112
|
+
# Find the first non-heading non-blank paragraph.
|
|
113
|
+
for chunk in text.split("\n\n"):
|
|
114
|
+
stripped = chunk.strip()
|
|
115
|
+
if not stripped:
|
|
116
|
+
continue
|
|
117
|
+
if stripped.startswith("#"):
|
|
118
|
+
continue
|
|
119
|
+
return stripped
|
|
120
|
+
return ""
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def _detect_stack(root: Path) -> list[str]:
|
|
124
|
+
stack: list[str] = []
|
|
125
|
+
if (root / "pyproject.toml").exists() or (root / "setup.py").exists():
|
|
126
|
+
stack.append("Python")
|
|
127
|
+
if (root / "package.json").exists():
|
|
128
|
+
stack.append("Node")
|
|
129
|
+
if (root / "Cargo.toml").exists():
|
|
130
|
+
stack.append("Rust")
|
|
131
|
+
if (root / "go.mod").exists():
|
|
132
|
+
stack.append("Go")
|
|
133
|
+
if (root / "pom.xml").exists() or (root / "build.gradle").exists():
|
|
134
|
+
stack.append("Java")
|
|
135
|
+
return stack
|
|
136
|
+
|
|
137
|
+
@staticmethod
|
|
138
|
+
def _entry_points(root: Path) -> list[str]:
|
|
139
|
+
candidates = [
|
|
140
|
+
"src/main.py",
|
|
141
|
+
"src/index.js",
|
|
142
|
+
"src/index.ts",
|
|
143
|
+
"src/main.go",
|
|
144
|
+
"src/main.rs",
|
|
145
|
+
"main.py",
|
|
146
|
+
"index.js",
|
|
147
|
+
"main.go",
|
|
148
|
+
]
|
|
149
|
+
return [c for c in candidates if (root / c).exists()]
|
|
150
|
+
|
|
151
|
+
@staticmethod
|
|
152
|
+
def _key_modules(
|
|
153
|
+
target: Path,
|
|
154
|
+
root: Path,
|
|
155
|
+
gitignore: pathspec.PathSpec,
|
|
156
|
+
) -> list[dict[str, str]]:
|
|
157
|
+
out: list[dict[str, str]] = []
|
|
158
|
+
try:
|
|
159
|
+
entries = sorted(target.iterdir())
|
|
160
|
+
except OSError:
|
|
161
|
+
return out
|
|
162
|
+
for child in entries:
|
|
163
|
+
if not child.is_dir():
|
|
164
|
+
continue
|
|
165
|
+
name = child.name
|
|
166
|
+
if name.startswith(".") or name in _DENYLIST_DIRS:
|
|
167
|
+
continue
|
|
168
|
+
try:
|
|
169
|
+
rel_dir = child.resolve().relative_to(root.resolve()).as_posix()
|
|
170
|
+
except ValueError:
|
|
171
|
+
rel_dir = name # target is outside root; don't gitignore-filter
|
|
172
|
+
# gitignore patterns expect dir entries with trailing slash.
|
|
173
|
+
if gitignore.match_file(rel_dir + "/") or gitignore.match_file(rel_dir):
|
|
174
|
+
continue
|
|
175
|
+
out.append({"path": name, "purpose": ""})
|
|
176
|
+
return out
|
|
177
|
+
|
|
178
|
+
@staticmethod
|
|
179
|
+
def _stats(
|
|
180
|
+
target: Path,
|
|
181
|
+
root: Path,
|
|
182
|
+
gitignore: pathspec.PathSpec,
|
|
183
|
+
) -> dict[str, object]:
|
|
184
|
+
files = 0
|
|
185
|
+
loc = 0
|
|
186
|
+
langs: Counter[str] = Counter()
|
|
187
|
+
root_resolved = root
|
|
188
|
+
with contextlib.suppress(OSError):
|
|
189
|
+
root_resolved = root.resolve()
|
|
190
|
+
for f in target.rglob("*"):
|
|
191
|
+
if not f.is_file():
|
|
192
|
+
continue
|
|
193
|
+
# Filter against the denylist anywhere in the path so a nested
|
|
194
|
+
# `bin/`/`node_modules/` is excluded even if .gitignore is silent.
|
|
195
|
+
try:
|
|
196
|
+
rel_target = f.relative_to(target).parts
|
|
197
|
+
except ValueError:
|
|
198
|
+
continue
|
|
199
|
+
if any(part in _DENYLIST_DIRS for part in rel_target):
|
|
200
|
+
continue
|
|
201
|
+
if any(part.startswith(".") for part in rel_target):
|
|
202
|
+
continue
|
|
203
|
+
# Cross-check against .gitignore (which is anchored at repo root,
|
|
204
|
+
# so use the path relative to root, not target).
|
|
205
|
+
try:
|
|
206
|
+
rel_root = f.resolve().relative_to(root_resolved).as_posix()
|
|
207
|
+
except ValueError:
|
|
208
|
+
rel_root = "/".join(rel_target)
|
|
209
|
+
if gitignore.match_file(rel_root):
|
|
210
|
+
continue
|
|
211
|
+
files += 1
|
|
212
|
+
try:
|
|
213
|
+
content = f.read_text(encoding="utf-8", errors="replace")
|
|
214
|
+
loc += content.count("\n")
|
|
215
|
+
except OSError:
|
|
216
|
+
continue
|
|
217
|
+
ext = f.suffix.lstrip(".")
|
|
218
|
+
if ext:
|
|
219
|
+
langs[ext] += 1
|
|
220
|
+
return {
|
|
221
|
+
"files": files,
|
|
222
|
+
"loc": loc,
|
|
223
|
+
"languages": [ext for ext, _ in langs.most_common(10)],
|
|
224
|
+
}
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""SqliteFTS5Index — BM25 keyword index using SQLite's FTS5 module.
|
|
2
|
+
|
|
3
|
+
Each chunk is stored as a row in an FTS5 virtual table. SQLite's BM25
|
|
4
|
+
ranking is exposed as a function in FTS5; we use it directly in the
|
|
5
|
+
ORDER BY. The vector field is NOT stored here — only metadata + snippet
|
|
6
|
+
text — so this index is much smaller than the vector store on disk.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
import re
|
|
13
|
+
import sqlite3
|
|
14
|
+
from collections.abc import Iterable
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
|
|
19
|
+
from code_context.domain.models import Chunk, IndexEntry
|
|
20
|
+
|
|
21
|
+
log = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
_FILE = "keyword.sqlite"
|
|
24
|
+
_FTS_TABLE = "chunks_fts"
|
|
25
|
+
|
|
26
|
+
# FTS5 has a small set of reserved tokens (AND/OR/NOT/NEAR) AND treats
|
|
27
|
+
# punctuation in queries as syntax (a `.` is a column separator, a `-`
|
|
28
|
+
# starts an exclusion clause, `:` is column-qualified term). The default
|
|
29
|
+
# unicode61 tokenizer handles punctuation INSIDE indexed text fine, but
|
|
30
|
+
# in the QUERY the parser sees punctuation before tokenization. Strip
|
|
31
|
+
# everything that isn't a word char / whitespace; the resulting token
|
|
32
|
+
# list still matches the indexed tokens because the tokenizer would
|
|
33
|
+
# have split them at the same boundaries on the way in.
|
|
34
|
+
_FTS_KEEP_RE = re.compile(r"[^\w\s]", flags=re.UNICODE)
|
|
35
|
+
_FTS_BOOLEAN_RE = re.compile(r"\b(AND|OR|NOT|NEAR)\b", re.IGNORECASE)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class SqliteFTS5Index:
|
|
39
|
+
@property
|
|
40
|
+
def version(self) -> str:
|
|
41
|
+
return f"sqlite-fts5-{sqlite3.sqlite_version}-v1"
|
|
42
|
+
|
|
43
|
+
def __init__(self) -> None:
|
|
44
|
+
self._conn: sqlite3.Connection | None = None
|
|
45
|
+
self._db_path: Path | None = None
|
|
46
|
+
self._open_inmem()
|
|
47
|
+
|
|
48
|
+
def _open_inmem(self) -> None:
|
|
49
|
+
# check_same_thread=False: the MCP server runs query handlers via
|
|
50
|
+
# asyncio.to_thread, which uses a thread pool. Without this flag, a
|
|
51
|
+
# connection opened on the main thread cannot be used from worker
|
|
52
|
+
# threads (sqlite3.ProgrammingError). SQLite's library is built in
|
|
53
|
+
# serialized threading mode by default, so a single connection is
|
|
54
|
+
# safe across threads as long as we don't have concurrent writes —
|
|
55
|
+
# which we don't (writes happen at indexer.run() time, queries are
|
|
56
|
+
# read-only).
|
|
57
|
+
self._conn = sqlite3.connect(":memory:", check_same_thread=False)
|
|
58
|
+
self._init_schema()
|
|
59
|
+
|
|
60
|
+
def _init_schema(self) -> None:
|
|
61
|
+
assert self._conn is not None
|
|
62
|
+
self._conn.executescript(
|
|
63
|
+
f"""
|
|
64
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS {_FTS_TABLE} USING fts5(
|
|
65
|
+
path, line_start UNINDEXED, line_end UNINDEXED,
|
|
66
|
+
content_hash UNINDEXED, snippet,
|
|
67
|
+
tokenize='unicode61 remove_diacritics 2'
|
|
68
|
+
);
|
|
69
|
+
-- vector storage is intentionally absent — vectors live in NumPyParquetStore.
|
|
70
|
+
"""
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def add(self, entries: Iterable[IndexEntry]) -> None:
|
|
74
|
+
assert self._conn is not None
|
|
75
|
+
rows = []
|
|
76
|
+
for e in entries:
|
|
77
|
+
c = e.chunk
|
|
78
|
+
rows.append((c.path, c.line_start, c.line_end, c.content_hash, c.snippet))
|
|
79
|
+
if not rows:
|
|
80
|
+
return
|
|
81
|
+
self._conn.executemany(
|
|
82
|
+
f"INSERT INTO {_FTS_TABLE} (path, line_start, line_end, content_hash, snippet) "
|
|
83
|
+
"VALUES (?, ?, ?, ?, ?)",
|
|
84
|
+
rows,
|
|
85
|
+
)
|
|
86
|
+
self._conn.commit()
|
|
87
|
+
|
|
88
|
+
def delete_by_path(self, path: str) -> int:
|
|
89
|
+
"""Remove every row whose path == `path` from the FTS5 table.
|
|
90
|
+
Returns the rowcount. Used by Sprint 6 incremental reindex."""
|
|
91
|
+
assert self._conn is not None
|
|
92
|
+
cur = self._conn.execute(f"DELETE FROM {_FTS_TABLE} WHERE path = ?", (path,))
|
|
93
|
+
self._conn.commit()
|
|
94
|
+
return cur.rowcount
|
|
95
|
+
|
|
96
|
+
def search(self, query: str, k: int) -> list[tuple[IndexEntry, float]]:
|
|
97
|
+
assert self._conn is not None
|
|
98
|
+
sanitised = _sanitise(query)
|
|
99
|
+
if not sanitised.strip():
|
|
100
|
+
return []
|
|
101
|
+
try:
|
|
102
|
+
cur = self._conn.execute(
|
|
103
|
+
f"""
|
|
104
|
+
SELECT path, line_start, line_end, content_hash, snippet,
|
|
105
|
+
bm25({_FTS_TABLE}) AS score
|
|
106
|
+
FROM {_FTS_TABLE}
|
|
107
|
+
WHERE {_FTS_TABLE} MATCH ?
|
|
108
|
+
ORDER BY score
|
|
109
|
+
LIMIT ?;
|
|
110
|
+
""",
|
|
111
|
+
(sanitised, k),
|
|
112
|
+
)
|
|
113
|
+
except sqlite3.OperationalError as exc:
|
|
114
|
+
log.warning("fts5 query failed (%s) for %r -> returning []", exc, query)
|
|
115
|
+
return []
|
|
116
|
+
return [
|
|
117
|
+
(
|
|
118
|
+
IndexEntry(
|
|
119
|
+
chunk=Chunk(
|
|
120
|
+
path=row[0],
|
|
121
|
+
line_start=row[1],
|
|
122
|
+
line_end=row[2],
|
|
123
|
+
content_hash=row[3],
|
|
124
|
+
snippet=row[4],
|
|
125
|
+
),
|
|
126
|
+
vector=np.zeros(0, dtype=np.float32), # Vector unused on this path.
|
|
127
|
+
),
|
|
128
|
+
# bm25() returns negative scores; flip sign for "higher is better".
|
|
129
|
+
-float(row[5]),
|
|
130
|
+
)
|
|
131
|
+
for row in cur.fetchall()
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
def persist(self, path: Path) -> None:
|
|
135
|
+
assert self._conn is not None
|
|
136
|
+
path.mkdir(parents=True, exist_ok=True)
|
|
137
|
+
target = path / _FILE
|
|
138
|
+
# Commit any open implicit transaction first — backup() blocks on
|
|
139
|
+
# uncommitted writes in the source connection.
|
|
140
|
+
self._conn.commit()
|
|
141
|
+
# Backup the in-memory DB to disk. sqlite3.Connection's context manager
|
|
142
|
+
# only commits on exit; it does NOT close. We close explicitly so
|
|
143
|
+
# Windows releases the file lock (otherwise tmp_path cleanup hangs).
|
|
144
|
+
# Backup target only used inside this method, no thread-safety concerns.
|
|
145
|
+
disk = sqlite3.connect(target, check_same_thread=False)
|
|
146
|
+
try:
|
|
147
|
+
self._conn.backup(disk)
|
|
148
|
+
finally:
|
|
149
|
+
disk.close()
|
|
150
|
+
self._db_path = target
|
|
151
|
+
|
|
152
|
+
def load(self, path: Path) -> None:
|
|
153
|
+
"""Restore the index from `<path>/keyword.sqlite` into a fresh
|
|
154
|
+
in-memory connection.
|
|
155
|
+
|
|
156
|
+
Pre-Sprint-6 versions opened the on-disk file directly — fast,
|
|
157
|
+
zero RAM, but mutations (Sprint 6's incremental reindex calls
|
|
158
|
+
delete_by_path / add after load) wrote directly to the active
|
|
159
|
+
index file, breaking atomicity, AND a subsequent persist(same_dir)
|
|
160
|
+
deadlocked on SQLite's backup-to-itself constraint. The fix is
|
|
161
|
+
to load disk→memory: subsequent mutations stay in RAM and a
|
|
162
|
+
later persist() does the standard memory→fresh-disk backup. RAM
|
|
163
|
+
cost on the WinServiceScheduler smoke is ~5 MB; trivial.
|
|
164
|
+
"""
|
|
165
|
+
target = path / _FILE
|
|
166
|
+
if not target.exists():
|
|
167
|
+
raise FileNotFoundError(f"keyword index missing at {target}")
|
|
168
|
+
if self._conn is not None:
|
|
169
|
+
self._conn.close()
|
|
170
|
+
# check_same_thread=False — see _open_inmem rationale.
|
|
171
|
+
self._conn = sqlite3.connect(":memory:", check_same_thread=False)
|
|
172
|
+
disk = sqlite3.connect(target, check_same_thread=False)
|
|
173
|
+
try:
|
|
174
|
+
disk.backup(self._conn)
|
|
175
|
+
finally:
|
|
176
|
+
disk.close()
|
|
177
|
+
self._db_path = target
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _sanitise(query: str) -> str:
|
|
181
|
+
"""Strip FTS5 syntax so user input never reaches the query parser
|
|
182
|
+
as anything other than bare whitespace-separated tokens.
|
|
183
|
+
|
|
184
|
+
Caught by Sprint 8's eval suite: 3/35 queries with periods or
|
|
185
|
+
hyphens silently returned [] from the sanitiser-as-was — `.`,
|
|
186
|
+
`-`, `:` are FTS5 query syntax even though they're tokenized
|
|
187
|
+
away in indexed text by unicode61.
|
|
188
|
+
|
|
189
|
+
Steps:
|
|
190
|
+
1. Drop every non-word, non-whitespace char.
|
|
191
|
+
2. Drop the boolean operators (AND/OR/NOT/NEAR) so e.g.
|
|
192
|
+
"tracking changes and merges" doesn't accidentally parse as
|
|
193
|
+
`tracking changes AND merges`.
|
|
194
|
+
3. Collapse whitespace.
|
|
195
|
+
|
|
196
|
+
The result is space-joined; FTS5 combines bare tokens with
|
|
197
|
+
implicit AND. We deliberately keep AND semantics: short queries
|
|
198
|
+
(1-3 tokens) get tight, high-precision matches; long
|
|
199
|
+
natural-language queries (5+ tokens) effectively return [] from
|
|
200
|
+
the keyword leg, leaving the vector leg to drive the result.
|
|
201
|
+
Sprint 8 eval confirmed that ORing tokens makes long-query
|
|
202
|
+
BM25 too noisy and hurts NDCG@10 by ~0.13.
|
|
203
|
+
"""
|
|
204
|
+
cleaned = _FTS_KEEP_RE.sub(" ", query)
|
|
205
|
+
cleaned = _FTS_BOOLEAN_RE.sub(" ", cleaned)
|
|
206
|
+
return " ".join(cleaned.split())
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""CrossEncoderReranker — re-scores candidates using a sentence-transformers CrossEncoder.
|
|
2
|
+
|
|
3
|
+
Lazy-loads the model on first use; constructing the adapter doesn't
|
|
4
|
+
trigger torch loading. Empty candidate list short-circuits and never
|
|
5
|
+
loads the model.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from code_context.domain.models import IndexEntry
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _load_model(model_name: str) -> Any: # pragma: no cover - integration-tested
|
|
19
|
+
from sentence_transformers import CrossEncoder
|
|
20
|
+
|
|
21
|
+
log.info("loading cross-encoder model: %s", model_name)
|
|
22
|
+
return CrossEncoder(model_name)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _lib_version() -> str:
|
|
26
|
+
try:
|
|
27
|
+
from importlib.metadata import PackageNotFoundError, version
|
|
28
|
+
|
|
29
|
+
return version("sentence-transformers")
|
|
30
|
+
except PackageNotFoundError: # pragma: no cover
|
|
31
|
+
return "unknown"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class CrossEncoderReranker:
|
|
35
|
+
def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2") -> None:
|
|
36
|
+
self.model_name = model_name
|
|
37
|
+
self._model: Any = None
|
|
38
|
+
|
|
39
|
+
@property
|
|
40
|
+
def version(self) -> str:
|
|
41
|
+
return "crossencoder-v1"
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def model_id(self) -> str:
|
|
45
|
+
return f"crossencoder:{self.model_name}@v{_lib_version()}"
|
|
46
|
+
|
|
47
|
+
def rerank(
|
|
48
|
+
self,
|
|
49
|
+
query: str,
|
|
50
|
+
candidates: list[tuple[IndexEntry, float]],
|
|
51
|
+
k: int,
|
|
52
|
+
) -> list[tuple[IndexEntry, float]]:
|
|
53
|
+
if not candidates:
|
|
54
|
+
return []
|
|
55
|
+
if self._model is None:
|
|
56
|
+
self._model = _load_model(self.model_name)
|
|
57
|
+
pairs = [(query, e.chunk.snippet[:2048]) for e, _ in candidates]
|
|
58
|
+
scores = self._model.predict(pairs)
|
|
59
|
+
scored = [(c[0], float(s)) for c, s in zip(candidates, scores, strict=True)]
|
|
60
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
61
|
+
return scored[:k]
|