code-context-mcp 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. code_context/__init__.py +3 -0
  2. code_context/_background.py +93 -0
  3. code_context/_composition.py +425 -0
  4. code_context/_watcher.py +89 -0
  5. code_context/adapters/__init__.py +0 -0
  6. code_context/adapters/driven/__init__.py +0 -0
  7. code_context/adapters/driven/chunker_dispatcher.py +43 -0
  8. code_context/adapters/driven/chunker_line.py +54 -0
  9. code_context/adapters/driven/chunker_treesitter.py +215 -0
  10. code_context/adapters/driven/chunker_treesitter_queries.py +111 -0
  11. code_context/adapters/driven/code_source_fs.py +122 -0
  12. code_context/adapters/driven/embeddings_local.py +111 -0
  13. code_context/adapters/driven/embeddings_openai.py +58 -0
  14. code_context/adapters/driven/git_source_cli.py +211 -0
  15. code_context/adapters/driven/introspector_fs.py +224 -0
  16. code_context/adapters/driven/keyword_index_sqlite.py +206 -0
  17. code_context/adapters/driven/reranker_crossencoder.py +61 -0
  18. code_context/adapters/driven/symbol_index_sqlite.py +264 -0
  19. code_context/adapters/driven/vector_store_numpy.py +119 -0
  20. code_context/adapters/driving/__init__.py +0 -0
  21. code_context/adapters/driving/mcp_server.py +365 -0
  22. code_context/cli.py +161 -0
  23. code_context/config.py +114 -0
  24. code_context/domain/__init__.py +0 -0
  25. code_context/domain/index_bus.py +52 -0
  26. code_context/domain/models.py +140 -0
  27. code_context/domain/ports.py +205 -0
  28. code_context/domain/use_cases/__init__.py +0 -0
  29. code_context/domain/use_cases/explain_diff.py +98 -0
  30. code_context/domain/use_cases/find_definition.py +30 -0
  31. code_context/domain/use_cases/find_references.py +22 -0
  32. code_context/domain/use_cases/get_file_tree.py +36 -0
  33. code_context/domain/use_cases/get_summary.py +24 -0
  34. code_context/domain/use_cases/indexer.py +336 -0
  35. code_context/domain/use_cases/recent_changes.py +36 -0
  36. code_context/domain/use_cases/search_repo.py +131 -0
  37. code_context/server.py +151 -0
  38. code_context_mcp-1.0.0.dist-info/METADATA +181 -0
  39. code_context_mcp-1.0.0.dist-info/RECORD +43 -0
  40. code_context_mcp-1.0.0.dist-info/WHEEL +5 -0
  41. code_context_mcp-1.0.0.dist-info/entry_points.txt +3 -0
  42. code_context_mcp-1.0.0.dist-info/licenses/LICENSE +21 -0
  43. code_context_mcp-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,211 @@
1
+ """GitCliSource — subprocess to `git` with ASCII unit-separator parsing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import re
7
+ import subprocess
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+
11
+ from code_context.domain.models import Change, DiffFile
12
+
13
+ log = logging.getLogger(__name__)
14
+
15
+ _FS = "\x1f" # ASCII unit separator
16
+ _PRETTY = f"%H{_FS}%aI{_FS}%an{_FS}%s"
17
+
18
+
19
+ class GitCliSource:
20
+ def is_repo(self, root: Path) -> bool:
21
+ return (root / ".git").exists()
22
+
23
+ def head_sha(self, root: Path) -> str:
24
+ if not self.is_repo(root):
25
+ return ""
26
+ try:
27
+ out = subprocess.run(
28
+ ["git", "rev-parse", "HEAD"],
29
+ cwd=str(root),
30
+ capture_output=True,
31
+ text=True,
32
+ encoding="utf-8",
33
+ errors="replace",
34
+ check=True,
35
+ )
36
+ return (out.stdout or "").strip()
37
+ except subprocess.CalledProcessError as exc:
38
+ log.warning("git rev-parse HEAD failed: %s", exc)
39
+ return ""
40
+
41
+ def commits(
42
+ self,
43
+ root: Path,
44
+ since: datetime | None = None,
45
+ paths: list[str] | None = None,
46
+ max_count: int = 20,
47
+ ) -> list[Change]:
48
+ if not self.is_repo(root):
49
+ return []
50
+
51
+ cmd = ["git", "log", f"--pretty=format:{_PRETTY}", "--name-only", f"-{max_count}"]
52
+ if since is not None:
53
+ cmd.append(f"--since={since.isoformat()}")
54
+ if paths:
55
+ cmd.append("--")
56
+ cmd.extend(paths)
57
+
58
+ try:
59
+ res = subprocess.run(
60
+ cmd,
61
+ cwd=str(root),
62
+ capture_output=True,
63
+ text=True,
64
+ encoding="utf-8",
65
+ errors="replace",
66
+ check=True,
67
+ )
68
+ except subprocess.CalledProcessError as exc:
69
+ log.warning("git log failed: %s", exc)
70
+ return []
71
+
72
+ return _parse(res.stdout or "")
73
+
74
+ def diff_files(self, root: Path, ref: str) -> list[DiffFile]:
75
+ """Use git diff-tree + numstat-like parsing to get hunks per file.
76
+
77
+ Strategy: `git diff <ref>^! --unified=0 --no-color` gives a unified
78
+ diff with zero context lines. Each hunk header line is:
79
+ @@ -<old_start>,<old_count> +<new_start>,<new_count> @@
80
+ We parse those into (new_start, new_start + new_count - 1) pairs.
81
+
82
+ For ref == HEAD, the worktree diff (uncommitted changes) is excluded;
83
+ we always show the committed diff. To diff worktree, the caller would
84
+ pass an explicit ref like "HEAD" with a different strategy — out of
85
+ scope for v0.7.0.
86
+ """
87
+ if not self.is_repo(root):
88
+ return []
89
+
90
+ # ^! syntax means "this commit's changes vs its parent". Equivalent to
91
+ # `git diff <ref>~1 <ref>` for non-merge commits. For the initial
92
+ # commit, ^! is invalid; fall back to `git diff --root <ref>`.
93
+ #
94
+ # Critical Windows note: text=True alone uses Python's default
95
+ # locale encoding (cp1252 on Windows), which CANNOT decode many
96
+ # bytes that legitimately appear in git diff output (binary chunks,
97
+ # mixed-encoding source files). When the reader thread fails to
98
+ # decode, `res.stdout` becomes None even though the subprocess
99
+ # exited successfully. We force UTF-8 + errors="replace" to ensure
100
+ # we always get a string back, and we defensively guard against
101
+ # None in case future git versions change the behavior again.
102
+ try:
103
+ res = subprocess.run(
104
+ ["git", "diff", f"{ref}^!", "--unified=0", "--no-color"],
105
+ cwd=str(root),
106
+ capture_output=True,
107
+ text=True,
108
+ encoding="utf-8",
109
+ errors="replace",
110
+ check=True,
111
+ )
112
+ diff_text = res.stdout
113
+ except subprocess.CalledProcessError:
114
+ # Probably the initial commit. Try --root.
115
+ try:
116
+ res = subprocess.run(
117
+ ["git", "diff", "--root", "--unified=0", "--no-color", ref],
118
+ cwd=str(root),
119
+ capture_output=True,
120
+ text=True,
121
+ encoding="utf-8",
122
+ errors="replace",
123
+ check=True,
124
+ )
125
+ diff_text = res.stdout
126
+ except subprocess.CalledProcessError as exc:
127
+ log.warning("git diff failed for ref %r: %s", ref, exc)
128
+ return []
129
+
130
+ if diff_text is None:
131
+ log.warning("git diff returned None stdout for ref %r — empty []", ref)
132
+ return []
133
+ return _parse_diff(diff_text)
134
+
135
+
136
+ def _parse(stdout: str) -> list[Change]:
137
+ """Parse the formatted output into Change objects.
138
+
139
+ Each commit is:
140
+ <sha>\\x1f<iso_date>\\x1f<author>\\x1f<subject>\\n
141
+ <path1>\\n
142
+ <path2>\\n
143
+ ...
144
+ \\n (blank separator)
145
+ """
146
+ commits: list[Change] = []
147
+ blocks = [b for b in stdout.split("\n\n") if b.strip()]
148
+ for block in blocks:
149
+ lines = block.splitlines()
150
+ if not lines:
151
+ continue
152
+ header = lines[0]
153
+ parts = header.split(_FS)
154
+ if len(parts) < 4:
155
+ continue
156
+ sha, iso_date, author, summary = parts[0], parts[1], parts[2], parts[3]
157
+ path_lines = [p.strip() for p in lines[1:] if p.strip()]
158
+ try:
159
+ date = datetime.fromisoformat(iso_date)
160
+ except ValueError:
161
+ continue
162
+ commits.append(
163
+ Change(
164
+ sha=sha,
165
+ date=date,
166
+ author=author,
167
+ paths=path_lines,
168
+ summary=summary,
169
+ )
170
+ )
171
+ return commits
172
+
173
+
174
+ def _parse_diff(diff_text: str) -> list[DiffFile]:
175
+ """Parse a unified diff into a list of (path, hunks) pairs.
176
+
177
+ Hunk headers look like:
178
+ @@ -<old>,<oc> +<new>,<nc> @@
179
+
180
+ File headers look like:
181
+ diff --git a/<path> b/<path>
182
+ +++ b/<path>
183
+
184
+ We use the +++ header for the "new file" path; a/<path> would point
185
+ at the old name in renames.
186
+ """
187
+ files_to_hunks: dict[str, list[tuple[int, int]]] = {}
188
+ current_path: str | None = None
189
+ hunk_re = re.compile(r"^@@ -\d+(?:,\d+)? \+(\d+)(?:,(\d+))? @@")
190
+ plus_path_re = re.compile(r"^\+\+\+ b/(.+)$")
191
+ null_path_re = re.compile(r"^\+\+\+ /dev/null")
192
+
193
+ for line in diff_text.splitlines():
194
+ m = plus_path_re.match(line)
195
+ if m:
196
+ current_path = m.group(1)
197
+ files_to_hunks.setdefault(current_path, [])
198
+ continue
199
+ if null_path_re.match(line):
200
+ current_path = None # File deletion — no new-file hunks.
201
+ continue
202
+ m = hunk_re.match(line)
203
+ if m and current_path:
204
+ new_start = int(m.group(1))
205
+ new_count = int(m.group(2)) if m.group(2) else 1
206
+ # new_count == 0 means pure deletion — use the surrounding line
207
+ # as a single-line range.
208
+ end_line = new_start if new_count == 0 else new_start + new_count - 1
209
+ files_to_hunks[current_path].append((new_start, end_line))
210
+
211
+ return [DiffFile(path=p, hunks=tuple(h)) for p, h in files_to_hunks.items()]
@@ -0,0 +1,224 @@
1
+ """FilesystemIntrospector — extracts a ProjectSummary from filesystem heuristics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import contextlib
6
+ import json
7
+ import tomllib
8
+ from collections import Counter
9
+ from pathlib import Path
10
+
11
+ import pathspec
12
+
13
+ from code_context.domain.models import ProjectSummary
14
+
15
+ # Universally-noisy directories that mean "compiled output / vendored deps /
16
+ # editor scratch", not source. Skipped even if .gitignore is missing —
17
+ # every language ecosystem has at least one of these and they bloat
18
+ # stats by 10-1000x (e.g. Sprint 5 smoke against WinServiceScheduler
19
+ # reported 2179 files / 6.5M LOC because bin/obj/.dll were walked).
20
+ _DENYLIST_DIRS = frozenset(
21
+ {
22
+ ".git",
23
+ ".hg",
24
+ ".svn",
25
+ ".venv",
26
+ "venv",
27
+ "node_modules",
28
+ "__pycache__",
29
+ ".pytest_cache",
30
+ ".mypy_cache",
31
+ ".ruff_cache",
32
+ ".tox",
33
+ "dist",
34
+ "build",
35
+ "bin",
36
+ "obj",
37
+ "out",
38
+ "publish",
39
+ "target",
40
+ "coverage",
41
+ ".idea",
42
+ ".vscode",
43
+ ".vs",
44
+ }
45
+ )
46
+
47
+
48
+ class FilesystemIntrospector:
49
+ def summary(
50
+ self, root: Path, scope: str = "project", path: Path | None = None
51
+ ) -> ProjectSummary:
52
+ target = path if (scope == "module" and path is not None) else root
53
+ gitignore = self._load_gitignore(root)
54
+ name = self._project_name(target)
55
+ purpose = self._readme_first_paragraph(target)
56
+ stack = self._detect_stack(target)
57
+ key_modules = self._key_modules(target, root, gitignore)
58
+ stats = self._stats(target, root, gitignore)
59
+ entry_points = self._entry_points(target)
60
+ return ProjectSummary(
61
+ name=name,
62
+ purpose=purpose,
63
+ stack=stack,
64
+ entry_points=entry_points,
65
+ key_modules=key_modules,
66
+ stats=stats,
67
+ )
68
+
69
+ @staticmethod
70
+ def _load_gitignore(root: Path) -> pathspec.PathSpec:
71
+ """Return a pathspec covering .gitignore + .git/ + the denylist.
72
+
73
+ Mirrors FilesystemSource._load_gitignore (Sprint 1). Adds a
74
+ baseline `.git/` line so even repos without a .gitignore skip
75
+ version-control internals; denylist dirs are appended as
76
+ gitignore-style patterns so the same matcher handles both.
77
+ """
78
+ lines = [".git/", *(f"{d}/" for d in sorted(_DENYLIST_DIRS))]
79
+ gi = root / ".gitignore"
80
+ if gi.exists():
81
+ with contextlib.suppress(OSError):
82
+ lines.extend(gi.read_text(encoding="utf-8", errors="replace").splitlines())
83
+ return pathspec.PathSpec.from_lines("gitignore", lines)
84
+
85
+ @staticmethod
86
+ def _project_name(root: Path) -> str:
87
+ py = root / "pyproject.toml"
88
+ if py.exists():
89
+ try:
90
+ data = tomllib.loads(py.read_text())
91
+ name = data.get("project", {}).get("name")
92
+ if isinstance(name, str):
93
+ return name
94
+ except (tomllib.TOMLDecodeError, OSError):
95
+ pass
96
+ pkg = root / "package.json"
97
+ if pkg.exists():
98
+ try:
99
+ data = json.loads(pkg.read_text())
100
+ if isinstance(data.get("name"), str):
101
+ return data["name"]
102
+ except (json.JSONDecodeError, OSError):
103
+ pass
104
+ return root.name
105
+
106
+ @staticmethod
107
+ def _readme_first_paragraph(root: Path) -> str:
108
+ for candidate in ("README.md", "readme.md", "README.rst", "README"):
109
+ f = root / candidate
110
+ if f.exists():
111
+ text = f.read_text(encoding="utf-8", errors="replace")
112
+ # Find the first non-heading non-blank paragraph.
113
+ for chunk in text.split("\n\n"):
114
+ stripped = chunk.strip()
115
+ if not stripped:
116
+ continue
117
+ if stripped.startswith("#"):
118
+ continue
119
+ return stripped
120
+ return ""
121
+
122
+ @staticmethod
123
+ def _detect_stack(root: Path) -> list[str]:
124
+ stack: list[str] = []
125
+ if (root / "pyproject.toml").exists() or (root / "setup.py").exists():
126
+ stack.append("Python")
127
+ if (root / "package.json").exists():
128
+ stack.append("Node")
129
+ if (root / "Cargo.toml").exists():
130
+ stack.append("Rust")
131
+ if (root / "go.mod").exists():
132
+ stack.append("Go")
133
+ if (root / "pom.xml").exists() or (root / "build.gradle").exists():
134
+ stack.append("Java")
135
+ return stack
136
+
137
+ @staticmethod
138
+ def _entry_points(root: Path) -> list[str]:
139
+ candidates = [
140
+ "src/main.py",
141
+ "src/index.js",
142
+ "src/index.ts",
143
+ "src/main.go",
144
+ "src/main.rs",
145
+ "main.py",
146
+ "index.js",
147
+ "main.go",
148
+ ]
149
+ return [c for c in candidates if (root / c).exists()]
150
+
151
+ @staticmethod
152
+ def _key_modules(
153
+ target: Path,
154
+ root: Path,
155
+ gitignore: pathspec.PathSpec,
156
+ ) -> list[dict[str, str]]:
157
+ out: list[dict[str, str]] = []
158
+ try:
159
+ entries = sorted(target.iterdir())
160
+ except OSError:
161
+ return out
162
+ for child in entries:
163
+ if not child.is_dir():
164
+ continue
165
+ name = child.name
166
+ if name.startswith(".") or name in _DENYLIST_DIRS:
167
+ continue
168
+ try:
169
+ rel_dir = child.resolve().relative_to(root.resolve()).as_posix()
170
+ except ValueError:
171
+ rel_dir = name # target is outside root; don't gitignore-filter
172
+ # gitignore patterns expect dir entries with trailing slash.
173
+ if gitignore.match_file(rel_dir + "/") or gitignore.match_file(rel_dir):
174
+ continue
175
+ out.append({"path": name, "purpose": ""})
176
+ return out
177
+
178
+ @staticmethod
179
+ def _stats(
180
+ target: Path,
181
+ root: Path,
182
+ gitignore: pathspec.PathSpec,
183
+ ) -> dict[str, object]:
184
+ files = 0
185
+ loc = 0
186
+ langs: Counter[str] = Counter()
187
+ root_resolved = root
188
+ with contextlib.suppress(OSError):
189
+ root_resolved = root.resolve()
190
+ for f in target.rglob("*"):
191
+ if not f.is_file():
192
+ continue
193
+ # Filter against the denylist anywhere in the path so a nested
194
+ # `bin/`/`node_modules/` is excluded even if .gitignore is silent.
195
+ try:
196
+ rel_target = f.relative_to(target).parts
197
+ except ValueError:
198
+ continue
199
+ if any(part in _DENYLIST_DIRS for part in rel_target):
200
+ continue
201
+ if any(part.startswith(".") for part in rel_target):
202
+ continue
203
+ # Cross-check against .gitignore (which is anchored at repo root,
204
+ # so use the path relative to root, not target).
205
+ try:
206
+ rel_root = f.resolve().relative_to(root_resolved).as_posix()
207
+ except ValueError:
208
+ rel_root = "/".join(rel_target)
209
+ if gitignore.match_file(rel_root):
210
+ continue
211
+ files += 1
212
+ try:
213
+ content = f.read_text(encoding="utf-8", errors="replace")
214
+ loc += content.count("\n")
215
+ except OSError:
216
+ continue
217
+ ext = f.suffix.lstrip(".")
218
+ if ext:
219
+ langs[ext] += 1
220
+ return {
221
+ "files": files,
222
+ "loc": loc,
223
+ "languages": [ext for ext, _ in langs.most_common(10)],
224
+ }
@@ -0,0 +1,206 @@
1
+ """SqliteFTS5Index — BM25 keyword index using SQLite's FTS5 module.
2
+
3
+ Each chunk is stored as a row in an FTS5 virtual table. SQLite's BM25
4
+ ranking is exposed as a function in FTS5; we use it directly in the
5
+ ORDER BY. The vector field is NOT stored here — only metadata + snippet
6
+ text — so this index is much smaller than the vector store on disk.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ import re
13
+ import sqlite3
14
+ from collections.abc import Iterable
15
+ from pathlib import Path
16
+
17
+ import numpy as np
18
+
19
+ from code_context.domain.models import Chunk, IndexEntry
20
+
21
+ log = logging.getLogger(__name__)
22
+
23
+ _FILE = "keyword.sqlite"
24
+ _FTS_TABLE = "chunks_fts"
25
+
26
+ # FTS5 has a small set of reserved tokens (AND/OR/NOT/NEAR) AND treats
27
+ # punctuation in queries as syntax (a `.` is a column separator, a `-`
28
+ # starts an exclusion clause, `:` is column-qualified term). The default
29
+ # unicode61 tokenizer handles punctuation INSIDE indexed text fine, but
30
+ # in the QUERY the parser sees punctuation before tokenization. Strip
31
+ # everything that isn't a word char / whitespace; the resulting token
32
+ # list still matches the indexed tokens because the tokenizer would
33
+ # have split them at the same boundaries on the way in.
34
+ _FTS_KEEP_RE = re.compile(r"[^\w\s]", flags=re.UNICODE)
35
+ _FTS_BOOLEAN_RE = re.compile(r"\b(AND|OR|NOT|NEAR)\b", re.IGNORECASE)
36
+
37
+
38
+ class SqliteFTS5Index:
39
+ @property
40
+ def version(self) -> str:
41
+ return f"sqlite-fts5-{sqlite3.sqlite_version}-v1"
42
+
43
+ def __init__(self) -> None:
44
+ self._conn: sqlite3.Connection | None = None
45
+ self._db_path: Path | None = None
46
+ self._open_inmem()
47
+
48
+ def _open_inmem(self) -> None:
49
+ # check_same_thread=False: the MCP server runs query handlers via
50
+ # asyncio.to_thread, which uses a thread pool. Without this flag, a
51
+ # connection opened on the main thread cannot be used from worker
52
+ # threads (sqlite3.ProgrammingError). SQLite's library is built in
53
+ # serialized threading mode by default, so a single connection is
54
+ # safe across threads as long as we don't have concurrent writes —
55
+ # which we don't (writes happen at indexer.run() time, queries are
56
+ # read-only).
57
+ self._conn = sqlite3.connect(":memory:", check_same_thread=False)
58
+ self._init_schema()
59
+
60
+ def _init_schema(self) -> None:
61
+ assert self._conn is not None
62
+ self._conn.executescript(
63
+ f"""
64
+ CREATE VIRTUAL TABLE IF NOT EXISTS {_FTS_TABLE} USING fts5(
65
+ path, line_start UNINDEXED, line_end UNINDEXED,
66
+ content_hash UNINDEXED, snippet,
67
+ tokenize='unicode61 remove_diacritics 2'
68
+ );
69
+ -- vector storage is intentionally absent — vectors live in NumPyParquetStore.
70
+ """
71
+ )
72
+
73
+ def add(self, entries: Iterable[IndexEntry]) -> None:
74
+ assert self._conn is not None
75
+ rows = []
76
+ for e in entries:
77
+ c = e.chunk
78
+ rows.append((c.path, c.line_start, c.line_end, c.content_hash, c.snippet))
79
+ if not rows:
80
+ return
81
+ self._conn.executemany(
82
+ f"INSERT INTO {_FTS_TABLE} (path, line_start, line_end, content_hash, snippet) "
83
+ "VALUES (?, ?, ?, ?, ?)",
84
+ rows,
85
+ )
86
+ self._conn.commit()
87
+
88
+ def delete_by_path(self, path: str) -> int:
89
+ """Remove every row whose path == `path` from the FTS5 table.
90
+ Returns the rowcount. Used by Sprint 6 incremental reindex."""
91
+ assert self._conn is not None
92
+ cur = self._conn.execute(f"DELETE FROM {_FTS_TABLE} WHERE path = ?", (path,))
93
+ self._conn.commit()
94
+ return cur.rowcount
95
+
96
+ def search(self, query: str, k: int) -> list[tuple[IndexEntry, float]]:
97
+ assert self._conn is not None
98
+ sanitised = _sanitise(query)
99
+ if not sanitised.strip():
100
+ return []
101
+ try:
102
+ cur = self._conn.execute(
103
+ f"""
104
+ SELECT path, line_start, line_end, content_hash, snippet,
105
+ bm25({_FTS_TABLE}) AS score
106
+ FROM {_FTS_TABLE}
107
+ WHERE {_FTS_TABLE} MATCH ?
108
+ ORDER BY score
109
+ LIMIT ?;
110
+ """,
111
+ (sanitised, k),
112
+ )
113
+ except sqlite3.OperationalError as exc:
114
+ log.warning("fts5 query failed (%s) for %r -> returning []", exc, query)
115
+ return []
116
+ return [
117
+ (
118
+ IndexEntry(
119
+ chunk=Chunk(
120
+ path=row[0],
121
+ line_start=row[1],
122
+ line_end=row[2],
123
+ content_hash=row[3],
124
+ snippet=row[4],
125
+ ),
126
+ vector=np.zeros(0, dtype=np.float32), # Vector unused on this path.
127
+ ),
128
+ # bm25() returns negative scores; flip sign for "higher is better".
129
+ -float(row[5]),
130
+ )
131
+ for row in cur.fetchall()
132
+ ]
133
+
134
+ def persist(self, path: Path) -> None:
135
+ assert self._conn is not None
136
+ path.mkdir(parents=True, exist_ok=True)
137
+ target = path / _FILE
138
+ # Commit any open implicit transaction first — backup() blocks on
139
+ # uncommitted writes in the source connection.
140
+ self._conn.commit()
141
+ # Backup the in-memory DB to disk. sqlite3.Connection's context manager
142
+ # only commits on exit; it does NOT close. We close explicitly so
143
+ # Windows releases the file lock (otherwise tmp_path cleanup hangs).
144
+ # Backup target only used inside this method, no thread-safety concerns.
145
+ disk = sqlite3.connect(target, check_same_thread=False)
146
+ try:
147
+ self._conn.backup(disk)
148
+ finally:
149
+ disk.close()
150
+ self._db_path = target
151
+
152
+ def load(self, path: Path) -> None:
153
+ """Restore the index from `<path>/keyword.sqlite` into a fresh
154
+ in-memory connection.
155
+
156
+ Pre-Sprint-6 versions opened the on-disk file directly — fast,
157
+ zero RAM, but mutations (Sprint 6's incremental reindex calls
158
+ delete_by_path / add after load) wrote directly to the active
159
+ index file, breaking atomicity, AND a subsequent persist(same_dir)
160
+ deadlocked on SQLite's backup-to-itself constraint. The fix is
161
+ to load disk→memory: subsequent mutations stay in RAM and a
162
+ later persist() does the standard memory→fresh-disk backup. RAM
163
+ cost on the WinServiceScheduler smoke is ~5 MB; trivial.
164
+ """
165
+ target = path / _FILE
166
+ if not target.exists():
167
+ raise FileNotFoundError(f"keyword index missing at {target}")
168
+ if self._conn is not None:
169
+ self._conn.close()
170
+ # check_same_thread=False — see _open_inmem rationale.
171
+ self._conn = sqlite3.connect(":memory:", check_same_thread=False)
172
+ disk = sqlite3.connect(target, check_same_thread=False)
173
+ try:
174
+ disk.backup(self._conn)
175
+ finally:
176
+ disk.close()
177
+ self._db_path = target
178
+
179
+
180
+ def _sanitise(query: str) -> str:
181
+ """Strip FTS5 syntax so user input never reaches the query parser
182
+ as anything other than bare whitespace-separated tokens.
183
+
184
+ Caught by Sprint 8's eval suite: 3/35 queries with periods or
185
+ hyphens silently returned [] from the sanitiser-as-was — `.`,
186
+ `-`, `:` are FTS5 query syntax even though they're tokenized
187
+ away in indexed text by unicode61.
188
+
189
+ Steps:
190
+ 1. Drop every non-word, non-whitespace char.
191
+ 2. Drop the boolean operators (AND/OR/NOT/NEAR) so e.g.
192
+ "tracking changes and merges" doesn't accidentally parse as
193
+ `tracking changes AND merges`.
194
+ 3. Collapse whitespace.
195
+
196
+ The result is space-joined; FTS5 combines bare tokens with
197
+ implicit AND. We deliberately keep AND semantics: short queries
198
+ (1-3 tokens) get tight, high-precision matches; long
199
+ natural-language queries (5+ tokens) effectively return [] from
200
+ the keyword leg, leaving the vector leg to drive the result.
201
+ Sprint 8 eval confirmed that ORing tokens makes long-query
202
+ BM25 too noisy and hurts NDCG@10 by ~0.13.
203
+ """
204
+ cleaned = _FTS_KEEP_RE.sub(" ", query)
205
+ cleaned = _FTS_BOOLEAN_RE.sub(" ", cleaned)
206
+ return " ".join(cleaned.split())
@@ -0,0 +1,61 @@
1
+ """CrossEncoderReranker — re-scores candidates using a sentence-transformers CrossEncoder.
2
+
3
+ Lazy-loads the model on first use; constructing the adapter doesn't
4
+ trigger torch loading. Empty candidate list short-circuits and never
5
+ loads the model.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Any
12
+
13
+ from code_context.domain.models import IndexEntry
14
+
15
+ log = logging.getLogger(__name__)
16
+
17
+
18
+ def _load_model(model_name: str) -> Any: # pragma: no cover - integration-tested
19
+ from sentence_transformers import CrossEncoder
20
+
21
+ log.info("loading cross-encoder model: %s", model_name)
22
+ return CrossEncoder(model_name)
23
+
24
+
25
+ def _lib_version() -> str:
26
+ try:
27
+ from importlib.metadata import PackageNotFoundError, version
28
+
29
+ return version("sentence-transformers")
30
+ except PackageNotFoundError: # pragma: no cover
31
+ return "unknown"
32
+
33
+
34
+ class CrossEncoderReranker:
35
+ def __init__(self, model_name: str = "cross-encoder/ms-marco-MiniLM-L-6-v2") -> None:
36
+ self.model_name = model_name
37
+ self._model: Any = None
38
+
39
+ @property
40
+ def version(self) -> str:
41
+ return "crossencoder-v1"
42
+
43
+ @property
44
+ def model_id(self) -> str:
45
+ return f"crossencoder:{self.model_name}@v{_lib_version()}"
46
+
47
+ def rerank(
48
+ self,
49
+ query: str,
50
+ candidates: list[tuple[IndexEntry, float]],
51
+ k: int,
52
+ ) -> list[tuple[IndexEntry, float]]:
53
+ if not candidates:
54
+ return []
55
+ if self._model is None:
56
+ self._model = _load_model(self.model_name)
57
+ pairs = [(query, e.chunk.snippet[:2048]) for e, _ in candidates]
58
+ scores = self._model.predict(pairs)
59
+ scored = [(c[0], float(s)) for c, s in zip(candidates, scores, strict=True)]
60
+ scored.sort(key=lambda x: x[1], reverse=True)
61
+ return scored[:k]