code-context-engine 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. code_context_engine-0.4.0.dist-info/METADATA +389 -0
  2. code_context_engine-0.4.0.dist-info/RECORD +63 -0
  3. code_context_engine-0.4.0.dist-info/WHEEL +5 -0
  4. code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
  5. code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
  6. code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
  7. context_engine/__init__.py +3 -0
  8. context_engine/cli.py +2848 -0
  9. context_engine/cli_style.py +66 -0
  10. context_engine/compression/__init__.py +0 -0
  11. context_engine/compression/compressor.py +144 -0
  12. context_engine/compression/ollama_client.py +33 -0
  13. context_engine/compression/output_rules.py +77 -0
  14. context_engine/compression/prompts.py +9 -0
  15. context_engine/compression/quality.py +37 -0
  16. context_engine/config.py +198 -0
  17. context_engine/dashboard/__init__.py +0 -0
  18. context_engine/dashboard/_page.py +1548 -0
  19. context_engine/dashboard/server.py +429 -0
  20. context_engine/editors.py +265 -0
  21. context_engine/event_bus.py +24 -0
  22. context_engine/indexer/__init__.py +0 -0
  23. context_engine/indexer/chunker.py +147 -0
  24. context_engine/indexer/embedder.py +154 -0
  25. context_engine/indexer/embedding_cache.py +168 -0
  26. context_engine/indexer/git_hooks.py +73 -0
  27. context_engine/indexer/git_indexer.py +136 -0
  28. context_engine/indexer/ignorefile.py +96 -0
  29. context_engine/indexer/manifest.py +78 -0
  30. context_engine/indexer/pipeline.py +624 -0
  31. context_engine/indexer/secrets.py +332 -0
  32. context_engine/indexer/watcher.py +109 -0
  33. context_engine/integration/__init__.py +0 -0
  34. context_engine/integration/bootstrap.py +76 -0
  35. context_engine/integration/git_context.py +132 -0
  36. context_engine/integration/mcp_server.py +1825 -0
  37. context_engine/integration/session_capture.py +306 -0
  38. context_engine/memory/__init__.py +6 -0
  39. context_engine/memory/compressor.py +344 -0
  40. context_engine/memory/db.py +922 -0
  41. context_engine/memory/extractive.py +106 -0
  42. context_engine/memory/grammar.py +419 -0
  43. context_engine/memory/hook_installer.py +258 -0
  44. context_engine/memory/hook_server.py +83 -0
  45. context_engine/memory/hooks.py +327 -0
  46. context_engine/memory/migrate.py +268 -0
  47. context_engine/models.py +96 -0
  48. context_engine/pricing.py +104 -0
  49. context_engine/project_commands.py +296 -0
  50. context_engine/retrieval/__init__.py +0 -0
  51. context_engine/retrieval/confidence.py +47 -0
  52. context_engine/retrieval/query_parser.py +105 -0
  53. context_engine/retrieval/retriever.py +199 -0
  54. context_engine/serve_http.py +208 -0
  55. context_engine/services.py +252 -0
  56. context_engine/storage/__init__.py +0 -0
  57. context_engine/storage/backend.py +39 -0
  58. context_engine/storage/fts_store.py +112 -0
  59. context_engine/storage/graph_store.py +219 -0
  60. context_engine/storage/local_backend.py +109 -0
  61. context_engine/storage/remote_backend.py +117 -0
  62. context_engine/storage/vector_store.py +357 -0
  63. context_engine/utils.py +72 -0
@@ -0,0 +1,296 @@
1
+ """Project-specific commands, rules, and preferences loaded at session start.
2
+
3
+ Supports two levels:
4
+ - **Workspace** (optional): parent directory's .cce/commands.yaml — global
5
+ defaults that apply to all projects under it.
6
+ - **Project**: the project's own .cce/commands.yaml — extends or overrides
7
+ the workspace config.
8
+
9
+ Example .cce/commands.yaml:
10
+ rules:
11
+ - NEVER generate down() in migrations — forward-only
12
+ - Use UUID for primary keys
13
+ preferences:
14
+ database: PostgreSQL
15
+ auth: Sanctum
16
+ style: "Clean architecture"
17
+ before_push:
18
+ - composer test
19
+ - phpstan analyse
20
+ before_commit:
21
+ - php-cs-fixer fix --dry-run
22
+ on_start:
23
+ - echo "Deploy freeze until Friday"
24
+ custom:
25
+ deploy: kubectl apply -f k8s/
26
+ """
27
+ import logging
28
+ from pathlib import Path
29
+
30
+ import yaml
31
+
32
+ log = logging.getLogger(__name__)
33
+
34
+ COMMANDS_DIR = ".cce"
35
+ COMMANDS_FILE = "commands.yaml"
36
+
37
+ VALID_HOOKS = {"before_push", "before_commit", "on_start", "custom"}
38
+ # Sections that are lists (merged by appending, deduped)
39
+ _LIST_SECTIONS = {"rules", "before_push", "before_commit", "on_start"}
40
+ # Sections that are dicts (merged by update)
41
+ _DICT_SECTIONS = {"preferences", "custom"}
42
+
43
+
44
+ def _commands_path(project_dir: str) -> Path:
45
+ return Path(project_dir) / COMMANDS_DIR / COMMANDS_FILE
46
+
47
+
48
+ def _load_yaml(path: Path) -> dict:
49
+ """Load a YAML file. Returns {} on any error."""
50
+ if not path.exists():
51
+ return {}
52
+ try:
53
+ data = yaml.safe_load(path.read_text()) or {}
54
+ except (yaml.YAMLError, OSError) as exc:
55
+ log.warning("Failed to parse %s: %s", path, exc)
56
+ return {}
57
+ if not isinstance(data, dict):
58
+ log.warning("%s is not a valid YAML mapping", path)
59
+ return {}
60
+ return data
61
+
62
+
63
+ def _find_workspace_dir(project_dir: str) -> Path | None:
64
+ """Find the nearest parent with .cce/commands.yaml (not project_dir itself)."""
65
+ current = Path(project_dir).resolve().parent
66
+ home = Path.home()
67
+ # Walk up but stop at home directory (don't scan /Users or /)
68
+ while current != current.parent and current != home.parent:
69
+ candidate = current / COMMANDS_DIR / COMMANDS_FILE
70
+ if candidate.exists():
71
+ return current
72
+ current = current.parent
73
+ return None
74
+
75
+
76
+ def _merge_configs(workspace: dict, project: dict) -> dict:
77
+ """Merge workspace config into project config. Project wins on conflicts."""
78
+ merged = {}
79
+ all_keys = set(workspace.keys()) | set(project.keys())
80
+ for key in all_keys:
81
+ ws_val = workspace.get(key)
82
+ pj_val = project.get(key)
83
+ if key in _LIST_SECTIONS:
84
+ # Merge lists, project items come after workspace, deduplicate
85
+ ws_list = ws_val if isinstance(ws_val, list) else []
86
+ pj_list = pj_val if isinstance(pj_val, list) else []
87
+ merged_list = []
88
+ seen_strs: set[str] = set()
89
+ for item in ws_list + pj_list:
90
+ item_key = str(item)
91
+ if item_key not in seen_strs:
92
+ seen_strs.add(item_key)
93
+ merged_list.append(item)
94
+ if merged_list:
95
+ merged[key] = merged_list
96
+ elif key in _DICT_SECTIONS:
97
+ # Merge dicts, project overrides workspace
98
+ ws_dict = ws_val if isinstance(ws_val, dict) else {}
99
+ pj_dict = pj_val if isinstance(pj_val, dict) else {}
100
+ combined = {**ws_dict, **pj_dict}
101
+ if combined:
102
+ merged[key] = combined
103
+ else:
104
+ # Unknown section: project wins, fallback to workspace
105
+ merged[key] = pj_val if pj_val is not None else ws_val
106
+ return merged
107
+
108
+
109
+ def load_commands(project_dir: str) -> dict:
110
+ """Load merged config: workspace (optional) + project."""
111
+ project_config = _load_yaml(_commands_path(project_dir))
112
+ workspace_dir = _find_workspace_dir(project_dir)
113
+ if workspace_dir is None:
114
+ return project_config
115
+ workspace_config = _load_yaml(workspace_dir / COMMANDS_DIR / COMMANDS_FILE)
116
+ return _merge_configs(workspace_config, project_config)
117
+
118
+
119
+ def load_project_only(project_dir: str) -> dict:
120
+ """Load only the project-level config (no workspace merge)."""
121
+ return _load_yaml(_commands_path(project_dir))
122
+
123
+
124
+ def save_commands(project_dir: str, commands: dict) -> None:
125
+ """Save project commands to .cce/commands.yaml."""
126
+ path = _commands_path(project_dir)
127
+ path.parent.mkdir(parents=True, exist_ok=True)
128
+ path.write_text(yaml.dump(commands, default_flow_style=False, sort_keys=False))
129
+
130
+
131
+ def add_command(project_dir: str, hook: str, command: str) -> None:
132
+ """Add a command to a hook. Creates the file if it doesn't exist."""
133
+ if hook not in VALID_HOOKS:
134
+ raise ValueError(f"Invalid hook '{hook}'. Valid hooks: {', '.join(sorted(VALID_HOOKS))}")
135
+ if hook == "custom":
136
+ raise ValueError("Use add_custom_command() for custom commands")
137
+ commands = load_project_only(project_dir)
138
+ hook_list = commands.setdefault(hook, [])
139
+ if not isinstance(hook_list, list):
140
+ raise ValueError(f"Hook '{hook}' is not a list in commands.yaml")
141
+ if command in hook_list:
142
+ return
143
+ hook_list.append(command)
144
+ save_commands(project_dir, commands)
145
+
146
+
147
+ def add_rule(project_dir: str, rule: str) -> None:
148
+ """Add a rule. Creates the file if it doesn't exist."""
149
+ commands = load_project_only(project_dir)
150
+ rules = commands.setdefault("rules", [])
151
+ if not isinstance(rules, list):
152
+ raise ValueError("'rules' section must be a list in commands.yaml")
153
+ if rule in rules:
154
+ return
155
+ rules.append(rule)
156
+ save_commands(project_dir, commands)
157
+
158
+
159
+ def set_preference(project_dir: str, key: str, value: str) -> None:
160
+ """Set a preference key-value pair."""
161
+ commands = load_project_only(project_dir)
162
+ prefs = commands.setdefault("preferences", {})
163
+ if not isinstance(prefs, dict):
164
+ raise ValueError("'preferences' section must be a mapping in commands.yaml")
165
+ prefs[key] = value
166
+ save_commands(project_dir, commands)
167
+
168
+
169
+ def add_custom_command(project_dir: str, name: str, command: str) -> None:
170
+ """Add a named custom command."""
171
+ commands = load_project_only(project_dir)
172
+ custom = commands.setdefault("custom", {})
173
+ if not isinstance(custom, dict):
174
+ raise ValueError("'custom' section must be a mapping in commands.yaml")
175
+ custom[name] = command
176
+ save_commands(project_dir, commands)
177
+
178
+
179
+ def remove_command(project_dir: str, hook: str, command: str) -> bool:
180
+ """Remove a command from a hook. Returns True if removed."""
181
+ commands = load_project_only(project_dir)
182
+ if hook not in commands:
183
+ return False
184
+ if hook == "custom":
185
+ custom = commands.get("custom", {})
186
+ if command in custom:
187
+ del custom[command]
188
+ if not custom:
189
+ del commands["custom"]
190
+ save_commands(project_dir, commands)
191
+ return True
192
+ return False
193
+ hook_list = commands.get(hook, [])
194
+ if not isinstance(hook_list, list):
195
+ return False
196
+ if command in hook_list:
197
+ hook_list.remove(command)
198
+ if not hook_list:
199
+ del commands[hook]
200
+ save_commands(project_dir, commands)
201
+ return True
202
+ return False
203
+
204
+
205
+ def remove_rule(project_dir: str, rule: str) -> bool:
206
+ """Remove a rule. Returns True if removed."""
207
+ commands = load_project_only(project_dir)
208
+ rules = commands.get("rules", [])
209
+ if not isinstance(rules, list) or rule not in rules:
210
+ return False
211
+ rules.remove(rule)
212
+ if not rules:
213
+ del commands["rules"]
214
+ save_commands(project_dir, commands)
215
+ return True
216
+
217
+
218
+ def remove_preference(project_dir: str, key: str) -> bool:
219
+ """Remove a preference. Returns True if removed."""
220
+ commands = load_project_only(project_dir)
221
+ prefs = commands.get("preferences", {})
222
+ if not isinstance(prefs, dict) or key not in prefs:
223
+ return False
224
+ del prefs[key]
225
+ if not prefs:
226
+ del commands["preferences"]
227
+ save_commands(project_dir, commands)
228
+ return True
229
+
230
+
231
+ _GITIGNORE_ENTRIES = [
232
+ # CCE local cache and per-machine files
233
+ (".cce/", "CCE local cache (per-machine, not for version control)"),
234
+ (".claude/settings.local.json", "Claude Code local settings written by cce init"),
235
+ ]
236
+
237
+
238
+ def ensure_gitignore(project_dir: str) -> None:
239
+ """Add CCE-related entries to .gitignore if not already present."""
240
+ gitignore = Path(project_dir) / ".gitignore"
241
+ content = gitignore.read_text() if gitignore.exists() else ""
242
+
243
+ additions = []
244
+ for entry, comment in _GITIGNORE_ENTRIES:
245
+ if entry not in content:
246
+ additions.append(f"# {comment}\n{entry}")
247
+
248
+ if not additions:
249
+ return
250
+
251
+ block = "\n\n# CCE (code-context-engine)\n" + "\n".join(additions) + "\n"
252
+ gitignore.write_text(content.rstrip() + block)
253
+
254
+
255
+ def format_for_prompt(commands: dict, label: str = "Project") -> str:
256
+ """Format commands as markdown for the init prompt."""
257
+ if not commands:
258
+ return ""
259
+ lines = []
260
+
261
+ # Rules
262
+ rules = commands.get("rules", [])
263
+ if rules and isinstance(rules, list):
264
+ lines.append(f"### {label} Rules")
265
+ for r in rules:
266
+ lines.append(f"- {r}")
267
+
268
+ # Preferences
269
+ prefs = commands.get("preferences", {})
270
+ if prefs and isinstance(prefs, dict):
271
+ lines.append(f"### {label} Preferences")
272
+ for k, v in prefs.items():
273
+ lines.append(f"- **{k}:** {v}")
274
+
275
+ # Commands
276
+ hook_labels = {
277
+ "before_push": "Before push",
278
+ "before_commit": "Before commit",
279
+ "on_start": "On session start",
280
+ }
281
+ cmd_lines = []
282
+ for hook, hook_label in hook_labels.items():
283
+ cmds = commands.get(hook, [])
284
+ if cmds and isinstance(cmds, list):
285
+ cmd_str = ", ".join(f"`{c}`" for c in cmds)
286
+ cmd_lines.append(f"- **{hook_label}:** {cmd_str}")
287
+ custom = commands.get("custom", {})
288
+ if custom and isinstance(custom, dict):
289
+ cmd_lines.append("- **Custom commands:**")
290
+ for name, cmd in custom.items():
291
+ cmd_lines.append(f" - `{name}`: `{cmd}`")
292
+ if cmd_lines:
293
+ lines.append(f"### {label} Commands")
294
+ lines.extend(cmd_lines)
295
+
296
+ return "\n".join(lines) if lines else ""
File without changes
@@ -0,0 +1,47 @@
1
+ """Confidence scoring for retrieved chunks.
2
+
3
+ The score is a weighted sum of three factors, each normalised to [0, 1]:
4
+
5
+ - vector similarity: 1 - (cosine distance from query embedding).
6
+ - keyword / file-hint match: a lightweight query-parser bonus when the chunk's
7
+ file path or content hits the parsed query intent. Replaces what used to be
8
+ labelled "graph hops" before the graph store was removed.
9
+ - recency: exponential decay based on the chunk's `modified_ts` metadata.
10
+
11
+ The weights live here as module constants so they're easy to find and tune.
12
+ """
13
+ import time
14
+ from context_engine.models import Chunk
15
+
16
+ _VECTOR_WEIGHT = 0.5
17
+ _KEYWORD_WEIGHT = 0.3
18
+ _RECENCY_WEIGHT = 0.2
19
+ _MAX_KEYWORD_DISTANCE = 5
20
+ _RECENCY_HALF_LIFE = 7 * 24 * 3600 # 1 week
21
+
22
+
23
+ class ConfidenceScorer:
24
+ def score(
25
+ self,
26
+ chunk: Chunk,
27
+ vector_distance: float,
28
+ keyword_distance: int,
29
+ ) -> float:
30
+ vector_score = max(0.0, 1.0 - vector_distance)
31
+ keyword_score = max(0.0, 1.0 - (keyword_distance / _MAX_KEYWORD_DISTANCE))
32
+ recency_score = self._recency_score(chunk)
33
+ combined = (
34
+ _VECTOR_WEIGHT * vector_score
35
+ + _KEYWORD_WEIGHT * keyword_score
36
+ + _RECENCY_WEIGHT * recency_score
37
+ )
38
+ return min(1.0, max(0.0, combined))
39
+
40
+ def _recency_score(self, chunk: Chunk) -> float:
41
+ modified_ts = chunk.metadata.get("modified_ts")
42
+ if modified_ts is None:
43
+ return 0.5
44
+ age_seconds = time.time() - modified_ts
45
+ if age_seconds <= 0:
46
+ return 1.0
47
+ return 0.5 ** (age_seconds / _RECENCY_HALF_LIFE)
@@ -0,0 +1,105 @@
1
+ """Query understanding — intent classification and keyword extraction."""
2
+ import re
3
+ from dataclasses import dataclass, field
4
+ from enum import Enum
5
+
6
+
7
+ class QueryIntent(Enum):
8
+ CODE_LOOKUP = "code_lookup"
9
+ DECISION_RECALL = "decision_recall"
10
+ ARCHITECTURE = "architecture"
11
+ GENERAL = "general"
12
+
13
+
14
+ _DECISION_PATTERNS = [
15
+ r"what did we decide",
16
+ r"decision about",
17
+ r"why did we",
18
+ r"last session",
19
+ r"previous discussion",
20
+ r"agreed on",
21
+ ]
22
+ _ARCHITECTURE_PATTERNS = [
23
+ r"how is .+ structured",
24
+ r"architecture",
25
+ r"module.+structure",
26
+ r"component.+design",
27
+ r"how does .+ work",
28
+ r"overview of",
29
+ r"explain the .+ system",
30
+ ]
31
+ _CODE_PATTERNS = [
32
+ r"find .+ function",
33
+ r"show me .+ class",
34
+ r"where is .+ defined",
35
+ r"implementation of",
36
+ r"\.py|\.js|\.ts",
37
+ r"function|class|method|def |import ",
38
+ ]
39
+ _FILE_PATH_RE = re.compile(r"[a-zA-Z0-9_./-]+\.[a-zA-Z]{1,10}")
40
+ # Natural-language stop words we always strip.
41
+ _STOP_WORDS = {
42
+ "the", "a", "an", "is", "are", "was", "were", "do", "does", "did",
43
+ "what", "how", "why", "where", "when", "who", "which",
44
+ "in", "on", "at", "to", "for", "of", "with", "about",
45
+ "me", "my", "we", "our", "it", "its", "i", "you",
46
+ "tell", "give",
47
+ }
48
+ # Code-flavoured words that look like stop words in prose ("show me get
49
+ # functions") but are critical naming prefixes in code. Strip them when the
50
+ # intent is conversational, keep them when the intent is code lookup so
51
+ # `getUser` / `set_config` / `find_by_id` matches survive keyword extraction.
52
+ _CODE_PREFIX_WORDS = {"show", "find", "get", "set", "fetch", "save", "validate", "create", "update", "delete"}
53
+
54
+
55
+ @dataclass
56
+ class ParsedQuery:
57
+ original: str
58
+ intent: QueryIntent
59
+ keywords: list[str] = field(default_factory=list)
60
+ file_hints: list[str] = field(default_factory=list)
61
+
62
+
63
+ class QueryParser:
64
+ def parse(self, query: str) -> ParsedQuery:
65
+ lower = query.lower()
66
+ intent = self._classify_intent(lower)
67
+ keywords = self._extract_keywords(query, intent=intent)
68
+ file_hints = _FILE_PATH_RE.findall(query)
69
+ return ParsedQuery(
70
+ original=query, intent=intent, keywords=keywords, file_hints=file_hints
71
+ )
72
+
73
+ def _classify_intent(self, query: str) -> QueryIntent:
74
+ for p in _DECISION_PATTERNS:
75
+ if re.search(p, query):
76
+ return QueryIntent.DECISION_RECALL
77
+ for p in _ARCHITECTURE_PATTERNS:
78
+ if re.search(p, query):
79
+ return QueryIntent.ARCHITECTURE
80
+ for p in _CODE_PATTERNS:
81
+ if re.search(p, query):
82
+ return QueryIntent.CODE_LOOKUP
83
+ return QueryIntent.GENERAL
84
+
85
+ def _extract_keywords(
86
+ self, query: str, intent: QueryIntent = QueryIntent.GENERAL
87
+ ) -> list[str]:
88
+ identifiers = re.findall(r"[A-Z][a-zA-Z0-9]+", query)
89
+ words = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*", query)
90
+ # For code-lookup intent, keep prefix words like `get`/`find`/`save`
91
+ # so the user's literal verb survives into FTS keyword scoring.
92
+ stop_words = (
93
+ _STOP_WORDS if intent == QueryIntent.CODE_LOOKUP
94
+ else _STOP_WORDS | _CODE_PREFIX_WORDS
95
+ )
96
+ meaningful = [
97
+ w for w in words if w.lower() not in stop_words and len(w) > 2
98
+ ]
99
+ seen = set()
100
+ result = []
101
+ for kw in identifiers + meaningful:
102
+ if kw not in seen:
103
+ seen.add(kw)
104
+ result.append(kw)
105
+ return result
@@ -0,0 +1,199 @@
1
+ """Hybrid retrieval — vector search + FTS BM25 + RRF merging + confidence scoring."""
2
+ import logging
3
+
4
+ from context_engine.models import Chunk
5
+ from context_engine.storage.backend import StorageBackend
6
+ from context_engine.indexer.embedder import Embedder
7
+ from context_engine.retrieval.confidence import ConfidenceScorer
8
+ from context_engine.retrieval.query_parser import QueryIntent, QueryParser
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+ _DEPRIORITISED_PATHS = {"tests/", "test_", "docs/", "spec", "plan"}
13
+ _RRF_K = 60
14
+ # Confidence weight in the final blend. The remainder goes to RRF, normalised to
15
+ # [0,1] by the best score in the candidate set so an exact-match FTS rank-1 hit
16
+ # scores the same as a vector rank-1 hit instead of being clamped to ~1.0.
17
+ _CONFIDENCE_WEIGHT = 0.5
18
+ # When the parsed query looks like a code lookup, give FTS more pull because
19
+ # exact-identifier hits are usually what the user wants.
20
+ _FTS_BOOST_CODE_LOOKUP = 1.5
21
+
22
+
23
+ class HybridRetriever:
24
+ def __init__(self, backend: StorageBackend, embedder: Embedder) -> None:
25
+ self._backend = backend
26
+ self._embedder = embedder
27
+ self._scorer = ConfidenceScorer()
28
+ self._parser = QueryParser()
29
+ self._fts_warned = False
30
+
31
+ async def retrieve(
32
+ self,
33
+ query: str,
34
+ top_k: int = 10,
35
+ confidence_threshold: float = 0.0,
36
+ max_tokens: int | None = None,
37
+ ) -> list[Chunk]:
38
+ parsed = self._parser.parse(query)
39
+ query_embedding = self._embedder.embed_query(query)
40
+
41
+ # embed_query returns tuple for LRU cache hashability; vector_store
42
+ # now handles the conversion internally via _to_list().
43
+
44
+ vector_results = await self._backend.vector_search(
45
+ query_embedding=query_embedding,
46
+ top_k=max(top_k * 3, 1),
47
+ )
48
+
49
+ # FTS search with graceful fallback
50
+ fts_ids: dict[str, int] = {}
51
+ try:
52
+ fts_results = await self._backend.fts_search(query, top_k=top_k * 3)
53
+ fts_ids = {id_: rank for rank, (id_, _) in enumerate(fts_results)}
54
+ except Exception:
55
+ if not self._fts_warned:
56
+ log.warning("FTS search unavailable; falling back to vector-only")
57
+ self._fts_warned = True
58
+
59
+ # Build vector rankings and chunk map
60
+ vector_ranks: dict[str, int] = {}
61
+ chunk_map: dict[str, Chunk] = {}
62
+ seen_keys: set[str] = set()
63
+
64
+ for rank, chunk in enumerate(vector_results):
65
+ dedup_key = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
66
+ if dedup_key in seen_keys:
67
+ continue
68
+ seen_keys.add(dedup_key)
69
+ vector_ranks[chunk.id] = rank
70
+ chunk_map[chunk.id] = chunk
71
+
72
+ # Hydrate FTS-only results
73
+ fts_only_ids = [id_ for id_ in fts_ids if id_ not in chunk_map]
74
+ if fts_only_ids:
75
+ try:
76
+ hydrated = await self._backend.get_chunks_by_ids(fts_only_ids)
77
+ for chunk in hydrated:
78
+ chunk_map[chunk.id] = chunk
79
+ except Exception as exc:
80
+ log.warning("Failed to hydrate FTS-only chunks: %s", exc)
81
+
82
+ # Compute RRF scores. Boost FTS contribution when the parsed intent
83
+ # is CODE_LOOKUP — exact identifier matches are almost always what the
84
+ # user wants and would otherwise be drowned by semantic-similarity hits.
85
+ fts_weight = (
86
+ _FTS_BOOST_CODE_LOOKUP if parsed.intent == QueryIntent.CODE_LOOKUP else 1.0
87
+ )
88
+ all_ids = set(vector_ranks.keys()) | set(fts_ids.keys())
89
+ rrf_scores: dict[str, float] = {}
90
+ for id_ in all_ids:
91
+ score = 0.0
92
+ if id_ in vector_ranks:
93
+ score += 1.0 / (_RRF_K + vector_ranks[id_])
94
+ if id_ in fts_ids:
95
+ score += fts_weight * (1.0 / (_RRF_K + fts_ids[id_]))
96
+ rrf_scores[id_] = score
97
+
98
+ # Normalise RRF to [0, 1] by the best score in this candidate set.
99
+ # The previous `min(rrf * _RRF_K, 1.0)` saturated nearly every result to
100
+ # ~1.0, so confidence_score dominated the blend and FTS rank carried
101
+ # almost no signal past the top few. Rank-normalising restores gradient.
102
+ max_rrf = max(rrf_scores.values()) if rrf_scores else 0.0
103
+
104
+ # Score with confidence scorer
105
+ scored: list[tuple[Chunk, float]] = []
106
+ for id_, rrf_score in rrf_scores.items():
107
+ chunk = chunk_map.get(id_)
108
+ if chunk is None:
109
+ continue
110
+
111
+ distance = chunk.metadata.get("_distance", 0.0)
112
+ normalised_distance = min(max(distance / 2.0, 0.0), 1.0)
113
+ keyword_distance = self._estimate_keyword_distance(chunk, parsed)
114
+ conf_score = self._scorer.score(
115
+ chunk,
116
+ vector_distance=normalised_distance,
117
+ keyword_distance=keyword_distance,
118
+ )
119
+
120
+ normalised_rrf = (rrf_score / max_rrf) if max_rrf > 0 else 0.0
121
+ final_score = (
122
+ _CONFIDENCE_WEIGHT * conf_score
123
+ + (1.0 - _CONFIDENCE_WEIGHT) * normalised_rrf
124
+ )
125
+ final_score = self._apply_path_penalty(chunk.file_path, final_score)
126
+ chunk.confidence_score = final_score
127
+
128
+ if final_score >= confidence_threshold:
129
+ scored.append((chunk, final_score))
130
+
131
+ scored.sort(key=lambda x: x[1], reverse=True)
132
+ ranked = [chunk for chunk, _ in scored[:top_k]]
133
+
134
+ # Graph expansion: fetch 1-2 bonus chunks from files reachable via
135
+ # CALLS/IMPORTS edges from the top results.
136
+ if ranked and hasattr(self._backend, "get_related_file_paths"):
137
+ try:
138
+ top_files = list({c.file_path for c in ranked[:3]})
139
+ related_files = await self._backend.get_related_file_paths(top_files)
140
+ qe_list = (
141
+ list(query_embedding)
142
+ if not isinstance(query_embedding, list)
143
+ else query_embedding
144
+ )
145
+ for rel_fp in related_files[:2]: # max 2 bonus files
146
+ bonus = await self._backend.vector_search(
147
+ query_embedding=qe_list,
148
+ top_k=2,
149
+ filters={"file_path": rel_fp},
150
+ )
151
+ for b in bonus:
152
+ dedup_key = (
153
+ f"{b.file_path}:{b.start_line}-{b.end_line}"
154
+ )
155
+ if dedup_key not in seen_keys:
156
+ seen_keys.add(dedup_key)
157
+ dist = b.metadata.get("_distance", 1.0)
158
+ b.confidence_score = max(0.0, 1.0 - dist) * 0.85
159
+ if b.confidence_score >= confidence_threshold:
160
+ ranked.append(b)
161
+ except Exception as exc:
162
+ log.debug("Graph expansion skipped: %s", exc)
163
+
164
+ if max_tokens is None:
165
+ return ranked
166
+
167
+ packed: list[Chunk] = []
168
+ budget = max_tokens
169
+ for chunk in ranked:
170
+ tokens = chunk.token_count
171
+ if tokens <= budget:
172
+ packed.append(chunk)
173
+ budget -= tokens
174
+ elif chunk.compressed_content:
175
+ compressed_tokens = max(1, int(len(chunk.compressed_content) / 3.3))
176
+ if compressed_tokens <= budget:
177
+ packed.append(chunk)
178
+ budget -= compressed_tokens
179
+ return packed
180
+
181
+ @staticmethod
182
+ def _apply_path_penalty(file_path: str, score: float) -> float:
183
+ if file_path.startswith("git:"):
184
+ return score
185
+ fp_lower = file_path.lower()
186
+ for marker in _DEPRIORITISED_PATHS:
187
+ if marker in fp_lower:
188
+ return score * 0.8
189
+ return score
190
+
191
+ def _estimate_keyword_distance(self, chunk, parsed) -> int:
192
+ if parsed.file_hints:
193
+ for hint in parsed.file_hints:
194
+ if hint in chunk.file_path:
195
+ return 0
196
+ for keyword in parsed.keywords:
197
+ if keyword.lower() in chunk.content.lower():
198
+ return 0
199
+ return 2