code-context-engine 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_context_engine-0.4.0.dist-info/METADATA +389 -0
- code_context_engine-0.4.0.dist-info/RECORD +63 -0
- code_context_engine-0.4.0.dist-info/WHEEL +5 -0
- code_context_engine-0.4.0.dist-info/entry_points.txt +4 -0
- code_context_engine-0.4.0.dist-info/licenses/LICENSE +21 -0
- code_context_engine-0.4.0.dist-info/top_level.txt +1 -0
- context_engine/__init__.py +3 -0
- context_engine/cli.py +2848 -0
- context_engine/cli_style.py +66 -0
- context_engine/compression/__init__.py +0 -0
- context_engine/compression/compressor.py +144 -0
- context_engine/compression/ollama_client.py +33 -0
- context_engine/compression/output_rules.py +77 -0
- context_engine/compression/prompts.py +9 -0
- context_engine/compression/quality.py +37 -0
- context_engine/config.py +198 -0
- context_engine/dashboard/__init__.py +0 -0
- context_engine/dashboard/_page.py +1548 -0
- context_engine/dashboard/server.py +429 -0
- context_engine/editors.py +265 -0
- context_engine/event_bus.py +24 -0
- context_engine/indexer/__init__.py +0 -0
- context_engine/indexer/chunker.py +147 -0
- context_engine/indexer/embedder.py +154 -0
- context_engine/indexer/embedding_cache.py +168 -0
- context_engine/indexer/git_hooks.py +73 -0
- context_engine/indexer/git_indexer.py +136 -0
- context_engine/indexer/ignorefile.py +96 -0
- context_engine/indexer/manifest.py +78 -0
- context_engine/indexer/pipeline.py +624 -0
- context_engine/indexer/secrets.py +332 -0
- context_engine/indexer/watcher.py +109 -0
- context_engine/integration/__init__.py +0 -0
- context_engine/integration/bootstrap.py +76 -0
- context_engine/integration/git_context.py +132 -0
- context_engine/integration/mcp_server.py +1825 -0
- context_engine/integration/session_capture.py +306 -0
- context_engine/memory/__init__.py +6 -0
- context_engine/memory/compressor.py +344 -0
- context_engine/memory/db.py +922 -0
- context_engine/memory/extractive.py +106 -0
- context_engine/memory/grammar.py +419 -0
- context_engine/memory/hook_installer.py +258 -0
- context_engine/memory/hook_server.py +83 -0
- context_engine/memory/hooks.py +327 -0
- context_engine/memory/migrate.py +268 -0
- context_engine/models.py +96 -0
- context_engine/pricing.py +104 -0
- context_engine/project_commands.py +296 -0
- context_engine/retrieval/__init__.py +0 -0
- context_engine/retrieval/confidence.py +47 -0
- context_engine/retrieval/query_parser.py +105 -0
- context_engine/retrieval/retriever.py +199 -0
- context_engine/serve_http.py +208 -0
- context_engine/services.py +252 -0
- context_engine/storage/__init__.py +0 -0
- context_engine/storage/backend.py +39 -0
- context_engine/storage/fts_store.py +112 -0
- context_engine/storage/graph_store.py +219 -0
- context_engine/storage/local_backend.py +109 -0
- context_engine/storage/remote_backend.py +117 -0
- context_engine/storage/vector_store.py +357 -0
- context_engine/utils.py +72 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
"""Project-specific commands, rules, and preferences loaded at session start.
|
|
2
|
+
|
|
3
|
+
Supports two levels:
|
|
4
|
+
- **Workspace** (optional): parent directory's .cce/commands.yaml — global
|
|
5
|
+
defaults that apply to all projects under it.
|
|
6
|
+
- **Project**: the project's own .cce/commands.yaml — extends or overrides
|
|
7
|
+
the workspace config.
|
|
8
|
+
|
|
9
|
+
Example .cce/commands.yaml:
|
|
10
|
+
rules:
|
|
11
|
+
- NEVER generate down() in migrations — forward-only
|
|
12
|
+
- Use UUID for primary keys
|
|
13
|
+
preferences:
|
|
14
|
+
database: PostgreSQL
|
|
15
|
+
auth: Sanctum
|
|
16
|
+
style: "Clean architecture"
|
|
17
|
+
before_push:
|
|
18
|
+
- composer test
|
|
19
|
+
- phpstan analyse
|
|
20
|
+
before_commit:
|
|
21
|
+
- php-cs-fixer fix --dry-run
|
|
22
|
+
on_start:
|
|
23
|
+
- echo "Deploy freeze until Friday"
|
|
24
|
+
custom:
|
|
25
|
+
deploy: kubectl apply -f k8s/
|
|
26
|
+
"""
|
|
27
|
+
import logging
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
import yaml
|
|
31
|
+
|
|
32
|
+
log = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
COMMANDS_DIR = ".cce"
|
|
35
|
+
COMMANDS_FILE = "commands.yaml"
|
|
36
|
+
|
|
37
|
+
VALID_HOOKS = {"before_push", "before_commit", "on_start", "custom"}
|
|
38
|
+
# Sections that are lists (merged by appending, deduped)
|
|
39
|
+
_LIST_SECTIONS = {"rules", "before_push", "before_commit", "on_start"}
|
|
40
|
+
# Sections that are dicts (merged by update)
|
|
41
|
+
_DICT_SECTIONS = {"preferences", "custom"}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _commands_path(project_dir: str) -> Path:
|
|
45
|
+
return Path(project_dir) / COMMANDS_DIR / COMMANDS_FILE
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _load_yaml(path: Path) -> dict:
|
|
49
|
+
"""Load a YAML file. Returns {} on any error."""
|
|
50
|
+
if not path.exists():
|
|
51
|
+
return {}
|
|
52
|
+
try:
|
|
53
|
+
data = yaml.safe_load(path.read_text()) or {}
|
|
54
|
+
except (yaml.YAMLError, OSError) as exc:
|
|
55
|
+
log.warning("Failed to parse %s: %s", path, exc)
|
|
56
|
+
return {}
|
|
57
|
+
if not isinstance(data, dict):
|
|
58
|
+
log.warning("%s is not a valid YAML mapping", path)
|
|
59
|
+
return {}
|
|
60
|
+
return data
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _find_workspace_dir(project_dir: str) -> Path | None:
|
|
64
|
+
"""Find the nearest parent with .cce/commands.yaml (not project_dir itself)."""
|
|
65
|
+
current = Path(project_dir).resolve().parent
|
|
66
|
+
home = Path.home()
|
|
67
|
+
# Walk up but stop at home directory (don't scan /Users or /)
|
|
68
|
+
while current != current.parent and current != home.parent:
|
|
69
|
+
candidate = current / COMMANDS_DIR / COMMANDS_FILE
|
|
70
|
+
if candidate.exists():
|
|
71
|
+
return current
|
|
72
|
+
current = current.parent
|
|
73
|
+
return None
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _merge_configs(workspace: dict, project: dict) -> dict:
|
|
77
|
+
"""Merge workspace config into project config. Project wins on conflicts."""
|
|
78
|
+
merged = {}
|
|
79
|
+
all_keys = set(workspace.keys()) | set(project.keys())
|
|
80
|
+
for key in all_keys:
|
|
81
|
+
ws_val = workspace.get(key)
|
|
82
|
+
pj_val = project.get(key)
|
|
83
|
+
if key in _LIST_SECTIONS:
|
|
84
|
+
# Merge lists, project items come after workspace, deduplicate
|
|
85
|
+
ws_list = ws_val if isinstance(ws_val, list) else []
|
|
86
|
+
pj_list = pj_val if isinstance(pj_val, list) else []
|
|
87
|
+
merged_list = []
|
|
88
|
+
seen_strs: set[str] = set()
|
|
89
|
+
for item in ws_list + pj_list:
|
|
90
|
+
item_key = str(item)
|
|
91
|
+
if item_key not in seen_strs:
|
|
92
|
+
seen_strs.add(item_key)
|
|
93
|
+
merged_list.append(item)
|
|
94
|
+
if merged_list:
|
|
95
|
+
merged[key] = merged_list
|
|
96
|
+
elif key in _DICT_SECTIONS:
|
|
97
|
+
# Merge dicts, project overrides workspace
|
|
98
|
+
ws_dict = ws_val if isinstance(ws_val, dict) else {}
|
|
99
|
+
pj_dict = pj_val if isinstance(pj_val, dict) else {}
|
|
100
|
+
combined = {**ws_dict, **pj_dict}
|
|
101
|
+
if combined:
|
|
102
|
+
merged[key] = combined
|
|
103
|
+
else:
|
|
104
|
+
# Unknown section: project wins, fallback to workspace
|
|
105
|
+
merged[key] = pj_val if pj_val is not None else ws_val
|
|
106
|
+
return merged
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def load_commands(project_dir: str) -> dict:
|
|
110
|
+
"""Load merged config: workspace (optional) + project."""
|
|
111
|
+
project_config = _load_yaml(_commands_path(project_dir))
|
|
112
|
+
workspace_dir = _find_workspace_dir(project_dir)
|
|
113
|
+
if workspace_dir is None:
|
|
114
|
+
return project_config
|
|
115
|
+
workspace_config = _load_yaml(workspace_dir / COMMANDS_DIR / COMMANDS_FILE)
|
|
116
|
+
return _merge_configs(workspace_config, project_config)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def load_project_only(project_dir: str) -> dict:
|
|
120
|
+
"""Load only the project-level config (no workspace merge)."""
|
|
121
|
+
return _load_yaml(_commands_path(project_dir))
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def save_commands(project_dir: str, commands: dict) -> None:
|
|
125
|
+
"""Save project commands to .cce/commands.yaml."""
|
|
126
|
+
path = _commands_path(project_dir)
|
|
127
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
path.write_text(yaml.dump(commands, default_flow_style=False, sort_keys=False))
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def add_command(project_dir: str, hook: str, command: str) -> None:
|
|
132
|
+
"""Add a command to a hook. Creates the file if it doesn't exist."""
|
|
133
|
+
if hook not in VALID_HOOKS:
|
|
134
|
+
raise ValueError(f"Invalid hook '{hook}'. Valid hooks: {', '.join(sorted(VALID_HOOKS))}")
|
|
135
|
+
if hook == "custom":
|
|
136
|
+
raise ValueError("Use add_custom_command() for custom commands")
|
|
137
|
+
commands = load_project_only(project_dir)
|
|
138
|
+
hook_list = commands.setdefault(hook, [])
|
|
139
|
+
if not isinstance(hook_list, list):
|
|
140
|
+
raise ValueError(f"Hook '{hook}' is not a list in commands.yaml")
|
|
141
|
+
if command in hook_list:
|
|
142
|
+
return
|
|
143
|
+
hook_list.append(command)
|
|
144
|
+
save_commands(project_dir, commands)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def add_rule(project_dir: str, rule: str) -> None:
|
|
148
|
+
"""Add a rule. Creates the file if it doesn't exist."""
|
|
149
|
+
commands = load_project_only(project_dir)
|
|
150
|
+
rules = commands.setdefault("rules", [])
|
|
151
|
+
if not isinstance(rules, list):
|
|
152
|
+
raise ValueError("'rules' section must be a list in commands.yaml")
|
|
153
|
+
if rule in rules:
|
|
154
|
+
return
|
|
155
|
+
rules.append(rule)
|
|
156
|
+
save_commands(project_dir, commands)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def set_preference(project_dir: str, key: str, value: str) -> None:
|
|
160
|
+
"""Set a preference key-value pair."""
|
|
161
|
+
commands = load_project_only(project_dir)
|
|
162
|
+
prefs = commands.setdefault("preferences", {})
|
|
163
|
+
if not isinstance(prefs, dict):
|
|
164
|
+
raise ValueError("'preferences' section must be a mapping in commands.yaml")
|
|
165
|
+
prefs[key] = value
|
|
166
|
+
save_commands(project_dir, commands)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def add_custom_command(project_dir: str, name: str, command: str) -> None:
|
|
170
|
+
"""Add a named custom command."""
|
|
171
|
+
commands = load_project_only(project_dir)
|
|
172
|
+
custom = commands.setdefault("custom", {})
|
|
173
|
+
if not isinstance(custom, dict):
|
|
174
|
+
raise ValueError("'custom' section must be a mapping in commands.yaml")
|
|
175
|
+
custom[name] = command
|
|
176
|
+
save_commands(project_dir, commands)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def remove_command(project_dir: str, hook: str, command: str) -> bool:
|
|
180
|
+
"""Remove a command from a hook. Returns True if removed."""
|
|
181
|
+
commands = load_project_only(project_dir)
|
|
182
|
+
if hook not in commands:
|
|
183
|
+
return False
|
|
184
|
+
if hook == "custom":
|
|
185
|
+
custom = commands.get("custom", {})
|
|
186
|
+
if command in custom:
|
|
187
|
+
del custom[command]
|
|
188
|
+
if not custom:
|
|
189
|
+
del commands["custom"]
|
|
190
|
+
save_commands(project_dir, commands)
|
|
191
|
+
return True
|
|
192
|
+
return False
|
|
193
|
+
hook_list = commands.get(hook, [])
|
|
194
|
+
if not isinstance(hook_list, list):
|
|
195
|
+
return False
|
|
196
|
+
if command in hook_list:
|
|
197
|
+
hook_list.remove(command)
|
|
198
|
+
if not hook_list:
|
|
199
|
+
del commands[hook]
|
|
200
|
+
save_commands(project_dir, commands)
|
|
201
|
+
return True
|
|
202
|
+
return False
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def remove_rule(project_dir: str, rule: str) -> bool:
|
|
206
|
+
"""Remove a rule. Returns True if removed."""
|
|
207
|
+
commands = load_project_only(project_dir)
|
|
208
|
+
rules = commands.get("rules", [])
|
|
209
|
+
if not isinstance(rules, list) or rule not in rules:
|
|
210
|
+
return False
|
|
211
|
+
rules.remove(rule)
|
|
212
|
+
if not rules:
|
|
213
|
+
del commands["rules"]
|
|
214
|
+
save_commands(project_dir, commands)
|
|
215
|
+
return True
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def remove_preference(project_dir: str, key: str) -> bool:
|
|
219
|
+
"""Remove a preference. Returns True if removed."""
|
|
220
|
+
commands = load_project_only(project_dir)
|
|
221
|
+
prefs = commands.get("preferences", {})
|
|
222
|
+
if not isinstance(prefs, dict) or key not in prefs:
|
|
223
|
+
return False
|
|
224
|
+
del prefs[key]
|
|
225
|
+
if not prefs:
|
|
226
|
+
del commands["preferences"]
|
|
227
|
+
save_commands(project_dir, commands)
|
|
228
|
+
return True
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
_GITIGNORE_ENTRIES = [
|
|
232
|
+
# CCE local cache and per-machine files
|
|
233
|
+
(".cce/", "CCE local cache (per-machine, not for version control)"),
|
|
234
|
+
(".claude/settings.local.json", "Claude Code local settings written by cce init"),
|
|
235
|
+
]
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def ensure_gitignore(project_dir: str) -> None:
|
|
239
|
+
"""Add CCE-related entries to .gitignore if not already present."""
|
|
240
|
+
gitignore = Path(project_dir) / ".gitignore"
|
|
241
|
+
content = gitignore.read_text() if gitignore.exists() else ""
|
|
242
|
+
|
|
243
|
+
additions = []
|
|
244
|
+
for entry, comment in _GITIGNORE_ENTRIES:
|
|
245
|
+
if entry not in content:
|
|
246
|
+
additions.append(f"# {comment}\n{entry}")
|
|
247
|
+
|
|
248
|
+
if not additions:
|
|
249
|
+
return
|
|
250
|
+
|
|
251
|
+
block = "\n\n# CCE (code-context-engine)\n" + "\n".join(additions) + "\n"
|
|
252
|
+
gitignore.write_text(content.rstrip() + block)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def format_for_prompt(commands: dict, label: str = "Project") -> str:
|
|
256
|
+
"""Format commands as markdown for the init prompt."""
|
|
257
|
+
if not commands:
|
|
258
|
+
return ""
|
|
259
|
+
lines = []
|
|
260
|
+
|
|
261
|
+
# Rules
|
|
262
|
+
rules = commands.get("rules", [])
|
|
263
|
+
if rules and isinstance(rules, list):
|
|
264
|
+
lines.append(f"### {label} Rules")
|
|
265
|
+
for r in rules:
|
|
266
|
+
lines.append(f"- {r}")
|
|
267
|
+
|
|
268
|
+
# Preferences
|
|
269
|
+
prefs = commands.get("preferences", {})
|
|
270
|
+
if prefs and isinstance(prefs, dict):
|
|
271
|
+
lines.append(f"### {label} Preferences")
|
|
272
|
+
for k, v in prefs.items():
|
|
273
|
+
lines.append(f"- **{k}:** {v}")
|
|
274
|
+
|
|
275
|
+
# Commands
|
|
276
|
+
hook_labels = {
|
|
277
|
+
"before_push": "Before push",
|
|
278
|
+
"before_commit": "Before commit",
|
|
279
|
+
"on_start": "On session start",
|
|
280
|
+
}
|
|
281
|
+
cmd_lines = []
|
|
282
|
+
for hook, hook_label in hook_labels.items():
|
|
283
|
+
cmds = commands.get(hook, [])
|
|
284
|
+
if cmds and isinstance(cmds, list):
|
|
285
|
+
cmd_str = ", ".join(f"`{c}`" for c in cmds)
|
|
286
|
+
cmd_lines.append(f"- **{hook_label}:** {cmd_str}")
|
|
287
|
+
custom = commands.get("custom", {})
|
|
288
|
+
if custom and isinstance(custom, dict):
|
|
289
|
+
cmd_lines.append("- **Custom commands:**")
|
|
290
|
+
for name, cmd in custom.items():
|
|
291
|
+
cmd_lines.append(f" - `{name}`: `{cmd}`")
|
|
292
|
+
if cmd_lines:
|
|
293
|
+
lines.append(f"### {label} Commands")
|
|
294
|
+
lines.extend(cmd_lines)
|
|
295
|
+
|
|
296
|
+
return "\n".join(lines) if lines else ""
|
|
File without changes
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Confidence scoring for retrieved chunks.
|
|
2
|
+
|
|
3
|
+
The score is a weighted sum of three factors, each normalised to [0, 1]:
|
|
4
|
+
|
|
5
|
+
- vector similarity: 1 - (cosine distance from query embedding).
|
|
6
|
+
- keyword / file-hint match: a lightweight query-parser bonus when the chunk's
|
|
7
|
+
file path or content hits the parsed query intent. Replaces what used to be
|
|
8
|
+
labelled "graph hops" before the graph store was removed.
|
|
9
|
+
- recency: exponential decay based on the chunk's `modified_ts` metadata.
|
|
10
|
+
|
|
11
|
+
The weights live here as module constants so they're easy to find and tune.
|
|
12
|
+
"""
|
|
13
|
+
import time
|
|
14
|
+
from context_engine.models import Chunk
|
|
15
|
+
|
|
16
|
+
_VECTOR_WEIGHT = 0.5
|
|
17
|
+
_KEYWORD_WEIGHT = 0.3
|
|
18
|
+
_RECENCY_WEIGHT = 0.2
|
|
19
|
+
_MAX_KEYWORD_DISTANCE = 5
|
|
20
|
+
_RECENCY_HALF_LIFE = 7 * 24 * 3600 # 1 week
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ConfidenceScorer:
|
|
24
|
+
def score(
|
|
25
|
+
self,
|
|
26
|
+
chunk: Chunk,
|
|
27
|
+
vector_distance: float,
|
|
28
|
+
keyword_distance: int,
|
|
29
|
+
) -> float:
|
|
30
|
+
vector_score = max(0.0, 1.0 - vector_distance)
|
|
31
|
+
keyword_score = max(0.0, 1.0 - (keyword_distance / _MAX_KEYWORD_DISTANCE))
|
|
32
|
+
recency_score = self._recency_score(chunk)
|
|
33
|
+
combined = (
|
|
34
|
+
_VECTOR_WEIGHT * vector_score
|
|
35
|
+
+ _KEYWORD_WEIGHT * keyword_score
|
|
36
|
+
+ _RECENCY_WEIGHT * recency_score
|
|
37
|
+
)
|
|
38
|
+
return min(1.0, max(0.0, combined))
|
|
39
|
+
|
|
40
|
+
def _recency_score(self, chunk: Chunk) -> float:
|
|
41
|
+
modified_ts = chunk.metadata.get("modified_ts")
|
|
42
|
+
if modified_ts is None:
|
|
43
|
+
return 0.5
|
|
44
|
+
age_seconds = time.time() - modified_ts
|
|
45
|
+
if age_seconds <= 0:
|
|
46
|
+
return 1.0
|
|
47
|
+
return 0.5 ** (age_seconds / _RECENCY_HALF_LIFE)
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Query understanding — intent classification and keyword extraction."""
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class QueryIntent(Enum):
|
|
8
|
+
CODE_LOOKUP = "code_lookup"
|
|
9
|
+
DECISION_RECALL = "decision_recall"
|
|
10
|
+
ARCHITECTURE = "architecture"
|
|
11
|
+
GENERAL = "general"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_DECISION_PATTERNS = [
|
|
15
|
+
r"what did we decide",
|
|
16
|
+
r"decision about",
|
|
17
|
+
r"why did we",
|
|
18
|
+
r"last session",
|
|
19
|
+
r"previous discussion",
|
|
20
|
+
r"agreed on",
|
|
21
|
+
]
|
|
22
|
+
_ARCHITECTURE_PATTERNS = [
|
|
23
|
+
r"how is .+ structured",
|
|
24
|
+
r"architecture",
|
|
25
|
+
r"module.+structure",
|
|
26
|
+
r"component.+design",
|
|
27
|
+
r"how does .+ work",
|
|
28
|
+
r"overview of",
|
|
29
|
+
r"explain the .+ system",
|
|
30
|
+
]
|
|
31
|
+
_CODE_PATTERNS = [
|
|
32
|
+
r"find .+ function",
|
|
33
|
+
r"show me .+ class",
|
|
34
|
+
r"where is .+ defined",
|
|
35
|
+
r"implementation of",
|
|
36
|
+
r"\.py|\.js|\.ts",
|
|
37
|
+
r"function|class|method|def |import ",
|
|
38
|
+
]
|
|
39
|
+
_FILE_PATH_RE = re.compile(r"[a-zA-Z0-9_./-]+\.[a-zA-Z]{1,10}")
|
|
40
|
+
# Natural-language stop words we always strip.
|
|
41
|
+
_STOP_WORDS = {
|
|
42
|
+
"the", "a", "an", "is", "are", "was", "were", "do", "does", "did",
|
|
43
|
+
"what", "how", "why", "where", "when", "who", "which",
|
|
44
|
+
"in", "on", "at", "to", "for", "of", "with", "about",
|
|
45
|
+
"me", "my", "we", "our", "it", "its", "i", "you",
|
|
46
|
+
"tell", "give",
|
|
47
|
+
}
|
|
48
|
+
# Code-flavoured words that look like stop words in prose ("show me get
|
|
49
|
+
# functions") but are critical naming prefixes in code. Strip them when the
|
|
50
|
+
# intent is conversational, keep them when the intent is code lookup so
|
|
51
|
+
# `getUser` / `set_config` / `find_by_id` matches survive keyword extraction.
|
|
52
|
+
_CODE_PREFIX_WORDS = {"show", "find", "get", "set", "fetch", "save", "validate", "create", "update", "delete"}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class ParsedQuery:
|
|
57
|
+
original: str
|
|
58
|
+
intent: QueryIntent
|
|
59
|
+
keywords: list[str] = field(default_factory=list)
|
|
60
|
+
file_hints: list[str] = field(default_factory=list)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class QueryParser:
|
|
64
|
+
def parse(self, query: str) -> ParsedQuery:
|
|
65
|
+
lower = query.lower()
|
|
66
|
+
intent = self._classify_intent(lower)
|
|
67
|
+
keywords = self._extract_keywords(query, intent=intent)
|
|
68
|
+
file_hints = _FILE_PATH_RE.findall(query)
|
|
69
|
+
return ParsedQuery(
|
|
70
|
+
original=query, intent=intent, keywords=keywords, file_hints=file_hints
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def _classify_intent(self, query: str) -> QueryIntent:
|
|
74
|
+
for p in _DECISION_PATTERNS:
|
|
75
|
+
if re.search(p, query):
|
|
76
|
+
return QueryIntent.DECISION_RECALL
|
|
77
|
+
for p in _ARCHITECTURE_PATTERNS:
|
|
78
|
+
if re.search(p, query):
|
|
79
|
+
return QueryIntent.ARCHITECTURE
|
|
80
|
+
for p in _CODE_PATTERNS:
|
|
81
|
+
if re.search(p, query):
|
|
82
|
+
return QueryIntent.CODE_LOOKUP
|
|
83
|
+
return QueryIntent.GENERAL
|
|
84
|
+
|
|
85
|
+
def _extract_keywords(
|
|
86
|
+
self, query: str, intent: QueryIntent = QueryIntent.GENERAL
|
|
87
|
+
) -> list[str]:
|
|
88
|
+
identifiers = re.findall(r"[A-Z][a-zA-Z0-9]+", query)
|
|
89
|
+
words = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*", query)
|
|
90
|
+
# For code-lookup intent, keep prefix words like `get`/`find`/`save`
|
|
91
|
+
# so the user's literal verb survives into FTS keyword scoring.
|
|
92
|
+
stop_words = (
|
|
93
|
+
_STOP_WORDS if intent == QueryIntent.CODE_LOOKUP
|
|
94
|
+
else _STOP_WORDS | _CODE_PREFIX_WORDS
|
|
95
|
+
)
|
|
96
|
+
meaningful = [
|
|
97
|
+
w for w in words if w.lower() not in stop_words and len(w) > 2
|
|
98
|
+
]
|
|
99
|
+
seen = set()
|
|
100
|
+
result = []
|
|
101
|
+
for kw in identifiers + meaningful:
|
|
102
|
+
if kw not in seen:
|
|
103
|
+
seen.add(kw)
|
|
104
|
+
result.append(kw)
|
|
105
|
+
return result
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Hybrid retrieval — vector search + FTS BM25 + RRF merging + confidence scoring."""
|
|
2
|
+
import logging
|
|
3
|
+
|
|
4
|
+
from context_engine.models import Chunk
|
|
5
|
+
from context_engine.storage.backend import StorageBackend
|
|
6
|
+
from context_engine.indexer.embedder import Embedder
|
|
7
|
+
from context_engine.retrieval.confidence import ConfidenceScorer
|
|
8
|
+
from context_engine.retrieval.query_parser import QueryIntent, QueryParser
|
|
9
|
+
|
|
10
|
+
log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
_DEPRIORITISED_PATHS = {"tests/", "test_", "docs/", "spec", "plan"}
|
|
13
|
+
_RRF_K = 60
|
|
14
|
+
# Confidence weight in the final blend. The remainder goes to RRF, normalised to
|
|
15
|
+
# [0,1] by the best score in the candidate set so an exact-match FTS rank-1 hit
|
|
16
|
+
# scores the same as a vector rank-1 hit instead of being clamped to ~1.0.
|
|
17
|
+
_CONFIDENCE_WEIGHT = 0.5
|
|
18
|
+
# When the parsed query looks like a code lookup, give FTS more pull because
|
|
19
|
+
# exact-identifier hits are usually what the user wants.
|
|
20
|
+
_FTS_BOOST_CODE_LOOKUP = 1.5
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class HybridRetriever:
|
|
24
|
+
def __init__(self, backend: StorageBackend, embedder: Embedder) -> None:
|
|
25
|
+
self._backend = backend
|
|
26
|
+
self._embedder = embedder
|
|
27
|
+
self._scorer = ConfidenceScorer()
|
|
28
|
+
self._parser = QueryParser()
|
|
29
|
+
self._fts_warned = False
|
|
30
|
+
|
|
31
|
+
async def retrieve(
|
|
32
|
+
self,
|
|
33
|
+
query: str,
|
|
34
|
+
top_k: int = 10,
|
|
35
|
+
confidence_threshold: float = 0.0,
|
|
36
|
+
max_tokens: int | None = None,
|
|
37
|
+
) -> list[Chunk]:
|
|
38
|
+
parsed = self._parser.parse(query)
|
|
39
|
+
query_embedding = self._embedder.embed_query(query)
|
|
40
|
+
|
|
41
|
+
# embed_query returns tuple for LRU cache hashability; vector_store
|
|
42
|
+
# now handles the conversion internally via _to_list().
|
|
43
|
+
|
|
44
|
+
vector_results = await self._backend.vector_search(
|
|
45
|
+
query_embedding=query_embedding,
|
|
46
|
+
top_k=max(top_k * 3, 1),
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
# FTS search with graceful fallback
|
|
50
|
+
fts_ids: dict[str, int] = {}
|
|
51
|
+
try:
|
|
52
|
+
fts_results = await self._backend.fts_search(query, top_k=top_k * 3)
|
|
53
|
+
fts_ids = {id_: rank for rank, (id_, _) in enumerate(fts_results)}
|
|
54
|
+
except Exception:
|
|
55
|
+
if not self._fts_warned:
|
|
56
|
+
log.warning("FTS search unavailable; falling back to vector-only")
|
|
57
|
+
self._fts_warned = True
|
|
58
|
+
|
|
59
|
+
# Build vector rankings and chunk map
|
|
60
|
+
vector_ranks: dict[str, int] = {}
|
|
61
|
+
chunk_map: dict[str, Chunk] = {}
|
|
62
|
+
seen_keys: set[str] = set()
|
|
63
|
+
|
|
64
|
+
for rank, chunk in enumerate(vector_results):
|
|
65
|
+
dedup_key = f"{chunk.file_path}:{chunk.start_line}-{chunk.end_line}"
|
|
66
|
+
if dedup_key in seen_keys:
|
|
67
|
+
continue
|
|
68
|
+
seen_keys.add(dedup_key)
|
|
69
|
+
vector_ranks[chunk.id] = rank
|
|
70
|
+
chunk_map[chunk.id] = chunk
|
|
71
|
+
|
|
72
|
+
# Hydrate FTS-only results
|
|
73
|
+
fts_only_ids = [id_ for id_ in fts_ids if id_ not in chunk_map]
|
|
74
|
+
if fts_only_ids:
|
|
75
|
+
try:
|
|
76
|
+
hydrated = await self._backend.get_chunks_by_ids(fts_only_ids)
|
|
77
|
+
for chunk in hydrated:
|
|
78
|
+
chunk_map[chunk.id] = chunk
|
|
79
|
+
except Exception as exc:
|
|
80
|
+
log.warning("Failed to hydrate FTS-only chunks: %s", exc)
|
|
81
|
+
|
|
82
|
+
# Compute RRF scores. Boost FTS contribution when the parsed intent
|
|
83
|
+
# is CODE_LOOKUP — exact identifier matches are almost always what the
|
|
84
|
+
# user wants and would otherwise be drowned by semantic-similarity hits.
|
|
85
|
+
fts_weight = (
|
|
86
|
+
_FTS_BOOST_CODE_LOOKUP if parsed.intent == QueryIntent.CODE_LOOKUP else 1.0
|
|
87
|
+
)
|
|
88
|
+
all_ids = set(vector_ranks.keys()) | set(fts_ids.keys())
|
|
89
|
+
rrf_scores: dict[str, float] = {}
|
|
90
|
+
for id_ in all_ids:
|
|
91
|
+
score = 0.0
|
|
92
|
+
if id_ in vector_ranks:
|
|
93
|
+
score += 1.0 / (_RRF_K + vector_ranks[id_])
|
|
94
|
+
if id_ in fts_ids:
|
|
95
|
+
score += fts_weight * (1.0 / (_RRF_K + fts_ids[id_]))
|
|
96
|
+
rrf_scores[id_] = score
|
|
97
|
+
|
|
98
|
+
# Normalise RRF to [0, 1] by the best score in this candidate set.
|
|
99
|
+
# The previous `min(rrf * _RRF_K, 1.0)` saturated nearly every result to
|
|
100
|
+
# ~1.0, so confidence_score dominated the blend and FTS rank carried
|
|
101
|
+
# almost no signal past the top few. Rank-normalising restores gradient.
|
|
102
|
+
max_rrf = max(rrf_scores.values()) if rrf_scores else 0.0
|
|
103
|
+
|
|
104
|
+
# Score with confidence scorer
|
|
105
|
+
scored: list[tuple[Chunk, float]] = []
|
|
106
|
+
for id_, rrf_score in rrf_scores.items():
|
|
107
|
+
chunk = chunk_map.get(id_)
|
|
108
|
+
if chunk is None:
|
|
109
|
+
continue
|
|
110
|
+
|
|
111
|
+
distance = chunk.metadata.get("_distance", 0.0)
|
|
112
|
+
normalised_distance = min(max(distance / 2.0, 0.0), 1.0)
|
|
113
|
+
keyword_distance = self._estimate_keyword_distance(chunk, parsed)
|
|
114
|
+
conf_score = self._scorer.score(
|
|
115
|
+
chunk,
|
|
116
|
+
vector_distance=normalised_distance,
|
|
117
|
+
keyword_distance=keyword_distance,
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
normalised_rrf = (rrf_score / max_rrf) if max_rrf > 0 else 0.0
|
|
121
|
+
final_score = (
|
|
122
|
+
_CONFIDENCE_WEIGHT * conf_score
|
|
123
|
+
+ (1.0 - _CONFIDENCE_WEIGHT) * normalised_rrf
|
|
124
|
+
)
|
|
125
|
+
final_score = self._apply_path_penalty(chunk.file_path, final_score)
|
|
126
|
+
chunk.confidence_score = final_score
|
|
127
|
+
|
|
128
|
+
if final_score >= confidence_threshold:
|
|
129
|
+
scored.append((chunk, final_score))
|
|
130
|
+
|
|
131
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
132
|
+
ranked = [chunk for chunk, _ in scored[:top_k]]
|
|
133
|
+
|
|
134
|
+
# Graph expansion: fetch 1-2 bonus chunks from files reachable via
|
|
135
|
+
# CALLS/IMPORTS edges from the top results.
|
|
136
|
+
if ranked and hasattr(self._backend, "get_related_file_paths"):
|
|
137
|
+
try:
|
|
138
|
+
top_files = list({c.file_path for c in ranked[:3]})
|
|
139
|
+
related_files = await self._backend.get_related_file_paths(top_files)
|
|
140
|
+
qe_list = (
|
|
141
|
+
list(query_embedding)
|
|
142
|
+
if not isinstance(query_embedding, list)
|
|
143
|
+
else query_embedding
|
|
144
|
+
)
|
|
145
|
+
for rel_fp in related_files[:2]: # max 2 bonus files
|
|
146
|
+
bonus = await self._backend.vector_search(
|
|
147
|
+
query_embedding=qe_list,
|
|
148
|
+
top_k=2,
|
|
149
|
+
filters={"file_path": rel_fp},
|
|
150
|
+
)
|
|
151
|
+
for b in bonus:
|
|
152
|
+
dedup_key = (
|
|
153
|
+
f"{b.file_path}:{b.start_line}-{b.end_line}"
|
|
154
|
+
)
|
|
155
|
+
if dedup_key not in seen_keys:
|
|
156
|
+
seen_keys.add(dedup_key)
|
|
157
|
+
dist = b.metadata.get("_distance", 1.0)
|
|
158
|
+
b.confidence_score = max(0.0, 1.0 - dist) * 0.85
|
|
159
|
+
if b.confidence_score >= confidence_threshold:
|
|
160
|
+
ranked.append(b)
|
|
161
|
+
except Exception as exc:
|
|
162
|
+
log.debug("Graph expansion skipped: %s", exc)
|
|
163
|
+
|
|
164
|
+
if max_tokens is None:
|
|
165
|
+
return ranked
|
|
166
|
+
|
|
167
|
+
packed: list[Chunk] = []
|
|
168
|
+
budget = max_tokens
|
|
169
|
+
for chunk in ranked:
|
|
170
|
+
tokens = chunk.token_count
|
|
171
|
+
if tokens <= budget:
|
|
172
|
+
packed.append(chunk)
|
|
173
|
+
budget -= tokens
|
|
174
|
+
elif chunk.compressed_content:
|
|
175
|
+
compressed_tokens = max(1, int(len(chunk.compressed_content) / 3.3))
|
|
176
|
+
if compressed_tokens <= budget:
|
|
177
|
+
packed.append(chunk)
|
|
178
|
+
budget -= compressed_tokens
|
|
179
|
+
return packed
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def _apply_path_penalty(file_path: str, score: float) -> float:
|
|
183
|
+
if file_path.startswith("git:"):
|
|
184
|
+
return score
|
|
185
|
+
fp_lower = file_path.lower()
|
|
186
|
+
for marker in _DEPRIORITISED_PATHS:
|
|
187
|
+
if marker in fp_lower:
|
|
188
|
+
return score * 0.8
|
|
189
|
+
return score
|
|
190
|
+
|
|
191
|
+
def _estimate_keyword_distance(self, chunk, parsed) -> int:
|
|
192
|
+
if parsed.file_hints:
|
|
193
|
+
for hint in parsed.file_hints:
|
|
194
|
+
if hint in chunk.file_path:
|
|
195
|
+
return 0
|
|
196
|
+
for keyword in parsed.keywords:
|
|
197
|
+
if keyword.lower() in chunk.content.lower():
|
|
198
|
+
return 0
|
|
199
|
+
return 2
|