codebase-index 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_index/__init__.py +7 -0
- codebase_index/__main__.py +3 -0
- codebase_index/cli.py +916 -0
- codebase_index/config.py +110 -0
- codebase_index/discovery/__init__.py +10 -0
- codebase_index/discovery/classify.py +151 -0
- codebase_index/discovery/ignore.py +58 -0
- codebase_index/discovery/walker.py +75 -0
- codebase_index/doctor.py +138 -0
- codebase_index/embeddings/__init__.py +2 -0
- codebase_index/embeddings/backend.py +67 -0
- codebase_index/embeddings/external.py +56 -0
- codebase_index/embeddings/local.py +41 -0
- codebase_index/embeddings/noop.py +15 -0
- codebase_index/graph/__init__.py +8 -0
- codebase_index/graph/analysis.py +468 -0
- codebase_index/graph/builder.py +160 -0
- codebase_index/graph/expand.py +136 -0
- codebase_index/graph/export.py +381 -0
- codebase_index/graph/navigate.py +201 -0
- codebase_index/indexer/__init__.py +8 -0
- codebase_index/indexer/doc_chunks.py +202 -0
- codebase_index/indexer/freshness.py +109 -0
- codebase_index/indexer/pipeline.py +423 -0
- codebase_index/mcp/__init__.py +2 -0
- codebase_index/mcp/server.py +354 -0
- codebase_index/models.py +145 -0
- codebase_index/output/__init__.py +6 -0
- codebase_index/output/json.py +13 -0
- codebase_index/output/markdown.py +316 -0
- codebase_index/output/redact.py +31 -0
- codebase_index/parsers/__init__.py +9 -0
- codebase_index/parsers/base.py +47 -0
- codebase_index/parsers/languages.py +290 -0
- codebase_index/parsers/line_chunker.py +39 -0
- codebase_index/parsers/symbol_chunks.py +62 -0
- codebase_index/parsers/treesitter.py +439 -0
- codebase_index/retrieval/__init__.py +9 -0
- codebase_index/retrieval/budget.py +82 -0
- codebase_index/retrieval/fusion.py +62 -0
- codebase_index/retrieval/intent.py +56 -0
- codebase_index/retrieval/pipeline.py +207 -0
- codebase_index/retrieval/rerank.py +69 -0
- codebase_index/retrieval/searchers.py +291 -0
- codebase_index/retrieval/skeleton.py +251 -0
- codebase_index/retrieval/types.py +79 -0
- codebase_index/scaffold.py +399 -0
- codebase_index/service.py +158 -0
- codebase_index/skill_template/SKILL.md +198 -0
- codebase_index/skill_template/examples/hooks/settings.json +16 -0
- codebase_index/skill_template/scripts/cbx +25 -0
- codebase_index/skill_template/scripts/cbx.ps1 +25 -0
- codebase_index/skill_update.py +150 -0
- codebase_index/storage/__init__.py +8 -0
- codebase_index/storage/db.py +116 -0
- codebase_index/storage/repo.py +701 -0
- codebase_index/storage/schema.sql +125 -0
- codebase_index/watch/__init__.py +5 -0
- codebase_index/watch/watcher.py +93 -0
- codebase_index-1.6.0.dist-info/METADATA +748 -0
- codebase_index-1.6.0.dist-info/RECORD +64 -0
- codebase_index-1.6.0.dist-info/WHEEL +4 -0
- codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
- codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Cheap rule-first intent classifier (regex/keyword heuristics).
|
|
2
|
+
|
|
3
|
+
Each intent maps to retriever weights over {"path","symbol","fts"}, a default
|
|
4
|
+
token budget, and a graph strategy (consumed later by M5).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
from .types import Intent, IntentPlan
|
|
12
|
+
|
|
13
|
+
_RULES: list[tuple[re.Pattern[str], Intent]] = [
|
|
14
|
+
(re.compile(r"traceback|stack ?trace|error:|exception|why does .* fail", re.I), Intent.DEBUG_ERROR),
|
|
15
|
+
(re.compile(r"\b(who calls|find references|references to|callers of)\b", re.I), Intent.FIND_REFS),
|
|
16
|
+
(re.compile(r"\b(what breaks|what depends on|impact of|affected if)\b", re.I), Intent.IMPACT),
|
|
17
|
+
(re.compile(r"\b(data ?flow|where does .* get set|trace .* flow)\b", re.I), Intent.DATA_FLOW),
|
|
18
|
+
(re.compile(r"\b(architecture|high-?level|overview|structure of)\b", re.I), Intent.ARCHITECTURE),
|
|
19
|
+
(re.compile(r"\b(how does|how do|explain how|how .* works?)\b", re.I), Intent.HOW_IT_WORKS),
|
|
20
|
+
(re.compile(r"\b(where is|find the|locate|implementation of|defined)\b", re.I), Intent.LOCATE_IMPL),
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
_PLANS: dict[Intent, IntentPlan] = {
|
|
24
|
+
Intent.LOCATE_IMPL: IntentPlan(Intent.LOCATE_IMPL, {"symbol": 1.0, "path": 0.7, "fts": 0.4, "vector": 0.2}, 1500),
|
|
25
|
+
Intent.HOW_IT_WORKS: IntentPlan(Intent.HOW_IT_WORKS, {"fts": 1.0, "symbol": 0.7, "path": 0.3, "vector": 0.8}, 2200, graph_strategy="down"),
|
|
26
|
+
Intent.IMPACT: IntentPlan(Intent.IMPACT, {"symbol": 1.0, "path": 0.6, "fts": 0.3, "vector": 0.3}, 1800, graph_strategy="up"),
|
|
27
|
+
Intent.FIND_REFS: IntentPlan(Intent.FIND_REFS, {"symbol": 1.0, "fts": 0.3, "path": 0.2, "vector": 0.2}, 1500, graph_strategy="refs"),
|
|
28
|
+
Intent.DATA_FLOW: IntentPlan(Intent.DATA_FLOW, {"symbol": 0.9, "fts": 0.8, "path": 0.3, "vector": 0.6}, 2000, graph_strategy="both"),
|
|
29
|
+
Intent.DEBUG_ERROR: IntentPlan(Intent.DEBUG_ERROR, {"fts": 1.0, "symbol": 0.6, "path": 0.3, "vector": 0.4}, 1800),
|
|
30
|
+
Intent.ARCHITECTURE: IntentPlan(Intent.ARCHITECTURE, {"fts": 0.6, "symbol": 0.4, "path": 0.5, "vector": 0.5}, 2500, summaries_first=True),
|
|
31
|
+
Intent.KEYWORD: IntentPlan(Intent.KEYWORD, {"fts": 1.0, "symbol": 0.6, "path": 0.5, "vector": 0.7}, 1500),
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def detect_intent(query: str) -> IntentPlan:
|
|
36
|
+
matched = [_PLANS[intent] for pattern, intent in _RULES if pattern.search(query)]
|
|
37
|
+
if not matched:
|
|
38
|
+
return _PLANS[Intent.KEYWORD]
|
|
39
|
+
if len(matched) == 1:
|
|
40
|
+
return matched[0]
|
|
41
|
+
|
|
42
|
+
# Merge multiple matched intents: max weight per retriever, max token_budget,
|
|
43
|
+
# primary intent/graph_strategy from first match.
|
|
44
|
+
merged_weights: dict[str, float] = {}
|
|
45
|
+
for plan in matched:
|
|
46
|
+
for key, val in plan.weights.items():
|
|
47
|
+
merged_weights[key] = max(merged_weights.get(key, 0.0), val)
|
|
48
|
+
|
|
49
|
+
primary = matched[0]
|
|
50
|
+
return IntentPlan(
|
|
51
|
+
intent=primary.intent,
|
|
52
|
+
weights=merged_weights,
|
|
53
|
+
token_budget=max(p.token_budget for p in matched),
|
|
54
|
+
graph_strategy=primary.graph_strategy,
|
|
55
|
+
summaries_first=any(p.summaries_first for p in matched),
|
|
56
|
+
)
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Orchestrate the hybrid retrieval pipeline (RETRIEVAL.md §1–§7).
|
|
2
|
+
|
|
3
|
+
query -> intent -> retrievers -> RRF fuse -> rerank -> budget -> payload.
|
|
4
|
+
Graph expansion (§5) and vector retrieval (§2 vector) are deferred to M5/M6.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import sqlite3
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from ..config import Config
|
|
15
|
+
from ..indexer.freshness import compute_freshness
|
|
16
|
+
from . import searchers
|
|
17
|
+
from .budget import apply_budget
|
|
18
|
+
from .fusion import fuse
|
|
19
|
+
from .intent import detect_intent
|
|
20
|
+
from .rerank import rerank
|
|
21
|
+
from .types import Confidence
|
|
22
|
+
|
|
23
|
+
_TERM_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
24
|
+
_RRF_K = 60
|
|
25
|
+
# Max results kept per file before extras are pushed to the tail. Bucketed fusion
|
|
26
|
+
# already collapses co-located hits; this caps the long tail of one big file
|
|
27
|
+
# dominating the page so distinct files get surfaced.
|
|
28
|
+
_MAX_PER_FILE = 3
|
|
29
|
+
_KIND_ALIASES = {
|
|
30
|
+
"method": "method",
|
|
31
|
+
"methods": "method",
|
|
32
|
+
"function": "function",
|
|
33
|
+
"functions": "function",
|
|
34
|
+
"class": "class",
|
|
35
|
+
"classes": "class",
|
|
36
|
+
"interface": "interface",
|
|
37
|
+
"interfaces": "interface",
|
|
38
|
+
"enum": "enum",
|
|
39
|
+
"enums": "enum",
|
|
40
|
+
"type": "type",
|
|
41
|
+
"types": "type",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _requested_symbol_kind(query: str) -> str | None:
|
|
46
|
+
kinds = {
|
|
47
|
+
_KIND_ALIASES[t.lower()]
|
|
48
|
+
for t in _TERM_RE.findall(query)
|
|
49
|
+
if t.lower() in _KIND_ALIASES
|
|
50
|
+
}
|
|
51
|
+
return next(iter(kinds)) if len(kinds) == 1 else None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _run_retrievers(conn, query, *, mode, limit, weights, backend=None):
|
|
55
|
+
lists = {}
|
|
56
|
+
symbol_kind = _requested_symbol_kind(query)
|
|
57
|
+
if mode in ("hybrid", "fts"):
|
|
58
|
+
lists["fts"] = searchers.fts_candidates(conn, query, limit=limit)
|
|
59
|
+
if mode in ("hybrid", "symbol"):
|
|
60
|
+
lists["symbol"] = searchers.symbol_candidates(conn, query, limit=limit, kind=symbol_kind)
|
|
61
|
+
if mode == "hybrid":
|
|
62
|
+
lists["path"] = searchers.path_candidates(conn, query, limit=limit)
|
|
63
|
+
if mode in ("hybrid", "vector") and backend is not None and getattr(backend, "enabled", False):
|
|
64
|
+
lists["vector"] = searchers.vector_candidates(conn, query, backend, limit=limit)
|
|
65
|
+
if mode != "hybrid":
|
|
66
|
+
weights = {mode: 1.0}
|
|
67
|
+
return lists, weights
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _confidence(ranked) -> Confidence:
|
|
71
|
+
if not ranked:
|
|
72
|
+
return Confidence.LOW
|
|
73
|
+
top = ranked[0]
|
|
74
|
+
if top.score <= 0:
|
|
75
|
+
return Confidence.LOW
|
|
76
|
+
if len(ranked) == 1:
|
|
77
|
+
return Confidence.MEDIUM
|
|
78
|
+
# Relative gap, not absolute: scale-invariant, so it stays meaningful regardless
|
|
79
|
+
# of fusion's score magnitude. agreeing_sources is file-level (how many retrievers
|
|
80
|
+
# surfaced the winning file at all), the signal RRF agreement is meant to capture.
|
|
81
|
+
rel_gap = (top.score - ranked[1].score) / top.score
|
|
82
|
+
agree = getattr(top, "agreeing_sources", 1)
|
|
83
|
+
exact = getattr(top, "exact_symbol", False)
|
|
84
|
+
n = len(ranked)
|
|
85
|
+
# Exact symbol match always high confidence
|
|
86
|
+
if exact:
|
|
87
|
+
return Confidence.HIGH
|
|
88
|
+
# Strong multi-source agreement with a clear score gap
|
|
89
|
+
if agree >= 3 and rel_gap > 0.15:
|
|
90
|
+
return Confidence.HIGH
|
|
91
|
+
if agree >= 2 and rel_gap > 0.25:
|
|
92
|
+
return Confidence.HIGH
|
|
93
|
+
# Single source but very dominant winner
|
|
94
|
+
if agree == 1 and rel_gap > 0.5:
|
|
95
|
+
return Confidence.HIGH
|
|
96
|
+
if agree >= 2 or rel_gap > 0.1 or n >= 5:
|
|
97
|
+
return Confidence.MEDIUM
|
|
98
|
+
return Confidence.LOW
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _diversify(ranked: list, *, per_file: int) -> list:
|
|
102
|
+
"""Stable reorder: keep the first `per_file` hits of each file in place, push
|
|
103
|
+
the rest to the tail (preserving their relative order). Nothing is dropped, so
|
|
104
|
+
recall is intact; the page just isn't monopolised by one file's many regions."""
|
|
105
|
+
kept: list = []
|
|
106
|
+
overflow: list = []
|
|
107
|
+
counts: dict[str, int] = {}
|
|
108
|
+
for c in ranked:
|
|
109
|
+
counts[c.path] = counts.get(c.path, 0) + 1
|
|
110
|
+
(kept if counts[c.path] <= per_file else overflow).append(c)
|
|
111
|
+
return kept + overflow
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _fallback_suggestions(query, ranked) -> dict:
|
|
115
|
+
terms = _TERM_RE.findall(query)
|
|
116
|
+
if not terms:
|
|
117
|
+
return {}
|
|
118
|
+
longest = max(terms, key=len)
|
|
119
|
+
rg = [f'rg -n "{longest}"']
|
|
120
|
+
if len(terms) > 1:
|
|
121
|
+
rg.append(f'rg -n "{".*".join(terms[:3])}"')
|
|
122
|
+
return {"ripgrep": rg}
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def search(
|
|
126
|
+
conn: sqlite3.Connection,
|
|
127
|
+
query: str,
|
|
128
|
+
*,
|
|
129
|
+
mode: str,
|
|
130
|
+
limit: int,
|
|
131
|
+
token_budget: int,
|
|
132
|
+
no_fallback: bool,
|
|
133
|
+
backend=None,
|
|
134
|
+
root: Optional[Path] = None,
|
|
135
|
+
config: Optional[Config] = None,
|
|
136
|
+
offset: int = 0,
|
|
137
|
+
compact: bool = True,
|
|
138
|
+
compact_min_reduction: float = 0.25,
|
|
139
|
+
) -> dict:
|
|
140
|
+
plan = detect_intent(query)
|
|
141
|
+
if token_budget <= 0:
|
|
142
|
+
token_budget = plan.token_budget
|
|
143
|
+
fetch_limit = limit + offset
|
|
144
|
+
lists, weights = _run_retrievers(
|
|
145
|
+
conn, query, mode=mode, limit=fetch_limit, weights=plan.weights, backend=backend
|
|
146
|
+
)
|
|
147
|
+
fused = fuse(lists, weights=weights, k=_RRF_K)
|
|
148
|
+
ranked = _diversify(rerank(fused, query=query, intent=plan.intent), per_file=_MAX_PER_FILE)
|
|
149
|
+
ranked = ranked[:fetch_limit]
|
|
150
|
+
confidence = _confidence(ranked)
|
|
151
|
+
# Scale budget proportionally so later pages receive snippet coverage.
|
|
152
|
+
scaled_budget = token_budget * fetch_limit // max(limit, 1) if offset > 0 else token_budget
|
|
153
|
+
from .skeleton import make_compactor
|
|
154
|
+
|
|
155
|
+
compactor = make_compactor(
|
|
156
|
+
intent=plan.intent, query=query,
|
|
157
|
+
enabled=compact, min_reduction=compact_min_reduction,
|
|
158
|
+
)
|
|
159
|
+
all_results, all_recommended = apply_budget(
|
|
160
|
+
ranked, token_budget=scaled_budget, compactor=compactor
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
# Paginate: slice results and filter recommended_reads to the current page.
|
|
164
|
+
paginated = all_results[offset:offset + limit]
|
|
165
|
+
paginated_keys = {(r["path"], r["line_start"], r["line_end"]) for r in paginated}
|
|
166
|
+
recommended = [
|
|
167
|
+
r for r in all_recommended
|
|
168
|
+
if (r["path"], r["line_start"], r["line_end"]) in paginated_keys
|
|
169
|
+
]
|
|
170
|
+
has_more = len(all_results) > offset + limit
|
|
171
|
+
|
|
172
|
+
fallback = {}
|
|
173
|
+
if not no_fallback and confidence == Confidence.LOW:
|
|
174
|
+
fallback = _fallback_suggestions(query, ranked)
|
|
175
|
+
|
|
176
|
+
if config is not None and root is not None:
|
|
177
|
+
freshness = compute_freshness(conn, root, config)
|
|
178
|
+
else:
|
|
179
|
+
from ..models import IndexFreshness
|
|
180
|
+
from ..storage import repo
|
|
181
|
+
built_at = repo.get_meta(conn, "built_at")
|
|
182
|
+
freshness = IndexFreshness(
|
|
183
|
+
exists=built_at is not None,
|
|
184
|
+
stale=False,
|
|
185
|
+
files_changed_since_build=0,
|
|
186
|
+
built_at=built_at,
|
|
187
|
+
head_commit=repo.get_meta(conn, "head_commit"),
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
payload: dict = {
|
|
191
|
+
"query": query,
|
|
192
|
+
"intent": plan.intent.value,
|
|
193
|
+
"mode": mode,
|
|
194
|
+
"index": freshness.model_dump(),
|
|
195
|
+
"confidence": confidence.value,
|
|
196
|
+
"results": paginated,
|
|
197
|
+
"recommended_reads": recommended,
|
|
198
|
+
"fallback_suggestions": fallback,
|
|
199
|
+
}
|
|
200
|
+
if offset > 0 or has_more:
|
|
201
|
+
payload["pagination"] = {
|
|
202
|
+
"offset": offset,
|
|
203
|
+
"limit": limit,
|
|
204
|
+
"has_more": has_more,
|
|
205
|
+
"next_offset": offset + limit if has_more else None,
|
|
206
|
+
}
|
|
207
|
+
return payload
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Explainable feature reranker layered on the fused order (RETRIEVAL.md §4).
|
|
2
|
+
|
|
3
|
+
Adds a bounded bonus/penalty to the fused RRF score and produces a human-readable
|
|
4
|
+
`reason` per candidate. No external model. Graph centrality uses the denormalized
|
|
5
|
+
symbols.in_degree/out_degree; cross-node graph expansion is M5.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
from ..discovery.classify import is_test_path
|
|
14
|
+
from .types import Candidate, Intent
|
|
15
|
+
|
|
16
|
+
_TERM_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
17
|
+
|
|
18
|
+
# Graph-centrality bonus. Logarithmic (not linear) so a "god class" with hundreds
|
|
19
|
+
# of callers cannot dominate a genuinely relevant low-degree match on a stray-term
|
|
20
|
+
# tie. log1p compresses the tail — in_degree 4 → 10 → 100 yields a gently rising,
|
|
21
|
+
# capped bonus instead of saturating the cap by in_degree 10 — and the lower cap
|
|
22
|
+
# keeps centrality a tiebreak rather than an override. This dampens the god-class
|
|
23
|
+
# over-ranking documented in tests/benchmark_honest_RESULTS.md.
|
|
24
|
+
_DEGREE_SCALE = 0.03
|
|
25
|
+
_DEGREE_CAP = 0.08
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def rerank(candidates: list[Candidate], *, query: str, intent: Intent) -> list[Candidate]:
|
|
29
|
+
terms = {t.lower() for t in _TERM_RE.findall(query)}
|
|
30
|
+
for c in candidates:
|
|
31
|
+
bonus = 0.0
|
|
32
|
+
reasons: list[str] = []
|
|
33
|
+
|
|
34
|
+
if c.source == "symbol" and c.kind in {"function", "method", "class", "interface", "type"}:
|
|
35
|
+
bonus += 0.05
|
|
36
|
+
if c.exact_symbol:
|
|
37
|
+
bonus += 0.20
|
|
38
|
+
reasons.append("exact symbol match")
|
|
39
|
+
if c.symbol and c.symbol.lower() in terms:
|
|
40
|
+
bonus += 0.05
|
|
41
|
+
|
|
42
|
+
if any(t in c.path.lower() for t in terms):
|
|
43
|
+
bonus += 0.05
|
|
44
|
+
reasons.append(f"in {c.path.rsplit('/', 1)[0] or '.'}/")
|
|
45
|
+
|
|
46
|
+
if c.in_degree:
|
|
47
|
+
bonus += min(_DEGREE_CAP, math.log1p(c.in_degree) * _DEGREE_SCALE)
|
|
48
|
+
reasons.append(f"{c.in_degree} callers")
|
|
49
|
+
elif c.ref_count:
|
|
50
|
+
# Precise in_degree is only computed for globally-unique symbol names
|
|
51
|
+
# (ambiguous names never resolve), so common names like `run`/`handle`
|
|
52
|
+
# always score 0. Fall back to a damped name-reference count — half the
|
|
53
|
+
# scale and cap — so centrality still breaks ties without overriding the
|
|
54
|
+
# precise signal where it exists.
|
|
55
|
+
bonus += min(_DEGREE_CAP / 2, math.log1p(c.ref_count) * (_DEGREE_SCALE / 2))
|
|
56
|
+
reasons.append(f"~{c.ref_count} refs by name")
|
|
57
|
+
if intent is Intent.ARCHITECTURE and (c.in_degree + c.out_degree):
|
|
58
|
+
bonus += min(_DEGREE_CAP, math.log1p(c.in_degree + c.out_degree) * (_DEGREE_SCALE / 2))
|
|
59
|
+
|
|
60
|
+
wants_tests = "test" in terms or "tests" in terms
|
|
61
|
+
if c.is_generated or (is_test_path(c.path) and not wants_tests):
|
|
62
|
+
bonus -= 0.15
|
|
63
|
+
reasons.append("generated/test demoted")
|
|
64
|
+
|
|
65
|
+
c.score += bonus
|
|
66
|
+
c.reason = " · ".join(reasons) if reasons else c.source
|
|
67
|
+
|
|
68
|
+
candidates.sort(key=lambda c: c.score, reverse=True)
|
|
69
|
+
return candidates
|
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
"""Three retrievers, each emitting a uniform list[Candidate].
|
|
2
|
+
|
|
3
|
+
Vector retrieval (RETRIEVAL.md §2) is M6 and intentionally absent here; the
|
|
4
|
+
pipeline degrades to path+symbol+fts.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import sqlite3
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
from ..config import Config
|
|
15
|
+
from ..indexer.freshness import compute_freshness
|
|
16
|
+
from ..models import (
|
|
17
|
+
GraphCoverage,
|
|
18
|
+
IndexFreshness,
|
|
19
|
+
RefSite,
|
|
20
|
+
RefsResponse,
|
|
21
|
+
SymbolDef,
|
|
22
|
+
SymbolResponse,
|
|
23
|
+
)
|
|
24
|
+
from ..storage import repo
|
|
25
|
+
from .types import Candidate as M4Candidate
|
|
26
|
+
|
|
27
|
+
_WORD_RE = re.compile(r"[A-Za-z0-9_]+")
|
|
28
|
+
_CAMEL_RE = re.compile(r"[A-Z]+(?![a-z])|[A-Z]?[a-z0-9]+")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def fts_candidates(conn: sqlite3.Connection, query: str, *, limit: int) -> list[M4Candidate]:
|
|
32
|
+
match = build_match_query(query)
|
|
33
|
+
if not match:
|
|
34
|
+
return []
|
|
35
|
+
out: list[M4Candidate] = []
|
|
36
|
+
for row in repo.fts_search(conn, match, limit=limit):
|
|
37
|
+
out.append(
|
|
38
|
+
M4Candidate(
|
|
39
|
+
path=row["path"],
|
|
40
|
+
line_start=row["line_start"],
|
|
41
|
+
line_end=row["line_end"],
|
|
42
|
+
source="fts",
|
|
43
|
+
score=-float(row["bm25"]),
|
|
44
|
+
content=row["content"],
|
|
45
|
+
token_est=int(row["token_est"]),
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
return out
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# Natural-language filler that is never a useful symbol query term. Kept deliberately small:
|
|
52
|
+
# anything that could plausibly be an identifier (get/set/run/...) is NOT a stopword.
|
|
53
|
+
_SYMBOL_STOPWORDS = {
|
|
54
|
+
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "how", "does",
|
|
55
|
+
"do", "did", "what", "where", "which", "who", "whom", "when", "why", "to", "of", "in",
|
|
56
|
+
"on", "for", "and", "or", "with", "from", "it", "this", "that", "these", "those",
|
|
57
|
+
"into", "during", "if", "via", "across", "between", "about", "their", "its",
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _salient_terms(query: str) -> list[str]:
|
|
62
|
+
"""Lower-cased query terms worth matching against symbol names (dedup, order-preserving)."""
|
|
63
|
+
out: list[str] = []
|
|
64
|
+
for t in _WORD_RE.findall(query):
|
|
65
|
+
tl = t.lower()
|
|
66
|
+
if len(tl) < 3 or tl in _SYMBOL_STOPWORDS:
|
|
67
|
+
continue
|
|
68
|
+
out.append(tl)
|
|
69
|
+
return list(dict.fromkeys(out))
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _name_subtokens(name: str) -> set[str]:
|
|
73
|
+
"""camelCase + snake_case split of a symbol name, lower-cased (e.g. ReligionManager ->
|
|
74
|
+
{religion, manager}; refresh_access_token -> {refresh, access, token})."""
|
|
75
|
+
return {s.lower() for s in _subtokens(name)}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def symbol_candidates(
|
|
79
|
+
conn: sqlite3.Connection, query: str, *, limit: int, kind: str | None = None
|
|
80
|
+
) -> list[M4Candidate]:
|
|
81
|
+
"""Symbol retriever that scores by how many query terms a symbol's name covers.
|
|
82
|
+
|
|
83
|
+
The old behaviour searched only the single longest term, so "religion manager" matched
|
|
84
|
+
the bare `Religion` class (exact) and never reached `ReligionManager`. Now every salient
|
|
85
|
+
term is searched and candidates are ranked by camelCase/underscore-split *coverage* of the
|
|
86
|
+
query, so the multi-word concept lands on the multi-word symbol.
|
|
87
|
+
"""
|
|
88
|
+
terms = _salient_terms(query)
|
|
89
|
+
if not terms:
|
|
90
|
+
return []
|
|
91
|
+
|
|
92
|
+
term_set = set(terms)
|
|
93
|
+
joined = "".join(terms)
|
|
94
|
+
rows_by_key: dict[tuple, sqlite3.Row] = {}
|
|
95
|
+
for term in terms:
|
|
96
|
+
for row in repo.symbol_search(conn, term, limit=limit, kind=kind):
|
|
97
|
+
key = (row["path"], row["line_start"], row["name"])
|
|
98
|
+
rows_by_key.setdefault(key, row)
|
|
99
|
+
|
|
100
|
+
scored: list[tuple] = []
|
|
101
|
+
for row in rows_by_key.values():
|
|
102
|
+
subs = _name_subtokens(row["name"])
|
|
103
|
+
name_l = (row["name"] or "").lower()
|
|
104
|
+
covered = sum(1 for t in terms if t in subs or t in name_l)
|
|
105
|
+
tightness = len(subs & term_set) / len(subs) if subs else 0.0
|
|
106
|
+
# Exact-match precedence is for *precise* lookups only. With one salient term it's a
|
|
107
|
+
# real identifier query; with many it must match the whole camelCase-joined name
|
|
108
|
+
# (e.g. "religion manager" -> ReligionManager). A single shared term ("token" hitting
|
|
109
|
+
# a generated `Token` type) must NOT count as exact.
|
|
110
|
+
exact = (len(terms) == 1 and bool(row["is_exact"])) or (bool(joined) and name_l == joined)
|
|
111
|
+
# Ranking: most query terms covered, then exact-name match, then a tighter name
|
|
112
|
+
# (fewer junk subtokens), then more-referenced (in_degree), then a shorter name.
|
|
113
|
+
sort_key = (covered, int(exact), tightness, int(row["in_degree"]), -len(name_l))
|
|
114
|
+
score = covered + tightness + (2.0 if exact else 0.0)
|
|
115
|
+
scored.append((sort_key, score, exact, row))
|
|
116
|
+
|
|
117
|
+
scored.sort(key=lambda x: x[0], reverse=True)
|
|
118
|
+
|
|
119
|
+
out: list[M4Candidate] = []
|
|
120
|
+
for sort_key, score, exact, row in scored[:limit]:
|
|
121
|
+
out.append(
|
|
122
|
+
M4Candidate(
|
|
123
|
+
path=row["path"],
|
|
124
|
+
line_start=row["line_start"],
|
|
125
|
+
line_end=row["line_end"],
|
|
126
|
+
source="symbol",
|
|
127
|
+
score=float(score),
|
|
128
|
+
kind=row["kind"],
|
|
129
|
+
symbol=row["name"],
|
|
130
|
+
content=row["signature"],
|
|
131
|
+
token_est=max(1, len(row["signature"] or "") // 4),
|
|
132
|
+
in_degree=int(row["in_degree"]),
|
|
133
|
+
out_degree=int(row["out_degree"]),
|
|
134
|
+
is_generated=bool(row["is_generated"]),
|
|
135
|
+
exact_symbol=exact,
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Damped centrality fallback: symbols whose name is not globally unique never
|
|
140
|
+
# get a resolved in_degree, so back-fill a name-reference count for the zero ones.
|
|
141
|
+
zero_deg = [c.symbol for c in out if not c.in_degree and c.symbol]
|
|
142
|
+
if zero_deg:
|
|
143
|
+
counts = repo.name_ref_counts(conn, zero_deg)
|
|
144
|
+
for c in out:
|
|
145
|
+
if not c.in_degree and c.symbol:
|
|
146
|
+
c.ref_count = counts.get(c.symbol, 0)
|
|
147
|
+
return out
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def path_candidates(conn: sqlite3.Connection, query: str, *, limit: int) -> list[M4Candidate]:
|
|
151
|
+
out: list[M4Candidate] = []
|
|
152
|
+
for rank, row in enumerate(repo.path_search(conn, query, limit=limit)):
|
|
153
|
+
out.append(
|
|
154
|
+
M4Candidate(
|
|
155
|
+
path=row["path"],
|
|
156
|
+
line_start=1,
|
|
157
|
+
line_end=1,
|
|
158
|
+
source="path",
|
|
159
|
+
score=float(row["hits"]) / (1 + rank),
|
|
160
|
+
is_generated=bool(row["is_generated"]),
|
|
161
|
+
)
|
|
162
|
+
)
|
|
163
|
+
return out
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _subtokens(term: str) -> list[str]:
|
|
167
|
+
parts: list[str] = []
|
|
168
|
+
for piece in term.split("_"):
|
|
169
|
+
parts.extend(m.group(0) for m in _CAMEL_RE.finditer(piece))
|
|
170
|
+
return [p for p in parts if len(p) >= 2]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def build_match_query(query: str) -> str:
|
|
174
|
+
"""Build the FTS5 MATCH expression for `query`.
|
|
175
|
+
|
|
176
|
+
Each whitespace term expands to an OR group over the term and its
|
|
177
|
+
camelCase/snake_case subtokens; groups are AND-ed. Natural-language filler
|
|
178
|
+
("how does X work") is dropped first: otherwise FTS would AND-in stopwords
|
|
179
|
+
that code chunks never contain, collapsing recall to zero on the very intents
|
|
180
|
+
(HOW_IT_WORKS / DEBUG_ERROR) that weight FTS highest. If *every* term is a
|
|
181
|
+
stopword we fall back to the full set rather than emit an empty match.
|
|
182
|
+
"""
|
|
183
|
+
groups: list[str] = []
|
|
184
|
+
salient: list[str] = []
|
|
185
|
+
for term in _WORD_RE.findall(query):
|
|
186
|
+
variants = {term, *_subtokens(term)}
|
|
187
|
+
variants = {v for v in variants if len(v) >= 2}
|
|
188
|
+
if not variants:
|
|
189
|
+
continue
|
|
190
|
+
ored = " OR ".join(f'"{v}"' for v in sorted(variants, key=str.lower))
|
|
191
|
+
# FTS5 rejects implicit AND (space) when a group contains parenthesised OR
|
|
192
|
+
# expressions; explicit AND is required between all groups.
|
|
193
|
+
group = f"({ored})" if len(variants) > 1 else ored
|
|
194
|
+
groups.append(group)
|
|
195
|
+
if term.lower() not in _SYMBOL_STOPWORDS:
|
|
196
|
+
salient.append(group)
|
|
197
|
+
return " AND ".join(salient or groups)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _freshness(
|
|
201
|
+
conn: sqlite3.Connection, root: Optional[Path] = None, config: Optional[Config] = None
|
|
202
|
+
) -> IndexFreshness:
|
|
203
|
+
if config is not None and root is not None:
|
|
204
|
+
return compute_freshness(conn, root, config)
|
|
205
|
+
built_at = repo.get_meta(conn, "built_at")
|
|
206
|
+
return IndexFreshness(
|
|
207
|
+
exists=built_at is not None,
|
|
208
|
+
stale=False,
|
|
209
|
+
files_changed_since_build=0,
|
|
210
|
+
built_at=built_at,
|
|
211
|
+
head_commit=repo.get_meta(conn, "head_commit"),
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def symbol_lookup(
|
|
216
|
+
conn: sqlite3.Connection, name: str, *, kind: Optional[str], exact: bool
|
|
217
|
+
) -> SymbolResponse:
|
|
218
|
+
rows = repo.symbols_by_name(conn, name, kind=kind, exact=exact)
|
|
219
|
+
symbols = [
|
|
220
|
+
SymbolDef(
|
|
221
|
+
name=row["name"],
|
|
222
|
+
qualified=row["qualified"],
|
|
223
|
+
kind=row["kind"],
|
|
224
|
+
path=row["path"],
|
|
225
|
+
line_start=row["line_start"],
|
|
226
|
+
line_end=row["line_end"],
|
|
227
|
+
signature=row["signature"],
|
|
228
|
+
)
|
|
229
|
+
for row in rows
|
|
230
|
+
]
|
|
231
|
+
return SymbolResponse(query=name, index=_freshness(conn), symbols=symbols)
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def refs_lookup(conn: sqlite3.Connection, name: str, *, kind: str) -> RefsResponse:
|
|
235
|
+
defs = repo.symbols_by_name(conn, name, exact=True)
|
|
236
|
+
sites = [
|
|
237
|
+
RefSite(
|
|
238
|
+
path=row["path"],
|
|
239
|
+
line=row["line"],
|
|
240
|
+
kind="call",
|
|
241
|
+
confidence=row["confidence"] if "confidence" in row.keys() else "extracted",
|
|
242
|
+
)
|
|
243
|
+
for row in repo.refs_for_name(conn, name)
|
|
244
|
+
]
|
|
245
|
+
if kind == "all":
|
|
246
|
+
sites.extend(
|
|
247
|
+
# A definition is the symbol itself — exact by construction.
|
|
248
|
+
RefSite(path=row["path"], line=row["line_start"], kind="definition")
|
|
249
|
+
for row in defs
|
|
250
|
+
)
|
|
251
|
+
sites.sort(key=lambda site: (site.path, site.line, site.kind))
|
|
252
|
+
# Coverage is judged by the symbol's defining language(s); fall back to the
|
|
253
|
+
# call-site files when the symbol has no indexed definition.
|
|
254
|
+
coverage_paths = [row["path"] for row in defs] or [s.path for s in sites]
|
|
255
|
+
return RefsResponse(
|
|
256
|
+
query=name,
|
|
257
|
+
index=_freshness(conn),
|
|
258
|
+
sites=sites,
|
|
259
|
+
coverage=GraphCoverage.for_paths(coverage_paths),
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def vector_candidates(
|
|
264
|
+
conn: sqlite3.Connection, query: str, backend, *, limit: int
|
|
265
|
+
) -> list["M4Candidate"]:
|
|
266
|
+
"""Semantic retriever: embed the query, KNN over vec_chunks.
|
|
267
|
+
|
|
268
|
+
`backend` must be an enabled EmbeddingBackend; callers pass None/Noop when
|
|
269
|
+
embeddings are disabled and simply skip this retriever. sqlite-vec `distance`
|
|
270
|
+
is smaller-is-better, so the candidate score negates it for "higher is better".
|
|
271
|
+
"""
|
|
272
|
+
if backend is None or not getattr(backend, "enabled", False):
|
|
273
|
+
return []
|
|
274
|
+
query = query.strip()
|
|
275
|
+
if not query:
|
|
276
|
+
return []
|
|
277
|
+
vec = backend.embed([query])[0]
|
|
278
|
+
out: list[M4Candidate] = []
|
|
279
|
+
for row in repo.vector_search(conn, vec, limit=limit):
|
|
280
|
+
out.append(
|
|
281
|
+
M4Candidate(
|
|
282
|
+
path=row["path"],
|
|
283
|
+
line_start=row["line_start"],
|
|
284
|
+
line_end=row["line_end"],
|
|
285
|
+
source="vector",
|
|
286
|
+
score=-float(row["distance"]),
|
|
287
|
+
content=row["content"],
|
|
288
|
+
token_est=int(row["token_est"]),
|
|
289
|
+
)
|
|
290
|
+
)
|
|
291
|
+
return out
|