@pmaddire/gcie 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENT.md +256 -0
- package/AGENT_USAGE.md +231 -0
- package/ARCHITECTURE.md +151 -0
- package/CLAUDE.md +69 -0
- package/DEBUGGING_PLAYBOOK.md +160 -0
- package/KNOWLEDGE_INDEX.md +154 -0
- package/POTENTIAL_UPDATES +130 -0
- package/PROJECT.md +141 -0
- package/README.md +371 -0
- package/REPO_DIGITAL_TWIN.md +98 -0
- package/ROADMAP.md +301 -0
- package/SETUP_ANY_REPO.md +85 -0
- package/bin/gcie-init.js +20 -0
- package/bin/gcie.js +45 -0
- package/cli/__init__.py +1 -0
- package/cli/app.py +163 -0
- package/cli/commands/__init__.py +1 -0
- package/cli/commands/cache.py +35 -0
- package/cli/commands/context.py +2426 -0
- package/cli/commands/context_slices.py +617 -0
- package/cli/commands/debug.py +24 -0
- package/cli/commands/index.py +17 -0
- package/cli/commands/query.py +20 -0
- package/cli/commands/setup.py +73 -0
- package/config/__init__.py +1 -0
- package/config/scanner_config.py +82 -0
- package/context/__init__.py +1 -0
- package/context/architecture_bootstrap.py +170 -0
- package/context/architecture_index.py +185 -0
- package/context/architecture_parser.py +170 -0
- package/context/architecture_slicer.py +308 -0
- package/context/context_router.py +70 -0
- package/context/fallback_evaluator.py +21 -0
- package/coverage_integration/__init__.py +1 -0
- package/coverage_integration/coverage_loader.py +55 -0
- package/debugging/__init__.py +12 -0
- package/debugging/bug_localizer.py +81 -0
- package/debugging/execution_path_analyzer.py +42 -0
- package/embeddings/__init__.py +6 -0
- package/embeddings/encoder.py +45 -0
- package/embeddings/faiss_index.py +72 -0
- package/git_integration/__init__.py +1 -0
- package/git_integration/git_miner.py +78 -0
- package/graphs/__init__.py +17 -0
- package/graphs/call_graph.py +70 -0
- package/graphs/code_graph.py +81 -0
- package/graphs/execution_graph.py +35 -0
- package/graphs/git_graph.py +43 -0
- package/graphs/graph_store.py +25 -0
- package/graphs/node_factory.py +21 -0
- package/graphs/test_graph.py +65 -0
- package/graphs/validators.py +28 -0
- package/graphs/variable_graph.py +51 -0
- package/knowledge_index/__init__.py +1 -0
- package/knowledge_index/index_builder.py +60 -0
- package/knowledge_index/models.py +35 -0
- package/knowledge_index/query_api.py +38 -0
- package/knowledge_index/store.py +23 -0
- package/llm_context/__init__.py +6 -0
- package/llm_context/context_builder.py +67 -0
- package/llm_context/snippet_selector.py +57 -0
- package/package.json +14 -0
- package/parser/__init__.py +18 -0
- package/parser/ast_parser.py +216 -0
- package/parser/call_resolver.py +52 -0
- package/parser/models.py +75 -0
- package/parser/tree_sitter_adapter.py +56 -0
- package/parser/variable_extractor.py +31 -0
- package/retrieval/__init__.py +17 -0
- package/retrieval/cache.py +22 -0
- package/retrieval/hybrid_retriever.py +249 -0
- package/retrieval/query_parser.py +38 -0
- package/retrieval/ranking.py +43 -0
- package/retrieval/semantic_retriever.py +39 -0
- package/retrieval/symbolic_retriever.py +80 -0
- package/scanner/__init__.py +5 -0
- package/scanner/file_filters.py +37 -0
- package/scanner/models.py +44 -0
- package/scanner/repository_scanner.py +55 -0
- package/scripts/bootstrap_from_github.ps1 +41 -0
- package/tracing/__init__.py +1 -0
- package/tracing/runtime_tracer.py +60 -0
|
@@ -0,0 +1,2426 @@
|
|
|
1
|
+
"""CLI command: context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
import networkx as nx
|
|
11
|
+
|
|
12
|
+
from config.scanner_config import ScannerConfig
|
|
13
|
+
from graphs.call_graph import build_call_graph
|
|
14
|
+
from graphs.code_graph import build_code_structure_graph
|
|
15
|
+
from graphs.variable_graph import build_variable_graph
|
|
16
|
+
from llm_context.context_builder import build_context
|
|
17
|
+
from llm_context.snippet_selector import RankedSnippet, estimate_tokens
|
|
18
|
+
from parser.ast_parser import parse_python_file, parse_python_source
|
|
19
|
+
from retrieval.hybrid_retriever import hybrid_retrieve
|
|
20
|
+
from scanner.repository_scanner import scan_repository
|
|
21
|
+
|
|
22
|
+
# Simple in-process cache for repo-wide context builds
|
|
23
|
+
_REPO_CACHE: dict[str, tuple[nx.DiGraph, dict[str, str], dict[str, str], dict[str, str]]] = {}
|
|
24
|
+
|
|
25
|
+
_FRONTEND_EXTENSIONS = {
|
|
26
|
+
".js",
|
|
27
|
+
".jsx",
|
|
28
|
+
".ts",
|
|
29
|
+
".tsx",
|
|
30
|
+
".css",
|
|
31
|
+
".scss",
|
|
32
|
+
".sass",
|
|
33
|
+
".less",
|
|
34
|
+
".html",
|
|
35
|
+
".vue",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
_CODE_EXTENSIONS = {
|
|
39
|
+
".py",
|
|
40
|
+
".pyi",
|
|
41
|
+
".js",
|
|
42
|
+
".jsx",
|
|
43
|
+
".ts",
|
|
44
|
+
".tsx",
|
|
45
|
+
".json",
|
|
46
|
+
".yaml",
|
|
47
|
+
".yml",
|
|
48
|
+
".toml",
|
|
49
|
+
".ini",
|
|
50
|
+
".cfg",
|
|
51
|
+
*sorted(_FRONTEND_EXTENSIONS),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
_DOC_EXTENSIONS = {".md", ".txt", ".rst"}
|
|
55
|
+
|
|
56
|
+
_ALL_CONTEXT_EXTENSIONS = _CODE_EXTENSIONS | _DOC_EXTENSIONS
|
|
57
|
+
|
|
58
|
+
_EXCLUDE_GLOBS = (
|
|
59
|
+
"get-shit-done/docs/ja-JP/**",
|
|
60
|
+
"get-shit-done/docs/zh-CN/**",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
_FRONTEND_KEYWORDS = {
|
|
64
|
+
"frontend",
|
|
65
|
+
"ui",
|
|
66
|
+
"ux",
|
|
67
|
+
"component",
|
|
68
|
+
"react",
|
|
69
|
+
"vue",
|
|
70
|
+
"svelte",
|
|
71
|
+
"angular",
|
|
72
|
+
"css",
|
|
73
|
+
"style",
|
|
74
|
+
"layout",
|
|
75
|
+
"toolbar",
|
|
76
|
+
"canvas",
|
|
77
|
+
"page",
|
|
78
|
+
"view",
|
|
79
|
+
"hook",
|
|
80
|
+
"hooks",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
_STOPWORDS = {
|
|
84
|
+
"how",
|
|
85
|
+
"does",
|
|
86
|
+
"when",
|
|
87
|
+
"what",
|
|
88
|
+
"why",
|
|
89
|
+
"where",
|
|
90
|
+
"which",
|
|
91
|
+
"the",
|
|
92
|
+
"this",
|
|
93
|
+
"that",
|
|
94
|
+
"into",
|
|
95
|
+
"from",
|
|
96
|
+
"with",
|
|
97
|
+
"files",
|
|
98
|
+
"file",
|
|
99
|
+
"help",
|
|
100
|
+
"doesnt",
|
|
101
|
+
"using",
|
|
102
|
+
"used",
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
_OPERATIONAL_DOC_NAMES = {
|
|
106
|
+
"agent.md",
|
|
107
|
+
"agent_usage.md",
|
|
108
|
+
"architecture.md",
|
|
109
|
+
"project.md",
|
|
110
|
+
"roadmap.md",
|
|
111
|
+
"debugging_playbook.md",
|
|
112
|
+
"readme.md",
|
|
113
|
+
"skill.md",
|
|
114
|
+
"claude.md",
|
|
115
|
+
"contextgrabber.md",
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
_OPERATIONAL_PATH_HINTS = (
|
|
119
|
+
".planning/",
|
|
120
|
+
".gcie/",
|
|
121
|
+
"/skills/",
|
|
122
|
+
"get-shit-done/workflows/",
|
|
123
|
+
"get-shit-done/commands/",
|
|
124
|
+
"get-shit-done/templates/",
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
_QUERY_ALIASES = {
|
|
128
|
+
"routing": ("router", "route"),
|
|
129
|
+
"router": ("routing", "route"),
|
|
130
|
+
"bootstrapped": ("bootstrap", "init", "initialize", "managed"),
|
|
131
|
+
"bootstrapping": ("bootstrap", "init", "initialize", "managed"),
|
|
132
|
+
"bootstrap": ("init", "initialize", "managed", "index", "architecture"),
|
|
133
|
+
"managed": ("index", "architecture"),
|
|
134
|
+
"command": ("cli", "handler", "run", "context"),
|
|
135
|
+
"commands": ("cli", "handler", "run", "context"),
|
|
136
|
+
"context": ("builder", "command", "cli"),
|
|
137
|
+
"pipeline": ("retrieval", "hybrid", "symbolic", "semantic", "ranking"),
|
|
138
|
+
"retrieval": ("pipeline", "hybrid", "symbolic", "semantic", "ranking"),
|
|
139
|
+
"hybrid": ("retrieval", "symbolic", "semantic", "ranking"),
|
|
140
|
+
"builder": ("build", "index", "context"),
|
|
141
|
+
"builders": ("builder", "build", "index", "context"),
|
|
142
|
+
"build": ("builder", "context"),
|
|
143
|
+
"plan": ("planner", "pipeline", "stage"),
|
|
144
|
+
"planner": ("plan", "pipeline", "stage"),
|
|
145
|
+
"convert": ("conversion", "api", "route"),
|
|
146
|
+
"conversion": ("convert", "api", "route"),
|
|
147
|
+
"analyze": ("analysis", "pipeline", "stage"),
|
|
148
|
+
"analysis": ("analyze", "pipeline", "stage"),
|
|
149
|
+
"extract": ("extraction", "pipeline", "stage"),
|
|
150
|
+
"extraction": ("extract", "pipeline", "stage"),
|
|
151
|
+
"stage": ("pipeline", "plan", "build"),
|
|
152
|
+
"stages": ("stage", "pipeline", "plan", "build"),
|
|
153
|
+
"scanning": ("scanner", "scan", "repository"),
|
|
154
|
+
"scanner": ("scanning", "scan", "repository"),
|
|
155
|
+
"tracing": ("trace", "tracer", "execution"),
|
|
156
|
+
"represented": ("representation", "represent", "execution"),
|
|
157
|
+
"generate": ("generation", "agent", "model", "stream"),
|
|
158
|
+
"refine": ("refinement", "patch", "chat", "model"),
|
|
159
|
+
"wiring": ("app", "main", "entry", "route", "router"),
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
_GENERIC_ENTRYPOINT_STEMS = {"main", "index", "app"}
|
|
163
|
+
_GENERIC_ENTRYPOINT_PATHS = {
|
|
164
|
+
"frontend/src/main.jsx",
|
|
165
|
+
"frontend/index.html",
|
|
166
|
+
}
|
|
167
|
+
_BACKEND_PATH_HINTS = ("backend/", "server/", "api/", "services/", "service/", "workers/", "worker/")
|
|
168
|
+
_BACKEND_FILE_HINTS = (
|
|
169
|
+
"client",
|
|
170
|
+
"service",
|
|
171
|
+
"worker",
|
|
172
|
+
"controller",
|
|
173
|
+
"handler",
|
|
174
|
+
"router",
|
|
175
|
+
"route",
|
|
176
|
+
"config",
|
|
177
|
+
"settings",
|
|
178
|
+
"pipeline",
|
|
179
|
+
"plan",
|
|
180
|
+
"build",
|
|
181
|
+
"extract",
|
|
182
|
+
"analyze",
|
|
183
|
+
)
|
|
184
|
+
_CHAIN_TERMS = {
|
|
185
|
+
"stage",
|
|
186
|
+
"stages",
|
|
187
|
+
"pipeline",
|
|
188
|
+
"plan",
|
|
189
|
+
"planner",
|
|
190
|
+
"build",
|
|
191
|
+
"builder",
|
|
192
|
+
"convert",
|
|
193
|
+
"analyze",
|
|
194
|
+
"extract",
|
|
195
|
+
"workflow",
|
|
196
|
+
}
|
|
197
|
+
_COMMON_FAMILY_TOKENS = {
|
|
198
|
+
"src",
|
|
199
|
+
"tests",
|
|
200
|
+
"test",
|
|
201
|
+
"commands",
|
|
202
|
+
"command",
|
|
203
|
+
"context",
|
|
204
|
+
"cli",
|
|
205
|
+
"core",
|
|
206
|
+
"app",
|
|
207
|
+
"file",
|
|
208
|
+
"files",
|
|
209
|
+
"index",
|
|
210
|
+
"init",
|
|
211
|
+
"main",
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
_SYSTEM_QUERY_TERMS = {
|
|
215
|
+
"architecture",
|
|
216
|
+
"bootstrap",
|
|
217
|
+
"command",
|
|
218
|
+
"commands",
|
|
219
|
+
"context",
|
|
220
|
+
"pipeline",
|
|
221
|
+
"retrieval",
|
|
222
|
+
"workflow",
|
|
223
|
+
"builder",
|
|
224
|
+
"builders",
|
|
225
|
+
"graph",
|
|
226
|
+
"routing",
|
|
227
|
+
"router",
|
|
228
|
+
"generate",
|
|
229
|
+
"refine",
|
|
230
|
+
"wiring",
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
_SUPPORT_ROLE_TOKENS = {
|
|
234
|
+
"app",
|
|
235
|
+
"main",
|
|
236
|
+
"index",
|
|
237
|
+
"entry",
|
|
238
|
+
"router",
|
|
239
|
+
"route",
|
|
240
|
+
"context",
|
|
241
|
+
"builder",
|
|
242
|
+
"hook",
|
|
243
|
+
"hooks",
|
|
244
|
+
"provider",
|
|
245
|
+
"service",
|
|
246
|
+
"client",
|
|
247
|
+
"store",
|
|
248
|
+
"state",
|
|
249
|
+
"handler",
|
|
250
|
+
"controller",
|
|
251
|
+
"bootstrap",
|
|
252
|
+
"command",
|
|
253
|
+
"commands",
|
|
254
|
+
"retriever",
|
|
255
|
+
"selector",
|
|
256
|
+
"evaluator",
|
|
257
|
+
"parser",
|
|
258
|
+
"scanner",
|
|
259
|
+
"generate",
|
|
260
|
+
"refine",
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
_SUPPORT_PROMOTION_TERMS = {
|
|
264
|
+
"routing",
|
|
265
|
+
"router",
|
|
266
|
+
"fallback",
|
|
267
|
+
"bootstrap",
|
|
268
|
+
"managed",
|
|
269
|
+
"index",
|
|
270
|
+
"builder",
|
|
271
|
+
"build",
|
|
272
|
+
"command",
|
|
273
|
+
"commands",
|
|
274
|
+
"context",
|
|
275
|
+
"orchestration",
|
|
276
|
+
"workflow",
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _snippet_from_lines(lines: list[str], max_lines: int) -> str:
|
|
281
|
+
if not lines:
|
|
282
|
+
return ""
|
|
283
|
+
return "\n".join(lines[:max_lines]).strip()
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def _repo_signature(repo_path: Path, manifest_files) -> str:
|
|
287
|
+
parts: list[str] = [repo_path.as_posix()]
|
|
288
|
+
for entry in manifest_files:
|
|
289
|
+
try:
|
|
290
|
+
stat = (repo_path / entry.relative_path).stat()
|
|
291
|
+
except OSError:
|
|
292
|
+
continue
|
|
293
|
+
parts.append(f"{entry.relative_path.as_posix()}:{stat.st_mtime_ns}:{stat.st_size}")
|
|
294
|
+
return "|".join(parts)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
def _cache_dir(repo_path: Path) -> Path:
|
|
298
|
+
return repo_path / ".gcie" / "cache"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _cache_path(repo_path: Path) -> Path:
|
|
302
|
+
return _cache_dir(repo_path) / "context_cache.json"
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _load_disk_cache(cache_path: Path) -> tuple[str, nx.DiGraph, dict[str, str], dict[str, str], dict[str, str]] | None:
|
|
306
|
+
if not cache_path.exists():
|
|
307
|
+
return None
|
|
308
|
+
try:
|
|
309
|
+
payload = json.loads(cache_path.read_text(encoding="utf-8"))
|
|
310
|
+
except Exception:
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
signature = payload.get("signature")
|
|
314
|
+
graph_data = payload.get("graph")
|
|
315
|
+
if signature is None or graph_data is None:
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
graph = nx.node_link_graph(graph_data, directed=True)
|
|
319
|
+
file_text = payload.get("file_text", {})
|
|
320
|
+
function_snippets = payload.get("function_snippets", {})
|
|
321
|
+
class_snippets = payload.get("class_snippets", {})
|
|
322
|
+
return signature, graph, file_text, function_snippets, class_snippets
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def _save_disk_cache(
|
|
326
|
+
cache_path: Path,
|
|
327
|
+
*,
|
|
328
|
+
signature: str,
|
|
329
|
+
graph: nx.DiGraph,
|
|
330
|
+
file_text: dict[str, str],
|
|
331
|
+
function_snippets: dict[str, str],
|
|
332
|
+
class_snippets: dict[str, str],
|
|
333
|
+
) -> None:
|
|
334
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
335
|
+
payload = {
|
|
336
|
+
"signature": signature,
|
|
337
|
+
"graph": nx.node_link_data(graph),
|
|
338
|
+
"file_text": file_text,
|
|
339
|
+
"function_snippets": function_snippets,
|
|
340
|
+
"class_snippets": class_snippets,
|
|
341
|
+
}
|
|
342
|
+
cache_path.write_text(json.dumps(payload), encoding="utf-8")
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _effective_intent(query: str, intent: str | None) -> str:
|
|
346
|
+
if intent:
|
|
347
|
+
return intent
|
|
348
|
+
text = query.lower()
|
|
349
|
+
if any(word in text for word in ("debug", "why", "error", "fail", "bug", "trace")):
|
|
350
|
+
return "debug"
|
|
351
|
+
if any(word in text for word in ("refactor", "rewrite", "migrate", "restructure")):
|
|
352
|
+
return "refactor"
|
|
353
|
+
if any(word in text for word in ("add", "change", "update", "extend", "modify", "remove", "rename")):
|
|
354
|
+
return "edit"
|
|
355
|
+
return "explore"
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _query_terms(query: str) -> set[str]:
|
|
359
|
+
raw_terms = re.findall(r"[a-zA-Z_][a-zA-Z0-9_]*", query.lower())
|
|
360
|
+
terms: set[str] = set()
|
|
361
|
+
for term in raw_terms:
|
|
362
|
+
for part in term.split("_"):
|
|
363
|
+
if len(part) >= 3 and part not in _STOPWORDS:
|
|
364
|
+
terms.add(part)
|
|
365
|
+
for alias in _QUERY_ALIASES.get(part, ()):
|
|
366
|
+
if len(alias) >= 3 and alias not in _STOPWORDS:
|
|
367
|
+
terms.add(alias)
|
|
368
|
+
return terms
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
def _is_system_query(query: str) -> bool:
|
|
372
|
+
return bool(_query_terms(query) & _SYSTEM_QUERY_TERMS)
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _classify_path(path: str) -> str:
|
|
376
|
+
candidate = Path(path)
|
|
377
|
+
suffix = candidate.suffix.lower()
|
|
378
|
+
lowered = candidate.as_posix().lower()
|
|
379
|
+
name = candidate.name.lower()
|
|
380
|
+
|
|
381
|
+
if suffix in _CODE_EXTENSIONS:
|
|
382
|
+
return "code"
|
|
383
|
+
if suffix not in _DOC_EXTENSIONS:
|
|
384
|
+
return "general_doc"
|
|
385
|
+
if name in _OPERATIONAL_DOC_NAMES:
|
|
386
|
+
return "operational_doc"
|
|
387
|
+
if any(hint in lowered for hint in _OPERATIONAL_PATH_HINTS):
|
|
388
|
+
return "operational_doc"
|
|
389
|
+
if "/plans/" in lowered or lowered.endswith("-plan.md") or lowered.endswith("-context.md"):
|
|
390
|
+
return "operational_doc"
|
|
391
|
+
return "general_doc"
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _normalized_query_text(query: str) -> str:
|
|
395
|
+
return query.lower().replace('\\', '/')
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _explicit_file_mention_score(path: str, query: str) -> float:
|
|
399
|
+
normalized_query = _normalized_query_text(query)
|
|
400
|
+
normalized_path = path.lower().replace('\\', '/')
|
|
401
|
+
candidate = Path(path)
|
|
402
|
+
name = candidate.name.lower()
|
|
403
|
+
stem = candidate.stem.lower()
|
|
404
|
+
score = 0.0
|
|
405
|
+
if normalized_path and normalized_path in normalized_query:
|
|
406
|
+
score += 1.2
|
|
407
|
+
if name and name in normalized_query:
|
|
408
|
+
score += 0.75
|
|
409
|
+
if stem and re.search(rf"\b{re.escape(stem)}\b", normalized_query):
|
|
410
|
+
score += 0.2
|
|
411
|
+
return min(1.6, score)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _mentioned_file_paths(file_text: dict[str, str], query: str) -> list[tuple[float, str]]:
|
|
415
|
+
matches: list[tuple[float, str]] = []
|
|
416
|
+
for rel_path in file_text:
|
|
417
|
+
score = _explicit_file_mention_score(rel_path, query)
|
|
418
|
+
if score <= 0:
|
|
419
|
+
continue
|
|
420
|
+
matches.append((score, rel_path))
|
|
421
|
+
matches.sort(key=lambda item: (-item[0], item[1]))
|
|
422
|
+
return matches
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def _layer_bucket(path: str | None) -> str:
|
|
426
|
+
if not path:
|
|
427
|
+
return "unknown"
|
|
428
|
+
normalized = path.lower().replace("\\", "/")
|
|
429
|
+
if normalized.startswith(("frontend/", "ui/", "web/")):
|
|
430
|
+
return "frontend"
|
|
431
|
+
if normalized.startswith(("backend/", "server/", "api/")):
|
|
432
|
+
return "backend"
|
|
433
|
+
if normalized.startswith(("tests/", "test/")):
|
|
434
|
+
return "test"
|
|
435
|
+
if normalized.startswith(("docs/", ".gcie/")) or normalized.endswith(".md"):
|
|
436
|
+
return "docs"
|
|
437
|
+
if any(token in normalized for token in ("build", "theme", "pptx", "worker", "job")):
|
|
438
|
+
return "build"
|
|
439
|
+
candidate = Path(path)
|
|
440
|
+
if candidate.parts:
|
|
441
|
+
return candidate.parts[0].lower()
|
|
442
|
+
return candidate.stem.lower()
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def _is_edit_like_query(query: str, intent: str | None) -> bool:
|
|
447
|
+
effective = _effective_intent(query, intent)
|
|
448
|
+
if effective in {"edit", "refactor"}:
|
|
449
|
+
return True
|
|
450
|
+
lowered = query.lower()
|
|
451
|
+
return any(term in lowered for term in ("fix", "modify", "patch", "rename", "update", "change"))
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _is_backend_path(path: str) -> bool:
|
|
455
|
+
lowered = path.lower().replace("\\", "/")
|
|
456
|
+
if lowered.startswith(_BACKEND_PATH_HINTS):
|
|
457
|
+
return True
|
|
458
|
+
tokens = _family_tokens(path) | {Path(path).stem.lower()}
|
|
459
|
+
return bool(tokens & set(_BACKEND_FILE_HINTS))
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _query_shape(query: str, intent: str | None, explicit_paths: set[str]) -> str:
|
|
463
|
+
terms = _query_terms(query)
|
|
464
|
+
effective = _effective_intent(query, intent)
|
|
465
|
+
lowered = query.lower()
|
|
466
|
+
explicit_layers = {_layer_bucket(path) for path in explicit_paths}
|
|
467
|
+
explicit_count = len(explicit_paths)
|
|
468
|
+
|
|
469
|
+
has_frontend = any(layer == "frontend" for layer in explicit_layers) or "frontend/" in lowered
|
|
470
|
+
has_backend = any(layer in {"backend", "api"} for layer in explicit_layers) or any(
|
|
471
|
+
hint in lowered for hint in ("/api/", "app.py", "main.py", "backend")
|
|
472
|
+
)
|
|
473
|
+
has_chain_terms = bool(terms & _CHAIN_TERMS)
|
|
474
|
+
|
|
475
|
+
if explicit_count >= 4 or (explicit_count >= 3 and has_chain_terms):
|
|
476
|
+
return "multi_hop_chain"
|
|
477
|
+
if has_frontend and has_backend:
|
|
478
|
+
return "cross_layer_ui_api"
|
|
479
|
+
if has_chain_terms and ("build" in terms or "planner" in terms or "stage" in terms):
|
|
480
|
+
return "builder_orchestrator"
|
|
481
|
+
|
|
482
|
+
backend_explicit = [path for path in explicit_paths if _is_backend_path(path)]
|
|
483
|
+
if len(backend_explicit) >= 2 and effective in {"explore", "debug", "edit", "refactor"}:
|
|
484
|
+
return "backend_config_pair"
|
|
485
|
+
|
|
486
|
+
if explicit_count == 1:
|
|
487
|
+
return "single_file"
|
|
488
|
+
|
|
489
|
+
if explicit_count >= 2:
|
|
490
|
+
families = {_candidate_family(path) for path in explicit_paths}
|
|
491
|
+
if len(families) == 1:
|
|
492
|
+
return "same_layer_pair"
|
|
493
|
+
|
|
494
|
+
return "single_file"
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _is_generic_entrypoint(path: str) -> bool:
|
|
498
|
+
normalized = path.lower().replace("\\", "/")
|
|
499
|
+
candidate = Path(path)
|
|
500
|
+
if normalized in _GENERIC_ENTRYPOINT_PATHS:
|
|
501
|
+
return True
|
|
502
|
+
if candidate.stem.lower() in _GENERIC_ENTRYPOINT_STEMS:
|
|
503
|
+
return True
|
|
504
|
+
return False
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
def _candidate_role(
|
|
508
|
+
path: str | None,
|
|
509
|
+
query: str,
|
|
510
|
+
query_shape: str,
|
|
511
|
+
explicit_targets: set[str],
|
|
512
|
+
strong_paths: list[str],
|
|
513
|
+
) -> str:
|
|
514
|
+
if not path:
|
|
515
|
+
return "support_config"
|
|
516
|
+
|
|
517
|
+
normalized = path.lower().replace("\\", "/")
|
|
518
|
+
candidate = Path(path)
|
|
519
|
+
stem = candidate.stem.lower()
|
|
520
|
+
role = _file_role(path)
|
|
521
|
+
|
|
522
|
+
if path in explicit_targets:
|
|
523
|
+
return "explicit_target"
|
|
524
|
+
if _is_generic_entrypoint(path):
|
|
525
|
+
return "generic_entrypoint"
|
|
526
|
+
if role in {"app", "main", "index", "router", "route", "entry", "command"}:
|
|
527
|
+
return "caller_or_entry"
|
|
528
|
+
|
|
529
|
+
if query_shape == "multi_hop_chain":
|
|
530
|
+
tokens = _family_tokens(path)
|
|
531
|
+
if tokens & {"plan", "build", "stage", "pipeline", "extract", "analyze"} and stem not in _GENERIC_ENTRYPOINT_STEMS:
|
|
532
|
+
return "intermediate_pipeline"
|
|
533
|
+
|
|
534
|
+
if _is_backend_path(path):
|
|
535
|
+
for anchor in strong_paths:
|
|
536
|
+
if not _is_backend_path(anchor):
|
|
537
|
+
continue
|
|
538
|
+
anchor_path = Path(anchor)
|
|
539
|
+
if candidate.parent == anchor_path.parent and path != anchor:
|
|
540
|
+
return "sibling_module"
|
|
541
|
+
|
|
542
|
+
suffix = candidate.suffix.lower()
|
|
543
|
+
if suffix in {".json", ".yaml", ".yml", ".toml", ".ini", ".cfg", ".css", ".scss", ".sass", ".less", ".html"}:
|
|
544
|
+
return "support_config"
|
|
545
|
+
|
|
546
|
+
if "config" in normalized or "settings" in normalized or any(hint in normalized for hint in ("tailwind", "vite", "postcss", "vercel", "package.json")):
|
|
547
|
+
return "support_config"
|
|
548
|
+
|
|
549
|
+
return "sibling_module"
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def _role_adjustment(role: str, query_shape: str, query: str, intent: str | None) -> float:
|
|
553
|
+
effective = _effective_intent(query, intent)
|
|
554
|
+
if role == "explicit_target":
|
|
555
|
+
if _is_edit_like_query(query, intent):
|
|
556
|
+
return 0.95
|
|
557
|
+
return 0.55
|
|
558
|
+
if role == "generic_entrypoint":
|
|
559
|
+
if _is_edit_like_query(query, intent):
|
|
560
|
+
return -0.4
|
|
561
|
+
if query_shape in {"multi_hop_chain", "builder_orchestrator"}:
|
|
562
|
+
return -0.12
|
|
563
|
+
return -0.24
|
|
564
|
+
if role == "sibling_module":
|
|
565
|
+
if query_shape in {"backend_config_pair", "same_layer_pair"}:
|
|
566
|
+
return 0.18
|
|
567
|
+
return 0.08
|
|
568
|
+
if role == "caller_or_entry":
|
|
569
|
+
if query_shape in {"cross_layer_ui_api", "multi_hop_chain", "builder_orchestrator"}:
|
|
570
|
+
return 0.16
|
|
571
|
+
return 0.04
|
|
572
|
+
if role == "intermediate_pipeline":
|
|
573
|
+
if query_shape == "multi_hop_chain":
|
|
574
|
+
return 0.3
|
|
575
|
+
if query_shape == "builder_orchestrator":
|
|
576
|
+
return 0.16
|
|
577
|
+
return 0.08
|
|
578
|
+
if role == "support_config":
|
|
579
|
+
if effective == "debug":
|
|
580
|
+
return -0.28
|
|
581
|
+
if effective in {"edit", "refactor"}:
|
|
582
|
+
return -0.16
|
|
583
|
+
return -0.12
|
|
584
|
+
return 0.0
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
|
|
588
|
+
def _subtree_locality_adjustment(path: str | None, explicit_targets: set[str], query_shape: str) -> float:
|
|
589
|
+
if not path or not explicit_targets:
|
|
590
|
+
return 0.0
|
|
591
|
+
explicit_roots = {Path(item).parts[0].lower() for item in explicit_targets if Path(item).parts}
|
|
592
|
+
explicit_parents = {
|
|
593
|
+
Path(item).parent.as_posix().lower()
|
|
594
|
+
for item in explicit_targets
|
|
595
|
+
if Path(item).parent.as_posix() not in {"", "."}
|
|
596
|
+
}
|
|
597
|
+
explicit_families = {_candidate_family(item) for item in explicit_targets}
|
|
598
|
+
path_local_focus = any(len(Path(item).parts) >= 3 for item in explicit_targets)
|
|
599
|
+
candidate = Path(path)
|
|
600
|
+
candidate_root = candidate.parts[0].lower() if candidate.parts else ""
|
|
601
|
+
candidate_family = _candidate_family(path)
|
|
602
|
+
candidate_parent = candidate.parent.as_posix().lower()
|
|
603
|
+
|
|
604
|
+
if path in explicit_targets:
|
|
605
|
+
return 0.18
|
|
606
|
+
if candidate_parent in explicit_parents and path_local_focus:
|
|
607
|
+
return 0.22
|
|
608
|
+
if candidate_family in explicit_families:
|
|
609
|
+
return 0.14 if path_local_focus else 0.12
|
|
610
|
+
if query_shape in {"cross_layer_ui_api", "backend_config_pair"}:
|
|
611
|
+
if candidate_root in explicit_roots:
|
|
612
|
+
return 0.04
|
|
613
|
+
return -0.08
|
|
614
|
+
if len(explicit_roots) == 1:
|
|
615
|
+
if candidate_root not in explicit_roots:
|
|
616
|
+
return -0.18 if path_local_focus else -0.14
|
|
617
|
+
return 0.12
|
|
618
|
+
if path_local_focus and candidate_root in explicit_roots:
|
|
619
|
+
return 0.02
|
|
620
|
+
return -0.04
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def _support_config_penalty(path: str | None, role: str, explicit_targets: set[str]) -> float:
|
|
624
|
+
if not path or not explicit_targets:
|
|
625
|
+
return 0.0
|
|
626
|
+
if role not in {"support_config", "general_doc", "operational_doc"}:
|
|
627
|
+
return 0.0
|
|
628
|
+
candidate_family = _candidate_family(path)
|
|
629
|
+
explicit_families = {_candidate_family(item) for item in explicit_targets}
|
|
630
|
+
if candidate_family in explicit_families:
|
|
631
|
+
return -0.08
|
|
632
|
+
return -0.26
|
|
633
|
+
|
|
634
|
+
|
|
635
|
+
def _promote_priority_first(
|
|
636
|
+
ranked: list[RankedSnippet],
|
|
637
|
+
explicit_priority_ids: set[str],
|
|
638
|
+
linked_priority_ids: set[str],
|
|
639
|
+
chain_priority_ids: set[str],
|
|
640
|
+
explicit_targets: set[str] | None = None,
|
|
641
|
+
) -> list[RankedSnippet]:
|
|
642
|
+
explicit_priority_ids = set(explicit_priority_ids)
|
|
643
|
+
linked_priority_ids = set(linked_priority_ids)
|
|
644
|
+
chain_priority_ids = set(chain_priority_ids)
|
|
645
|
+
explicit_targets = set(explicit_targets or set())
|
|
646
|
+
explicit_roots = {Path(path).parts[0].lower() for path in explicit_targets if Path(path).parts}
|
|
647
|
+
explicit_families = {_candidate_family(path) for path in explicit_targets}
|
|
648
|
+
explicit_parents = {
|
|
649
|
+
Path(path).parent.as_posix().lower()
|
|
650
|
+
for path in explicit_targets
|
|
651
|
+
if Path(path).parent.as_posix() not in {"", "."}
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
def _locality_tier(node_id: str) -> int:
|
|
655
|
+
path = _node_file_path(node_id)
|
|
656
|
+
if not path:
|
|
657
|
+
return 6
|
|
658
|
+
if path in explicit_targets:
|
|
659
|
+
return 0
|
|
660
|
+
candidate = Path(path)
|
|
661
|
+
candidate_parent = candidate.parent.as_posix().lower()
|
|
662
|
+
if candidate_parent in explicit_parents:
|
|
663
|
+
return 1
|
|
664
|
+
if _candidate_family(path) in explicit_families:
|
|
665
|
+
return 2
|
|
666
|
+
candidate_root = candidate.parts[0].lower() if candidate.parts else ""
|
|
667
|
+
if candidate_root in explicit_roots:
|
|
668
|
+
return 3
|
|
669
|
+
return 4
|
|
670
|
+
|
|
671
|
+
return sorted(
|
|
672
|
+
ranked,
|
|
673
|
+
key=lambda item: (
|
|
674
|
+
0 if item.node_id in explicit_priority_ids else 1,
|
|
675
|
+
0 if item.node_id in linked_priority_ids else 1,
|
|
676
|
+
0 if item.node_id in chain_priority_ids else 1,
|
|
677
|
+
0 if item.node_id.startswith("file:") else 1,
|
|
678
|
+
_locality_tier(item.node_id),
|
|
679
|
+
-item.score,
|
|
680
|
+
item.node_id,
|
|
681
|
+
),
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
def _family_competition_adjustment(path: str | None, explicit_targets: set[str], query_shape: str) -> float:
|
|
685
|
+
if not path or not explicit_targets:
|
|
686
|
+
return 0.0
|
|
687
|
+
explicit_roots = {Path(item).parts[0].lower() for item in explicit_targets if Path(item).parts}
|
|
688
|
+
path_local_focus = any(len(Path(item).parts) >= 3 for item in explicit_targets)
|
|
689
|
+
explicit_families = {_candidate_family(item) for item in explicit_targets}
|
|
690
|
+
family = _candidate_family(path)
|
|
691
|
+
if family in explicit_families:
|
|
692
|
+
return 0.24 if path_local_focus else 0.16
|
|
693
|
+
if query_shape in {"cross_layer_ui_api", "same_layer_pair", "backend_config_pair"}:
|
|
694
|
+
return -0.06
|
|
695
|
+
if path_local_focus and len(explicit_roots) == 1:
|
|
696
|
+
return -0.08
|
|
697
|
+
return -0.02
|
|
698
|
+
|
|
699
|
+
def _entrypoint_penalty(path: str, explicit_targets: set[str]) -> float:
|
|
700
|
+
if not explicit_targets:
|
|
701
|
+
return 0.0
|
|
702
|
+
if not _is_generic_entrypoint(path):
|
|
703
|
+
return 0.0
|
|
704
|
+
|
|
705
|
+
candidate = Path(path)
|
|
706
|
+
candidate_layer = _layer_bucket(path)
|
|
707
|
+
candidate_family = _candidate_family(path)
|
|
708
|
+
has_stronger_peer = any(
|
|
709
|
+
target != path
|
|
710
|
+
and (_layer_bucket(target) == candidate_layer or _candidate_family(target) == candidate_family)
|
|
711
|
+
and not _is_generic_entrypoint(target)
|
|
712
|
+
for target in explicit_targets
|
|
713
|
+
)
|
|
714
|
+
if has_stronger_peer:
|
|
715
|
+
if candidate.stem.lower() in {"main", "app"}:
|
|
716
|
+
return 0.34
|
|
717
|
+
return 0.28
|
|
718
|
+
return 0.0
|
|
719
|
+
|
|
720
|
+
|
|
721
|
+
def _explicit_priority_ids(file_text: dict[str, str], query: str, intent: str | None = None) -> set[str]:
|
|
722
|
+
threshold = 0.5 if _is_edit_like_query(query, intent) else 0.75
|
|
723
|
+
return {f"file:{path}" for score, path in _mentioned_file_paths(file_text, query) if score >= threshold}
|
|
724
|
+
|
|
725
|
+
|
|
726
|
+
def _layer_priority_ids(ranked: list[RankedSnippet], query: str, intent: str | None, explicit_priority_ids: set[str]) -> set[str]:
|
|
727
|
+
if _effective_intent(query, intent) != "edit":
|
|
728
|
+
return set()
|
|
729
|
+
explicit_paths = [node_id[5:] for node_id in explicit_priority_ids if node_id.startswith("file:")]
|
|
730
|
+
explicit_layers = {_layer_bucket(path) for path in explicit_paths}
|
|
731
|
+
if len(explicit_layers) < 2:
|
|
732
|
+
return set()
|
|
733
|
+
|
|
734
|
+
best_by_layer: dict[str, tuple[float, str]] = {}
|
|
735
|
+
for item in ranked:
|
|
736
|
+
if not item.node_id.startswith("file:"):
|
|
737
|
+
continue
|
|
738
|
+
path = item.node_id[5:]
|
|
739
|
+
if _classify_path(path) != "code":
|
|
740
|
+
continue
|
|
741
|
+
layer = _layer_bucket(path)
|
|
742
|
+
if layer not in explicit_layers:
|
|
743
|
+
continue
|
|
744
|
+
current = best_by_layer.get(layer)
|
|
745
|
+
score = item.score + _explicit_file_mention_score(path, query)
|
|
746
|
+
if current is None or score > current[0] or (score == current[0] and item.node_id < current[1]):
|
|
747
|
+
best_by_layer[layer] = (score, item.node_id)
|
|
748
|
+
return {node_id for _, node_id in best_by_layer.values()}
|
|
749
|
+
|
|
750
|
+
|
|
751
|
+
def _family_tokens(path: str) -> set[str]:
|
|
752
|
+
candidate = Path(path)
|
|
753
|
+
tokens: set[str] = set()
|
|
754
|
+
for part in candidate.parts:
|
|
755
|
+
for token in re.split(r"[^a-zA-Z0-9_]+", part.lower()):
|
|
756
|
+
for piece in token.split("_"):
|
|
757
|
+
if len(piece) >= 3 and piece not in _COMMON_FAMILY_TOKENS:
|
|
758
|
+
tokens.add(piece)
|
|
759
|
+
stem = candidate.stem.lower()
|
|
760
|
+
for piece in stem.split("_"):
|
|
761
|
+
if len(piece) >= 3 and piece not in _COMMON_FAMILY_TOKENS:
|
|
762
|
+
tokens.add(piece)
|
|
763
|
+
return tokens
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
def _path_match_score(path: str, query: str) -> float:
|
|
767
|
+
terms = _query_terms(query)
|
|
768
|
+
score = _explicit_file_mention_score(path, query)
|
|
769
|
+
if not terms:
|
|
770
|
+
return score
|
|
771
|
+
lowered = path.lower()
|
|
772
|
+
score = 0.0
|
|
773
|
+
for term in terms:
|
|
774
|
+
if term in lowered:
|
|
775
|
+
score += 0.2
|
|
776
|
+
parts = {part for part in re.split(r"[^a-zA-Z0-9_]+", lowered) if part}
|
|
777
|
+
overlap = terms & parts
|
|
778
|
+
if overlap:
|
|
779
|
+
score += 0.15 * len(overlap)
|
|
780
|
+
family_overlap = terms & _family_tokens(path)
|
|
781
|
+
if family_overlap:
|
|
782
|
+
score += 0.08 * len(family_overlap)
|
|
783
|
+
if lowered.startswith("tests/") and "test" not in terms and "tests" not in terms:
|
|
784
|
+
score -= 0.1
|
|
785
|
+
return score
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
def _content_match_score(content: str, query: str) -> float:
|
|
789
|
+
terms = _query_terms(query)
|
|
790
|
+
if not terms:
|
|
791
|
+
return 0.0
|
|
792
|
+
lowered = content.lower()
|
|
793
|
+
hits = sum(1 for term in terms if term in lowered)
|
|
794
|
+
if hits == 0:
|
|
795
|
+
return 0.0
|
|
796
|
+
return min(0.35, hits * 0.07)
|
|
797
|
+
|
|
798
|
+
|
|
799
|
+
def _class_weight(path: str, query: str, intent: str | None) -> float:
|
|
800
|
+
file_class = _classify_path(path)
|
|
801
|
+
lowered_path = path.lower().replace("\\", "/")
|
|
802
|
+
effective_intent = _effective_intent(query, intent)
|
|
803
|
+
if effective_intent == "debug":
|
|
804
|
+
if file_class == "code":
|
|
805
|
+
return 0.4
|
|
806
|
+
if "get-shit-done/" in lowered_path:
|
|
807
|
+
return -0.45
|
|
808
|
+
if file_class == "operational_doc":
|
|
809
|
+
if "/templates/" in lowered_path:
|
|
810
|
+
return -0.28
|
|
811
|
+
return 0.12
|
|
812
|
+
return -0.35
|
|
813
|
+
if effective_intent == "explore":
|
|
814
|
+
if file_class == "code":
|
|
815
|
+
return 0.18
|
|
816
|
+
if "get-shit-done/" in lowered_path:
|
|
817
|
+
return -0.32
|
|
818
|
+
if file_class == "operational_doc":
|
|
819
|
+
if "/templates/" in lowered_path:
|
|
820
|
+
return -0.2
|
|
821
|
+
return 0.22
|
|
822
|
+
return -0.05
|
|
823
|
+
if effective_intent == "refactor":
|
|
824
|
+
if file_class == "code":
|
|
825
|
+
return 0.3
|
|
826
|
+
if "get-shit-done/" in lowered_path:
|
|
827
|
+
return -0.3
|
|
828
|
+
if file_class == "operational_doc":
|
|
829
|
+
if "/templates/" in lowered_path:
|
|
830
|
+
return -0.18
|
|
831
|
+
return 0.1
|
|
832
|
+
return -0.15
|
|
833
|
+
if file_class == "code":
|
|
834
|
+
return 0.25
|
|
835
|
+
if file_class == "operational_doc":
|
|
836
|
+
return 0.1
|
|
837
|
+
return -0.1
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
def _strong_path_matches(ranked: list[RankedSnippet], query: str, intent: str | None) -> list[str]:
|
|
841
|
+
strong: list[str] = []
|
|
842
|
+
for item in ranked:
|
|
843
|
+
if item.node_id.startswith("file:"):
|
|
844
|
+
path = item.node_id[5:]
|
|
845
|
+
elif item.node_id.startswith(("function:", "class:")):
|
|
846
|
+
path = item.node_id.split(":", 1)[1].split("::", 1)[0]
|
|
847
|
+
else:
|
|
848
|
+
continue
|
|
849
|
+
strength = _path_match_score(path, query) + _class_weight(path, query, intent)
|
|
850
|
+
if strength >= 0.55:
|
|
851
|
+
strong.append(path)
|
|
852
|
+
return strong[:8]
|
|
853
|
+
|
|
854
|
+
|
|
855
|
+
def _reference_tokens(path: str) -> set[str]:
|
|
856
|
+
candidate = Path(path)
|
|
857
|
+
dotted = ".".join(candidate.with_suffix("").parts)
|
|
858
|
+
tokens = {candidate.stem.lower(), dotted.lower()}
|
|
859
|
+
if candidate.parent.parts:
|
|
860
|
+
tokens.add(f"{candidate.parent.name.lower()}.{candidate.stem.lower()}")
|
|
861
|
+
return {token for token in tokens if token}
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def _adjacency_boost(
|
|
865
|
+
path: str,
|
|
866
|
+
query: str,
|
|
867
|
+
intent: str | None,
|
|
868
|
+
strong_paths: list[str],
|
|
869
|
+
file_text: dict[str, str],
|
|
870
|
+
) -> float:
|
|
871
|
+
if not strong_paths:
|
|
872
|
+
return 0.0
|
|
873
|
+
current = Path(path)
|
|
874
|
+
current_tokens = _family_tokens(path)
|
|
875
|
+
current_class = _classify_path(path)
|
|
876
|
+
terms = _query_terms(query)
|
|
877
|
+
bonus = 0.0
|
|
878
|
+
reference_tokens = _reference_tokens(path)
|
|
879
|
+
for matched in strong_paths:
|
|
880
|
+
if matched == path:
|
|
881
|
+
continue
|
|
882
|
+
matched_path = Path(matched)
|
|
883
|
+
shared_tokens = current_tokens & _family_tokens(matched)
|
|
884
|
+
if shared_tokens:
|
|
885
|
+
bonus += min(0.24, 0.08 * len(shared_tokens))
|
|
886
|
+
if current.parts and matched_path.parts and current.parts[0] == matched_path.parts[0]:
|
|
887
|
+
bonus += 0.06
|
|
888
|
+
if current.parent == matched_path.parent:
|
|
889
|
+
bonus += 0.08
|
|
890
|
+
if _is_backend_path(path) and _is_backend_path(matched) and current.parent == matched_path.parent and (terms & current_tokens):
|
|
891
|
+
bonus += 0.18
|
|
892
|
+
matched_text = file_text.get(matched, "").lower()
|
|
893
|
+
if matched_text and any(token in matched_text for token in reference_tokens):
|
|
894
|
+
bonus += 0.16
|
|
895
|
+
if _is_system_query(query) and current_class == "code" and _classify_path(matched) == "code":
|
|
896
|
+
if shared_tokens:
|
|
897
|
+
bonus += 0.05
|
|
898
|
+
if current.parent == matched_path.parent:
|
|
899
|
+
bonus += 0.04
|
|
900
|
+
return min(0.45, bonus)
|
|
901
|
+
|
|
902
|
+
|
|
903
|
+
def _support_role_bonus(path: str, query: str, strong_paths: list[str], file_text: dict[str, str]) -> float:
|
|
904
|
+
current = Path(path)
|
|
905
|
+
tokens = _family_tokens(path)
|
|
906
|
+
score = 0.0
|
|
907
|
+
if tokens & _SUPPORT_ROLE_TOKENS:
|
|
908
|
+
score += 0.14
|
|
909
|
+
if current.stem.lower() in _SUPPORT_ROLE_TOKENS:
|
|
910
|
+
score += 0.12
|
|
911
|
+
if _frontend_bias(query) and current.stem.lower() in {"app", "main", "index"}:
|
|
912
|
+
score += 0.1
|
|
913
|
+
for matched in strong_paths:
|
|
914
|
+
matched_path = Path(matched)
|
|
915
|
+
matched_text = file_text.get(matched, "").lower()
|
|
916
|
+
if current == matched_path:
|
|
917
|
+
continue
|
|
918
|
+
if current.parts and matched_path.parts and current.parts[0] == matched_path.parts[0]:
|
|
919
|
+
if tokens & _family_tokens(matched):
|
|
920
|
+
score += 0.08
|
|
921
|
+
if matched_text and any(token in matched_text for token in _reference_tokens(path)):
|
|
922
|
+
score += 0.16
|
|
923
|
+
return min(0.35, score)
|
|
924
|
+
|
|
925
|
+
|
|
926
|
+
def _mandatory_node_ids(
|
|
927
|
+
ranked: list[RankedSnippet],
|
|
928
|
+
query: str,
|
|
929
|
+
intent: str | None,
|
|
930
|
+
*,
|
|
931
|
+
support_priority_ids: set[str] | None = None,
|
|
932
|
+
explicit_priority_ids: set[str] | None = None,
|
|
933
|
+
) -> set[str]:
|
|
934
|
+
mandatory: set[str] = set(support_priority_ids or set()) | set(explicit_priority_ids or set())
|
|
935
|
+
support_enabled = _support_promotion_enabled(query, intent)
|
|
936
|
+
for item in ranked:
|
|
937
|
+
if not item.node_id.startswith("file:"):
|
|
938
|
+
continue
|
|
939
|
+
path = item.node_id[len("file:") :]
|
|
940
|
+
file_class = _classify_path(path)
|
|
941
|
+
if file_class == "general_doc" and _effective_intent(query, intent) == "debug":
|
|
942
|
+
continue
|
|
943
|
+
if file_class != "code" and item.node_id not in (explicit_priority_ids or set()):
|
|
944
|
+
continue
|
|
945
|
+
if _path_match_score(path, query) + _class_weight(path, query, intent) >= 0.45:
|
|
946
|
+
mandatory.add(item.node_id)
|
|
947
|
+
continue
|
|
948
|
+
support_tokens = _family_tokens(path) | {Path(path).stem.lower()}
|
|
949
|
+
if support_enabled and _classify_path(path) == "code" and support_tokens & _SUPPORT_ROLE_TOKENS and item.score >= 1.25:
|
|
950
|
+
mandatory.add(item.node_id)
|
|
951
|
+
return mandatory
|
|
952
|
+
|
|
953
|
+
|
|
954
|
+
|
|
955
|
+
def _linked_file_priority_ids(
|
|
956
|
+
ranked: list[RankedSnippet],
|
|
957
|
+
explicit_priority_ids: set[str],
|
|
958
|
+
query_shape: str,
|
|
959
|
+
query: str,
|
|
960
|
+
intent: str | None,
|
|
961
|
+
) -> set[str]:
|
|
962
|
+
if not explicit_priority_ids:
|
|
963
|
+
return set()
|
|
964
|
+
if not _is_edit_like_query(query, intent):
|
|
965
|
+
return set()
|
|
966
|
+
if query_shape not in {"same_layer_pair", "cross_layer_ui_api", "backend_config_pair", "builder_orchestrator"}:
|
|
967
|
+
return set()
|
|
968
|
+
|
|
969
|
+
keep = {
|
|
970
|
+
item.node_id
|
|
971
|
+
for item in sorted(ranked, key=lambda snippet: (-snippet.score, snippet.node_id))
|
|
972
|
+
if item.node_id in explicit_priority_ids
|
|
973
|
+
}
|
|
974
|
+
return set(sorted(keep)[:3])
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
def _chain_quota_priority_ids(
|
|
978
|
+
ranked: list[RankedSnippet],
|
|
979
|
+
query: str,
|
|
980
|
+
intent: str | None,
|
|
981
|
+
explicit_targets: set[str],
|
|
982
|
+
) -> set[str]:
|
|
983
|
+
query_shape = _query_shape(query, intent, explicit_targets)
|
|
984
|
+
if query_shape not in {"multi_hop_chain", "builder_orchestrator"}:
|
|
985
|
+
return set()
|
|
986
|
+
|
|
987
|
+
file_candidates = [item for item in ranked if item.node_id.startswith("file:") and _classify_path(item.node_id[5:]) == "code"]
|
|
988
|
+
if not file_candidates:
|
|
989
|
+
return set()
|
|
990
|
+
|
|
991
|
+
sorted_candidates = sorted(file_candidates, key=lambda item: (-item.score, item.node_id))
|
|
992
|
+
caller = next(
|
|
993
|
+
(
|
|
994
|
+
item.node_id
|
|
995
|
+
for item in sorted_candidates
|
|
996
|
+
if _candidate_role(item.node_id[5:], query, query_shape, explicit_targets, []) in {"caller_or_entry", "generic_entrypoint"}
|
|
997
|
+
),
|
|
998
|
+
None,
|
|
999
|
+
)
|
|
1000
|
+
|
|
1001
|
+
middle = next(
|
|
1002
|
+
(
|
|
1003
|
+
item.node_id
|
|
1004
|
+
for item in sorted_candidates
|
|
1005
|
+
if _candidate_role(item.node_id[5:], query, query_shape, explicit_targets, []) in {"intermediate_pipeline", "sibling_module"}
|
|
1006
|
+
and item.node_id != caller
|
|
1007
|
+
),
|
|
1008
|
+
None,
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
downstream = next(
|
|
1012
|
+
(
|
|
1013
|
+
item.node_id
|
|
1014
|
+
for item in sorted_candidates
|
|
1015
|
+
if item.node_id != caller and item.node_id != middle and (
|
|
1016
|
+
item.node_id[5:] in explicit_targets or _candidate_role(item.node_id[5:], query, query_shape, explicit_targets, []) != "generic_entrypoint"
|
|
1017
|
+
)
|
|
1018
|
+
),
|
|
1019
|
+
None,
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
return {node_id for node_id in (caller, middle, downstream) if node_id}
|
|
1023
|
+
|
|
1024
|
+
def _collect_repo_modules(repo_path: Path) -> tuple[list, dict, dict, dict, str, nx.DiGraph]:
|
|
1025
|
+
config = ScannerConfig.from_extensions(
|
|
1026
|
+
sorted(_ALL_CONTEXT_EXTENSIONS),
|
|
1027
|
+
include_hidden=False,
|
|
1028
|
+
)
|
|
1029
|
+
config.exclude_globs = _EXCLUDE_GLOBS
|
|
1030
|
+
manifest = scan_repository(repo_path, config=config)
|
|
1031
|
+
signature = _repo_signature(repo_path, manifest.files)
|
|
1032
|
+
|
|
1033
|
+
cache_hit = _REPO_CACHE.get(signature)
|
|
1034
|
+
if cache_hit is not None:
|
|
1035
|
+
graph, file_text, function_snippets, class_snippets = cache_hit
|
|
1036
|
+
return [], file_text, function_snippets, class_snippets, signature, graph
|
|
1037
|
+
|
|
1038
|
+
disk_cache = _load_disk_cache(_cache_path(repo_path))
|
|
1039
|
+
if disk_cache is not None:
|
|
1040
|
+
cached_sig, graph, file_text, function_snippets, class_snippets = disk_cache
|
|
1041
|
+
if cached_sig == signature:
|
|
1042
|
+
_REPO_CACHE.clear()
|
|
1043
|
+
_REPO_CACHE[signature] = (graph, file_text, function_snippets, class_snippets)
|
|
1044
|
+
return [], file_text, function_snippets, class_snippets, signature, graph
|
|
1045
|
+
|
|
1046
|
+
modules = []
|
|
1047
|
+
file_text: dict[str, str] = {}
|
|
1048
|
+
function_snippets: dict[str, str] = {}
|
|
1049
|
+
class_snippets: dict[str, str] = {}
|
|
1050
|
+
|
|
1051
|
+
graph = nx.DiGraph()
|
|
1052
|
+
|
|
1053
|
+
for entry in manifest.files:
|
|
1054
|
+
file_rel = entry.relative_path.as_posix()
|
|
1055
|
+
file_path = repo_path / entry.relative_path
|
|
1056
|
+
|
|
1057
|
+
try:
|
|
1058
|
+
source = file_path.read_text(encoding="utf-8")
|
|
1059
|
+
except Exception:
|
|
1060
|
+
continue
|
|
1061
|
+
|
|
1062
|
+
file_text[file_rel] = source
|
|
1063
|
+
file_node_id = f"file:{file_rel}"
|
|
1064
|
+
graph.add_node(
|
|
1065
|
+
file_node_id,
|
|
1066
|
+
type="file",
|
|
1067
|
+
label=file_rel,
|
|
1068
|
+
path=file_rel,
|
|
1069
|
+
file_class=_classify_path(file_rel),
|
|
1070
|
+
)
|
|
1071
|
+
|
|
1072
|
+
if entry.suffix in {".py", ".pyi"}:
|
|
1073
|
+
module = parse_python_source(source, file=Path(file_rel))
|
|
1074
|
+
modules.append(module)
|
|
1075
|
+
|
|
1076
|
+
lines = source.splitlines()
|
|
1077
|
+
for fn in module.functions:
|
|
1078
|
+
start = max(fn.start_line, 1)
|
|
1079
|
+
end = max(fn.end_line, start)
|
|
1080
|
+
snippet = "\n".join(lines[start - 1 : end]).strip()
|
|
1081
|
+
node_id = f"function:{Path(fn.file).as_posix()}::{fn.name}"
|
|
1082
|
+
if snippet:
|
|
1083
|
+
function_snippets[node_id] = snippet
|
|
1084
|
+
|
|
1085
|
+
for cls in module.classes:
|
|
1086
|
+
start = max(cls.start_line, 1)
|
|
1087
|
+
end = max(cls.end_line, start)
|
|
1088
|
+
snippet = "\n".join(lines[start - 1 : end]).strip()
|
|
1089
|
+
node_id = f"class:{Path(cls.file).as_posix()}::{cls.name}"
|
|
1090
|
+
if snippet:
|
|
1091
|
+
class_snippets[node_id] = snippet
|
|
1092
|
+
|
|
1093
|
+
if modules:
|
|
1094
|
+
graph = nx.compose(
|
|
1095
|
+
graph,
|
|
1096
|
+
nx.compose(
|
|
1097
|
+
nx.compose(build_call_graph(modules), build_variable_graph(modules)),
|
|
1098
|
+
build_code_structure_graph(modules),
|
|
1099
|
+
),
|
|
1100
|
+
)
|
|
1101
|
+
|
|
1102
|
+
_REPO_CACHE.clear()
|
|
1103
|
+
_REPO_CACHE[signature] = (graph, file_text, function_snippets, class_snippets)
|
|
1104
|
+
_save_disk_cache(
|
|
1105
|
+
_cache_path(repo_path),
|
|
1106
|
+
signature=signature,
|
|
1107
|
+
graph=graph,
|
|
1108
|
+
file_text=file_text,
|
|
1109
|
+
function_snippets=function_snippets,
|
|
1110
|
+
class_snippets=class_snippets,
|
|
1111
|
+
)
|
|
1112
|
+
return modules, file_text, function_snippets, class_snippets, signature, graph
|
|
1113
|
+
|
|
1114
|
+
|
|
1115
|
+
def _frontend_bias(query: str) -> bool:
|
|
1116
|
+
text = query.lower()
|
|
1117
|
+
return any(keyword in text for keyword in _FRONTEND_KEYWORDS)
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
def _boost_score(
|
|
1121
|
+
node_id: str,
|
|
1122
|
+
base_score: float,
|
|
1123
|
+
query: str,
|
|
1124
|
+
intent: str | None,
|
|
1125
|
+
strong_paths: list[str] | None = None,
|
|
1126
|
+
file_text: dict[str, str] | None = None,
|
|
1127
|
+
) -> float:
|
|
1128
|
+
boosted = base_score
|
|
1129
|
+
strong_paths = strong_paths or []
|
|
1130
|
+
file_text = file_text or {}
|
|
1131
|
+
if node_id.startswith("file:"):
|
|
1132
|
+
path = node_id[len("file:") :]
|
|
1133
|
+
suffix = Path(path).suffix.lower()
|
|
1134
|
+
boosted += _path_match_score(path, query)
|
|
1135
|
+
boosted += _class_weight(path, query, intent)
|
|
1136
|
+
boosted += _adjacency_boost(path, query, intent, strong_paths, file_text)
|
|
1137
|
+
boosted += _support_role_bonus(path, query, strong_paths, file_text)
|
|
1138
|
+
if _frontend_bias(query) and suffix in _FRONTEND_EXTENSIONS:
|
|
1139
|
+
boosted += 0.2
|
|
1140
|
+
if suffix in {".py", ".pyi", ".js", ".jsx", ".ts", ".tsx"}:
|
|
1141
|
+
boosted += 0.05
|
|
1142
|
+
return boosted
|
|
1143
|
+
|
|
1144
|
+
if node_id.startswith(("function:", "class:")):
|
|
1145
|
+
file_path = node_id.split(":", 1)[1].split("::", 1)[0]
|
|
1146
|
+
boosted += _path_match_score(file_path, query)
|
|
1147
|
+
boosted += _class_weight(file_path, query, intent)
|
|
1148
|
+
boosted += _adjacency_boost(file_path, query, intent, strong_paths, file_text)
|
|
1149
|
+
boosted += _support_role_bonus(file_path, query, strong_paths, file_text)
|
|
1150
|
+
return boosted + 0.2
|
|
1151
|
+
|
|
1152
|
+
return boosted
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
def _supplemental_file_snippets(
|
|
1156
|
+
file_text: dict[str, str],
|
|
1157
|
+
query: str,
|
|
1158
|
+
intent: str | None,
|
|
1159
|
+
strong_paths: list[str],
|
|
1160
|
+
*,
|
|
1161
|
+
limit: int = 12,
|
|
1162
|
+
) -> list[RankedSnippet]:
|
|
1163
|
+
supplemental: list[RankedSnippet] = []
|
|
1164
|
+
effective_intent = _effective_intent(query, intent)
|
|
1165
|
+
system_query = _is_system_query(query)
|
|
1166
|
+
for path, text in file_text.items():
|
|
1167
|
+
path_score = _path_match_score(path, query)
|
|
1168
|
+
content_score = _content_match_score(text[:4000], query)
|
|
1169
|
+
class_score = _class_weight(path, query, intent)
|
|
1170
|
+
adjacency = _adjacency_boost(path, query, intent, strong_paths, file_text)
|
|
1171
|
+
support_bonus = _support_role_bonus(path, query, strong_paths, file_text)
|
|
1172
|
+
total = path_score + content_score + class_score + adjacency + support_bonus
|
|
1173
|
+
if effective_intent == "debug" and _classify_path(path) == "general_doc":
|
|
1174
|
+
if total < 0.2:
|
|
1175
|
+
continue
|
|
1176
|
+
elif total <= 0.18:
|
|
1177
|
+
continue
|
|
1178
|
+
supplemental.append(
|
|
1179
|
+
RankedSnippet(
|
|
1180
|
+
node_id=f"file:{path}",
|
|
1181
|
+
content=_snippet_from_lines(text.splitlines(), max_lines=120),
|
|
1182
|
+
score=total,
|
|
1183
|
+
)
|
|
1184
|
+
)
|
|
1185
|
+
supplemental.sort(key=lambda item: (-item.score, item.node_id))
|
|
1186
|
+
if not system_query:
|
|
1187
|
+
return supplemental[:limit]
|
|
1188
|
+
|
|
1189
|
+
selected: list[RankedSnippet] = []
|
|
1190
|
+
family_counts: dict[str, int] = {}
|
|
1191
|
+
for item in supplemental:
|
|
1192
|
+
family = Path(item.node_id[5:]).parts[0] if Path(item.node_id[5:]).parts else item.node_id
|
|
1193
|
+
if family_counts.get(family, 0) >= 3:
|
|
1194
|
+
continue
|
|
1195
|
+
selected.append(item)
|
|
1196
|
+
family_counts[family] = family_counts.get(family, 0) + 1
|
|
1197
|
+
if len(selected) >= limit:
|
|
1198
|
+
break
|
|
1199
|
+
return selected
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
def _top_anchor_paths(ranked: list[RankedSnippet], *, limit: int = 6) -> list[str]:
|
|
1203
|
+
anchors: list[str] = []
|
|
1204
|
+
seen: set[str] = set()
|
|
1205
|
+
for item in sorted(ranked, key=lambda snippet: (-snippet.score, snippet.node_id)):
|
|
1206
|
+
if not item.node_id.startswith(("file:", "function:", "class:")):
|
|
1207
|
+
continue
|
|
1208
|
+
if item.node_id.startswith("file:"):
|
|
1209
|
+
path = item.node_id[5:]
|
|
1210
|
+
else:
|
|
1211
|
+
path = item.node_id.split(":", 1)[1].split("::", 1)[0]
|
|
1212
|
+
if _classify_path(path) != "code":
|
|
1213
|
+
continue
|
|
1214
|
+
if path in seen:
|
|
1215
|
+
continue
|
|
1216
|
+
anchors.append(path)
|
|
1217
|
+
seen.add(path)
|
|
1218
|
+
if len(anchors) >= limit:
|
|
1219
|
+
break
|
|
1220
|
+
return anchors
|
|
1221
|
+
|
|
1222
|
+
|
|
1223
|
+
def _seed_anchor_paths(file_text: dict[str, str], query: str, intent: str | None, *, limit: int = 4) -> list[str]:
|
|
1224
|
+
seeded: list[tuple[float, str]] = []
|
|
1225
|
+
for path, text in file_text.items():
|
|
1226
|
+
if _classify_path(path) != "code":
|
|
1227
|
+
continue
|
|
1228
|
+
score = _path_match_score(path, query)
|
|
1229
|
+
score += _content_match_score(text[:2000], query)
|
|
1230
|
+
score += _class_weight(path, query, intent)
|
|
1231
|
+
score += _support_role_bonus(path, query, [], file_text)
|
|
1232
|
+
if score < 0.35:
|
|
1233
|
+
continue
|
|
1234
|
+
seeded.append((score, path))
|
|
1235
|
+
seeded.sort(key=lambda item: (-item[0], item[1]))
|
|
1236
|
+
return [path for _, path in seeded[:limit]]
|
|
1237
|
+
|
|
1238
|
+
|
|
1239
|
+
def _repair_candidate_bonus(path: str, query: str, intent: str | None, anchor_paths: list[str], file_text: dict[str, str]) -> float:
|
|
1240
|
+
if not anchor_paths:
|
|
1241
|
+
return 0.0
|
|
1242
|
+
score = _support_role_bonus(path, query, anchor_paths, file_text)
|
|
1243
|
+
score += _adjacency_boost(path, query, intent, anchor_paths, file_text)
|
|
1244
|
+
tokens = _family_tokens(path)
|
|
1245
|
+
if _is_system_query(query) and tokens & _SUPPORT_ROLE_TOKENS:
|
|
1246
|
+
score += 0.08
|
|
1247
|
+
candidate = Path(path)
|
|
1248
|
+
for anchor in anchor_paths:
|
|
1249
|
+
anchor_path = Path(anchor)
|
|
1250
|
+
if candidate.parent == anchor_path.parent:
|
|
1251
|
+
score += 0.08
|
|
1252
|
+
if candidate.parts and anchor_path.parts and candidate.parts[0] == anchor_path.parts[0]:
|
|
1253
|
+
score += 0.05
|
|
1254
|
+
return min(0.55, score)
|
|
1255
|
+
|
|
1256
|
+
|
|
1257
|
+
def _support_promotion_enabled(query: str, intent: str | None) -> bool:
|
|
1258
|
+
effective_intent = _effective_intent(query, intent)
|
|
1259
|
+
if effective_intent not in {"debug", "explore", "edit", "refactor"}:
|
|
1260
|
+
return False
|
|
1261
|
+
return bool(_query_terms(query) & _SUPPORT_PROMOTION_TERMS) or _is_system_query(query)
|
|
1262
|
+
|
|
1263
|
+
|
|
1264
|
+
def _support_promotion_score(path: str, query: str, anchor_paths: list[str], file_text: dict[str, str]) -> float:
|
|
1265
|
+
if not anchor_paths:
|
|
1266
|
+
return 0.0
|
|
1267
|
+
candidate = Path(path)
|
|
1268
|
+
tokens = _family_tokens(path)
|
|
1269
|
+
score = 0.0
|
|
1270
|
+
if tokens & _SUPPORT_ROLE_TOKENS:
|
|
1271
|
+
score += 0.14
|
|
1272
|
+
if candidate.stem.lower() in _SUPPORT_ROLE_TOKENS:
|
|
1273
|
+
score += 0.12
|
|
1274
|
+
score += min(0.22, _path_match_score(path, query))
|
|
1275
|
+
|
|
1276
|
+
for anchor in anchor_paths:
|
|
1277
|
+
anchor_path = Path(anchor)
|
|
1278
|
+
anchor_text = file_text.get(anchor, "").lower()
|
|
1279
|
+
if candidate.parent == anchor_path.parent:
|
|
1280
|
+
score += 0.16
|
|
1281
|
+
if candidate.parts and anchor_path.parts and candidate.parts[0] == anchor_path.parts[0]:
|
|
1282
|
+
score += 0.06
|
|
1283
|
+
if tokens & _family_tokens(anchor):
|
|
1284
|
+
score += 0.08
|
|
1285
|
+
if anchor_text and any(token in anchor_text for token in _reference_tokens(path)):
|
|
1286
|
+
score += 0.18
|
|
1287
|
+
return min(0.7, score)
|
|
1288
|
+
|
|
1289
|
+
|
|
1290
|
+
def _promoted_support_file_snippets(
|
|
1291
|
+
file_text: dict[str, str],
|
|
1292
|
+
query: str,
|
|
1293
|
+
intent: str | None,
|
|
1294
|
+
anchor_paths: list[str],
|
|
1295
|
+
existing_ids: set[str],
|
|
1296
|
+
*,
|
|
1297
|
+
limit: int = 2,
|
|
1298
|
+
) -> list[RankedSnippet]:
|
|
1299
|
+
if not _support_promotion_enabled(query, intent):
|
|
1300
|
+
return []
|
|
1301
|
+
|
|
1302
|
+
promoted: list[RankedSnippet] = []
|
|
1303
|
+
for path, text in file_text.items():
|
|
1304
|
+
node_id = f"file:{path}"
|
|
1305
|
+
if node_id in existing_ids:
|
|
1306
|
+
continue
|
|
1307
|
+
if _classify_path(path) != "code":
|
|
1308
|
+
continue
|
|
1309
|
+
promotion_score = _support_promotion_score(path, query, anchor_paths, file_text)
|
|
1310
|
+
if promotion_score < 0.38:
|
|
1311
|
+
continue
|
|
1312
|
+
total = promotion_score + _content_match_score(text[:3000], query)
|
|
1313
|
+
promoted.append(
|
|
1314
|
+
RankedSnippet(
|
|
1315
|
+
node_id=node_id,
|
|
1316
|
+
content=_snippet_from_lines(text.splitlines(), max_lines=120),
|
|
1317
|
+
score=total,
|
|
1318
|
+
)
|
|
1319
|
+
)
|
|
1320
|
+
|
|
1321
|
+
promoted.sort(key=lambda item: (-item.score, item.node_id))
|
|
1322
|
+
return promoted[:limit]
|
|
1323
|
+
|
|
1324
|
+
|
|
1325
|
+
def _support_priority_ids(ranked: list[RankedSnippet], query: str, intent: str | None) -> set[str]:
|
|
1326
|
+
if not _support_promotion_enabled(query, intent):
|
|
1327
|
+
return set()
|
|
1328
|
+
anchor_paths = _top_anchor_paths(ranked)
|
|
1329
|
+
if not anchor_paths:
|
|
1330
|
+
return set()
|
|
1331
|
+
|
|
1332
|
+
candidates: list[tuple[float, str]] = []
|
|
1333
|
+
for item in ranked:
|
|
1334
|
+
if not item.node_id.startswith("file:"):
|
|
1335
|
+
continue
|
|
1336
|
+
path = item.node_id[5:]
|
|
1337
|
+
if _classify_path(path) != "code":
|
|
1338
|
+
continue
|
|
1339
|
+
support_score = _support_promotion_score(path, query, anchor_paths, {})
|
|
1340
|
+
support_tokens = _family_tokens(path) | {Path(path).stem.lower()}
|
|
1341
|
+
if not (support_tokens & _SUPPORT_ROLE_TOKENS):
|
|
1342
|
+
continue
|
|
1343
|
+
if support_score < 0.2 and item.score < 1.2:
|
|
1344
|
+
continue
|
|
1345
|
+
candidates.append((item.score + support_score, item.node_id))
|
|
1346
|
+
|
|
1347
|
+
candidates.sort(key=lambda item: (-item[0], item[1]))
|
|
1348
|
+
return {node_id for _, node_id in candidates[:2]}
|
|
1349
|
+
|
|
1350
|
+
|
|
1351
|
+
def _collapse_support_query_snippets(
|
|
1352
|
+
ranked: list[RankedSnippet],
|
|
1353
|
+
query: str,
|
|
1354
|
+
intent: str | None,
|
|
1355
|
+
file_text: dict[str, str],
|
|
1356
|
+
) -> tuple[list[RankedSnippet], set[str]]:
|
|
1357
|
+
support_priority_ids = _support_priority_ids(ranked, query, intent)
|
|
1358
|
+
if not support_priority_ids:
|
|
1359
|
+
return ranked, set()
|
|
1360
|
+
|
|
1361
|
+
retained_files = {node_id[5:] for node_id in support_priority_ids if node_id.startswith("file:")}
|
|
1362
|
+
filtered: list[RankedSnippet] = []
|
|
1363
|
+
for item in ranked:
|
|
1364
|
+
if item.node_id.startswith(("function:", "class:")):
|
|
1365
|
+
parent_path = item.node_id.split(":", 1)[1].split("::", 1)[0]
|
|
1366
|
+
if parent_path in retained_files:
|
|
1367
|
+
continue
|
|
1368
|
+
if item.node_id.startswith("file:"):
|
|
1369
|
+
path = item.node_id[5:]
|
|
1370
|
+
if path.endswith(".md") and retained_files:
|
|
1371
|
+
if _classify_path(path) != "operational_doc":
|
|
1372
|
+
continue
|
|
1373
|
+
filtered.append(item)
|
|
1374
|
+
|
|
1375
|
+
filtered.sort(
|
|
1376
|
+
key=lambda item: (
|
|
1377
|
+
0 if item.node_id in support_priority_ids else 1,
|
|
1378
|
+
0 if item.node_id.startswith("file:") else 1,
|
|
1379
|
+
-item.score,
|
|
1380
|
+
item.node_id,
|
|
1381
|
+
)
|
|
1382
|
+
)
|
|
1383
|
+
return filtered, support_priority_ids
|
|
1384
|
+
|
|
1385
|
+
|
|
1386
|
+
def _repair_file_snippets(
|
|
1387
|
+
file_text: dict[str, str],
|
|
1388
|
+
query: str,
|
|
1389
|
+
intent: str | None,
|
|
1390
|
+
ranked: list[RankedSnippet],
|
|
1391
|
+
existing_ids: set[str],
|
|
1392
|
+
*,
|
|
1393
|
+
limit: int = 4,
|
|
1394
|
+
) -> list[RankedSnippet]:
|
|
1395
|
+
anchor_paths = _top_anchor_paths(ranked)
|
|
1396
|
+
if not anchor_paths:
|
|
1397
|
+
anchor_paths = _seed_anchor_paths(file_text, query, intent)
|
|
1398
|
+
if not anchor_paths:
|
|
1399
|
+
return []
|
|
1400
|
+
|
|
1401
|
+
repair: list[RankedSnippet] = []
|
|
1402
|
+
promoted_ids: set[str] = set()
|
|
1403
|
+
for item in _promoted_support_file_snippets(file_text, query, intent, anchor_paths, existing_ids, limit=min(2, limit)):
|
|
1404
|
+
repair.append(item)
|
|
1405
|
+
promoted_ids.add(item.node_id)
|
|
1406
|
+
|
|
1407
|
+
for path, text in file_text.items():
|
|
1408
|
+
node_id = f"file:{path}"
|
|
1409
|
+
if node_id in existing_ids or node_id in promoted_ids:
|
|
1410
|
+
continue
|
|
1411
|
+
if _classify_path(path) == "general_doc":
|
|
1412
|
+
continue
|
|
1413
|
+
base = _path_match_score(path, query) + _class_weight(path, query, intent)
|
|
1414
|
+
content = _content_match_score(text[:4000], query)
|
|
1415
|
+
repair_bonus = _repair_candidate_bonus(path, query, intent, anchor_paths, file_text)
|
|
1416
|
+
total = base + content + repair_bonus
|
|
1417
|
+
if total < 0.45:
|
|
1418
|
+
continue
|
|
1419
|
+
repair.append(
|
|
1420
|
+
RankedSnippet(
|
|
1421
|
+
node_id=node_id,
|
|
1422
|
+
content=_snippet_from_lines(text.splitlines(), max_lines=120),
|
|
1423
|
+
score=total,
|
|
1424
|
+
)
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
repair.sort(key=lambda item: (-item.score, item.node_id))
|
|
1428
|
+
return repair[:limit]
|
|
1429
|
+
|
|
1430
|
+
|
|
1431
|
+
def _selected_anchor_paths(selected: tuple[RankedSnippet, ...], query: str, intent: str | None) -> list[str]:
|
|
1432
|
+
if not _support_promotion_enabled(query, intent):
|
|
1433
|
+
return []
|
|
1434
|
+
anchors: list[str] = []
|
|
1435
|
+
seen: set[str] = set()
|
|
1436
|
+
for item in selected:
|
|
1437
|
+
if item.node_id.startswith("file:"):
|
|
1438
|
+
path = item.node_id[5:]
|
|
1439
|
+
elif item.node_id.startswith(("function:", "class:")):
|
|
1440
|
+
path = item.node_id.split(":", 1)[1].split("::", 1)[0]
|
|
1441
|
+
else:
|
|
1442
|
+
continue
|
|
1443
|
+
if _classify_path(path) != "code":
|
|
1444
|
+
continue
|
|
1445
|
+
tokens = _family_tokens(path) | {Path(path).stem.lower()}
|
|
1446
|
+
if not (tokens & _SUPPORT_ROLE_TOKENS):
|
|
1447
|
+
continue
|
|
1448
|
+
if path in seen:
|
|
1449
|
+
continue
|
|
1450
|
+
anchors.append(path)
|
|
1451
|
+
seen.add(path)
|
|
1452
|
+
return anchors
|
|
1453
|
+
|
|
1454
|
+
|
|
1455
|
+
def _reference_fallback_snippets(
|
|
1456
|
+
file_text: dict[str, str],
|
|
1457
|
+
query: str,
|
|
1458
|
+
intent: str | None,
|
|
1459
|
+
selected: tuple[RankedSnippet, ...],
|
|
1460
|
+
existing_ids: set[str],
|
|
1461
|
+
*,
|
|
1462
|
+
limit: int = 2,
|
|
1463
|
+
) -> list[RankedSnippet]:
|
|
1464
|
+
anchors = _selected_anchor_paths(selected, query, intent)
|
|
1465
|
+
if not anchors:
|
|
1466
|
+
return []
|
|
1467
|
+
|
|
1468
|
+
recovered: list[RankedSnippet] = []
|
|
1469
|
+
for path, text in file_text.items():
|
|
1470
|
+
node_id = f"file:{path}"
|
|
1471
|
+
if node_id in existing_ids:
|
|
1472
|
+
continue
|
|
1473
|
+
if _classify_path(path) != "code":
|
|
1474
|
+
continue
|
|
1475
|
+
tokens = _family_tokens(path) | {Path(path).stem.lower()}
|
|
1476
|
+
if not (tokens & _SUPPORT_ROLE_TOKENS):
|
|
1477
|
+
continue
|
|
1478
|
+
candidate = Path(path)
|
|
1479
|
+
reference_hits = 0
|
|
1480
|
+
same_dir_hits = 0
|
|
1481
|
+
family_hits = 0
|
|
1482
|
+
for anchor in anchors:
|
|
1483
|
+
anchor_path = Path(anchor)
|
|
1484
|
+
anchor_text = file_text.get(anchor, "").lower()
|
|
1485
|
+
if candidate.parent == anchor_path.parent:
|
|
1486
|
+
same_dir_hits += 1
|
|
1487
|
+
if candidate.parts and anchor_path.parts and candidate.parts[0] == anchor_path.parts[0]:
|
|
1488
|
+
family_hits += 1
|
|
1489
|
+
if anchor_text and any(token in anchor_text for token in _reference_tokens(path)):
|
|
1490
|
+
reference_hits += 1
|
|
1491
|
+
if reference_hits == 0:
|
|
1492
|
+
continue
|
|
1493
|
+
total = (
|
|
1494
|
+
0.5 * reference_hits
|
|
1495
|
+
+ 0.12 * same_dir_hits
|
|
1496
|
+
+ 0.08 * family_hits
|
|
1497
|
+
+ _path_match_score(path, query)
|
|
1498
|
+
+ _content_match_score(text[:3000], query)
|
|
1499
|
+
+ _class_weight(path, query, intent)
|
|
1500
|
+
)
|
|
1501
|
+
if total < 0.72:
|
|
1502
|
+
continue
|
|
1503
|
+
recovered.append(
|
|
1504
|
+
RankedSnippet(
|
|
1505
|
+
node_id=node_id,
|
|
1506
|
+
content=_snippet_from_lines(text.splitlines(), max_lines=120),
|
|
1507
|
+
score=total,
|
|
1508
|
+
)
|
|
1509
|
+
)
|
|
1510
|
+
|
|
1511
|
+
recovered.sort(key=lambda item: (-item.score, item.node_id))
|
|
1512
|
+
return recovered[:limit]
|
|
1513
|
+
|
|
1514
|
+
|
|
1515
|
+
def _apply_reference_fallback(
|
|
1516
|
+
ranked: list[RankedSnippet],
|
|
1517
|
+
file_text: dict[str, str],
|
|
1518
|
+
query: str,
|
|
1519
|
+
intent: str | None,
|
|
1520
|
+
payload,
|
|
1521
|
+
) -> tuple[list[RankedSnippet], set[str], str | None]:
|
|
1522
|
+
existing_ids = {item.node_id for item in ranked}
|
|
1523
|
+
recovered = _reference_fallback_snippets(file_text, query, intent, payload.snippets, existing_ids)
|
|
1524
|
+
if not recovered:
|
|
1525
|
+
return ranked, set(), None
|
|
1526
|
+
|
|
1527
|
+
recovery_ids = {item.node_id for item in recovered}
|
|
1528
|
+
ranked.extend(recovered)
|
|
1529
|
+
ranked.sort(
|
|
1530
|
+
key=lambda item: (
|
|
1531
|
+
0 if item.node_id in recovery_ids else 1,
|
|
1532
|
+
0 if item.node_id.startswith("file:") else 1,
|
|
1533
|
+
-item.score,
|
|
1534
|
+
item.node_id,
|
|
1535
|
+
)
|
|
1536
|
+
)
|
|
1537
|
+
return ranked, recovery_ids, "reference_search"
|
|
1538
|
+
|
|
1539
|
+
|
|
1540
|
+
@dataclass(frozen=True, slots=True)
|
|
1541
|
+
class _ChannelCandidate:
|
|
1542
|
+
node_id: str
|
|
1543
|
+
channel: str
|
|
1544
|
+
score: float
|
|
1545
|
+
content: str
|
|
1546
|
+
rationale: str
|
|
1547
|
+
|
|
1548
|
+
|
|
1549
|
+
@dataclass(frozen=True, slots=True)
|
|
1550
|
+
class _AdaptiveCompanion:
|
|
1551
|
+
path: str
|
|
1552
|
+
score: float
|
|
1553
|
+
path_score: float
|
|
1554
|
+
same_family: bool
|
|
1555
|
+
role: str
|
|
1556
|
+
|
|
1557
|
+
|
|
1558
|
+
def _node_file_path(node_id: str) -> str | None:
|
|
1559
|
+
if node_id.startswith("file:"):
|
|
1560
|
+
return node_id[5:]
|
|
1561
|
+
if node_id.startswith(("function:", "class:")):
|
|
1562
|
+
return node_id.split(":", 1)[1].split("::", 1)[0]
|
|
1563
|
+
return None
|
|
1564
|
+
|
|
1565
|
+
|
|
1566
|
+
def _file_role(path: str | None) -> str:
|
|
1567
|
+
if not path:
|
|
1568
|
+
return "unknown"
|
|
1569
|
+
candidate = Path(path)
|
|
1570
|
+
stem = candidate.stem.lower()
|
|
1571
|
+
if stem in _SUPPORT_ROLE_TOKENS:
|
|
1572
|
+
return stem
|
|
1573
|
+
for token in sorted(_family_tokens(path)):
|
|
1574
|
+
if token in _SUPPORT_ROLE_TOKENS:
|
|
1575
|
+
return token
|
|
1576
|
+
file_class = _classify_path(path)
|
|
1577
|
+
if file_class == "operational_doc":
|
|
1578
|
+
return "operational_doc"
|
|
1579
|
+
if file_class == "general_doc":
|
|
1580
|
+
return "general_doc"
|
|
1581
|
+
return "module"
|
|
1582
|
+
|
|
1583
|
+
|
|
1584
|
+
def _candidate_family(path: str | None) -> str:
|
|
1585
|
+
if not path:
|
|
1586
|
+
return "unknown"
|
|
1587
|
+
candidate = Path(path)
|
|
1588
|
+
if len(candidate.parts) >= 2:
|
|
1589
|
+
return "/".join(candidate.parts[:2])
|
|
1590
|
+
if candidate.parts:
|
|
1591
|
+
return candidate.parts[0]
|
|
1592
|
+
return candidate.stem
|
|
1593
|
+
|
|
1594
|
+
|
|
1595
|
+
def _candidate_content(
|
|
1596
|
+
node_id: str,
|
|
1597
|
+
file_text: dict[str, str],
|
|
1598
|
+
function_snippets: dict[str, str],
|
|
1599
|
+
class_snippets: dict[str, str],
|
|
1600
|
+
) -> str:
|
|
1601
|
+
snippet = function_snippets.get(node_id)
|
|
1602
|
+
if snippet is None:
|
|
1603
|
+
snippet = class_snippets.get(node_id)
|
|
1604
|
+
if snippet is None:
|
|
1605
|
+
file_path = _node_file_path(node_id)
|
|
1606
|
+
if file_path:
|
|
1607
|
+
text = file_text.get(file_path, "")
|
|
1608
|
+
if text:
|
|
1609
|
+
snippet = _snippet_from_lines(text.splitlines(), max_lines=120)
|
|
1610
|
+
return snippet or ""
|
|
1611
|
+
|
|
1612
|
+
|
|
1613
|
+
def _query_variant(query: str, channel: str) -> str:
|
|
1614
|
+
if channel != "expand":
|
|
1615
|
+
return query
|
|
1616
|
+
expanded = sorted(_query_terms(query))
|
|
1617
|
+
return query if not expanded else f"{query} {' '.join(expanded)}"
|
|
1618
|
+
|
|
1619
|
+
|
|
1620
|
+
def _file_channel_candidates(
|
|
1621
|
+
file_text: dict[str, str],
|
|
1622
|
+
query: str,
|
|
1623
|
+
intent: str | None,
|
|
1624
|
+
*,
|
|
1625
|
+
channel: str,
|
|
1626
|
+
anchor_paths: list[str] | None = None,
|
|
1627
|
+
limit: int = 16,
|
|
1628
|
+
) -> list[_ChannelCandidate]:
|
|
1629
|
+
if channel == "adj" and not anchor_paths:
|
|
1630
|
+
return []
|
|
1631
|
+
|
|
1632
|
+
variant = _query_variant(query, channel)
|
|
1633
|
+
out: list[_ChannelCandidate] = []
|
|
1634
|
+
for path, text in file_text.items():
|
|
1635
|
+
file_class = _classify_path(path)
|
|
1636
|
+
if channel == "adj" and file_class != "code":
|
|
1637
|
+
continue
|
|
1638
|
+
path_score = _path_match_score(path, variant)
|
|
1639
|
+
content_score = _content_match_score(text[:4000], variant)
|
|
1640
|
+
class_score = _class_weight(path, query, intent)
|
|
1641
|
+
adjacency = 0.0
|
|
1642
|
+
support_bonus = 0.0
|
|
1643
|
+
threshold = 0.28
|
|
1644
|
+
|
|
1645
|
+
if channel == "lex":
|
|
1646
|
+
threshold = 0.22
|
|
1647
|
+
elif channel == "expand":
|
|
1648
|
+
support_bonus = 0.6 * _support_role_bonus(path, variant, anchor_paths or [], file_text)
|
|
1649
|
+
threshold = 0.3
|
|
1650
|
+
elif channel == "adj":
|
|
1651
|
+
adjacency = _repair_candidate_bonus(path, query, intent, anchor_paths or [], file_text)
|
|
1652
|
+
support_bonus = _support_role_bonus(path, query, anchor_paths or [], file_text)
|
|
1653
|
+
threshold = 0.46
|
|
1654
|
+
|
|
1655
|
+
total = path_score + content_score + class_score + adjacency + support_bonus
|
|
1656
|
+
if file_class == "general_doc" and _effective_intent(query, intent) == "debug":
|
|
1657
|
+
total -= 0.2
|
|
1658
|
+
if total < threshold:
|
|
1659
|
+
continue
|
|
1660
|
+
|
|
1661
|
+
out.append(
|
|
1662
|
+
_ChannelCandidate(
|
|
1663
|
+
node_id=f"file:{path}",
|
|
1664
|
+
channel=channel,
|
|
1665
|
+
score=total,
|
|
1666
|
+
content=_snippet_from_lines(text.splitlines(), max_lines=120),
|
|
1667
|
+
rationale=f"path={path_score:.2f},content={content_score:.2f},adj={adjacency:.2f},support={support_bonus:.2f}",
|
|
1668
|
+
)
|
|
1669
|
+
)
|
|
1670
|
+
|
|
1671
|
+
out.sort(key=lambda item: (-item.score, item.node_id))
|
|
1672
|
+
return out[:limit]
|
|
1673
|
+
|
|
1674
|
+
|
|
1675
|
+
def _explicit_file_channel_candidates(
|
|
1676
|
+
file_text: dict[str, str],
|
|
1677
|
+
query: str,
|
|
1678
|
+
intent: str | None,
|
|
1679
|
+
*,
|
|
1680
|
+
limit: int = 8,
|
|
1681
|
+
) -> list[_ChannelCandidate]:
|
|
1682
|
+
out: list[_ChannelCandidate] = []
|
|
1683
|
+
for score, rel_path in _mentioned_file_paths(file_text, query)[:limit]:
|
|
1684
|
+
text = file_text.get(rel_path, "")
|
|
1685
|
+
if not text:
|
|
1686
|
+
continue
|
|
1687
|
+
out.append(
|
|
1688
|
+
_ChannelCandidate(
|
|
1689
|
+
node_id=f"file:{rel_path}",
|
|
1690
|
+
channel="target",
|
|
1691
|
+
score=score + 1.0,
|
|
1692
|
+
content=_snippet_from_lines(text.splitlines(), max_lines=220 if _is_edit_like_query(query, intent) else 120),
|
|
1693
|
+
rationale=f"explicit_file={score:.2f}",
|
|
1694
|
+
)
|
|
1695
|
+
)
|
|
1696
|
+
return out
|
|
1697
|
+
|
|
1698
|
+
|
|
1699
|
+
def _vector_channel_candidates(
|
|
1700
|
+
graph: nx.DiGraph,
|
|
1701
|
+
query: str,
|
|
1702
|
+
file_text: dict[str, str],
|
|
1703
|
+
function_snippets: dict[str, str],
|
|
1704
|
+
class_snippets: dict[str, str],
|
|
1705
|
+
*,
|
|
1706
|
+
top_k: int,
|
|
1707
|
+
) -> list[_ChannelCandidate]:
|
|
1708
|
+
out: list[_ChannelCandidate] = []
|
|
1709
|
+
for candidate in hybrid_retrieve(graph, query, top_k=top_k):
|
|
1710
|
+
content = _candidate_content(candidate.node_id, file_text, function_snippets, class_snippets)
|
|
1711
|
+
if not content:
|
|
1712
|
+
continue
|
|
1713
|
+
out.append(
|
|
1714
|
+
_ChannelCandidate(
|
|
1715
|
+
node_id=candidate.node_id,
|
|
1716
|
+
channel="vec",
|
|
1717
|
+
score=candidate.score,
|
|
1718
|
+
content=content,
|
|
1719
|
+
rationale=candidate.rationale,
|
|
1720
|
+
)
|
|
1721
|
+
)
|
|
1722
|
+
return out
|
|
1723
|
+
|
|
1724
|
+
|
|
1725
|
+
def _fuse_context_channels(
|
|
1726
|
+
channel_map: dict[str, list[_ChannelCandidate]],
|
|
1727
|
+
query: str,
|
|
1728
|
+
intent: str | None,
|
|
1729
|
+
file_text: dict[str, str],
|
|
1730
|
+
*,
|
|
1731
|
+
explicit_targets: set[str] | None = None,
|
|
1732
|
+
query_shape: str | None = None,
|
|
1733
|
+
limit: int = 48,
|
|
1734
|
+
) -> tuple[list[RankedSnippet], dict[str, dict[str, object]]]:
|
|
1735
|
+
channel_weights = {
|
|
1736
|
+
"target": 1.5,
|
|
1737
|
+
"lex": 1.0,
|
|
1738
|
+
"vec": 1.15,
|
|
1739
|
+
"expand": 0.9,
|
|
1740
|
+
"adj": 0.95,
|
|
1741
|
+
"fallback": 1.05,
|
|
1742
|
+
}
|
|
1743
|
+
merged: dict[str, dict[str, object]] = {}
|
|
1744
|
+
explicit_targets = explicit_targets or set()
|
|
1745
|
+
resolved_query_shape = query_shape or _query_shape(query, intent, explicit_targets)
|
|
1746
|
+
|
|
1747
|
+
for channel_name in ("target", "lex", "vec", "expand", "adj", "fallback"):
|
|
1748
|
+
candidates = channel_map.get(channel_name, [])
|
|
1749
|
+
candidates = sorted(candidates, key=lambda item: (-item.score, item.node_id))
|
|
1750
|
+
for rank, item in enumerate(candidates, start=1):
|
|
1751
|
+
entry = merged.setdefault(
|
|
1752
|
+
item.node_id,
|
|
1753
|
+
{
|
|
1754
|
+
"rrf": 0.0,
|
|
1755
|
+
"best_score": item.score,
|
|
1756
|
+
"content": item.content,
|
|
1757
|
+
"channels": set(),
|
|
1758
|
+
"rationales": [],
|
|
1759
|
+
},
|
|
1760
|
+
)
|
|
1761
|
+
entry["rrf"] = float(entry["rrf"]) + channel_weights.get(channel_name, 1.0) / (50.0 + rank)
|
|
1762
|
+
entry["best_score"] = max(float(entry["best_score"]), item.score)
|
|
1763
|
+
entry["content"] = entry["content"] or item.content
|
|
1764
|
+
cast_channels = entry["channels"]
|
|
1765
|
+
assert isinstance(cast_channels, set)
|
|
1766
|
+
cast_channels.add(channel_name)
|
|
1767
|
+
cast_rationales = entry["rationales"]
|
|
1768
|
+
assert isinstance(cast_rationales, list)
|
|
1769
|
+
cast_rationales.append(f"{channel_name}:{item.rationale}")
|
|
1770
|
+
|
|
1771
|
+
preliminary = sorted(
|
|
1772
|
+
merged.items(),
|
|
1773
|
+
key=lambda pair: (-float(pair[1]["rrf"]), -float(pair[1]["best_score"]), pair[0]),
|
|
1774
|
+
)
|
|
1775
|
+
strong_paths: list[str] = []
|
|
1776
|
+
seen_paths: set[str] = set()
|
|
1777
|
+
for node_id, _ in preliminary:
|
|
1778
|
+
path = _node_file_path(node_id)
|
|
1779
|
+
if not path or _classify_path(path) != "code":
|
|
1780
|
+
continue
|
|
1781
|
+
if path in seen_paths:
|
|
1782
|
+
continue
|
|
1783
|
+
strong_paths.append(path)
|
|
1784
|
+
seen_paths.add(path)
|
|
1785
|
+
if len(strong_paths) >= 8:
|
|
1786
|
+
break
|
|
1787
|
+
|
|
1788
|
+
ranked: list[RankedSnippet] = []
|
|
1789
|
+
attached: dict[str, dict[str, object]] = {}
|
|
1790
|
+
for node_id, entry in preliminary:
|
|
1791
|
+
path = _node_file_path(node_id)
|
|
1792
|
+
channels = tuple(sorted(entry["channels"]))
|
|
1793
|
+
final_score = float(entry["rrf"]) * 16.0 + _boost_score(
|
|
1794
|
+
node_id,
|
|
1795
|
+
float(entry["best_score"]),
|
|
1796
|
+
query,
|
|
1797
|
+
intent,
|
|
1798
|
+
strong_paths,
|
|
1799
|
+
file_text,
|
|
1800
|
+
)
|
|
1801
|
+
final_score += 0.05 * len(channels)
|
|
1802
|
+
if "target" in channels:
|
|
1803
|
+
final_score += 0.55
|
|
1804
|
+
if path:
|
|
1805
|
+
final_score -= _entrypoint_penalty(path, explicit_targets)
|
|
1806
|
+
if "lex" in channels and "vec" in channels:
|
|
1807
|
+
final_score += 0.18
|
|
1808
|
+
if "adj" in channels:
|
|
1809
|
+
final_score += 0.14
|
|
1810
|
+
if "expand" in channels:
|
|
1811
|
+
final_score += 0.08
|
|
1812
|
+
base_role = _file_role(path)
|
|
1813
|
+
candidate_role = "ranked"
|
|
1814
|
+
if explicit_targets:
|
|
1815
|
+
candidate_role = _candidate_role(path, query, resolved_query_shape, explicit_targets, strong_paths)
|
|
1816
|
+
final_score += _role_adjustment(candidate_role, resolved_query_shape, query, intent)
|
|
1817
|
+
final_score += _family_competition_adjustment(path, explicit_targets, resolved_query_shape)
|
|
1818
|
+
final_score += _subtree_locality_adjustment(path, explicit_targets, resolved_query_shape)
|
|
1819
|
+
final_score += _support_config_penalty(path, candidate_role, explicit_targets)
|
|
1820
|
+
if base_role in _SUPPORT_ROLE_TOKENS and _support_promotion_enabled(query, intent):
|
|
1821
|
+
final_score += 0.1
|
|
1822
|
+
|
|
1823
|
+
ranked.append(
|
|
1824
|
+
RankedSnippet(
|
|
1825
|
+
node_id=node_id,
|
|
1826
|
+
content=str(entry["content"]),
|
|
1827
|
+
score=final_score,
|
|
1828
|
+
)
|
|
1829
|
+
)
|
|
1830
|
+
attached[node_id] = {
|
|
1831
|
+
"channels": list(channels[:4]),
|
|
1832
|
+
"family": _candidate_family(path),
|
|
1833
|
+
"file_role": base_role,
|
|
1834
|
+
"candidate_role": candidate_role,
|
|
1835
|
+
"query_shape": resolved_query_shape,
|
|
1836
|
+
"file_class": _classify_path(path) if path else "unknown",
|
|
1837
|
+
"why_included": "+".join(channels) if channels else "ranked",
|
|
1838
|
+
}
|
|
1839
|
+
|
|
1840
|
+
ranked.sort(key=lambda item: (-item.score, item.node_id))
|
|
1841
|
+
return ranked[:limit], attached
|
|
1842
|
+
|
|
1843
|
+
|
|
1844
|
+
def _selected_file_paths(selected: tuple[RankedSnippet, ...]) -> list[str]:
|
|
1845
|
+
paths: list[str] = []
|
|
1846
|
+
seen: set[str] = set()
|
|
1847
|
+
for item in selected:
|
|
1848
|
+
path = _node_file_path(item.node_id)
|
|
1849
|
+
if not path or path in seen:
|
|
1850
|
+
continue
|
|
1851
|
+
paths.append(path)
|
|
1852
|
+
seen.add(path)
|
|
1853
|
+
return paths
|
|
1854
|
+
|
|
1855
|
+
|
|
1856
|
+
def _referenced_companion_paths(
|
|
1857
|
+
anchor_paths: list[str],
|
|
1858
|
+
file_text: dict[str, str],
|
|
1859
|
+
selected_paths: set[str],
|
|
1860
|
+
) -> list[str]:
|
|
1861
|
+
referenced: list[str] = []
|
|
1862
|
+
for path in sorted(file_text):
|
|
1863
|
+
if path in selected_paths or _classify_path(path) != "code":
|
|
1864
|
+
continue
|
|
1865
|
+
tokens = _reference_tokens(path)
|
|
1866
|
+
for anchor in anchor_paths:
|
|
1867
|
+
anchor_text = file_text.get(anchor, "").lower()
|
|
1868
|
+
if anchor_text and any(token in anchor_text for token in tokens):
|
|
1869
|
+
referenced.append(path)
|
|
1870
|
+
break
|
|
1871
|
+
return referenced
|
|
1872
|
+
|
|
1873
|
+
|
|
1874
|
+
def _context_fallback_reason(
|
|
1875
|
+
payload,
|
|
1876
|
+
query: str,
|
|
1877
|
+
intent: str | None,
|
|
1878
|
+
file_text: dict[str, str],
|
|
1879
|
+
attached: dict[str, dict[str, object]],
|
|
1880
|
+
) -> str | None:
|
|
1881
|
+
selected_paths = _selected_file_paths(payload.snippets)
|
|
1882
|
+
code_paths = [path for path in selected_paths if _classify_path(path) == "code"]
|
|
1883
|
+
if not code_paths:
|
|
1884
|
+
return "insufficient_context_coverage"
|
|
1885
|
+
|
|
1886
|
+
referenced_missing = _referenced_companion_paths(code_paths, file_text, set(selected_paths))
|
|
1887
|
+
if referenced_missing:
|
|
1888
|
+
return "support_family_missing"
|
|
1889
|
+
|
|
1890
|
+
families = {_candidate_family(path) for path in code_paths}
|
|
1891
|
+
strong_files = 0
|
|
1892
|
+
for snippet in payload.snippets:
|
|
1893
|
+
meta = attached.get(snippet.node_id, {})
|
|
1894
|
+
channels = meta.get("channels", []) if isinstance(meta, dict) else []
|
|
1895
|
+
if len(channels) >= 2 and (_node_file_path(snippet.node_id) or "") in code_paths:
|
|
1896
|
+
strong_files += 1
|
|
1897
|
+
|
|
1898
|
+
if _is_system_query(query) and (len(code_paths) < 2 or len(families) < 2):
|
|
1899
|
+
return "insufficient_context_coverage"
|
|
1900
|
+
if _support_promotion_enabled(query, intent) and strong_files < 2:
|
|
1901
|
+
return "low_context_confidence"
|
|
1902
|
+
return None
|
|
1903
|
+
|
|
1904
|
+
|
|
1905
|
+
def _normal_search_fallback_snippets(
|
|
1906
|
+
file_text: dict[str, str],
|
|
1907
|
+
query: str,
|
|
1908
|
+
intent: str | None,
|
|
1909
|
+
selected: tuple[RankedSnippet, ...],
|
|
1910
|
+
existing_ids: set[str],
|
|
1911
|
+
*,
|
|
1912
|
+
limit: int = 4,
|
|
1913
|
+
) -> list[_ChannelCandidate]:
|
|
1914
|
+
selected_paths = _selected_file_paths(selected)
|
|
1915
|
+
anchors = [path for path in selected_paths if _classify_path(path) == "code"]
|
|
1916
|
+
referenced = set(_referenced_companion_paths(anchors, file_text, set(selected_paths)))
|
|
1917
|
+
|
|
1918
|
+
out: list[_ChannelCandidate] = []
|
|
1919
|
+
for path, text in file_text.items():
|
|
1920
|
+
node_id = f"file:{path}"
|
|
1921
|
+
if node_id in existing_ids:
|
|
1922
|
+
continue
|
|
1923
|
+
file_class = _classify_path(path)
|
|
1924
|
+
if file_class == "general_doc" and _effective_intent(query, intent) == "debug":
|
|
1925
|
+
continue
|
|
1926
|
+
path_score = _path_match_score(path, query)
|
|
1927
|
+
content_score = _content_match_score(text[:5000], query)
|
|
1928
|
+
class_score = _class_weight(path, query, intent)
|
|
1929
|
+
adjacency = _adjacency_boost(path, query, intent, anchors, file_text)
|
|
1930
|
+
support_bonus = _support_role_bonus(path, query, anchors, file_text)
|
|
1931
|
+
total = path_score + content_score + class_score + adjacency + support_bonus
|
|
1932
|
+
if path in referenced:
|
|
1933
|
+
total += 0.75
|
|
1934
|
+
if total < 0.5:
|
|
1935
|
+
continue
|
|
1936
|
+
out.append(
|
|
1937
|
+
_ChannelCandidate(
|
|
1938
|
+
node_id=node_id,
|
|
1939
|
+
channel="fallback",
|
|
1940
|
+
score=total,
|
|
1941
|
+
content=_snippet_from_lines(text.splitlines(), max_lines=120),
|
|
1942
|
+
rationale=f"fallback:path={path_score:.2f},content={content_score:.2f},referenced={path in referenced}",
|
|
1943
|
+
)
|
|
1944
|
+
)
|
|
1945
|
+
|
|
1946
|
+
out.sort(key=lambda item: (-item.score, item.node_id))
|
|
1947
|
+
return out[:limit]
|
|
1948
|
+
|
|
1949
|
+
|
|
1950
|
+
def _apply_normal_search_fallback(
|
|
1951
|
+
ranked: list[RankedSnippet],
|
|
1952
|
+
file_text: dict[str, str],
|
|
1953
|
+
query: str,
|
|
1954
|
+
intent: str | None,
|
|
1955
|
+
payload,
|
|
1956
|
+
attached: dict[str, dict[str, object]],
|
|
1957
|
+
) -> tuple[list[RankedSnippet], dict[str, dict[str, object]], set[str], str | None, bool]:
|
|
1958
|
+
reason = _context_fallback_reason(payload, query, intent, file_text, attached)
|
|
1959
|
+
if not reason:
|
|
1960
|
+
return ranked, attached, set(), None, False
|
|
1961
|
+
|
|
1962
|
+
existing_ids = {item.node_id for item in ranked}
|
|
1963
|
+
fallback_candidates = _normal_search_fallback_snippets(file_text, query, intent, payload.snippets, existing_ids)
|
|
1964
|
+
if not fallback_candidates:
|
|
1965
|
+
return ranked, attached, set(), reason, True
|
|
1966
|
+
|
|
1967
|
+
fused_ranked, fused_attached = _fuse_context_channels({"fallback": fallback_candidates}, query, intent, file_text, limit=len(fallback_candidates))
|
|
1968
|
+
fallback_ids = {item.node_id for item in fused_ranked}
|
|
1969
|
+
for node_id, meta in fused_attached.items():
|
|
1970
|
+
attached[node_id] = meta
|
|
1971
|
+
ranked.extend(fused_ranked)
|
|
1972
|
+
ranked.sort(
|
|
1973
|
+
key=lambda item: (
|
|
1974
|
+
0 if item.node_id in fallback_ids else 1,
|
|
1975
|
+
0 if item.node_id.startswith("file:") else 1,
|
|
1976
|
+
-item.score,
|
|
1977
|
+
item.node_id,
|
|
1978
|
+
)
|
|
1979
|
+
)
|
|
1980
|
+
return ranked, attached, fallback_ids, reason, True
|
|
1981
|
+
|
|
1982
|
+
|
|
1983
|
+
|
|
1984
|
+
def _skeletonize_content(content: str, max_lines: int = 60) -> str:
|
|
1985
|
+
lines = content.splitlines()
|
|
1986
|
+
if len(lines) <= max_lines:
|
|
1987
|
+
return content
|
|
1988
|
+
|
|
1989
|
+
signature_pattern = re.compile(r"^\s*(def\s+|class\s+|export\s+|function\s+|const\s+|let\s+|var\s+|@app\.route|if\s+__name__)", re.IGNORECASE)
|
|
1990
|
+
selected: list[str] = []
|
|
1991
|
+
for line in lines:
|
|
1992
|
+
if signature_pattern.search(line):
|
|
1993
|
+
selected.append(line)
|
|
1994
|
+
if len(selected) >= max_lines:
|
|
1995
|
+
break
|
|
1996
|
+
|
|
1997
|
+
if len(selected) < min(20, max_lines):
|
|
1998
|
+
selected = lines[:max_lines]
|
|
1999
|
+
|
|
2000
|
+
return "\n".join(selected).strip()
|
|
2001
|
+
|
|
2002
|
+
|
|
2003
|
+
def _packaging_sets(
|
|
2004
|
+
ranked: list[RankedSnippet],
|
|
2005
|
+
attached: dict[str, dict[str, object]],
|
|
2006
|
+
*,
|
|
2007
|
+
explicit_priority_ids: set[str],
|
|
2008
|
+
linked_priority_ids: set[str],
|
|
2009
|
+
chain_priority_ids: set[str],
|
|
2010
|
+
mandatory_node_ids: set[str],
|
|
2011
|
+
) -> tuple[set[str], set[str]]:
|
|
2012
|
+
pivot_ids = set(explicit_priority_ids) | set(linked_priority_ids) | set(chain_priority_ids)
|
|
2013
|
+
if not pivot_ids:
|
|
2014
|
+
for item in ranked:
|
|
2015
|
+
if item.node_id.startswith("file:"):
|
|
2016
|
+
pivot_ids.add(item.node_id)
|
|
2017
|
+
if len(pivot_ids) >= 2:
|
|
2018
|
+
break
|
|
2019
|
+
|
|
2020
|
+
skeleton_ids: set[str] = set()
|
|
2021
|
+
for item in ranked:
|
|
2022
|
+
if not item.node_id.startswith("file:"):
|
|
2023
|
+
continue
|
|
2024
|
+
if item.node_id in pivot_ids or item.node_id in mandatory_node_ids:
|
|
2025
|
+
continue
|
|
2026
|
+
meta = attached.get(item.node_id, {})
|
|
2027
|
+
role = meta.get("candidate_role") if isinstance(meta, dict) else None
|
|
2028
|
+
if role in {"support_config", "sibling_module", "caller_or_entry", "generic_entrypoint"}:
|
|
2029
|
+
skeleton_ids.add(item.node_id)
|
|
2030
|
+
return pivot_ids, skeleton_ids
|
|
2031
|
+
|
|
2032
|
+
|
|
2033
|
+
def _apply_packaging(
|
|
2034
|
+
ranked: list[RankedSnippet],
|
|
2035
|
+
pivot_ids: set[str],
|
|
2036
|
+
skeleton_ids: set[str],
|
|
2037
|
+
*,
|
|
2038
|
+
max_skeleton_lines: int = 60,
|
|
2039
|
+
) -> list[RankedSnippet]:
|
|
2040
|
+
packed: list[RankedSnippet] = []
|
|
2041
|
+
for item in ranked:
|
|
2042
|
+
if item.node_id in skeleton_ids:
|
|
2043
|
+
packed.append(
|
|
2044
|
+
RankedSnippet(
|
|
2045
|
+
node_id=item.node_id,
|
|
2046
|
+
score=item.score,
|
|
2047
|
+
content=_skeletonize_content(item.content, max_lines=max_skeleton_lines),
|
|
2048
|
+
)
|
|
2049
|
+
)
|
|
2050
|
+
continue
|
|
2051
|
+
packed.append(item)
|
|
2052
|
+
return packed
|
|
2053
|
+
|
|
2054
|
+
def run_context(path: str, query: str, budget: int | None, intent: str | None, top_k: int = 40) -> dict:
|
|
2055
|
+
target = Path(path)
|
|
2056
|
+
|
|
2057
|
+
if target.is_dir():
|
|
2058
|
+
_, file_text, function_snippets, class_snippets, _, graph = _collect_repo_modules(target)
|
|
2059
|
+
else:
|
|
2060
|
+
module = parse_python_file(target)
|
|
2061
|
+
source = target.read_text(encoding="utf-8").splitlines()
|
|
2062
|
+
graph = nx.compose(
|
|
2063
|
+
nx.compose(build_call_graph((module,)), build_variable_graph((module,))),
|
|
2064
|
+
build_code_structure_graph((module,)),
|
|
2065
|
+
)
|
|
2066
|
+
file_rel = target.as_posix()
|
|
2067
|
+
file_text = {file_rel: "\n".join(source)}
|
|
2068
|
+
function_snippets = {}
|
|
2069
|
+
class_snippets = {}
|
|
2070
|
+
lines = source
|
|
2071
|
+
for fn in module.functions:
|
|
2072
|
+
start = max(fn.start_line, 1)
|
|
2073
|
+
end = max(fn.end_line, start)
|
|
2074
|
+
snippet = "\n".join(lines[start - 1 : end]).strip()
|
|
2075
|
+
node_id = f"function:{Path(fn.file).as_posix()}::{fn.name}"
|
|
2076
|
+
if snippet:
|
|
2077
|
+
function_snippets[node_id] = snippet
|
|
2078
|
+
for cls in module.classes:
|
|
2079
|
+
start = max(cls.start_line, 1)
|
|
2080
|
+
end = max(cls.end_line, start)
|
|
2081
|
+
snippet = "\n".join(lines[start - 1 : end]).strip()
|
|
2082
|
+
node_id = f"class:{Path(cls.file).as_posix()}::{cls.name}"
|
|
2083
|
+
if snippet:
|
|
2084
|
+
class_snippets[node_id] = snippet
|
|
2085
|
+
|
|
2086
|
+
explicit_priority_ids = _explicit_priority_ids(file_text, query, intent)
|
|
2087
|
+
explicit_target_paths = {node_id[5:] for node_id in explicit_priority_ids if node_id.startswith("file:")}
|
|
2088
|
+
query_shape = _query_shape(query, intent, explicit_target_paths)
|
|
2089
|
+
|
|
2090
|
+
channels: dict[str, list[_ChannelCandidate]] = {
|
|
2091
|
+
"target": _explicit_file_channel_candidates(file_text, query, intent),
|
|
2092
|
+
"vec": _vector_channel_candidates(graph, query, file_text, function_snippets, class_snippets, top_k=top_k),
|
|
2093
|
+
"lex": _file_channel_candidates(file_text, query, intent, channel="lex", limit=18),
|
|
2094
|
+
"expand": _file_channel_candidates(file_text, query, intent, channel="expand", limit=14),
|
|
2095
|
+
}
|
|
2096
|
+
ranked, attached = _fuse_context_channels(
|
|
2097
|
+
channels,
|
|
2098
|
+
query,
|
|
2099
|
+
intent,
|
|
2100
|
+
file_text,
|
|
2101
|
+
explicit_targets=explicit_target_paths,
|
|
2102
|
+
query_shape=query_shape,
|
|
2103
|
+
)
|
|
2104
|
+
anchor_paths = _top_anchor_paths(ranked)
|
|
2105
|
+
channels["adj"] = _file_channel_candidates(file_text, query, intent, channel="adj", anchor_paths=anchor_paths, limit=10)
|
|
2106
|
+
ranked, attached = _fuse_context_channels(
|
|
2107
|
+
channels,
|
|
2108
|
+
query,
|
|
2109
|
+
intent,
|
|
2110
|
+
file_text,
|
|
2111
|
+
explicit_targets=explicit_target_paths,
|
|
2112
|
+
query_shape=query_shape,
|
|
2113
|
+
)
|
|
2114
|
+
|
|
2115
|
+
ranked, support_priority_ids = _collapse_support_query_snippets(ranked, query, intent, file_text)
|
|
2116
|
+
explicit_priority_ids = {candidate.node_id for candidate in channels.get("target", [])}
|
|
2117
|
+
explicit_priority_ids |= _layer_priority_ids(ranked, query, intent, explicit_priority_ids)
|
|
2118
|
+
linked_priority_ids = _linked_file_priority_ids(ranked, explicit_priority_ids, query_shape, query, intent)
|
|
2119
|
+
chain_priority_ids = _chain_quota_priority_ids(ranked, query, intent, explicit_target_paths)
|
|
2120
|
+
ranked = _promote_priority_first(
|
|
2121
|
+
ranked,
|
|
2122
|
+
explicit_priority_ids,
|
|
2123
|
+
linked_priority_ids,
|
|
2124
|
+
chain_priority_ids,
|
|
2125
|
+
explicit_target_paths,
|
|
2126
|
+
)
|
|
2127
|
+
|
|
2128
|
+
mandatory_node_ids = _mandatory_node_ids(
|
|
2129
|
+
ranked,
|
|
2130
|
+
query,
|
|
2131
|
+
intent,
|
|
2132
|
+
support_priority_ids=support_priority_ids | linked_priority_ids | chain_priority_ids,
|
|
2133
|
+
explicit_priority_ids=explicit_priority_ids | linked_priority_ids | chain_priority_ids,
|
|
2134
|
+
)
|
|
2135
|
+
|
|
2136
|
+
pivot_node_ids, skeleton_node_ids = _packaging_sets(
|
|
2137
|
+
ranked,
|
|
2138
|
+
attached,
|
|
2139
|
+
explicit_priority_ids=explicit_priority_ids,
|
|
2140
|
+
linked_priority_ids=linked_priority_ids,
|
|
2141
|
+
chain_priority_ids=chain_priority_ids,
|
|
2142
|
+
mandatory_node_ids=mandatory_node_ids,
|
|
2143
|
+
)
|
|
2144
|
+
packed_ranked = _apply_packaging(ranked, pivot_node_ids, skeleton_node_ids)
|
|
2145
|
+
|
|
2146
|
+
payload = build_context(
|
|
2147
|
+
query,
|
|
2148
|
+
packed_ranked,
|
|
2149
|
+
token_budget=budget,
|
|
2150
|
+
intent=intent,
|
|
2151
|
+
mandatory_node_ids=mandatory_node_ids,
|
|
2152
|
+
)
|
|
2153
|
+
|
|
2154
|
+
ranked, attached, fallback_priority_ids, fallback_reason, fallback_search_used = _apply_normal_search_fallback(
|
|
2155
|
+
ranked,
|
|
2156
|
+
file_text,
|
|
2157
|
+
query,
|
|
2158
|
+
intent,
|
|
2159
|
+
payload,
|
|
2160
|
+
attached,
|
|
2161
|
+
)
|
|
2162
|
+
if fallback_priority_ids:
|
|
2163
|
+
combined_priority_ids = support_priority_ids | fallback_priority_ids | linked_priority_ids | chain_priority_ids
|
|
2164
|
+
ranked, support_priority_ids = _collapse_support_query_snippets(ranked, query, intent, file_text)
|
|
2165
|
+
combined_priority_ids |= support_priority_ids
|
|
2166
|
+
ranked = _promote_priority_first(
|
|
2167
|
+
ranked,
|
|
2168
|
+
explicit_priority_ids,
|
|
2169
|
+
linked_priority_ids,
|
|
2170
|
+
chain_priority_ids,
|
|
2171
|
+
explicit_target_paths,
|
|
2172
|
+
)
|
|
2173
|
+
mandatory_node_ids = _mandatory_node_ids(
|
|
2174
|
+
ranked,
|
|
2175
|
+
query,
|
|
2176
|
+
intent,
|
|
2177
|
+
support_priority_ids=combined_priority_ids,
|
|
2178
|
+
explicit_priority_ids=explicit_priority_ids | linked_priority_ids | chain_priority_ids,
|
|
2179
|
+
)
|
|
2180
|
+
pivot_node_ids, skeleton_node_ids = _packaging_sets(
|
|
2181
|
+
ranked,
|
|
2182
|
+
attached,
|
|
2183
|
+
explicit_priority_ids=explicit_priority_ids,
|
|
2184
|
+
linked_priority_ids=linked_priority_ids,
|
|
2185
|
+
chain_priority_ids=chain_priority_ids,
|
|
2186
|
+
mandatory_node_ids=mandatory_node_ids,
|
|
2187
|
+
)
|
|
2188
|
+
packed_ranked = _apply_packaging(ranked, pivot_node_ids, skeleton_node_ids)
|
|
2189
|
+
payload = build_context(
|
|
2190
|
+
query,
|
|
2191
|
+
packed_ranked,
|
|
2192
|
+
token_budget=budget,
|
|
2193
|
+
intent=intent,
|
|
2194
|
+
mandatory_node_ids=mandatory_node_ids,
|
|
2195
|
+
)
|
|
2196
|
+
|
|
2197
|
+
snippets_out: list[dict[str, object]] = []
|
|
2198
|
+
for snippet in payload.snippets:
|
|
2199
|
+
base_meta = attached.get(
|
|
2200
|
+
snippet.node_id,
|
|
2201
|
+
{
|
|
2202
|
+
"channels": [],
|
|
2203
|
+
"family": _candidate_family(_node_file_path(snippet.node_id)),
|
|
2204
|
+
"file_role": _file_role(_node_file_path(snippet.node_id)),
|
|
2205
|
+
"candidate_role": "ranked",
|
|
2206
|
+
"query_shape": query_shape,
|
|
2207
|
+
"file_class": _classify_path(_node_file_path(snippet.node_id) or ""),
|
|
2208
|
+
"why_included": "selected",
|
|
2209
|
+
},
|
|
2210
|
+
)
|
|
2211
|
+
meta = dict(base_meta)
|
|
2212
|
+
if snippet.node_id in pivot_node_ids:
|
|
2213
|
+
meta["packaging_role"] = "pivot"
|
|
2214
|
+
elif snippet.node_id in skeleton_node_ids:
|
|
2215
|
+
meta["packaging_role"] = "adjacent_support"
|
|
2216
|
+
else:
|
|
2217
|
+
meta["packaging_role"] = "full"
|
|
2218
|
+
|
|
2219
|
+
snippets_out.append(
|
|
2220
|
+
{
|
|
2221
|
+
"node_id": snippet.node_id,
|
|
2222
|
+
"score": snippet.score,
|
|
2223
|
+
"content": snippet.content,
|
|
2224
|
+
"attached_context": meta,
|
|
2225
|
+
}
|
|
2226
|
+
)
|
|
2227
|
+
|
|
2228
|
+
return {
|
|
2229
|
+
"query": payload.query,
|
|
2230
|
+
"tokens": payload.total_tokens_estimate,
|
|
2231
|
+
"snippets": snippets_out,
|
|
2232
|
+
"fallback_search_used": fallback_search_used,
|
|
2233
|
+
"fallback_reason": fallback_reason,
|
|
2234
|
+
}
|
|
2235
|
+
|
|
2236
|
+
def _adaptive_companion_candidates(
|
|
2237
|
+
payload: dict,
|
|
2238
|
+
file_text: dict[str, str],
|
|
2239
|
+
query: str,
|
|
2240
|
+
intent: str | None,
|
|
2241
|
+
) -> list[_AdaptiveCompanion]:
|
|
2242
|
+
selected_paths = _selected_file_paths(
|
|
2243
|
+
tuple(
|
|
2244
|
+
RankedSnippet(
|
|
2245
|
+
node_id=item["node_id"],
|
|
2246
|
+
content=item.get("content", ""),
|
|
2247
|
+
score=float(item.get("score", 0.0)),
|
|
2248
|
+
)
|
|
2249
|
+
for item in payload.get("snippets", [])
|
|
2250
|
+
)
|
|
2251
|
+
)
|
|
2252
|
+
selected_set = set(selected_paths)
|
|
2253
|
+
anchors = [path for path in selected_paths if _classify_path(path) == "code"]
|
|
2254
|
+
if not anchors:
|
|
2255
|
+
return []
|
|
2256
|
+
|
|
2257
|
+
anchor_families = {_candidate_family(path) for path in anchors}
|
|
2258
|
+
candidates: list[_AdaptiveCompanion] = []
|
|
2259
|
+
for path in _referenced_companion_paths(anchors, file_text, selected_set):
|
|
2260
|
+
if _classify_path(path) != "code":
|
|
2261
|
+
continue
|
|
2262
|
+
text = file_text.get(path, "")
|
|
2263
|
+
path_score = _path_match_score(path, query)
|
|
2264
|
+
same_family = _candidate_family(path) in anchor_families
|
|
2265
|
+
role = _file_role(path)
|
|
2266
|
+
score = path_score
|
|
2267
|
+
score += _content_match_score(text[:4000], query)
|
|
2268
|
+
score += _class_weight(path, query, intent)
|
|
2269
|
+
score += _support_role_bonus(path, query, anchors, file_text)
|
|
2270
|
+
score += _adjacency_boost(path, query, intent, anchors, file_text)
|
|
2271
|
+
if same_family:
|
|
2272
|
+
score += 0.18
|
|
2273
|
+
if role in _SUPPORT_ROLE_TOKENS:
|
|
2274
|
+
score += 0.08
|
|
2275
|
+
candidates.append(
|
|
2276
|
+
_AdaptiveCompanion(
|
|
2277
|
+
path=path,
|
|
2278
|
+
score=score,
|
|
2279
|
+
path_score=path_score,
|
|
2280
|
+
same_family=same_family,
|
|
2281
|
+
role=role,
|
|
2282
|
+
)
|
|
2283
|
+
)
|
|
2284
|
+
|
|
2285
|
+
candidates.sort(key=lambda item: (-item.score, item.path))
|
|
2286
|
+
return candidates
|
|
2287
|
+
|
|
2288
|
+
|
|
2289
|
+
def _adaptive_missing_companions(
|
|
2290
|
+
payload: dict,
|
|
2291
|
+
file_text: dict[str, str],
|
|
2292
|
+
query: str,
|
|
2293
|
+
intent: str | None,
|
|
2294
|
+
*,
|
|
2295
|
+
limit: int = 1,
|
|
2296
|
+
) -> list[str]:
|
|
2297
|
+
if payload.get("fallback_reason") != "support_family_missing":
|
|
2298
|
+
return []
|
|
2299
|
+
if not (_is_system_query(query) or _support_promotion_enabled(query, intent)):
|
|
2300
|
+
return []
|
|
2301
|
+
|
|
2302
|
+
candidates = _adaptive_companion_candidates(payload, file_text, query, intent)
|
|
2303
|
+
filtered = [
|
|
2304
|
+
item for item in candidates
|
|
2305
|
+
if item.score >= 1.05 and (item.same_family or item.path_score >= 0.35 or item.role in _SUPPORT_ROLE_TOKENS)
|
|
2306
|
+
]
|
|
2307
|
+
return [item.path for item in filtered[:limit]]
|
|
2308
|
+
|
|
2309
|
+
|
|
2310
|
+
def _adaptive_replace_index(snippets: list[dict], family: str) -> int | None:
|
|
2311
|
+
candidates: list[tuple[float, int]] = []
|
|
2312
|
+
for idx, item in enumerate(snippets):
|
|
2313
|
+
attached = item.get("attached_context", {})
|
|
2314
|
+
if attached.get("why_included") == "adaptive_pin":
|
|
2315
|
+
continue
|
|
2316
|
+
if attached.get("family") != family:
|
|
2317
|
+
continue
|
|
2318
|
+
candidates.append((float(item.get("score", 0.0)), idx))
|
|
2319
|
+
if not candidates:
|
|
2320
|
+
return None
|
|
2321
|
+
candidates.sort(key=lambda item: (item[0], item[1]))
|
|
2322
|
+
return candidates[0][1]
|
|
2323
|
+
|
|
2324
|
+
|
|
2325
|
+
def run_context_adaptive(
|
|
2326
|
+
path: str,
|
|
2327
|
+
query: str,
|
|
2328
|
+
budget: int | None,
|
|
2329
|
+
intent: str | None,
|
|
2330
|
+
top_k: int = 40,
|
|
2331
|
+
*,
|
|
2332
|
+
completion_limit: int = 1,
|
|
2333
|
+
) -> dict:
|
|
2334
|
+
payload = run_context(path, query, budget, intent, top_k=top_k)
|
|
2335
|
+
target = Path(path)
|
|
2336
|
+
repo_root = target if target.is_dir() else target.parent
|
|
2337
|
+
_, file_text, _, _, _, _ = _collect_repo_modules(repo_root)
|
|
2338
|
+
|
|
2339
|
+
missing_paths = _adaptive_missing_companions(payload, file_text, query, intent, limit=completion_limit)
|
|
2340
|
+
if not missing_paths:
|
|
2341
|
+
payload["adaptive_completion_used"] = False
|
|
2342
|
+
payload["adaptive_completion_reason"] = None
|
|
2343
|
+
payload["adaptive_missing_files"] = []
|
|
2344
|
+
return payload
|
|
2345
|
+
|
|
2346
|
+
remaining_budget = budget
|
|
2347
|
+
used_tokens = int(payload.get("tokens", 0))
|
|
2348
|
+
snippets = list(payload.get("snippets", []))
|
|
2349
|
+
existing_ids = {item["node_id"] for item in snippets}
|
|
2350
|
+
added: list[str] = []
|
|
2351
|
+
for rel_path in missing_paths:
|
|
2352
|
+
node_id = f"file:{rel_path}"
|
|
2353
|
+
if node_id in existing_ids:
|
|
2354
|
+
continue
|
|
2355
|
+
content = _snippet_from_lines(file_text.get(rel_path, "").splitlines(), max_lines=60)
|
|
2356
|
+
if not content:
|
|
2357
|
+
continue
|
|
2358
|
+
token_cost = estimate_tokens(content)
|
|
2359
|
+
family = _candidate_family(rel_path)
|
|
2360
|
+
replace_idx = _adaptive_replace_index(snippets, family)
|
|
2361
|
+
if replace_idx is not None:
|
|
2362
|
+
replaced = snippets.pop(replace_idx)
|
|
2363
|
+
used_tokens -= estimate_tokens(replaced.get("content", ""))
|
|
2364
|
+
existing_ids.discard(replaced["node_id"])
|
|
2365
|
+
if remaining_budget is not None and used_tokens + token_cost > remaining_budget:
|
|
2366
|
+
continue
|
|
2367
|
+
snippets.append(
|
|
2368
|
+
{
|
|
2369
|
+
"node_id": node_id,
|
|
2370
|
+
"score": 99.0,
|
|
2371
|
+
"content": content,
|
|
2372
|
+
"attached_context": {
|
|
2373
|
+
"channels": ["adaptive_pin"],
|
|
2374
|
+
"family": family,
|
|
2375
|
+
"file_role": _file_role(rel_path),
|
|
2376
|
+
"file_class": _classify_path(rel_path),
|
|
2377
|
+
"why_included": "adaptive_pin",
|
|
2378
|
+
},
|
|
2379
|
+
}
|
|
2380
|
+
)
|
|
2381
|
+
used_tokens += token_cost
|
|
2382
|
+
existing_ids.add(node_id)
|
|
2383
|
+
added.append(rel_path)
|
|
2384
|
+
|
|
2385
|
+
snippets.sort(key=lambda item: (0 if item.get("attached_context", {}).get("why_included") == "adaptive_pin" else 1, -float(item.get("score", 0.0)), item["node_id"]))
|
|
2386
|
+
payload["snippets"] = snippets
|
|
2387
|
+
payload["tokens"] = used_tokens
|
|
2388
|
+
payload["adaptive_completion_used"] = bool(added)
|
|
2389
|
+
payload["adaptive_completion_reason"] = "missing_referenced_companion" if added else None
|
|
2390
|
+
payload["adaptive_missing_files"] = added
|
|
2391
|
+
return payload
|
|
2392
|
+
|
|
2393
|
+
|
|
2394
|
+
|
|
2395
|
+
|
|
2396
|
+
|
|
2397
|
+
|
|
2398
|
+
|
|
2399
|
+
|
|
2400
|
+
|
|
2401
|
+
|
|
2402
|
+
|
|
2403
|
+
|
|
2404
|
+
|
|
2405
|
+
|
|
2406
|
+
|
|
2407
|
+
|
|
2408
|
+
|
|
2409
|
+
|
|
2410
|
+
|
|
2411
|
+
|
|
2412
|
+
|
|
2413
|
+
|
|
2414
|
+
|
|
2415
|
+
|
|
2416
|
+
|
|
2417
|
+
|
|
2418
|
+
|
|
2419
|
+
|
|
2420
|
+
|
|
2421
|
+
|
|
2422
|
+
|
|
2423
|
+
|
|
2424
|
+
|
|
2425
|
+
|
|
2426
|
+
|