@pmaddire/gcie 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENT.md +256 -0
- package/AGENT_USAGE.md +231 -0
- package/ARCHITECTURE.md +151 -0
- package/CLAUDE.md +69 -0
- package/DEBUGGING_PLAYBOOK.md +160 -0
- package/KNOWLEDGE_INDEX.md +154 -0
- package/POTENTIAL_UPDATES +130 -0
- package/PROJECT.md +141 -0
- package/README.md +371 -0
- package/REPO_DIGITAL_TWIN.md +98 -0
- package/ROADMAP.md +301 -0
- package/SETUP_ANY_REPO.md +85 -0
- package/bin/gcie-init.js +20 -0
- package/bin/gcie.js +45 -0
- package/cli/__init__.py +1 -0
- package/cli/app.py +163 -0
- package/cli/commands/__init__.py +1 -0
- package/cli/commands/cache.py +35 -0
- package/cli/commands/context.py +2426 -0
- package/cli/commands/context_slices.py +617 -0
- package/cli/commands/debug.py +24 -0
- package/cli/commands/index.py +17 -0
- package/cli/commands/query.py +20 -0
- package/cli/commands/setup.py +73 -0
- package/config/__init__.py +1 -0
- package/config/scanner_config.py +82 -0
- package/context/__init__.py +1 -0
- package/context/architecture_bootstrap.py +170 -0
- package/context/architecture_index.py +185 -0
- package/context/architecture_parser.py +170 -0
- package/context/architecture_slicer.py +308 -0
- package/context/context_router.py +70 -0
- package/context/fallback_evaluator.py +21 -0
- package/coverage_integration/__init__.py +1 -0
- package/coverage_integration/coverage_loader.py +55 -0
- package/debugging/__init__.py +12 -0
- package/debugging/bug_localizer.py +81 -0
- package/debugging/execution_path_analyzer.py +42 -0
- package/embeddings/__init__.py +6 -0
- package/embeddings/encoder.py +45 -0
- package/embeddings/faiss_index.py +72 -0
- package/git_integration/__init__.py +1 -0
- package/git_integration/git_miner.py +78 -0
- package/graphs/__init__.py +17 -0
- package/graphs/call_graph.py +70 -0
- package/graphs/code_graph.py +81 -0
- package/graphs/execution_graph.py +35 -0
- package/graphs/git_graph.py +43 -0
- package/graphs/graph_store.py +25 -0
- package/graphs/node_factory.py +21 -0
- package/graphs/test_graph.py +65 -0
- package/graphs/validators.py +28 -0
- package/graphs/variable_graph.py +51 -0
- package/knowledge_index/__init__.py +1 -0
- package/knowledge_index/index_builder.py +60 -0
- package/knowledge_index/models.py +35 -0
- package/knowledge_index/query_api.py +38 -0
- package/knowledge_index/store.py +23 -0
- package/llm_context/__init__.py +6 -0
- package/llm_context/context_builder.py +67 -0
- package/llm_context/snippet_selector.py +57 -0
- package/package.json +14 -0
- package/parser/__init__.py +18 -0
- package/parser/ast_parser.py +216 -0
- package/parser/call_resolver.py +52 -0
- package/parser/models.py +75 -0
- package/parser/tree_sitter_adapter.py +56 -0
- package/parser/variable_extractor.py +31 -0
- package/retrieval/__init__.py +17 -0
- package/retrieval/cache.py +22 -0
- package/retrieval/hybrid_retriever.py +249 -0
- package/retrieval/query_parser.py +38 -0
- package/retrieval/ranking.py +43 -0
- package/retrieval/semantic_retriever.py +39 -0
- package/retrieval/symbolic_retriever.py +80 -0
- package/scanner/__init__.py +5 -0
- package/scanner/file_filters.py +37 -0
- package/scanner/models.py +44 -0
- package/scanner/repository_scanner.py +55 -0
- package/scripts/bootstrap_from_github.ps1 +41 -0
- package/tracing/__init__.py +1 -0
- package/tracing/runtime_tracer.py +60 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
"""Architecture-driven context slicing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
import re
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from llm_context.snippet_selector import estimate_tokens
|
|
10
|
+
|
|
11
|
+
from .architecture_index import load_architecture_index
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
_MISSING_RATIO_FALLBACK = 0.5
|
|
15
|
+
_STOPWORDS = {
|
|
16
|
+
"for",
|
|
17
|
+
"and",
|
|
18
|
+
"the",
|
|
19
|
+
"with",
|
|
20
|
+
"from",
|
|
21
|
+
"this",
|
|
22
|
+
"that",
|
|
23
|
+
"into",
|
|
24
|
+
"onto",
|
|
25
|
+
"over",
|
|
26
|
+
"under",
|
|
27
|
+
"fix",
|
|
28
|
+
"add",
|
|
29
|
+
"update",
|
|
30
|
+
"refactor",
|
|
31
|
+
"change",
|
|
32
|
+
"when",
|
|
33
|
+
"why",
|
|
34
|
+
"how",
|
|
35
|
+
"use",
|
|
36
|
+
"using",
|
|
37
|
+
"used",
|
|
38
|
+
"make",
|
|
39
|
+
"new",
|
|
40
|
+
}
|
|
41
|
+
_ARCH_KEYWORDS = {
|
|
42
|
+
"fallback",
|
|
43
|
+
"router",
|
|
44
|
+
"routing",
|
|
45
|
+
"context",
|
|
46
|
+
"slicer",
|
|
47
|
+
"architecture",
|
|
48
|
+
"validation",
|
|
49
|
+
"mode",
|
|
50
|
+
"confidence",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class ArchitectureSliceResult:
|
|
56
|
+
query: str
|
|
57
|
+
snippets: list[dict]
|
|
58
|
+
confidence: float
|
|
59
|
+
matched_subsystems: list[dict]
|
|
60
|
+
missing_files: list[str]
|
|
61
|
+
error: str | None = None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _tokenize(text: str) -> set[str]:
|
|
65
|
+
tokens = []
|
|
66
|
+
for raw in re.split(r"[\s_-]+", text.lower()):
|
|
67
|
+
token = "".join(ch for ch in raw if ch.isalnum() or ch == "_")
|
|
68
|
+
if len(token) >= 3:
|
|
69
|
+
if token not in _STOPWORDS:
|
|
70
|
+
tokens.append(token)
|
|
71
|
+
return set(tokens)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _subsystem_blob(subsystem: dict) -> str:
|
|
75
|
+
parts = [subsystem.get("name", ""), subsystem.get("purpose", ""), subsystem.get("status", "")]
|
|
76
|
+
for field in (
|
|
77
|
+
subsystem.get("interfaces", []),
|
|
78
|
+
subsystem.get("depends_on", []),
|
|
79
|
+
subsystem.get("used_by", []),
|
|
80
|
+
subsystem.get("failure_modes", []),
|
|
81
|
+
subsystem.get("notes", []),
|
|
82
|
+
):
|
|
83
|
+
if field:
|
|
84
|
+
parts.extend(field)
|
|
85
|
+
return " ".join(parts)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _score_subsystem(subsystem: dict, query_tokens: set[str]) -> float:
|
|
89
|
+
if not query_tokens:
|
|
90
|
+
return 0.0
|
|
91
|
+
blob = _subsystem_blob(subsystem).lower()
|
|
92
|
+
matches = sum(1 for token in query_tokens if token in blob)
|
|
93
|
+
return matches / max(len(query_tokens), 1)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _snippet_from_lines(lines: list[str], max_lines: int) -> str:
|
|
97
|
+
return "\n".join(lines[:max_lines]).strip()
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _collect_snippets(repo_path: Path, files: list[str], max_lines: int = 120) -> tuple[list[dict], list[str]]:
|
|
101
|
+
snippets: list[dict] = []
|
|
102
|
+
missing: list[str] = []
|
|
103
|
+
for rel_path in files:
|
|
104
|
+
file_path = repo_path / rel_path
|
|
105
|
+
if not file_path.exists():
|
|
106
|
+
missing.append(rel_path)
|
|
107
|
+
continue
|
|
108
|
+
try:
|
|
109
|
+
content = file_path.read_text(encoding="utf-8").splitlines()
|
|
110
|
+
except Exception:
|
|
111
|
+
missing.append(rel_path)
|
|
112
|
+
continue
|
|
113
|
+
snippet = _snippet_from_lines(content, max_lines=max_lines)
|
|
114
|
+
if snippet:
|
|
115
|
+
snippets.append(
|
|
116
|
+
{
|
|
117
|
+
"node_id": f"file:{rel_path}",
|
|
118
|
+
"score": 1.0,
|
|
119
|
+
"content": snippet,
|
|
120
|
+
}
|
|
121
|
+
)
|
|
122
|
+
return snippets, missing
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _validate_index(repo_path: Path, index_data: dict) -> tuple[list[dict], list[str], float]:
|
|
126
|
+
missing: list[str] = []
|
|
127
|
+
cleaned: list[dict] = []
|
|
128
|
+
total = 0
|
|
129
|
+
|
|
130
|
+
for subsystem in index_data.get("subsystems", []):
|
|
131
|
+
key_files = subsystem.get("key_files", []) or []
|
|
132
|
+
total += len(key_files)
|
|
133
|
+
valid_files: list[str] = []
|
|
134
|
+
for rel_path in key_files:
|
|
135
|
+
if (repo_path / rel_path).exists():
|
|
136
|
+
valid_files.append(rel_path)
|
|
137
|
+
else:
|
|
138
|
+
missing.append(rel_path)
|
|
139
|
+
cleaned.append({**subsystem, "key_files": valid_files})
|
|
140
|
+
|
|
141
|
+
if total == 0:
|
|
142
|
+
return cleaned, missing, 0.0
|
|
143
|
+
|
|
144
|
+
missing_ratio = len(missing) / total
|
|
145
|
+
return cleaned, missing, missing_ratio
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def _arch_query(query_tokens: set[str]) -> bool:
|
|
149
|
+
return bool(query_tokens & _ARCH_KEYWORDS)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _rank_core_files(core_files: list[str], query_tokens: set[str]) -> list[str]:
|
|
153
|
+
weights = {
|
|
154
|
+
"router": 3,
|
|
155
|
+
"routing": 3,
|
|
156
|
+
"fallback": 3,
|
|
157
|
+
"architecture": 2,
|
|
158
|
+
"slicer": 2,
|
|
159
|
+
"validation": 2,
|
|
160
|
+
"context": 1,
|
|
161
|
+
"mode": 1,
|
|
162
|
+
"confidence": 1,
|
|
163
|
+
}
|
|
164
|
+
ranked = []
|
|
165
|
+
for path in core_files:
|
|
166
|
+
lowered = path.lower()
|
|
167
|
+
score = 0
|
|
168
|
+
for key, weight in weights.items():
|
|
169
|
+
if key in lowered:
|
|
170
|
+
score += weight
|
|
171
|
+
if query_tokens:
|
|
172
|
+
score += sum(1 for token in query_tokens if token in lowered)
|
|
173
|
+
ranked.append((score, path))
|
|
174
|
+
ranked.sort(key=lambda item: item[0], reverse=True)
|
|
175
|
+
return [path for score, path in ranked]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _select_core_files(index_data: dict, query_tokens: set[str]) -> list[str]:
|
|
179
|
+
core_files = index_data.get("core_infrastructure", []) or []
|
|
180
|
+
return _rank_core_files(core_files, query_tokens)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def slice_with_architecture(repo_path: Path, query: str) -> ArchitectureSliceResult:
|
|
184
|
+
index_path = repo_path / ".gcie" / "architecture_index.json"
|
|
185
|
+
index_data = load_architecture_index(index_path)
|
|
186
|
+
if index_data is None:
|
|
187
|
+
return ArchitectureSliceResult(
|
|
188
|
+
query=query,
|
|
189
|
+
snippets=[],
|
|
190
|
+
confidence=0.0,
|
|
191
|
+
matched_subsystems=[],
|
|
192
|
+
missing_files=[],
|
|
193
|
+
error="index_missing",
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
subsystems, missing_files, missing_ratio = _validate_index(repo_path, index_data)
|
|
197
|
+
if not subsystems:
|
|
198
|
+
return ArchitectureSliceResult(
|
|
199
|
+
query=query,
|
|
200
|
+
snippets=[],
|
|
201
|
+
confidence=0.0,
|
|
202
|
+
matched_subsystems=[],
|
|
203
|
+
missing_files=missing_files,
|
|
204
|
+
error="no_subsystems",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
if missing_ratio >= _MISSING_RATIO_FALLBACK and missing_files:
|
|
208
|
+
return ArchitectureSliceResult(
|
|
209
|
+
query=query,
|
|
210
|
+
snippets=[],
|
|
211
|
+
confidence=0.0,
|
|
212
|
+
matched_subsystems=[],
|
|
213
|
+
missing_files=missing_files,
|
|
214
|
+
error="index_missing_files",
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
query_tokens = _tokenize(query)
|
|
218
|
+
scored = []
|
|
219
|
+
for subsystem in subsystems:
|
|
220
|
+
score = _score_subsystem(subsystem, query_tokens)
|
|
221
|
+
scored.append((score, subsystem))
|
|
222
|
+
|
|
223
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
224
|
+
matched = [(score, subsystem) for score, subsystem in scored if score > 0]
|
|
225
|
+
|
|
226
|
+
if not matched:
|
|
227
|
+
if _arch_query(query_tokens) and index_data.get("core_infrastructure"):
|
|
228
|
+
core_files = _select_core_files(index_data, query_tokens)
|
|
229
|
+
snippets, missing = _collect_snippets(repo_path, core_files)
|
|
230
|
+
missing_files.extend(missing)
|
|
231
|
+
return ArchitectureSliceResult(
|
|
232
|
+
query=query,
|
|
233
|
+
snippets=snippets,
|
|
234
|
+
confidence=0.25,
|
|
235
|
+
matched_subsystems=[],
|
|
236
|
+
missing_files=missing_files,
|
|
237
|
+
error=None,
|
|
238
|
+
)
|
|
239
|
+
return ArchitectureSliceResult(
|
|
240
|
+
query=query,
|
|
241
|
+
snippets=[],
|
|
242
|
+
confidence=0.0,
|
|
243
|
+
matched_subsystems=[],
|
|
244
|
+
missing_files=missing_files,
|
|
245
|
+
error="low_match",
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
top_score = matched[0][0]
|
|
249
|
+
if missing_ratio > 0:
|
|
250
|
+
top_score = max(top_score * (1.0 - missing_ratio), 0.0)
|
|
251
|
+
|
|
252
|
+
selected_subsystems = [subsystem for score, subsystem in matched[:3]]
|
|
253
|
+
selected_files: list[str] = []
|
|
254
|
+
for subsystem in selected_subsystems:
|
|
255
|
+
selected_files.extend(subsystem.get("key_files", []))
|
|
256
|
+
|
|
257
|
+
include_core = False
|
|
258
|
+
arch_query = _arch_query(query_tokens)
|
|
259
|
+
if arch_query:
|
|
260
|
+
include_core = True
|
|
261
|
+
if arch_query and top_score <= 0.35:
|
|
262
|
+
include_core = True
|
|
263
|
+
if arch_query and len(selected_subsystems) <= 1:
|
|
264
|
+
include_core = True
|
|
265
|
+
|
|
266
|
+
core_files = _select_core_files(index_data, query_tokens)
|
|
267
|
+
if include_core and core_files:
|
|
268
|
+
selected_files = core_files + selected_files
|
|
269
|
+
|
|
270
|
+
snippets, missing = _collect_snippets(repo_path, selected_files)
|
|
271
|
+
missing_files.extend(missing)
|
|
272
|
+
|
|
273
|
+
if include_core and core_files and not snippets:
|
|
274
|
+
snippets, missing = _collect_snippets(repo_path, core_files)
|
|
275
|
+
missing_files.extend(missing)
|
|
276
|
+
|
|
277
|
+
return ArchitectureSliceResult(
|
|
278
|
+
query=query,
|
|
279
|
+
snippets=snippets,
|
|
280
|
+
confidence=top_score,
|
|
281
|
+
matched_subsystems=[
|
|
282
|
+
{"name": subsystem.get("name", ""), "score": score}
|
|
283
|
+
for score, subsystem in matched[:3]
|
|
284
|
+
],
|
|
285
|
+
missing_files=missing_files,
|
|
286
|
+
error=None,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def trim_snippets_to_budget(snippets: list[dict], max_total: int) -> list[dict]:
|
|
291
|
+
out: list[dict] = []
|
|
292
|
+
used = 0
|
|
293
|
+
for item in snippets:
|
|
294
|
+
tokens = estimate_tokens(item.get("content", ""))
|
|
295
|
+
if used + tokens > max_total:
|
|
296
|
+
continue
|
|
297
|
+
out.append(item)
|
|
298
|
+
used += tokens
|
|
299
|
+
return out
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Route context requests between architecture-driven and normal modes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Callable
|
|
8
|
+
|
|
9
|
+
from llm_context.snippet_selector import estimate_tokens
|
|
10
|
+
|
|
11
|
+
from .architecture_bootstrap import ensure_initialized
|
|
12
|
+
from .architecture_slicer import slice_with_architecture, trim_snippets_to_budget
|
|
13
|
+
from .fallback_evaluator import should_fallback
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
NormalRunner = Callable[[], dict]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _total_tokens(snippets: list[dict]) -> int:
|
|
20
|
+
return sum(estimate_tokens(item.get("content", "")) for item in snippets)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _record_fallback(repo_path: Path, reason: str | None, config: dict) -> None:
|
|
24
|
+
if reason is None:
|
|
25
|
+
return
|
|
26
|
+
config_path = repo_path / ".gcie" / "context_config.json"
|
|
27
|
+
config["fallback_reason"] = reason
|
|
28
|
+
try:
|
|
29
|
+
config_path.write_text(json.dumps(config, indent=2), encoding="utf-8")
|
|
30
|
+
except Exception:
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def route_context(
|
|
35
|
+
repo: str,
|
|
36
|
+
query: str,
|
|
37
|
+
*,
|
|
38
|
+
intent: str | None,
|
|
39
|
+
max_total: int,
|
|
40
|
+
profile: str | None,
|
|
41
|
+
normal_runner: NormalRunner,
|
|
42
|
+
) -> dict:
|
|
43
|
+
repo_path = Path(repo)
|
|
44
|
+
config = ensure_initialized(repo_path)
|
|
45
|
+
|
|
46
|
+
if not config.get("architecture_slicer_enabled", True):
|
|
47
|
+
_record_fallback(repo_path, "architecture_disabled", config)
|
|
48
|
+
payload = normal_runner()
|
|
49
|
+
payload["fallback_reason"] = "architecture_disabled"
|
|
50
|
+
return payload
|
|
51
|
+
|
|
52
|
+
arch_result = slice_with_architecture(repo_path, query)
|
|
53
|
+
fallback, reason = should_fallback(arch_result, config)
|
|
54
|
+
if fallback:
|
|
55
|
+
_record_fallback(repo_path, reason, config)
|
|
56
|
+
payload = normal_runner()
|
|
57
|
+
payload["fallback_reason"] = reason
|
|
58
|
+
return payload
|
|
59
|
+
|
|
60
|
+
trimmed = trim_snippets_to_budget(arch_result.snippets, max_total)
|
|
61
|
+
return {
|
|
62
|
+
"query": arch_result.query,
|
|
63
|
+
"profile": profile,
|
|
64
|
+
"mode": "architecture",
|
|
65
|
+
"intent": intent,
|
|
66
|
+
"confidence": arch_result.confidence,
|
|
67
|
+
"matched_subsystems": arch_result.matched_subsystems,
|
|
68
|
+
"snippets": trimmed,
|
|
69
|
+
"token_estimate": _total_tokens(trimmed),
|
|
70
|
+
}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Evaluate whether to fall back to normal context."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .architecture_slicer import ArchitectureSliceResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def should_fallback(result: ArchitectureSliceResult, config: dict) -> tuple[bool, str | None]:
|
|
9
|
+
"""Decide whether architecture slicing is insufficient."""
|
|
10
|
+
if result.error:
|
|
11
|
+
return True, result.error
|
|
12
|
+
|
|
13
|
+
if not result.snippets:
|
|
14
|
+
return True, "no_snippets"
|
|
15
|
+
|
|
16
|
+
threshold = float(config.get("confidence_threshold", 0.2))
|
|
17
|
+
if result.confidence < threshold:
|
|
18
|
+
if config.get("fallback_to_normal_on_low_confidence", True):
|
|
19
|
+
return True, "low_confidence"
|
|
20
|
+
|
|
21
|
+
return False, None
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Coverage integration package."""
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Coverage.py JSON report loader."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True, slots=True)
|
|
11
|
+
class CoverageFileRecord:
|
|
12
|
+
"""Coverage record for a single file."""
|
|
13
|
+
|
|
14
|
+
path: str
|
|
15
|
+
executed_lines: tuple[int, ...]
|
|
16
|
+
missing_lines: tuple[int, ...]
|
|
17
|
+
percent_covered: float
|
|
18
|
+
num_statements: int
|
|
19
|
+
num_branches: int
|
|
20
|
+
num_partial_branches: int
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass(frozen=True, slots=True)
|
|
24
|
+
class CoverageReport:
|
|
25
|
+
"""Loaded coverage report."""
|
|
26
|
+
|
|
27
|
+
files: tuple[CoverageFileRecord, ...]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def load_coverage_json(path: str | Path) -> CoverageReport:
|
|
31
|
+
"""Load Coverage.py JSON report from disk."""
|
|
32
|
+
report_path = Path(path)
|
|
33
|
+
if not report_path.exists():
|
|
34
|
+
return CoverageReport(files=())
|
|
35
|
+
|
|
36
|
+
data = json.loads(report_path.read_text(encoding="utf-8"))
|
|
37
|
+
files_data = data.get("files", {})
|
|
38
|
+
|
|
39
|
+
records: list[CoverageFileRecord] = []
|
|
40
|
+
for file_path, entry in files_data.items():
|
|
41
|
+
summary = entry.get("summary", {})
|
|
42
|
+
records.append(
|
|
43
|
+
CoverageFileRecord(
|
|
44
|
+
path=Path(file_path).as_posix(),
|
|
45
|
+
executed_lines=tuple(entry.get("executed_lines", [])),
|
|
46
|
+
missing_lines=tuple(entry.get("missing_lines", [])),
|
|
47
|
+
percent_covered=float(summary.get("percent_covered", 0.0)),
|
|
48
|
+
num_statements=int(summary.get("num_statements", 0)),
|
|
49
|
+
num_branches=int(summary.get("num_branches", 0)),
|
|
50
|
+
num_partial_branches=int(summary.get("num_partial_branches", 0)),
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
records.sort(key=lambda r: r.path)
|
|
55
|
+
return CoverageReport(files=tuple(records))
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Debugging package."""
|
|
2
|
+
|
|
3
|
+
from .bug_localizer import LocalizedBugReport, localize_bug
|
|
4
|
+
from .execution_path_analyzer import ExecutionPath, neighborhood_path, shortest_path_between
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"ExecutionPath",
|
|
8
|
+
"LocalizedBugReport",
|
|
9
|
+
"localize_bug",
|
|
10
|
+
"neighborhood_path",
|
|
11
|
+
"shortest_path_between",
|
|
12
|
+
]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Bug localization workflow for GCIE debugging queries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
import networkx as nx
|
|
8
|
+
|
|
9
|
+
from retrieval.hybrid_retriever import hybrid_retrieve
|
|
10
|
+
from retrieval.query_parser import parse_query
|
|
11
|
+
|
|
12
|
+
from .execution_path_analyzer import neighborhood_path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass(frozen=True, slots=True)
|
|
16
|
+
class LocalizedBugReport:
|
|
17
|
+
query: str
|
|
18
|
+
target_symbols: tuple[str, ...]
|
|
19
|
+
relevant_functions: tuple[str, ...]
|
|
20
|
+
call_chain: tuple[str, ...]
|
|
21
|
+
variable_modifications: tuple[str, ...]
|
|
22
|
+
ranked_candidates: tuple[str, ...]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _function_nodes_touching_symbol(graph: nx.DiGraph, symbol: str) -> tuple[str, ...]:
|
|
26
|
+
out: set[str] = set()
|
|
27
|
+
needle = symbol.lower()
|
|
28
|
+
|
|
29
|
+
for src, dst, attrs in graph.edges(data=True):
|
|
30
|
+
edge_type = str(attrs.get("type", ""))
|
|
31
|
+
if edge_type not in {"WRITES", "MODIFIES", "READS"}:
|
|
32
|
+
continue
|
|
33
|
+
|
|
34
|
+
if str(dst).lower() == f"variable:{needle}" and str(src).startswith("function:"):
|
|
35
|
+
out.add(str(src))
|
|
36
|
+
|
|
37
|
+
return tuple(sorted(out))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def localize_bug(
|
|
41
|
+
graph: nx.DiGraph,
|
|
42
|
+
query: str,
|
|
43
|
+
*,
|
|
44
|
+
git_recency_by_node: dict[str, float] | None = None,
|
|
45
|
+
coverage_risk_by_node: dict[str, float] | None = None,
|
|
46
|
+
) -> LocalizedBugReport:
|
|
47
|
+
"""Localize likely bug sources from a debugging query."""
|
|
48
|
+
parsed = parse_query(query)
|
|
49
|
+
symbols = parsed.symbols
|
|
50
|
+
|
|
51
|
+
variable_mods: set[str] = set()
|
|
52
|
+
for symbol in symbols:
|
|
53
|
+
variable_mods.update(_function_nodes_touching_symbol(graph, symbol))
|
|
54
|
+
|
|
55
|
+
hybrid = hybrid_retrieve(
|
|
56
|
+
graph,
|
|
57
|
+
query,
|
|
58
|
+
git_recency_by_node=git_recency_by_node,
|
|
59
|
+
coverage_risk_by_node=coverage_risk_by_node,
|
|
60
|
+
max_hops=3,
|
|
61
|
+
top_k=10,
|
|
62
|
+
)
|
|
63
|
+
ranked = tuple(c.node_id for c in hybrid)
|
|
64
|
+
|
|
65
|
+
relevant_functions = tuple(
|
|
66
|
+
node_id for node_id in ranked if node_id.startswith("function:")
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
chain: tuple[str, ...] = ()
|
|
70
|
+
if relevant_functions:
|
|
71
|
+
seed = relevant_functions[0]
|
|
72
|
+
chain = neighborhood_path(graph, seed=seed, hops=2).nodes
|
|
73
|
+
|
|
74
|
+
return LocalizedBugReport(
|
|
75
|
+
query=query,
|
|
76
|
+
target_symbols=symbols,
|
|
77
|
+
relevant_functions=relevant_functions,
|
|
78
|
+
call_chain=chain,
|
|
79
|
+
variable_modifications=tuple(sorted(variable_mods)),
|
|
80
|
+
ranked_candidates=ranked,
|
|
81
|
+
)
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Execution path analysis helpers for debugging output."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
import networkx as nx
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True, slots=True)
|
|
11
|
+
class ExecutionPath:
|
|
12
|
+
nodes: tuple[str, ...]
|
|
13
|
+
reason: str
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def shortest_path_between(graph: nx.DiGraph, source: str, target: str) -> ExecutionPath | None:
|
|
17
|
+
"""Return shortest undirected path between two nodes when available."""
|
|
18
|
+
try:
|
|
19
|
+
path = nx.shortest_path(graph.to_undirected(), source=source, target=target)
|
|
20
|
+
return ExecutionPath(nodes=tuple(path), reason="shortest_undirected_path")
|
|
21
|
+
except (nx.NetworkXNoPath, nx.NodeNotFound):
|
|
22
|
+
return None
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def neighborhood_path(graph: nx.DiGraph, seed: str, hops: int = 2) -> ExecutionPath:
|
|
26
|
+
"""Return bounded BFS neighborhood around a seed node."""
|
|
27
|
+
seen = {seed}
|
|
28
|
+
frontier = {seed}
|
|
29
|
+
|
|
30
|
+
for _ in range(max(hops, 0)):
|
|
31
|
+
nxt: set[str] = set()
|
|
32
|
+
for node in frontier:
|
|
33
|
+
nxt.update(graph.predecessors(node))
|
|
34
|
+
nxt.update(graph.successors(node))
|
|
35
|
+
nxt -= seen
|
|
36
|
+
if not nxt:
|
|
37
|
+
break
|
|
38
|
+
seen.update(nxt)
|
|
39
|
+
frontier = nxt
|
|
40
|
+
|
|
41
|
+
ordered = tuple(sorted(seen))
|
|
42
|
+
return ExecutionPath(nodes=ordered, reason=f"neighborhood_hops={hops}")
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Embedding encoder with SentenceTransformers fallback."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import math
|
|
7
|
+
from typing import Iterable
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _fallback_vector(text: str, dims: int = 64) -> list[float]:
|
|
11
|
+
vec = [0.0] * dims
|
|
12
|
+
tokens = text.lower().split()
|
|
13
|
+
if not tokens:
|
|
14
|
+
return vec
|
|
15
|
+
|
|
16
|
+
for tok in tokens:
|
|
17
|
+
digest = hashlib.sha256(tok.encode("utf-8")).digest()
|
|
18
|
+
idx = int.from_bytes(digest[:4], "big") % dims
|
|
19
|
+
sign = 1.0 if digest[4] % 2 == 0 else -1.0
|
|
20
|
+
vec[idx] += sign
|
|
21
|
+
|
|
22
|
+
norm = math.sqrt(sum(v * v for v in vec)) or 1.0
|
|
23
|
+
return [v / norm for v in vec]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TextEncoder:
|
|
27
|
+
"""SentenceTransformer-compatible text encoder with deterministic fallback."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2") -> None:
|
|
30
|
+
self.model_name = model_name
|
|
31
|
+
self._model = None
|
|
32
|
+
try:
|
|
33
|
+
from sentence_transformers import SentenceTransformer # type: ignore
|
|
34
|
+
|
|
35
|
+
self._model = SentenceTransformer(model_name)
|
|
36
|
+
except Exception:
|
|
37
|
+
self._model = None
|
|
38
|
+
|
|
39
|
+
def encode(self, texts: Iterable[str]) -> list[list[float]]:
|
|
40
|
+
values = list(texts)
|
|
41
|
+
if self._model is not None:
|
|
42
|
+
vectors = self._model.encode(values, normalize_embeddings=True)
|
|
43
|
+
return [list(map(float, row)) for row in vectors]
|
|
44
|
+
|
|
45
|
+
return [_fallback_vector(text) for text in values]
|