codebrain 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebrain/__init__.py +3 -0
- codebrain/__main__.py +6 -0
- codebrain/agent_bridge.py +162 -0
- codebrain/analyzer.py +943 -0
- codebrain/api.py +578 -0
- codebrain/api_models.py +102 -0
- codebrain/cli.py +1927 -0
- codebrain/comprehension.py +1939 -0
- codebrain/config.py +46 -0
- codebrain/context.py +276 -0
- codebrain/export.py +334 -0
- codebrain/graph/__init__.py +0 -0
- codebrain/graph/query.py +656 -0
- codebrain/graph/schema.py +113 -0
- codebrain/graph/store.py +295 -0
- codebrain/hook_runner.py +71 -0
- codebrain/hooks.py +107 -0
- codebrain/indexer.py +450 -0
- codebrain/llm.py +676 -0
- codebrain/logging.py +42 -0
- codebrain/mcp_server.py +1635 -0
- codebrain/memory/__init__.py +5 -0
- codebrain/memory/store.py +270 -0
- codebrain/parser/__init__.py +0 -0
- codebrain/parser/base.py +27 -0
- codebrain/parser/config_parser.py +228 -0
- codebrain/parser/models.py +44 -0
- codebrain/parser/python_parser.py +658 -0
- codebrain/parser/registry.py +144 -0
- codebrain/parser/typescript_parser.py +1189 -0
- codebrain/parser/typescript_treesitter.py +535 -0
- codebrain/py.typed +0 -0
- codebrain/resolver.py +171 -0
- codebrain/settings.py +88 -0
- codebrain/utils.py +59 -0
- codebrain/validator.py +563 -0
- codebrain/watcher/__init__.py +0 -0
- codebrain/watcher/file_watcher.py +173 -0
- codebrain-0.1.0.dist-info/METADATA +360 -0
- codebrain-0.1.0.dist-info/RECORD +44 -0
- codebrain-0.1.0.dist-info/WHEEL +5 -0
- codebrain-0.1.0.dist-info/entry_points.txt +6 -0
- codebrain-0.1.0.dist-info/licenses/LICENSE +21 -0
- codebrain-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1939 @@
|
|
|
1
|
+
"""Multi-resolution comprehension engine.
|
|
2
|
+
|
|
3
|
+
Generates layered views of a codebase at different zoom levels:
|
|
4
|
+
- System level: what is this codebase, what are its parts
|
|
5
|
+
- Module level: what does this file do, what does it depend on
|
|
6
|
+
- Symbol level: full context for a single function/class
|
|
7
|
+
- Risk level: where are the hotspots, what is fragile
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import re
|
|
14
|
+
from collections import Counter, defaultdict
|
|
15
|
+
|
|
16
|
+
from codebrain.graph.query import QueryEngine
|
|
17
|
+
from codebrain.graph.store import GraphStore
|
|
18
|
+
from codebrain.utils import is_test_file
|
|
19
|
+
|
|
20
|
+
# Generic names that are too common for bare-name matching in hotspot scoring.
|
|
21
|
+
# Without this filter, e.g. ``MyCache.get`` would collect ALL ``.get()`` callers
|
|
22
|
+
# across the entire codebase, producing wildly inflated risk scores.
|
|
23
|
+
_GENERIC_NAMES = frozenset({
|
|
24
|
+
"get", "set", "delete", "update", "create", "save", "load", "run",
|
|
25
|
+
"put", "post", "patch", "read", "write", "send", "close", "open",
|
|
26
|
+
"add", "remove", "pop", "push", "clear", "reset", "start", "stop",
|
|
27
|
+
"execute", "call", "apply", "handle", "process", "validate",
|
|
28
|
+
"init", "setup", "teardown", "configure",
|
|
29
|
+
"__init__", "__str__", "__repr__", "__eq__", "__hash__",
|
|
30
|
+
"__enter__", "__exit__", "__call__", "__getattr__", "__setattr__",
|
|
31
|
+
"__len__", "__iter__", "__next__", "__contains__",
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ComprehensionEngine:
|
|
36
|
+
"""Produces deterministic, multi-resolution comprehension views."""
|
|
37
|
+
|
|
38
|
+
def __init__(self, store: GraphStore) -> None:
|
|
39
|
+
self.store = store
|
|
40
|
+
self.engine = QueryEngine(store)
|
|
41
|
+
|
|
42
|
+
# ------------------------------------------------------------------
|
|
43
|
+
# Unified zoom interface
|
|
44
|
+
# ------------------------------------------------------------------
|
|
45
|
+
def zoom(self, target: str | None = None) -> dict:
|
|
46
|
+
"""Multi-resolution zoom — like Google Maps for architecture.
|
|
47
|
+
|
|
48
|
+
- No target: system level (what is this codebase?)
|
|
49
|
+
- Package name target: package level (what files are in this package?)
|
|
50
|
+
- File path target: module level (what does this file do?)
|
|
51
|
+
- Symbol name target: symbol level (full context for one symbol)
|
|
52
|
+
|
|
53
|
+
Each level includes navigation hints to drill down or zoom out.
|
|
54
|
+
"""
|
|
55
|
+
if target is None:
|
|
56
|
+
return self._zoom_system()
|
|
57
|
+
|
|
58
|
+
# Check if target looks like a file path
|
|
59
|
+
all_nodes = self.store.get_all_nodes()
|
|
60
|
+
file_paths = {n["file_path"] for n in all_nodes}
|
|
61
|
+
normalized = target.replace("\\", "/")
|
|
62
|
+
if normalized in file_paths or target in file_paths:
|
|
63
|
+
return self._zoom_module(normalized)
|
|
64
|
+
|
|
65
|
+
# Check if target is a package (directory prefix matching indexed files)
|
|
66
|
+
pkg_prefix = normalized.rstrip("/") + "/"
|
|
67
|
+
pkg_files = [fp for fp in file_paths if fp.startswith(pkg_prefix)]
|
|
68
|
+
if pkg_files:
|
|
69
|
+
return self._zoom_package(normalized.rstrip("/"), all_nodes)
|
|
70
|
+
|
|
71
|
+
# Also check if target matches a top-level package name exactly
|
|
72
|
+
top_packages = set()
|
|
73
|
+
for fp in file_paths:
|
|
74
|
+
parts = fp.split("/")
|
|
75
|
+
if len(parts) > 1:
|
|
76
|
+
top_packages.add(parts[0])
|
|
77
|
+
if normalized in top_packages:
|
|
78
|
+
return self._zoom_package(normalized, all_nodes)
|
|
79
|
+
|
|
80
|
+
# Try as symbol name
|
|
81
|
+
return self._zoom_symbol(target, all_nodes)
|
|
82
|
+
|
|
83
|
+
def _zoom_system(self) -> dict:
|
|
84
|
+
"""System-level zoom: overview + narrative + drill-down hints."""
|
|
85
|
+
narrative = self.system_narrative()
|
|
86
|
+
overview = self.system_overview()
|
|
87
|
+
|
|
88
|
+
# Discover packages with symbol counts for drill-down hints
|
|
89
|
+
all_nodes = self.store.get_all_nodes()
|
|
90
|
+
pkg_symbol_counts: dict[str, int] = {}
|
|
91
|
+
pkg_file_counts: dict[str, set[str]] = {}
|
|
92
|
+
for n in all_nodes:
|
|
93
|
+
parts = n["file_path"].split("/")
|
|
94
|
+
pkg = parts[0] if len(parts) > 1 else "(root)"
|
|
95
|
+
if n["type"] != "file":
|
|
96
|
+
pkg_symbol_counts[pkg] = pkg_symbol_counts.get(pkg, 0) + 1
|
|
97
|
+
if pkg not in pkg_file_counts:
|
|
98
|
+
pkg_file_counts[pkg] = set()
|
|
99
|
+
pkg_file_counts[pkg].add(n["file_path"])
|
|
100
|
+
|
|
101
|
+
top_packages = sorted(pkg_symbol_counts.items(), key=lambda x: -x[1])[:10]
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
"level": "system",
|
|
105
|
+
"narrative": narrative,
|
|
106
|
+
"drill_down": [
|
|
107
|
+
{
|
|
108
|
+
"target": pkg,
|
|
109
|
+
"symbols": count,
|
|
110
|
+
"files": len(pkg_file_counts.get(pkg, set())),
|
|
111
|
+
"hint": f"zoom('{pkg}')",
|
|
112
|
+
}
|
|
113
|
+
for pkg, count in top_packages
|
|
114
|
+
],
|
|
115
|
+
"stats": overview.get("stats", {}),
|
|
116
|
+
"zoom_out": None,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
def _zoom_package(self, package_name: str, all_nodes: list[dict] | None = None) -> dict:
|
|
120
|
+
"""Package-level zoom: files in a package with roles and stats."""
|
|
121
|
+
if all_nodes is None:
|
|
122
|
+
all_nodes = self.store.get_all_nodes()
|
|
123
|
+
|
|
124
|
+
pkg_prefix = package_name.rstrip("/") + "/"
|
|
125
|
+
|
|
126
|
+
# Collect files that belong to this package (direct children only)
|
|
127
|
+
file_nodes: dict[str, dict] = {}
|
|
128
|
+
file_symbols: dict[str, list[dict]] = defaultdict(list)
|
|
129
|
+
for n in all_nodes:
|
|
130
|
+
if not n["file_path"].startswith(pkg_prefix):
|
|
131
|
+
continue
|
|
132
|
+
# Check it's a direct child or one level deeper (sub-package __init__)
|
|
133
|
+
remainder = n["file_path"][len(pkg_prefix):]
|
|
134
|
+
if n["type"] == "file":
|
|
135
|
+
file_nodes[n["file_path"]] = n
|
|
136
|
+
else:
|
|
137
|
+
file_symbols[n["file_path"]].append(n)
|
|
138
|
+
|
|
139
|
+
if not file_nodes:
|
|
140
|
+
return {"level": "package", "error": f"No package: {package_name}"}
|
|
141
|
+
|
|
142
|
+
# Build file summaries
|
|
143
|
+
file_summaries = []
|
|
144
|
+
for fp, fnode in file_nodes.items():
|
|
145
|
+
syms = file_symbols.get(fp, [])
|
|
146
|
+
role = self._infer_module_role(fp, syms, 0, 0)
|
|
147
|
+
file_summaries.append({
|
|
148
|
+
"file_path": fp,
|
|
149
|
+
"role": role,
|
|
150
|
+
"symbol_count": len(syms),
|
|
151
|
+
"line_count": fnode.get("line_end", 0),
|
|
152
|
+
"drill_down": f"zoom('{fp}')",
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
file_summaries.sort(key=lambda x: x["symbol_count"], reverse=True)
|
|
156
|
+
|
|
157
|
+
# Detect sub-packages (directories within this package)
|
|
158
|
+
sub_packages: set[str] = set()
|
|
159
|
+
for fp in file_nodes:
|
|
160
|
+
remainder = fp[len(pkg_prefix):]
|
|
161
|
+
parts = remainder.split("/")
|
|
162
|
+
if len(parts) > 1:
|
|
163
|
+
sub_packages.add(package_name + "/" + parts[0])
|
|
164
|
+
|
|
165
|
+
# External dependencies: other packages this one depends on
|
|
166
|
+
all_edges = self.store.get_all_edges()
|
|
167
|
+
node_ids_in_pkg = set()
|
|
168
|
+
for fp in file_nodes:
|
|
169
|
+
for n in all_nodes:
|
|
170
|
+
if n["file_path"] == fp:
|
|
171
|
+
node_ids_in_pkg.add(n["id"])
|
|
172
|
+
|
|
173
|
+
ext_deps: set[str] = set()
|
|
174
|
+
int_deps: list[dict] = []
|
|
175
|
+
for e in all_edges:
|
|
176
|
+
if e["type"] not in ("IMPORTS", "CALLS"):
|
|
177
|
+
continue
|
|
178
|
+
src_fp = e.get("file_path", "")
|
|
179
|
+
if not src_fp.startswith(pkg_prefix):
|
|
180
|
+
continue
|
|
181
|
+
# Resolve target to a file path
|
|
182
|
+
tgt_fp = self._resolve_edge_target_file(e["target"], all_nodes)
|
|
183
|
+
if tgt_fp is None:
|
|
184
|
+
continue
|
|
185
|
+
if tgt_fp.startswith(pkg_prefix):
|
|
186
|
+
if src_fp != tgt_fp:
|
|
187
|
+
int_deps.append({"from": src_fp, "to": tgt_fp})
|
|
188
|
+
else:
|
|
189
|
+
tgt_parts = tgt_fp.split("/")
|
|
190
|
+
tgt_pkg = tgt_parts[0] if len(tgt_parts) > 1 else "(root)"
|
|
191
|
+
ext_deps.add(tgt_pkg)
|
|
192
|
+
|
|
193
|
+
return {
|
|
194
|
+
"level": "package",
|
|
195
|
+
"package": package_name,
|
|
196
|
+
"file_count": len(file_nodes),
|
|
197
|
+
"files": file_summaries,
|
|
198
|
+
"sub_packages": sorted(sub_packages),
|
|
199
|
+
"external_dependencies": sorted(ext_deps),
|
|
200
|
+
"internal_dependencies": int_deps[:50], # cap to avoid huge output
|
|
201
|
+
"zoom_out": {"hint": "zoom()"},
|
|
202
|
+
"drill_down": [
|
|
203
|
+
{"target": f["file_path"], "hint": f"zoom('{f['file_path']}')"} for f in file_summaries[:10]
|
|
204
|
+
],
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
def _resolve_edge_target_file(self, target: str, all_nodes: list[dict]) -> str | None:
|
|
208
|
+
"""Resolve an edge target (node ID, name, or qualified name) to a file path."""
|
|
209
|
+
for n in all_nodes:
|
|
210
|
+
if n["id"] == target or n["qualified_name"] == target:
|
|
211
|
+
return n["file_path"]
|
|
212
|
+
for n in all_nodes:
|
|
213
|
+
if n["name"] == target:
|
|
214
|
+
return n["file_path"]
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
def _zoom_module(self, file_path: str) -> dict:
|
|
218
|
+
"""Module-level zoom: module narrative + symbol list + zoom hints."""
|
|
219
|
+
narrative = self.module_narrative(file_path)
|
|
220
|
+
view = self.module_view(file_path)
|
|
221
|
+
|
|
222
|
+
symbols = []
|
|
223
|
+
for s in view.get("symbols", []):
|
|
224
|
+
symbols.append({
|
|
225
|
+
"name": s.get("name", ""),
|
|
226
|
+
"type": s.get("type", ""),
|
|
227
|
+
"line": s.get("line_start", 0),
|
|
228
|
+
"hint": f"zoom('{s.get('name', '')}')",
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
# Derive the package from the file path for zoom_out
|
|
232
|
+
parts = file_path.split("/")
|
|
233
|
+
if len(parts) > 1:
|
|
234
|
+
pkg = "/".join(parts[:-1])
|
|
235
|
+
zoom_out = {"target": pkg, "hint": f"zoom('{pkg}')"}
|
|
236
|
+
else:
|
|
237
|
+
zoom_out = {"hint": "zoom()"}
|
|
238
|
+
|
|
239
|
+
return {
|
|
240
|
+
"level": "module",
|
|
241
|
+
"file": file_path,
|
|
242
|
+
"narrative": narrative,
|
|
243
|
+
"symbols": symbols,
|
|
244
|
+
"zoom_out": zoom_out,
|
|
245
|
+
"drill_down": [
|
|
246
|
+
{"target": s["name"], "hint": s["hint"]} for s in symbols[:10]
|
|
247
|
+
],
|
|
248
|
+
"dependencies": view.get("dependencies", []),
|
|
249
|
+
"dependents": view.get("dependents", []),
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
def _zoom_symbol(self, name: str, all_nodes: list[dict] | None = None) -> dict:
|
|
253
|
+
"""Symbol-level zoom: full context + narrative."""
|
|
254
|
+
if all_nodes is None:
|
|
255
|
+
all_nodes = self.store.get_all_nodes()
|
|
256
|
+
|
|
257
|
+
# Find the symbol
|
|
258
|
+
matches = [n for n in all_nodes if n["name"] == name or n["id"] == name]
|
|
259
|
+
if not matches:
|
|
260
|
+
return {"level": "symbol", "error": f"Symbol '{name}' not found"}
|
|
261
|
+
|
|
262
|
+
node = matches[0]
|
|
263
|
+
narrative = self.symbol_narrative(node["id"])
|
|
264
|
+
|
|
265
|
+
# Get callers and callees for navigation
|
|
266
|
+
callers = self.engine.impact_of_change(node["id"], max_depth=1)
|
|
267
|
+
callees = self.engine.get_call_chain(node["id"], max_depth=1)
|
|
268
|
+
|
|
269
|
+
return {
|
|
270
|
+
"level": "symbol",
|
|
271
|
+
"name": node["name"],
|
|
272
|
+
"type": node["type"],
|
|
273
|
+
"file": node["file_path"],
|
|
274
|
+
"line": node.get("line_start", 0),
|
|
275
|
+
"narrative": narrative,
|
|
276
|
+
"callers": [{"name": c.get("name", c["node_id"]), "hint": f"zoom('{c.get('name', c['node_id'])}')"} for c in callers[:10]],
|
|
277
|
+
"callees": [{"name": c.get("name", c["node_id"]), "hint": f"zoom('{c.get('name', c['node_id'])}')"} for c in callees[:10]],
|
|
278
|
+
"zoom_out": {"target": node["file_path"], "hint": f"zoom('{node['file_path']}')"},
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
# ------------------------------------------------------------------
|
|
282
|
+
# System level
|
|
283
|
+
# ------------------------------------------------------------------
|
|
284
|
+
def system_overview(self) -> dict:
|
|
285
|
+
"""Top-level view of the entire codebase."""
|
|
286
|
+
stats = self.store.get_stats()
|
|
287
|
+
all_nodes = self.store.get_all_nodes()
|
|
288
|
+
|
|
289
|
+
# Discover packages (top-level directories)
|
|
290
|
+
packages: dict[str, dict] = {}
|
|
291
|
+
for node in all_nodes:
|
|
292
|
+
parts = node["file_path"].split("/")
|
|
293
|
+
pkg = parts[0] if len(parts) > 1 else "(root)"
|
|
294
|
+
if pkg not in packages:
|
|
295
|
+
packages[pkg] = {"files": set(), "docstring": "", "node_count": 0}
|
|
296
|
+
packages[pkg]["files"].add(node["file_path"])
|
|
297
|
+
if node["type"] != "file":
|
|
298
|
+
packages[pkg]["node_count"] += 1
|
|
299
|
+
|
|
300
|
+
# Get package docstrings from __init__.py nodes (already in all_nodes)
|
|
301
|
+
for node in all_nodes:
|
|
302
|
+
if node["type"] == "file" and node["docstring"] and node["file_path"].endswith("__init__.py"):
|
|
303
|
+
parts = node["file_path"].split("/")
|
|
304
|
+
pkg = parts[0] if len(parts) > 1 else "(root)"
|
|
305
|
+
if pkg in packages:
|
|
306
|
+
packages[pkg]["docstring"] = node["docstring"]
|
|
307
|
+
|
|
308
|
+
# Entry points: functions named main, or in __main__.py
|
|
309
|
+
entry_points: list[dict] = []
|
|
310
|
+
for node in all_nodes:
|
|
311
|
+
if node["name"] == "main" and node["type"] == "function":
|
|
312
|
+
entry_points.append({
|
|
313
|
+
"id": node["id"],
|
|
314
|
+
"file_path": node["file_path"],
|
|
315
|
+
"line": node["line_start"],
|
|
316
|
+
})
|
|
317
|
+
elif "__main__" in node["file_path"] and node["type"] == "file":
|
|
318
|
+
entry_points.append({
|
|
319
|
+
"id": node["id"],
|
|
320
|
+
"file_path": node["file_path"],
|
|
321
|
+
"line": 1,
|
|
322
|
+
})
|
|
323
|
+
|
|
324
|
+
# Dependency flow between packages — use IMPORTS + CALLS edges
|
|
325
|
+
# Build node lookup for resolving targets to packages
|
|
326
|
+
node_by_id = {n["id"]: n for n in all_nodes}
|
|
327
|
+
nodes_by_name: dict[str, list[dict]] = defaultdict(list)
|
|
328
|
+
for n in all_nodes:
|
|
329
|
+
nodes_by_name[n["name"]].append(n)
|
|
330
|
+
if n["qualified_name"] != n["name"]:
|
|
331
|
+
nodes_by_name[n["qualified_name"]].append(n)
|
|
332
|
+
|
|
333
|
+
all_edges = self.store.get_all_edges()
|
|
334
|
+
pkg_deps: dict[str, set[str]] = defaultdict(set)
|
|
335
|
+
for e in all_edges:
|
|
336
|
+
if e["type"] not in ("IMPORTS", "CALLS"):
|
|
337
|
+
continue
|
|
338
|
+
src_parts = e["file_path"].split("/")
|
|
339
|
+
src_pkg = src_parts[0] if len(src_parts) > 1 else "(root)"
|
|
340
|
+
|
|
341
|
+
# Resolve target to a package
|
|
342
|
+
target_pkgs: set[str] = set()
|
|
343
|
+
# Try exact node ID
|
|
344
|
+
t_node = node_by_id.get(e["target"])
|
|
345
|
+
if t_node:
|
|
346
|
+
tp = t_node["file_path"].split("/")
|
|
347
|
+
target_pkgs.add(tp[0] if len(tp) > 1 else "(root)")
|
|
348
|
+
# Try name/qname lookup
|
|
349
|
+
for m in nodes_by_name.get(e["target"], []):
|
|
350
|
+
tp = m["file_path"].split("/")
|
|
351
|
+
target_pkgs.add(tp[0] if len(tp) > 1 else "(root)")
|
|
352
|
+
# Try dotted import prefix
|
|
353
|
+
if "." in e["target"]:
|
|
354
|
+
first = e["target"].split(".")[0]
|
|
355
|
+
if first in packages:
|
|
356
|
+
target_pkgs.add(first)
|
|
357
|
+
|
|
358
|
+
for target_pkg in target_pkgs:
|
|
359
|
+
if target_pkg != src_pkg and target_pkg in packages:
|
|
360
|
+
pkg_deps[src_pkg].add(target_pkg)
|
|
361
|
+
|
|
362
|
+
return {
|
|
363
|
+
"stats": stats,
|
|
364
|
+
"packages": {
|
|
365
|
+
name: {
|
|
366
|
+
"file_count": len(info["files"]),
|
|
367
|
+
"node_count": info["node_count"],
|
|
368
|
+
"docstring": info["docstring"],
|
|
369
|
+
"depends_on": sorted(pkg_deps.get(name, set())),
|
|
370
|
+
}
|
|
371
|
+
for name, info in sorted(packages.items())
|
|
372
|
+
},
|
|
373
|
+
"entry_points": entry_points,
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
# ------------------------------------------------------------------
|
|
377
|
+
# Package level
|
|
378
|
+
# ------------------------------------------------------------------
|
|
379
|
+
def package_view(self, package_name: str) -> dict:
|
|
380
|
+
"""View of a single package (directory).
|
|
381
|
+
|
|
382
|
+
Bridges system_overview -> module_view.
|
|
383
|
+
Lists all files in the package with their roles, symbol counts,
|
|
384
|
+
dependencies, and which files are the most important.
|
|
385
|
+
"""
|
|
386
|
+
all_nodes = self.store.get_all_nodes()
|
|
387
|
+
return self._zoom_package(package_name, all_nodes)
|
|
388
|
+
|
|
389
|
+
# ------------------------------------------------------------------
|
|
390
|
+
# Module level
|
|
391
|
+
# ------------------------------------------------------------------
|
|
392
|
+
def module_view(self, file_path: str) -> dict:
|
|
393
|
+
"""Detailed view of a single file/module."""
|
|
394
|
+
nodes = self.store.get_nodes_by_file(file_path)
|
|
395
|
+
if not nodes:
|
|
396
|
+
return {"error": f"No indexed file: {file_path}"}
|
|
397
|
+
|
|
398
|
+
file_node = None
|
|
399
|
+
symbols: list[dict] = []
|
|
400
|
+
for n in nodes:
|
|
401
|
+
if n["type"] == "file":
|
|
402
|
+
file_node = n
|
|
403
|
+
else:
|
|
404
|
+
symbols.append({
|
|
405
|
+
"name": n["name"],
|
|
406
|
+
"type": n["type"],
|
|
407
|
+
"line_start": n["line_start"],
|
|
408
|
+
"line_end": n["line_end"],
|
|
409
|
+
"signature": n["signature"],
|
|
410
|
+
"is_exported": bool(n["is_exported"]),
|
|
411
|
+
"docstring": n["docstring"],
|
|
412
|
+
})
|
|
413
|
+
|
|
414
|
+
# What does this module import?
|
|
415
|
+
imports = self.engine.get_file_dependencies(file_path)
|
|
416
|
+
|
|
417
|
+
# What imports this module? — batch fetch all incoming edges for all nodes in this file
|
|
418
|
+
node_ids = {n["id"] for n in nodes}
|
|
419
|
+
node_names = {n["name"] for n in nodes}
|
|
420
|
+
node_qnames = {n["qualified_name"] for n in nodes}
|
|
421
|
+
# Use a single query to find all edges targeting any node in this file
|
|
422
|
+
all_edges = self.store.get_all_edges()
|
|
423
|
+
imported_by_set: set[str] = set()
|
|
424
|
+
|
|
425
|
+
# Detect language to filter cross-language false positives
|
|
426
|
+
_py = (".py",)
|
|
427
|
+
_js = (".ts", ".tsx", ".js", ".jsx")
|
|
428
|
+
fp_ext = os.path.splitext(file_path)[1].lower() if file_path else ""
|
|
429
|
+
fp_lang = "python" if fp_ext in _py else ("js" if fp_ext in _js else "other")
|
|
430
|
+
|
|
431
|
+
for e in all_edges:
|
|
432
|
+
if e["type"] in ("CALLS", "IMPORTS") and e["file_path"] != file_path:
|
|
433
|
+
# Match by node ID or qualified name (always safe)
|
|
434
|
+
if e["target"] in node_ids or e["target"] in node_qnames:
|
|
435
|
+
matched = True
|
|
436
|
+
elif e["target"] in node_names and e["target"] not in _GENERIC_NAMES:
|
|
437
|
+
# Bare-name match — skip generic names to avoid over-counting
|
|
438
|
+
matched = True
|
|
439
|
+
else:
|
|
440
|
+
matched = False
|
|
441
|
+
if matched:
|
|
442
|
+
# Skip cross-language false positives (Python <-> JS/TS)
|
|
443
|
+
e_ext = os.path.splitext(e["file_path"])[1].lower() if e["file_path"] else ""
|
|
444
|
+
e_lang = "python" if e_ext in _py else ("js" if e_ext in _js else "other")
|
|
445
|
+
if fp_lang != "other" and e_lang != "other" and fp_lang != e_lang:
|
|
446
|
+
continue
|
|
447
|
+
imported_by_set.add(e["file_path"])
|
|
448
|
+
|
|
449
|
+
# Coupling score: how many other files reference this one
|
|
450
|
+
coupling_in = len(imported_by_set)
|
|
451
|
+
coupling_out = len(imports)
|
|
452
|
+
|
|
453
|
+
# Infer role
|
|
454
|
+
role = self._infer_module_role(file_path, symbols, coupling_in, coupling_out)
|
|
455
|
+
|
|
456
|
+
return {
|
|
457
|
+
"file_path": file_path,
|
|
458
|
+
"docstring": file_node["docstring"] if file_node else "",
|
|
459
|
+
"line_count": file_node["line_end"] if file_node else 0,
|
|
460
|
+
"role": role,
|
|
461
|
+
"symbols": symbols,
|
|
462
|
+
"exports": [s["name"] for s in symbols if s["is_exported"]],
|
|
463
|
+
"imports": imports,
|
|
464
|
+
"imported_by": sorted(imported_by_set),
|
|
465
|
+
"coupling": {
|
|
466
|
+
"incoming": coupling_in,
|
|
467
|
+
"outgoing": coupling_out,
|
|
468
|
+
"score": coupling_in + coupling_out,
|
|
469
|
+
},
|
|
470
|
+
}
|
|
471
|
+
|
|
472
|
+
# ------------------------------------------------------------------
|
|
473
|
+
# Risk hotspots
|
|
474
|
+
# ------------------------------------------------------------------
|
|
475
|
+
def risk_hotspots(self, top_n: int = 20) -> list[dict]:
|
|
476
|
+
"""Find the most structurally risky nodes in the codebase.
|
|
477
|
+
|
|
478
|
+
Risk = number of transitive dependents * centrality.
|
|
479
|
+
High-risk nodes are those where a change would cascade widely.
|
|
480
|
+
"""
|
|
481
|
+
all_nodes = self.store.get_all_nodes()
|
|
482
|
+
all_edges = self.store.get_all_edges()
|
|
483
|
+
|
|
484
|
+
# Preload reverse index for bulk impact_of_change() calls
|
|
485
|
+
self.engine.preload_reverse_index()
|
|
486
|
+
|
|
487
|
+
# Build reverse index: target -> list of edges (for incoming lookups)
|
|
488
|
+
incoming_by_target: dict[str, list[dict]] = defaultdict(list)
|
|
489
|
+
for e in all_edges:
|
|
490
|
+
incoming_by_target[e["target"]].append(e)
|
|
491
|
+
|
|
492
|
+
# Build node lookup for resolving impacted node files
|
|
493
|
+
node_by_id = {n["id"]: n for n in all_nodes}
|
|
494
|
+
|
|
495
|
+
# Also index by name and qualified_name for edge target resolution
|
|
496
|
+
nodes_by_name: dict[str, list[dict]] = defaultdict(list)
|
|
497
|
+
for n in all_nodes:
|
|
498
|
+
nodes_by_name[n["name"]].append(n)
|
|
499
|
+
if n["qualified_name"] != n["name"]:
|
|
500
|
+
nodes_by_name[n["qualified_name"]].append(n)
|
|
501
|
+
|
|
502
|
+
# Phase 1: Compute direct dependents for all nodes (cheap, in-memory)
|
|
503
|
+
candidates: list[tuple[dict, int, int]] = [] # (node, direct, external)
|
|
504
|
+
|
|
505
|
+
for node in all_nodes:
|
|
506
|
+
if node["type"] == "file":
|
|
507
|
+
continue
|
|
508
|
+
|
|
509
|
+
# Skip test files and test functions/classes
|
|
510
|
+
if is_test_file(node["file_path"]):
|
|
511
|
+
continue
|
|
512
|
+
if node["name"].startswith("test_") or node["name"].startswith("Test"):
|
|
513
|
+
continue
|
|
514
|
+
|
|
515
|
+
# Count direct incoming references using pre-built index
|
|
516
|
+
callers: list[dict] = []
|
|
517
|
+
for e in incoming_by_target.get(node["id"], []):
|
|
518
|
+
if e["type"] in ("CALLS", "IMPORTS"):
|
|
519
|
+
callers.append(e)
|
|
520
|
+
# Skip bare-name matching for generic names to avoid inflated scores
|
|
521
|
+
# (e.g. MyCache.get collecting ALL .get() callers across the codebase)
|
|
522
|
+
if node["name"] not in _GENERIC_NAMES:
|
|
523
|
+
for e in incoming_by_target.get(node["name"], []):
|
|
524
|
+
if e["type"] in ("CALLS", "IMPORTS"):
|
|
525
|
+
callers.append(e)
|
|
526
|
+
if node["qualified_name"] != node["name"]:
|
|
527
|
+
for e in incoming_by_target.get(node["qualified_name"], []):
|
|
528
|
+
if e["type"] in ("CALLS", "IMPORTS"):
|
|
529
|
+
callers.append(e)
|
|
530
|
+
|
|
531
|
+
# Filter out callers from test files
|
|
532
|
+
callers = [e for e in callers if not is_test_file(e.get("file_path", ""))]
|
|
533
|
+
|
|
534
|
+
# Deduplicate by (source, target, line)
|
|
535
|
+
seen = set()
|
|
536
|
+
unique_callers = []
|
|
537
|
+
for e in callers:
|
|
538
|
+
key = (e["source"], e["target"], e["line"])
|
|
539
|
+
if key not in seen:
|
|
540
|
+
seen.add(key)
|
|
541
|
+
unique_callers.append(e)
|
|
542
|
+
|
|
543
|
+
direct_dependents = len(unique_callers)
|
|
544
|
+
if direct_dependents == 0:
|
|
545
|
+
continue
|
|
546
|
+
|
|
547
|
+
external_dependents = len([e for e in unique_callers if e["file_path"] != node["file_path"]])
|
|
548
|
+
candidates.append((node, direct_dependents, external_dependents))
|
|
549
|
+
|
|
550
|
+
# Phase 2: Sort by direct dependents (proxy for risk), take top candidates
|
|
551
|
+
# Only compute expensive transitive impact for top candidates
|
|
552
|
+
candidate_limit = max(top_n * 4, 100)
|
|
553
|
+
candidates.sort(key=lambda x: x[1] + x[2] * 2, reverse=True)
|
|
554
|
+
candidates = candidates[:candidate_limit]
|
|
555
|
+
|
|
556
|
+
hotspots: list[dict] = []
|
|
557
|
+
for node, direct_dependents, external_dependents in candidates:
|
|
558
|
+
# Transitive impact (limit depth for performance)
|
|
559
|
+
impacted = self.engine.impact_of_change(node["id"], max_depth=3)
|
|
560
|
+
transitive_count = len(impacted)
|
|
561
|
+
|
|
562
|
+
# Affected files — use node_by_id lookup instead of per-node DB query
|
|
563
|
+
affected_files = set()
|
|
564
|
+
for entry in impacted:
|
|
565
|
+
target = node_by_id.get(entry["node_id"])
|
|
566
|
+
if target:
|
|
567
|
+
affected_files.add(target["file_path"])
|
|
568
|
+
affected_files.discard(node["file_path"])
|
|
569
|
+
|
|
570
|
+
risk_score = (
|
|
571
|
+
direct_dependents * 1.0
|
|
572
|
+
+ external_dependents * 2.0
|
|
573
|
+
+ transitive_count * 0.5
|
|
574
|
+
+ len(affected_files) * 3.0
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
hotspots.append({
|
|
578
|
+
"node_id": node["id"],
|
|
579
|
+
"name": node["name"],
|
|
580
|
+
"type": node["type"],
|
|
581
|
+
"file_path": node["file_path"],
|
|
582
|
+
"line_start": node["line_start"],
|
|
583
|
+
"direct_dependents": direct_dependents,
|
|
584
|
+
"external_dependents": external_dependents,
|
|
585
|
+
"transitive_impact": transitive_count,
|
|
586
|
+
"affected_files": len(affected_files),
|
|
587
|
+
"risk_score": round(risk_score, 1),
|
|
588
|
+
})
|
|
589
|
+
|
|
590
|
+
hotspots.sort(key=lambda h: h["risk_score"], reverse=True)
|
|
591
|
+
return hotspots[:top_n]
|
|
592
|
+
|
|
593
|
+
# ------------------------------------------------------------------
|
|
594
|
+
# Health score
|
|
595
|
+
# ------------------------------------------------------------------
|
|
596
|
+
def health_score(self, hotspots: list[dict] | None = None) -> dict:
|
|
597
|
+
"""Compute codebase health metrics with individual scores per dimension.
|
|
598
|
+
|
|
599
|
+
Returns separate scores for each dimension rather than a single
|
|
600
|
+
misleading number. Each dimension is 0-100 (higher is better).
|
|
601
|
+
Pass pre-computed *hotspots* to avoid redundant computation.
|
|
602
|
+
"""
|
|
603
|
+
stats = self.store.get_stats()
|
|
604
|
+
total_nodes = stats.get("nodes", 0)
|
|
605
|
+
total_files = stats.get("files", 0)
|
|
606
|
+
|
|
607
|
+
# Dead code analysis
|
|
608
|
+
dead = self.engine.find_dead_code()
|
|
609
|
+
dead_ratio = len(dead) / max(total_nodes, 1)
|
|
610
|
+
dead_score = max(0, int(100 - dead_ratio * 200)) # 50% dead = 0
|
|
611
|
+
|
|
612
|
+
# Cycle analysis
|
|
613
|
+
cycles = self.engine.detect_cycles()
|
|
614
|
+
cycle_ratio = len(cycles) / max(total_files, 1)
|
|
615
|
+
cycle_score = max(0, int(100 - cycle_ratio * 500)) # 20% cyclic = 0
|
|
616
|
+
|
|
617
|
+
# Hotspot concentration
|
|
618
|
+
if hotspots is None:
|
|
619
|
+
hotspots = self.risk_hotspots(top_n=50)
|
|
620
|
+
high_risk = [h for h in hotspots if h["risk_score"] > 20]
|
|
621
|
+
hotspot_ratio = len(high_risk) / max(total_files, 1)
|
|
622
|
+
coupling_score = max(0, int(100 - hotspot_ratio * 300))
|
|
623
|
+
|
|
624
|
+
# Overall is weighted average (not a penalty system)
|
|
625
|
+
overall = int(dead_score * 0.3 + cycle_score * 0.3 + coupling_score * 0.4)
|
|
626
|
+
|
|
627
|
+
if overall >= 80:
|
|
628
|
+
grade = "A"
|
|
629
|
+
elif overall >= 60:
|
|
630
|
+
grade = "B"
|
|
631
|
+
elif overall >= 40:
|
|
632
|
+
grade = "C"
|
|
633
|
+
elif overall >= 20:
|
|
634
|
+
grade = "D"
|
|
635
|
+
else:
|
|
636
|
+
grade = "F"
|
|
637
|
+
|
|
638
|
+
return {
|
|
639
|
+
"score": overall,
|
|
640
|
+
"grade": grade,
|
|
641
|
+
"dimensions": {
|
|
642
|
+
"dead_code": {
|
|
643
|
+
"score": dead_score,
|
|
644
|
+
"count": len(dead),
|
|
645
|
+
"ratio": round(dead_ratio, 3),
|
|
646
|
+
},
|
|
647
|
+
"import_cycles": {
|
|
648
|
+
"score": cycle_score,
|
|
649
|
+
"count": len(cycles),
|
|
650
|
+
},
|
|
651
|
+
"coupling": {
|
|
652
|
+
"score": coupling_score,
|
|
653
|
+
"high_risk_hotspots": len(high_risk),
|
|
654
|
+
},
|
|
655
|
+
},
|
|
656
|
+
"details": {
|
|
657
|
+
"total_nodes": total_nodes,
|
|
658
|
+
"total_files": total_files,
|
|
659
|
+
"dead_code_count": len(dead),
|
|
660
|
+
"dead_code_ratio": round(dead_ratio, 3),
|
|
661
|
+
"import_cycles": len(cycles),
|
|
662
|
+
"high_risk_hotspots": len(high_risk),
|
|
663
|
+
},
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
# ------------------------------------------------------------------
|
|
667
|
+
# Dependency map
|
|
668
|
+
# ------------------------------------------------------------------
|
|
669
|
+
def dependency_map(self) -> dict:
|
|
670
|
+
"""File-level dependency graph for the entire codebase."""
|
|
671
|
+
file_nodes = self.store.get_all_nodes(type_filter="file")
|
|
672
|
+
all_nodes = self.store.get_all_nodes()
|
|
673
|
+
all_edges = self.store.get_all_edges()
|
|
674
|
+
|
|
675
|
+
# Build multiple lookup indices to resolve edge targets
|
|
676
|
+
node_by_id: dict[str, dict] = {}
|
|
677
|
+
nodes_by_name: dict[str, list[dict]] = defaultdict(list)
|
|
678
|
+
nodes_by_qname: dict[str, list[dict]] = defaultdict(list)
|
|
679
|
+
for n in all_nodes:
|
|
680
|
+
node_by_id[n["id"]] = n
|
|
681
|
+
nodes_by_name[n["name"]].append(n)
|
|
682
|
+
if n["qualified_name"] != n["name"]:
|
|
683
|
+
nodes_by_qname[n["qualified_name"]].append(n)
|
|
684
|
+
|
|
685
|
+
# Also map dotted import targets (e.g. "codebrain.indexer.full_index")
|
|
686
|
+
# to file paths by converting dots to path separators
|
|
687
|
+
file_paths = {n["file_path"] for n in file_nodes}
|
|
688
|
+
|
|
689
|
+
def _resolve_target_files(target: str) -> list[str]:
|
|
690
|
+
"""Resolve an edge target to file path(s)."""
|
|
691
|
+
# 1. Exact node ID
|
|
692
|
+
node = node_by_id.get(target)
|
|
693
|
+
if node:
|
|
694
|
+
return [node["file_path"]]
|
|
695
|
+
# 2. Qualified name
|
|
696
|
+
matches = nodes_by_qname.get(target)
|
|
697
|
+
if matches:
|
|
698
|
+
return [m["file_path"] for m in matches]
|
|
699
|
+
# 3. Simple name (may match multiple nodes)
|
|
700
|
+
matches = nodes_by_name.get(target)
|
|
701
|
+
if matches:
|
|
702
|
+
return [m["file_path"] for m in matches]
|
|
703
|
+
# 4. Dotted import path → try converting to file path
|
|
704
|
+
# e.g. "codebrain.indexer.full_index" → check "codebrain/indexer.py"
|
|
705
|
+
parts = target.split(".")
|
|
706
|
+
for i in range(len(parts), 0, -1):
|
|
707
|
+
candidate = "/".join(parts[:i]) + ".py"
|
|
708
|
+
if candidate in file_paths:
|
|
709
|
+
return [candidate]
|
|
710
|
+
# Also try as package __init__.py
|
|
711
|
+
candidate_pkg = "/".join(parts[:i]) + "/__init__.py"
|
|
712
|
+
if candidate_pkg in file_paths:
|
|
713
|
+
return [candidate_pkg]
|
|
714
|
+
return []
|
|
715
|
+
|
|
716
|
+
# Build edges-by-source for IMPORTS/CALLS
|
|
717
|
+
edges_by_file: dict[str, set[str]] = defaultdict(set)
|
|
718
|
+
for e in all_edges:
|
|
719
|
+
if e["type"] in ("IMPORTS", "CALLS"):
|
|
720
|
+
src_file = e["file_path"]
|
|
721
|
+
for target_file in _resolve_target_files(e["target"]):
|
|
722
|
+
if target_file != src_file:
|
|
723
|
+
edges_by_file[src_file].add(target_file)
|
|
724
|
+
|
|
725
|
+
deps: dict[str, list[str]] = {}
|
|
726
|
+
for fn in file_nodes:
|
|
727
|
+
deps[fn["file_path"]] = sorted(edges_by_file.get(fn["file_path"], set()))
|
|
728
|
+
|
|
729
|
+
return {"file_dependencies": deps}
|
|
730
|
+
|
|
731
|
+
# ------------------------------------------------------------------
|
|
732
|
+
# Change impact summary
|
|
733
|
+
# ------------------------------------------------------------------
|
|
734
|
+
def change_summary(self, file_path: str) -> dict:
|
|
735
|
+
"""If this file changes, what is the blast radius?"""
|
|
736
|
+
nodes = self.store.get_nodes_by_file(file_path)
|
|
737
|
+
all_affected: set[str] = set()
|
|
738
|
+
all_impacted_nodes: list[dict] = []
|
|
739
|
+
|
|
740
|
+
# Pre-fetch all nodes for ID->file resolution
|
|
741
|
+
all_nodes_list = self.store.get_all_nodes()
|
|
742
|
+
node_by_id = {n["id"]: n for n in all_nodes_list}
|
|
743
|
+
|
|
744
|
+
for node in nodes:
|
|
745
|
+
if node["type"] == "file":
|
|
746
|
+
continue
|
|
747
|
+
impacted = self.engine.impact_of_change(node["id"], max_depth=5)
|
|
748
|
+
for entry in impacted:
|
|
749
|
+
target = node_by_id.get(entry["node_id"])
|
|
750
|
+
if target and target["file_path"] != file_path:
|
|
751
|
+
all_affected.add(target["file_path"])
|
|
752
|
+
all_impacted_nodes.append({
|
|
753
|
+
"node_id": entry["node_id"],
|
|
754
|
+
"depth": entry["depth"],
|
|
755
|
+
"via": entry["via"],
|
|
756
|
+
})
|
|
757
|
+
|
|
758
|
+
return {
|
|
759
|
+
"file_path": file_path,
|
|
760
|
+
"affected_file_count": len(all_affected),
|
|
761
|
+
"affected_files": sorted(all_affected),
|
|
762
|
+
"impacted_nodes": all_impacted_nodes,
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
# ------------------------------------------------------------------
|
|
766
|
+
# Narrative generation
|
|
767
|
+
# ------------------------------------------------------------------
|
|
768
|
+
def system_narrative(self) -> dict:
|
|
769
|
+
"""Generate a natural language narrative for the entire codebase.
|
|
770
|
+
|
|
771
|
+
Combines system_overview, health_score, and risk_hotspots into
|
|
772
|
+
human-readable summaries with relative comparisons.
|
|
773
|
+
"""
|
|
774
|
+
overview = self.system_overview()
|
|
775
|
+
hotspots = self.risk_hotspots(top_n=50)
|
|
776
|
+
health = self.health_score(hotspots=hotspots)
|
|
777
|
+
|
|
778
|
+
stats = overview["stats"]
|
|
779
|
+
packages = overview["packages"]
|
|
780
|
+
pkg_names = sorted(packages.keys())
|
|
781
|
+
total_files = stats.get("files", 0)
|
|
782
|
+
|
|
783
|
+
# Size characterization
|
|
784
|
+
if total_files < 20:
|
|
785
|
+
size = "small"
|
|
786
|
+
elif total_files < 200:
|
|
787
|
+
size = "medium-sized"
|
|
788
|
+
else:
|
|
789
|
+
size = "large"
|
|
790
|
+
|
|
791
|
+
summary = (
|
|
792
|
+
f"This is a {size} project with {total_files} files "
|
|
793
|
+
f"and {stats.get('nodes', 0)} symbols across "
|
|
794
|
+
f"{len(pkg_names)} package{'s' if len(pkg_names) != 1 else ''}"
|
|
795
|
+
f" ({', '.join(pkg_names)})."
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
# Architecture: identify the largest package
|
|
799
|
+
arch_parts = []
|
|
800
|
+
if packages:
|
|
801
|
+
largest_pkg = max(packages.items(), key=lambda x: x[1]["node_count"])
|
|
802
|
+
for pkg_name in pkg_names:
|
|
803
|
+
pkg = packages[pkg_name]
|
|
804
|
+
part = f"{pkg_name} ({pkg['file_count']} files, {pkg['node_count']} symbols)"
|
|
805
|
+
if pkg_name == largest_pkg[0] and len(pkg_names) > 1:
|
|
806
|
+
part += " — the largest package"
|
|
807
|
+
arch_parts.append(part)
|
|
808
|
+
architecture = ". ".join(arch_parts) + "." if arch_parts else ""
|
|
809
|
+
|
|
810
|
+
# Backbone: top 3 most-depended-on files from hotspots
|
|
811
|
+
backbone = []
|
|
812
|
+
if hotspots:
|
|
813
|
+
top = hotspots[:3]
|
|
814
|
+
backbone_names = [h["file_path"].rsplit("/", 1)[-1] for h in top]
|
|
815
|
+
summary += (
|
|
816
|
+
f" The backbone modules are {', '.join(backbone_names)}"
|
|
817
|
+
" — most of the codebase depends on them."
|
|
818
|
+
)
|
|
819
|
+
|
|
820
|
+
# Health summary with relative context
|
|
821
|
+
grade = health["grade"]
|
|
822
|
+
score = health["score"]
|
|
823
|
+
details = health["details"]
|
|
824
|
+
dims = health.get("dimensions", {})
|
|
825
|
+
health_parts = [f"Health: {score}/100 (grade {grade})."]
|
|
826
|
+
dead_ratio = details.get("dead_code_ratio", 0)
|
|
827
|
+
if dead_ratio > 0.1:
|
|
828
|
+
health_parts.append(
|
|
829
|
+
f"Dead code is {dead_ratio:.0%} of symbols — higher than ideal (aim for <5%)."
|
|
830
|
+
)
|
|
831
|
+
elif dead_ratio > 0:
|
|
832
|
+
health_parts.append(f"Dead code is low at {dead_ratio:.0%} — good hygiene.")
|
|
833
|
+
cycles = details.get("import_cycles", 0)
|
|
834
|
+
if cycles > 0:
|
|
835
|
+
health_parts.append(f"{cycles} import cycle(s) — consider refactoring to improve layering.")
|
|
836
|
+
hotspot_count = details.get("high_risk_hotspots", 0)
|
|
837
|
+
if hotspot_count > 3:
|
|
838
|
+
health_parts.append(
|
|
839
|
+
f"{hotspot_count} high-risk hotspots — more than typical. Reduce coupling where possible."
|
|
840
|
+
)
|
|
841
|
+
health_summary = " ".join(health_parts)
|
|
842
|
+
|
|
843
|
+
# Key components with why
|
|
844
|
+
key_components = []
|
|
845
|
+
for h in hotspots[:5]:
|
|
846
|
+
risk_level = "high" if h["risk_score"] > 20 else ("medium" if h["risk_score"] > 5 else "low")
|
|
847
|
+
pct = h["affected_files"] / max(total_files, 1) * 100
|
|
848
|
+
key_components.append({
|
|
849
|
+
"name": h["name"],
|
|
850
|
+
"role": h["type"],
|
|
851
|
+
"risk": risk_level,
|
|
852
|
+
"why": (
|
|
853
|
+
f"{h['direct_dependents']} dependents, affects {h['affected_files']} files "
|
|
854
|
+
f"({pct:.0f}% of codebase)"
|
|
855
|
+
),
|
|
856
|
+
})
|
|
857
|
+
|
|
858
|
+
# Recommendations
|
|
859
|
+
recommendations = []
|
|
860
|
+
if details["dead_code_count"] > 5:
|
|
861
|
+
recommendations.append(
|
|
862
|
+
f"Remove or document {details['dead_code_count']} potentially unused symbols."
|
|
863
|
+
)
|
|
864
|
+
if details.get("import_cycles", 0) > 0:
|
|
865
|
+
recommendations.append(
|
|
866
|
+
f"Resolve {details['import_cycles']} import cycle(s) to improve layering."
|
|
867
|
+
)
|
|
868
|
+
if hotspot_count > 3:
|
|
869
|
+
recommendations.append(
|
|
870
|
+
"Reduce coupling on high-risk hotspots to lower change risk."
|
|
871
|
+
)
|
|
872
|
+
if not recommendations:
|
|
873
|
+
recommendations.append("Codebase is in good shape. Keep monitoring hotspots.")
|
|
874
|
+
|
|
875
|
+
return {
|
|
876
|
+
"summary": summary,
|
|
877
|
+
"architecture": architecture,
|
|
878
|
+
"health_summary": health_summary,
|
|
879
|
+
"key_components": key_components,
|
|
880
|
+
"recommendations": recommendations,
|
|
881
|
+
}
|
|
882
|
+
|
|
883
|
+
def module_narrative(self, file_path: str) -> dict:
|
|
884
|
+
"""Generate a natural language narrative for a single module.
|
|
885
|
+
|
|
886
|
+
Uses relative comparisons to codebase averages for context.
|
|
887
|
+
"""
|
|
888
|
+
view = self.module_view(file_path)
|
|
889
|
+
if "error" in view:
|
|
890
|
+
return view
|
|
891
|
+
|
|
892
|
+
role = view.get("role", "module")
|
|
893
|
+
symbols = view.get("symbols", [])
|
|
894
|
+
imports = view.get("imports", [])
|
|
895
|
+
imported_by = view.get("imported_by", [])
|
|
896
|
+
exports = view.get("exports", [])
|
|
897
|
+
coupling = view.get("coupling", {})
|
|
898
|
+
|
|
899
|
+
# Compute codebase baselines for relative comparisons
|
|
900
|
+
all_nodes = self.store.get_all_nodes()
|
|
901
|
+
symbols_per_file: dict[str, int] = {}
|
|
902
|
+
for n in all_nodes:
|
|
903
|
+
fp = n.get("file_path", "")
|
|
904
|
+
if fp and n["type"] != "file":
|
|
905
|
+
symbols_per_file[fp] = symbols_per_file.get(fp, 0) + 1
|
|
906
|
+
avg_symbols = sum(symbols_per_file.values()) / max(len(symbols_per_file), 1)
|
|
907
|
+
this_count = len(symbols)
|
|
908
|
+
|
|
909
|
+
file_name = file_path.split("/")[-1]
|
|
910
|
+
classes = [s for s in symbols if s["type"] == "class"]
|
|
911
|
+
functions = [s for s in symbols if s["type"] in ("function", "method")]
|
|
912
|
+
|
|
913
|
+
# Summary with relative size
|
|
914
|
+
ratio = this_count / max(avg_symbols, 1)
|
|
915
|
+
if ratio > 2:
|
|
916
|
+
size_note = f" — {ratio:.1f}x the codebase average ({avg_symbols:.0f})"
|
|
917
|
+
elif ratio < 0.5 and avg_symbols > 0:
|
|
918
|
+
size_note = f" — lightweight (codebase average: {avg_symbols:.0f})"
|
|
919
|
+
else:
|
|
920
|
+
size_note = ""
|
|
921
|
+
summary = (
|
|
922
|
+
f"{file_name} is a {role.replace('_', ' ')} module with "
|
|
923
|
+
f"{this_count} symbols ({len(classes)} classes, {len(functions)} functions){size_note}."
|
|
924
|
+
)
|
|
925
|
+
|
|
926
|
+
# Importance with rank
|
|
927
|
+
incoming = coupling.get("incoming", 0)
|
|
928
|
+
all_incoming = sorted(
|
|
929
|
+
[v.get("coupling", {}).get("incoming", 0)
|
|
930
|
+
for v in [self.module_view(fp) for fp in list(symbols_per_file.keys())[:50]]
|
|
931
|
+
if "error" not in v],
|
|
932
|
+
reverse=True,
|
|
933
|
+
)
|
|
934
|
+
rank = (all_incoming.index(incoming) + 1) if incoming in all_incoming else len(all_incoming)
|
|
935
|
+
total_modules = len(symbols_per_file)
|
|
936
|
+
|
|
937
|
+
if rank == 1 and incoming > 0:
|
|
938
|
+
importance = f"Most depended-on file — {incoming} files depend on it. Changes here have the highest blast radius."
|
|
939
|
+
elif rank <= 3 and incoming > 0:
|
|
940
|
+
importance = f"Ranked #{rank} by dependents — {incoming} files depend on it. This is load-bearing infrastructure."
|
|
941
|
+
elif incoming > 0:
|
|
942
|
+
importance = f"{incoming} file(s) depend on it (ranked #{rank} of {total_modules})."
|
|
943
|
+
else:
|
|
944
|
+
importance = "No other files depend on this module. Changes here are low-risk."
|
|
945
|
+
|
|
946
|
+
# Dependencies narrative
|
|
947
|
+
if imports:
|
|
948
|
+
dep_str = f"Imports: {', '.join(imports[:10])}"
|
|
949
|
+
if len(imports) > 10:
|
|
950
|
+
dep_str += f" (+{len(imports) - 10} more)"
|
|
951
|
+
else:
|
|
952
|
+
dep_str = "No imports."
|
|
953
|
+
if imported_by:
|
|
954
|
+
dep_str += f". Imported by: {', '.join(imported_by[:10])}"
|
|
955
|
+
if len(imported_by) > 10:
|
|
956
|
+
dep_str += f" (+{len(imported_by) - 10} more)"
|
|
957
|
+
dependencies = dep_str + "."
|
|
958
|
+
|
|
959
|
+
# Exports summary
|
|
960
|
+
if exports:
|
|
961
|
+
exports_summary = f"Exports {len(exports)} symbols: {', '.join(exports[:10])}."
|
|
962
|
+
else:
|
|
963
|
+
exports_summary = "No exported symbols."
|
|
964
|
+
|
|
965
|
+
# Risks with context
|
|
966
|
+
risks = []
|
|
967
|
+
if incoming > 10:
|
|
968
|
+
risks.append(f"High coupling: {incoming} dependents — more than 90% of modules.")
|
|
969
|
+
if coupling.get("score", 0) > 20:
|
|
970
|
+
risks.append(f"High total coupling score ({coupling['score']}). Consider splitting this module.")
|
|
971
|
+
if not risks:
|
|
972
|
+
risks.append("No significant structural risks.")
|
|
973
|
+
|
|
974
|
+
top_callers = ", ".join(imported_by[:5]) if imported_by else "None"
|
|
975
|
+
|
|
976
|
+
return {
|
|
977
|
+
"summary": summary,
|
|
978
|
+
"importance": importance,
|
|
979
|
+
"dependencies": dependencies,
|
|
980
|
+
"exports_summary": exports_summary,
|
|
981
|
+
"risks": risks,
|
|
982
|
+
"top_callers": top_callers,
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
def symbol_narrative(self, node_id: str) -> dict:
|
|
986
|
+
"""Generate a natural language narrative for a single symbol.
|
|
987
|
+
|
|
988
|
+
Includes blast radius as percentage of codebase and relative comparisons.
|
|
989
|
+
"""
|
|
990
|
+
node = self.store.get_node(node_id)
|
|
991
|
+
if not node:
|
|
992
|
+
# Try resolving by name
|
|
993
|
+
resolved = self.engine.resolve_node(node_id)
|
|
994
|
+
if resolved:
|
|
995
|
+
node = resolved[0]
|
|
996
|
+
node_id = node["id"]
|
|
997
|
+
else:
|
|
998
|
+
return {"error": f"Symbol not found: {node_id}"}
|
|
999
|
+
|
|
1000
|
+
name = node["name"]
|
|
1001
|
+
sym_type = node["type"]
|
|
1002
|
+
|
|
1003
|
+
# Get callers
|
|
1004
|
+
reverse_deps = self.engine.get_reverse_dependencies(node_id)
|
|
1005
|
+
callers = [d for d in reverse_deps if d["edge_type"] == "CALLS"]
|
|
1006
|
+
importers = [d for d in reverse_deps if d["edge_type"] == "IMPORTS"]
|
|
1007
|
+
|
|
1008
|
+
# Impact + blast radius percentage
|
|
1009
|
+
impacted = self.engine.impact_of_change(node_id, max_depth=3)
|
|
1010
|
+
affected_files = {e.get("node_id", "").split("::")[0] for e in impacted}
|
|
1011
|
+
affected_files.discard("")
|
|
1012
|
+
affected_files.discard(node["file_path"])
|
|
1013
|
+
|
|
1014
|
+
all_nodes = self.store.get_all_nodes()
|
|
1015
|
+
total_files = len({n["file_path"] for n in all_nodes if n.get("file_path")})
|
|
1016
|
+
blast_pct = len(affected_files) / max(total_files, 1) * 100
|
|
1017
|
+
|
|
1018
|
+
# Summary
|
|
1019
|
+
summary = f"{name} is a {sym_type} defined in {node['file_path']}."
|
|
1020
|
+
if node.get("docstring"):
|
|
1021
|
+
summary += f" {node['docstring'].split(chr(10))[0]}"
|
|
1022
|
+
|
|
1023
|
+
# Importance with blast radius percentage
|
|
1024
|
+
total_deps = len(callers) + len(importers)
|
|
1025
|
+
if total_deps > 5:
|
|
1026
|
+
importance = (
|
|
1027
|
+
f"High — {total_deps} direct dependents, affects {len(affected_files)} files "
|
|
1028
|
+
f"({blast_pct:.0f}% of codebase)."
|
|
1029
|
+
)
|
|
1030
|
+
if blast_pct > 30:
|
|
1031
|
+
importance += " This is a foundational symbol — modify with extreme care."
|
|
1032
|
+
elif total_deps > 0:
|
|
1033
|
+
importance = f"Moderate — {total_deps} direct dependent(s), affects {blast_pct:.0f}% of codebase."
|
|
1034
|
+
else:
|
|
1035
|
+
importance = "Low — no known dependents. Changes here are safe."
|
|
1036
|
+
|
|
1037
|
+
# Usage
|
|
1038
|
+
caller_files = {c["file_path"] for c in callers}
|
|
1039
|
+
if caller_files:
|
|
1040
|
+
usage = f"Called from {len(callers)} location(s) across {len(caller_files)} file(s)."
|
|
1041
|
+
else:
|
|
1042
|
+
usage = "No known call sites."
|
|
1043
|
+
|
|
1044
|
+
# Callers summary — group by file with counts
|
|
1045
|
+
callers_by_file: dict[str, int] = defaultdict(int)
|
|
1046
|
+
for c in callers:
|
|
1047
|
+
callers_by_file[c["file_path"]] += 1
|
|
1048
|
+
top_caller_files = sorted(callers_by_file.items(), key=lambda x: -x[1])[:5]
|
|
1049
|
+
|
|
1050
|
+
if top_caller_files:
|
|
1051
|
+
caller_detail = [f"{fp} ({count}x)" for fp, count in top_caller_files]
|
|
1052
|
+
callers_summary = f"Called by: {', '.join(caller_detail)}."
|
|
1053
|
+
if len(callers) > 5:
|
|
1054
|
+
callers_summary += f" (+{len(callers) - 5} more)"
|
|
1055
|
+
else:
|
|
1056
|
+
callers_summary = "No known callers."
|
|
1057
|
+
|
|
1058
|
+
caller_files_list = [{"file": fp, "count": ct} for fp, ct in top_caller_files]
|
|
1059
|
+
|
|
1060
|
+
# Dependencies
|
|
1061
|
+
outgoing = self.store.get_edges_from(node_id)
|
|
1062
|
+
call_targets = [e["target"] for e in outgoing if e["type"] == "CALLS"]
|
|
1063
|
+
if call_targets:
|
|
1064
|
+
dependencies = f"Calls: {', '.join(call_targets[:10])}."
|
|
1065
|
+
else:
|
|
1066
|
+
dependencies = "No outgoing calls."
|
|
1067
|
+
|
|
1068
|
+
return {
|
|
1069
|
+
"summary": summary,
|
|
1070
|
+
"importance": importance,
|
|
1071
|
+
"usage": usage,
|
|
1072
|
+
"callers_summary": callers_summary,
|
|
1073
|
+
"caller_files": caller_files_list,
|
|
1074
|
+
"dependencies": dependencies,
|
|
1075
|
+
}
|
|
1076
|
+
|
|
1077
|
+
# ------------------------------------------------------------------
|
|
1078
|
+
# Codebase anatomy
|
|
1079
|
+
# ------------------------------------------------------------------
|
|
1080
|
+
def codebase_anatomy(self) -> dict:
|
|
1081
|
+
"""Subsystem map for new developers — the 'truck parts' view.
|
|
1082
|
+
|
|
1083
|
+
Groups files into named subsystems with descriptions, importance
|
|
1084
|
+
rankings, and inter-subsystem connections.
|
|
1085
|
+
"""
|
|
1086
|
+
overview = self.system_overview()
|
|
1087
|
+
all_nodes = self.store.get_all_nodes()
|
|
1088
|
+
all_edges = self.store.get_all_edges()
|
|
1089
|
+
hotspots = self.risk_hotspots(top_n=50)
|
|
1090
|
+
|
|
1091
|
+
subsystems = self._group_into_subsystems(all_nodes, overview)
|
|
1092
|
+
connections = self._compute_subsystem_connections(subsystems, all_nodes, all_edges)
|
|
1093
|
+
self._score_subsystem_importance(subsystems, hotspots, connections)
|
|
1094
|
+
|
|
1095
|
+
for sub in subsystems:
|
|
1096
|
+
sub["description"] = self._describe_subsystem(sub, connections)
|
|
1097
|
+
|
|
1098
|
+
purpose = self._derive_purpose(overview, subsystems)
|
|
1099
|
+
|
|
1100
|
+
return {
|
|
1101
|
+
"purpose": purpose,
|
|
1102
|
+
"subsystems": subsystems,
|
|
1103
|
+
"connections": connections,
|
|
1104
|
+
"entry_points": overview.get("entry_points", []),
|
|
1105
|
+
"stats": overview.get("stats", {}),
|
|
1106
|
+
}
|
|
1107
|
+
|
|
1108
|
+
# ------------------------------------------------------------------
|
|
1109
|
+
# Anatomy helpers (private)
|
|
1110
|
+
# ------------------------------------------------------------------
|
|
1111
|
+
|
|
1112
|
+
# Known directory → display name mappings
|
|
1113
|
+
_KNOWN_NAMES: dict[str, str] = {
|
|
1114
|
+
"graph": "Graph Engine",
|
|
1115
|
+
"parser": "Parser System",
|
|
1116
|
+
"web": "Web UI",
|
|
1117
|
+
"api": "REST API",
|
|
1118
|
+
"cli": "CLI Interface",
|
|
1119
|
+
"mcp": "MCP Server",
|
|
1120
|
+
"tests": "Test Suite",
|
|
1121
|
+
"test": "Test Suite",
|
|
1122
|
+
}
|
|
1123
|
+
|
|
1124
|
+
def _group_into_subsystems(
|
|
1125
|
+
self, all_nodes: list[dict], overview: dict
|
|
1126
|
+
) -> list[dict]:
|
|
1127
|
+
"""Group files into subsystems using a 3-tier algorithm."""
|
|
1128
|
+
# Build per-file info: file_path → list of non-file nodes
|
|
1129
|
+
file_nodes: dict[str, list[dict]] = defaultdict(list)
|
|
1130
|
+
for node in all_nodes:
|
|
1131
|
+
fp = node.get("file_path", "")
|
|
1132
|
+
if not fp:
|
|
1133
|
+
continue
|
|
1134
|
+
if node["type"] == "file":
|
|
1135
|
+
# ensure the file key exists even with no symbols
|
|
1136
|
+
if fp not in file_nodes:
|
|
1137
|
+
file_nodes[fp] = []
|
|
1138
|
+
else:
|
|
1139
|
+
file_nodes[fp].append(node)
|
|
1140
|
+
|
|
1141
|
+
# Tier 1: directory-based grouping by parent directory at depth 2
|
|
1142
|
+
dir_groups: dict[str, list[str]] = defaultdict(list)
|
|
1143
|
+
for fp in file_nodes:
|
|
1144
|
+
parts = fp.split("/")
|
|
1145
|
+
# Use the directory path (excluding filename), capped at depth 2
|
|
1146
|
+
dir_parts = parts[:-1] if len(parts) > 1 else []
|
|
1147
|
+
if len(dir_parts) >= 2:
|
|
1148
|
+
key = "/".join(dir_parts[:2])
|
|
1149
|
+
elif dir_parts:
|
|
1150
|
+
key = dir_parts[0]
|
|
1151
|
+
else:
|
|
1152
|
+
key = "(root)"
|
|
1153
|
+
dir_groups[key].append(fp)
|
|
1154
|
+
|
|
1155
|
+
# Tier 4: separate test files into a single group
|
|
1156
|
+
test_files: list[str] = []
|
|
1157
|
+
non_test_groups: dict[str, list[str]] = {}
|
|
1158
|
+
for key, files in dir_groups.items():
|
|
1159
|
+
test = [f for f in files if is_test_file(f)]
|
|
1160
|
+
non_test = [f for f in files if not is_test_file(f)]
|
|
1161
|
+
if test:
|
|
1162
|
+
test_files.extend(test)
|
|
1163
|
+
if non_test:
|
|
1164
|
+
non_test_groups[key] = non_test
|
|
1165
|
+
elif not test:
|
|
1166
|
+
# Key with no files left (shouldn't happen, but safe)
|
|
1167
|
+
non_test_groups[key] = files
|
|
1168
|
+
dir_groups = non_test_groups
|
|
1169
|
+
|
|
1170
|
+
# Tier 2: merge small groups (<2 files) into role-based buckets
|
|
1171
|
+
role_merge: dict[str, list[str]] = defaultdict(list)
|
|
1172
|
+
final_groups: dict[str, list[str]] = {}
|
|
1173
|
+
for key, files in dir_groups.items():
|
|
1174
|
+
if len(files) < 2:
|
|
1175
|
+
for fp in files:
|
|
1176
|
+
syms = file_nodes.get(fp, [])
|
|
1177
|
+
sym_dicts = [{"name": s["name"], "type": s["type"]} for s in syms]
|
|
1178
|
+
role = self._infer_module_role(fp, sym_dicts, 0, 0)
|
|
1179
|
+
role_merge[role].append(fp)
|
|
1180
|
+
else:
|
|
1181
|
+
final_groups[key] = files
|
|
1182
|
+
|
|
1183
|
+
# Tier 3: singleton promotion — large files get their own subsystem
|
|
1184
|
+
# First, handle role-merged files
|
|
1185
|
+
for role, files in role_merge.items():
|
|
1186
|
+
for fp in files:
|
|
1187
|
+
sym_count = len(file_nodes.get(fp, []))
|
|
1188
|
+
if sym_count >= 5:
|
|
1189
|
+
final_groups[fp] = [fp]
|
|
1190
|
+
else:
|
|
1191
|
+
final_groups.setdefault("_utilities", []).append(fp)
|
|
1192
|
+
|
|
1193
|
+
# Also promote large files from big groups (>5 files) that have
|
|
1194
|
+
# many symbols (>= 15) — these are major modules worthy of their own subsystem
|
|
1195
|
+
PROMOTE_THRESHOLD = 15
|
|
1196
|
+
for key in list(final_groups.keys()):
|
|
1197
|
+
files = final_groups[key]
|
|
1198
|
+
if key.startswith("_") or len(files) <= 3:
|
|
1199
|
+
continue
|
|
1200
|
+
promoted = []
|
|
1201
|
+
kept = []
|
|
1202
|
+
for fp in files:
|
|
1203
|
+
sym_count = len(file_nodes.get(fp, []))
|
|
1204
|
+
if sym_count >= PROMOTE_THRESHOLD:
|
|
1205
|
+
promoted.append(fp)
|
|
1206
|
+
else:
|
|
1207
|
+
kept.append(fp)
|
|
1208
|
+
if promoted and kept:
|
|
1209
|
+
final_groups[key] = kept
|
|
1210
|
+
for fp in promoted:
|
|
1211
|
+
final_groups[fp] = [fp]
|
|
1212
|
+
|
|
1213
|
+
if test_files:
|
|
1214
|
+
final_groups["_tests"] = test_files
|
|
1215
|
+
|
|
1216
|
+
# Cap at ~15 subsystems: merge smallest (by symbol count) into "Other"
|
|
1217
|
+
MAX_SUBSYSTEMS = 15
|
|
1218
|
+
if len(final_groups) > MAX_SUBSYSTEMS:
|
|
1219
|
+
by_weight = sorted(
|
|
1220
|
+
final_groups.items(),
|
|
1221
|
+
key=lambda x: sum(len(file_nodes.get(f, [])) for f in x[1]),
|
|
1222
|
+
)
|
|
1223
|
+
keep = dict(by_weight[-(MAX_SUBSYSTEMS - 1):])
|
|
1224
|
+
other_files: list[str] = []
|
|
1225
|
+
for key, files in by_weight[:len(by_weight) - MAX_SUBSYSTEMS + 1]:
|
|
1226
|
+
other_files.extend(files)
|
|
1227
|
+
if other_files:
|
|
1228
|
+
keep["_other"] = other_files
|
|
1229
|
+
final_groups = keep
|
|
1230
|
+
|
|
1231
|
+
# Build subsystem dicts
|
|
1232
|
+
subsystems: list[dict] = []
|
|
1233
|
+
for key, files in sorted(final_groups.items()):
|
|
1234
|
+
slug = self._subsystem_slug(key)
|
|
1235
|
+
name = self._subsystem_name(key)
|
|
1236
|
+
|
|
1237
|
+
# Collect symbols in this subsystem
|
|
1238
|
+
symbols: list[dict] = []
|
|
1239
|
+
for fp in files:
|
|
1240
|
+
symbols.extend(file_nodes.get(fp, []))
|
|
1241
|
+
|
|
1242
|
+
# Determine dominant role
|
|
1243
|
+
roles = Counter()
|
|
1244
|
+
for fp in files:
|
|
1245
|
+
sym_dicts = [{"name": s["name"], "type": s["type"]} for s in file_nodes.get(fp, [])]
|
|
1246
|
+
r = self._infer_module_role(fp, sym_dicts, 0, 0)
|
|
1247
|
+
roles[r] += 1
|
|
1248
|
+
dominant_role = roles.most_common(1)[0][0] if roles else "module"
|
|
1249
|
+
|
|
1250
|
+
# Map role to category
|
|
1251
|
+
ROLE_MAP = {
|
|
1252
|
+
"core_library": "core",
|
|
1253
|
+
"domain_model": "core",
|
|
1254
|
+
"data_model": "core",
|
|
1255
|
+
"orchestrator": "interface",
|
|
1256
|
+
"entry_point": "interface",
|
|
1257
|
+
"configuration": "infrastructure",
|
|
1258
|
+
"utility": "infrastructure",
|
|
1259
|
+
"package_init": "infrastructure",
|
|
1260
|
+
"test": "peripheral",
|
|
1261
|
+
}
|
|
1262
|
+
role_cat = ROLE_MAP.get(dominant_role, "peripheral")
|
|
1263
|
+
|
|
1264
|
+
# Top symbols by name
|
|
1265
|
+
top_syms = sorted(
|
|
1266
|
+
[s for s in symbols if s["type"] in ("class", "function")],
|
|
1267
|
+
key=lambda s: s["name"],
|
|
1268
|
+
)[:5]
|
|
1269
|
+
|
|
1270
|
+
# Find entry points in this subsystem
|
|
1271
|
+
eps = [
|
|
1272
|
+
ep for ep in overview.get("entry_points", [])
|
|
1273
|
+
if ep.get("file_path") in set(files)
|
|
1274
|
+
]
|
|
1275
|
+
|
|
1276
|
+
subsystems.append({
|
|
1277
|
+
"name": name,
|
|
1278
|
+
"slug": slug,
|
|
1279
|
+
"packages": sorted(set(
|
|
1280
|
+
"/".join(fp.split("/")[:2]) if "/" in fp else fp
|
|
1281
|
+
for fp in files
|
|
1282
|
+
)),
|
|
1283
|
+
"files": sorted(files),
|
|
1284
|
+
"file_count": len(files),
|
|
1285
|
+
"symbol_count": len(symbols),
|
|
1286
|
+
"role": role_cat,
|
|
1287
|
+
"importance": 0.0,
|
|
1288
|
+
"importance_label": "peripheral",
|
|
1289
|
+
"description": "",
|
|
1290
|
+
"depends_on": [],
|
|
1291
|
+
"depended_by": [],
|
|
1292
|
+
"entry_points": eps,
|
|
1293
|
+
"top_symbols": [s["name"] for s in top_syms],
|
|
1294
|
+
"risk_score": 0.0,
|
|
1295
|
+
})
|
|
1296
|
+
|
|
1297
|
+
return subsystems
|
|
1298
|
+
|
|
1299
|
+
@staticmethod
|
|
1300
|
+
def _subsystem_slug(key: str) -> str:
|
|
1301
|
+
"""Derive a URL-safe slug from a grouping key."""
|
|
1302
|
+
if key == "_utilities":
|
|
1303
|
+
return "utilities"
|
|
1304
|
+
if key == "_tests":
|
|
1305
|
+
return "tests"
|
|
1306
|
+
if key == "_other":
|
|
1307
|
+
return "other"
|
|
1308
|
+
# Use last path segment, lowercase
|
|
1309
|
+
return key.split("/")[-1].removesuffix(".py").lower().replace(" ", "-")
|
|
1310
|
+
|
|
1311
|
+
def _subsystem_name(self, key: str) -> str:
|
|
1312
|
+
"""Derive a human-readable name from a grouping key."""
|
|
1313
|
+
if key == "_utilities":
|
|
1314
|
+
return "Utilities"
|
|
1315
|
+
if key == "_tests":
|
|
1316
|
+
return "Test Suite"
|
|
1317
|
+
if key == "_other":
|
|
1318
|
+
return "Other"
|
|
1319
|
+
|
|
1320
|
+
last = key.split("/")[-1].removesuffix(".py")
|
|
1321
|
+
# Check known mappings
|
|
1322
|
+
if last.lower() in self._KNOWN_NAMES:
|
|
1323
|
+
return self._KNOWN_NAMES[last.lower()]
|
|
1324
|
+
# Title case
|
|
1325
|
+
return last.replace("_", " ").replace("-", " ").title()
|
|
1326
|
+
|
|
1327
|
+
def _compute_subsystem_connections(
|
|
1328
|
+
self,
|
|
1329
|
+
subsystems: list[dict],
|
|
1330
|
+
all_nodes: list[dict],
|
|
1331
|
+
all_edges: list[dict],
|
|
1332
|
+
) -> list[dict]:
|
|
1333
|
+
"""Count inter-subsystem edges to build connection list."""
|
|
1334
|
+
# Map file_path → subsystem slug
|
|
1335
|
+
file_to_slug: dict[str, str] = {}
|
|
1336
|
+
for sub in subsystems:
|
|
1337
|
+
for fp in sub["files"]:
|
|
1338
|
+
file_to_slug[fp] = sub["slug"]
|
|
1339
|
+
|
|
1340
|
+
# Map node id/name/qname → file_path for target resolution
|
|
1341
|
+
node_file: dict[str, str] = {}
|
|
1342
|
+
for n in all_nodes:
|
|
1343
|
+
fp = n.get("file_path", "")
|
|
1344
|
+
if fp:
|
|
1345
|
+
node_file[n["id"]] = fp
|
|
1346
|
+
node_file[n["name"]] = fp
|
|
1347
|
+
if n["qualified_name"] != n["name"]:
|
|
1348
|
+
node_file[n["qualified_name"]] = fp
|
|
1349
|
+
|
|
1350
|
+
# Count edges between subsystems
|
|
1351
|
+
edge_counts: dict[tuple[str, str], int] = Counter()
|
|
1352
|
+
for e in all_edges:
|
|
1353
|
+
if e["type"] not in ("CALLS", "IMPORTS"):
|
|
1354
|
+
continue
|
|
1355
|
+
src_file = e.get("file_path", "")
|
|
1356
|
+
src_slug = file_to_slug.get(src_file)
|
|
1357
|
+
if not src_slug:
|
|
1358
|
+
continue
|
|
1359
|
+
tgt_file = node_file.get(e["target"], "")
|
|
1360
|
+
tgt_slug = file_to_slug.get(tgt_file)
|
|
1361
|
+
if not tgt_slug or tgt_slug == src_slug:
|
|
1362
|
+
continue
|
|
1363
|
+
edge_counts[(src_slug, tgt_slug)] += 1
|
|
1364
|
+
|
|
1365
|
+
# Build depends_on / depended_by on subsystems
|
|
1366
|
+
slug_lookup = {s["slug"]: s for s in subsystems}
|
|
1367
|
+
for (from_slug, to_slug), count in edge_counts.items():
|
|
1368
|
+
if to_slug not in slug_lookup.get(from_slug, {}).get("depends_on", []):
|
|
1369
|
+
slug_lookup[from_slug]["depends_on"].append(to_slug)
|
|
1370
|
+
if from_slug not in slug_lookup.get(to_slug, {}).get("depended_by", []):
|
|
1371
|
+
slug_lookup[to_slug]["depended_by"].append(from_slug)
|
|
1372
|
+
|
|
1373
|
+
# Sort depends_on / depended_by
|
|
1374
|
+
for sub in subsystems:
|
|
1375
|
+
sub["depends_on"] = sorted(set(sub["depends_on"]))
|
|
1376
|
+
sub["depended_by"] = sorted(set(sub["depended_by"]))
|
|
1377
|
+
|
|
1378
|
+
connections = [
|
|
1379
|
+
{
|
|
1380
|
+
"from": from_slug,
|
|
1381
|
+
"to": to_slug,
|
|
1382
|
+
"strength": count,
|
|
1383
|
+
"label": f"{count} calls + imports",
|
|
1384
|
+
}
|
|
1385
|
+
for (from_slug, to_slug), count in sorted(
|
|
1386
|
+
edge_counts.items(), key=lambda x: -x[1]
|
|
1387
|
+
)
|
|
1388
|
+
]
|
|
1389
|
+
return connections
|
|
1390
|
+
|
|
1391
|
+
def _score_subsystem_importance(
|
|
1392
|
+
self,
|
|
1393
|
+
subsystems: list[dict],
|
|
1394
|
+
hotspots: list[dict],
|
|
1395
|
+
connections: list[dict],
|
|
1396
|
+
) -> None:
|
|
1397
|
+
"""Score each subsystem's importance (0.0–1.0) and assign labels."""
|
|
1398
|
+
total_subsystems = max(len(subsystems), 1)
|
|
1399
|
+
|
|
1400
|
+
# Build hotspot risk per file
|
|
1401
|
+
risk_by_file: dict[str, float] = defaultdict(float)
|
|
1402
|
+
for h in hotspots:
|
|
1403
|
+
risk_by_file[h["file_path"]] += h["risk_score"]
|
|
1404
|
+
|
|
1405
|
+
raw_scores: list[float] = []
|
|
1406
|
+
for sub in subsystems:
|
|
1407
|
+
incoming = len(sub.get("depended_by", []))
|
|
1408
|
+
risk_agg = sum(risk_by_file.get(fp, 0.0) for fp in sub["files"])
|
|
1409
|
+
sub["risk_score"] = round(risk_agg, 1)
|
|
1410
|
+
sym_count = sub["symbol_count"]
|
|
1411
|
+
has_entry = 1.0 if sub.get("entry_points") else 0.0
|
|
1412
|
+
conn_count = len(sub.get("depends_on", [])) + len(sub.get("depended_by", []))
|
|
1413
|
+
centrality = conn_count / max(total_subsystems, 1)
|
|
1414
|
+
|
|
1415
|
+
score = (
|
|
1416
|
+
incoming * 0.4
|
|
1417
|
+
+ risk_agg * 0.25
|
|
1418
|
+
+ sym_count * 0.15
|
|
1419
|
+
+ has_entry * 0.1
|
|
1420
|
+
+ centrality * 0.1
|
|
1421
|
+
)
|
|
1422
|
+
raw_scores.append(score)
|
|
1423
|
+
|
|
1424
|
+
# Normalize to 0.0–1.0
|
|
1425
|
+
max_raw = max(raw_scores) if raw_scores else 1.0
|
|
1426
|
+
if max_raw == 0:
|
|
1427
|
+
max_raw = 1.0
|
|
1428
|
+
|
|
1429
|
+
for sub, raw in zip(subsystems, raw_scores):
|
|
1430
|
+
normalized = round(raw / max_raw, 2)
|
|
1431
|
+
sub["importance"] = normalized
|
|
1432
|
+
if normalized >= 0.75:
|
|
1433
|
+
sub["importance_label"] = "critical"
|
|
1434
|
+
elif normalized >= 0.50:
|
|
1435
|
+
sub["importance_label"] = "important"
|
|
1436
|
+
elif normalized >= 0.25:
|
|
1437
|
+
sub["importance_label"] = "supporting"
|
|
1438
|
+
else:
|
|
1439
|
+
sub["importance_label"] = "peripheral"
|
|
1440
|
+
|
|
1441
|
+
def _describe_subsystem(self, subsystem: dict, connections: list[dict]) -> str:
|
|
1442
|
+
"""Generate a deterministic one-liner description for a subsystem."""
|
|
1443
|
+
# Try __init__.py docstring from the files
|
|
1444
|
+
for fp in subsystem["files"]:
|
|
1445
|
+
if fp.endswith("__init__.py"):
|
|
1446
|
+
nodes = self.store.get_nodes_by_file(fp)
|
|
1447
|
+
for n in nodes:
|
|
1448
|
+
if n["type"] == "file" and n.get("docstring"):
|
|
1449
|
+
return n["docstring"].split("\n")[0]
|
|
1450
|
+
|
|
1451
|
+
role = subsystem["role"].replace("_", " ").title()
|
|
1452
|
+
sym_count = subsystem["symbol_count"]
|
|
1453
|
+
top = subsystem["top_symbols"][:3]
|
|
1454
|
+
dep_count = len(subsystem.get("depended_by", []))
|
|
1455
|
+
|
|
1456
|
+
top_str = ", ".join(top) if top else "no public symbols"
|
|
1457
|
+
dep_str = f"Depended on by {dep_count} other subsystem{'s' if dep_count != 1 else ''}." if dep_count else "No dependents."
|
|
1458
|
+
|
|
1459
|
+
return f"{role} providing {sym_count} symbols including {top_str}. {dep_str}"
|
|
1460
|
+
|
|
1461
|
+
def _derive_purpose(self, overview: dict, subsystems: list[dict]) -> str:
|
|
1462
|
+
"""Generate a one-line codebase purpose statement."""
|
|
1463
|
+
stats = overview.get("stats", {})
|
|
1464
|
+
file_count = stats.get("files", 0)
|
|
1465
|
+
sub_count = len(subsystems)
|
|
1466
|
+
|
|
1467
|
+
# Find the most important non-utility subsystem
|
|
1468
|
+
real_subs = [s for s in subsystems if s["slug"] not in ("other", "tests", "utilities")]
|
|
1469
|
+
top_sub = max(real_subs, key=lambda s: s["importance"]) if real_subs else (
|
|
1470
|
+
max(subsystems, key=lambda s: s["importance"]) if subsystems else None
|
|
1471
|
+
)
|
|
1472
|
+
center = top_sub["name"] if top_sub else "unknown"
|
|
1473
|
+
|
|
1474
|
+
# Detect dominant language from file extensions
|
|
1475
|
+
all_files = []
|
|
1476
|
+
for sub in subsystems:
|
|
1477
|
+
all_files.extend(sub["files"])
|
|
1478
|
+
py_count = sum(1 for f in all_files if f.endswith(".py"))
|
|
1479
|
+
ts_count = sum(1 for f in all_files if f.endswith((".ts", ".tsx", ".js", ".jsx")))
|
|
1480
|
+
if py_count > ts_count:
|
|
1481
|
+
lang = "Python"
|
|
1482
|
+
elif ts_count > py_count:
|
|
1483
|
+
lang = "TypeScript"
|
|
1484
|
+
elif py_count and ts_count:
|
|
1485
|
+
lang = "Python + TypeScript"
|
|
1486
|
+
else:
|
|
1487
|
+
lang = "source"
|
|
1488
|
+
|
|
1489
|
+
return (
|
|
1490
|
+
f"A {lang} project with {file_count} files organized into "
|
|
1491
|
+
f"{sub_count} subsystem{'s' if sub_count != 1 else ''}, "
|
|
1492
|
+
f"centered around {center}."
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1495
|
+
# ------------------------------------------------------------------
|
|
1496
|
+
# Ask: natural language question answering
|
|
1497
|
+
# ------------------------------------------------------------------
|
|
1498
|
+
|
|
1499
|
+
# Order matters: more specific patterns first, broad ones (overview) last.
|
|
1500
|
+
_INTENT_PATTERNS = {
|
|
1501
|
+
"risk": r"\b(risk\w*|dangerous|fragile|hotspot)\b",
|
|
1502
|
+
"health": r"\b(health|score|grade|quality)\b",
|
|
1503
|
+
"dead_code": r"\b(dead|unused|unreachable)\b",
|
|
1504
|
+
"cycles": r"\b(cycle|circular|import loop)\b",
|
|
1505
|
+
"impact": r"\b(impact|affect|change|break|depends?)\b.*\b(\w+)\b",
|
|
1506
|
+
"module": r"\b(module|file|package)\b.*\b(does?|about|role|purpose)\b",
|
|
1507
|
+
"anatomy": r"\b(anatomy|subsystem|parts?|structure|how.*organized)\b",
|
|
1508
|
+
"symbol": r"\b(function|class|method|symbol)\b.*\b(does?|what|where|who)\b",
|
|
1509
|
+
"overview": r"\b(overview|summary|what is|describe|about|main parts?|main packages?|main modules?)\b",
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
_STOP_WORDS = frozenset({
|
|
1513
|
+
"what", "does", "how", "the", "this", "that", "with", "from",
|
|
1514
|
+
"about", "which", "where", "when", "would", "could", "should",
|
|
1515
|
+
"have", "been", "being", "will", "they", "them", "their",
|
|
1516
|
+
"most", "more", "some", "each", "every", "there", "here",
|
|
1517
|
+
"into", "between", "through", "after", "before", "these",
|
|
1518
|
+
"those", "other", "than", "very", "just", "only",
|
|
1519
|
+
})
|
|
1520
|
+
|
|
1521
|
+
_SUGGESTIONS: dict[str, list[str]] = {
|
|
1522
|
+
"overview": [
|
|
1523
|
+
"What are the riskiest parts of the codebase?",
|
|
1524
|
+
"What's the health score?",
|
|
1525
|
+
"How is the codebase organized?",
|
|
1526
|
+
],
|
|
1527
|
+
"module": [
|
|
1528
|
+
"What are the riskiest files?",
|
|
1529
|
+
"What's the health score?",
|
|
1530
|
+
],
|
|
1531
|
+
"risk": [
|
|
1532
|
+
"What would break if I changed the riskiest symbol?",
|
|
1533
|
+
"What's the health score?",
|
|
1534
|
+
"Are there any import cycles?",
|
|
1535
|
+
],
|
|
1536
|
+
"health": [
|
|
1537
|
+
"What are the riskiest files?",
|
|
1538
|
+
"What dead code should I clean up?",
|
|
1539
|
+
"Are there any import cycles?",
|
|
1540
|
+
],
|
|
1541
|
+
"impact": [
|
|
1542
|
+
"What are the riskiest parts of the codebase?",
|
|
1543
|
+
"How is the codebase organized?",
|
|
1544
|
+
],
|
|
1545
|
+
"anatomy": [
|
|
1546
|
+
"What are the main parts of this codebase?",
|
|
1547
|
+
"What are the riskiest files?",
|
|
1548
|
+
"What's the health score?",
|
|
1549
|
+
],
|
|
1550
|
+
"dead_code": [
|
|
1551
|
+
"What's the health score?",
|
|
1552
|
+
"What are the riskiest files?",
|
|
1553
|
+
],
|
|
1554
|
+
"cycles": [
|
|
1555
|
+
"What's the health score?",
|
|
1556
|
+
"How is the codebase organized?",
|
|
1557
|
+
],
|
|
1558
|
+
"symbol": [
|
|
1559
|
+
"What are the riskiest parts of the codebase?",
|
|
1560
|
+
"What's the health score?",
|
|
1561
|
+
],
|
|
1562
|
+
"general": [
|
|
1563
|
+
"What are the main parts of this codebase?",
|
|
1564
|
+
"What's the health score?",
|
|
1565
|
+
"What are the riskiest files?",
|
|
1566
|
+
],
|
|
1567
|
+
}
|
|
1568
|
+
|
|
1569
|
+
@staticmethod
|
|
1570
|
+
def _extract_keywords(question: str) -> list[str]:
|
|
1571
|
+
"""Extract meaningful keywords from a question (stop-word filtered + bigrams)."""
|
|
1572
|
+
words = re.findall(r"[a-z_]+", question.lower())
|
|
1573
|
+
words = [w for w in words if len(w) > 3 and w not in ComprehensionEngine._STOP_WORDS]
|
|
1574
|
+
bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
|
|
1575
|
+
return (words + bigrams)[:5]
|
|
1576
|
+
|
|
1577
|
+
def answer_question(self, question: str) -> dict:
|
|
1578
|
+
"""Answer a natural language question about the codebase.
|
|
1579
|
+
|
|
1580
|
+
Uses regex intent detection to route the question to the right
|
|
1581
|
+
structural data source. Fully deterministic — no LLM call.
|
|
1582
|
+
|
|
1583
|
+
Returns:
|
|
1584
|
+
{
|
|
1585
|
+
"question": str,
|
|
1586
|
+
"intent": str,
|
|
1587
|
+
"answer": str,
|
|
1588
|
+
"data": dict | list,
|
|
1589
|
+
"sources": list[str],
|
|
1590
|
+
"suggestions": list[str],
|
|
1591
|
+
}
|
|
1592
|
+
"""
|
|
1593
|
+
q_lower = question.lower()
|
|
1594
|
+
|
|
1595
|
+
# Detect intent
|
|
1596
|
+
intent = "general"
|
|
1597
|
+
for name, pattern in self._INTENT_PATTERNS.items():
|
|
1598
|
+
if re.search(pattern, q_lower):
|
|
1599
|
+
intent = name
|
|
1600
|
+
break
|
|
1601
|
+
|
|
1602
|
+
answer = ""
|
|
1603
|
+
data: dict | list = {}
|
|
1604
|
+
sources: list[str] = []
|
|
1605
|
+
|
|
1606
|
+
if intent == "overview":
|
|
1607
|
+
overview = self.system_overview()
|
|
1608
|
+
stats = overview["stats"]
|
|
1609
|
+
pkgs = sorted(overview["packages"].keys())
|
|
1610
|
+
answer = (
|
|
1611
|
+
f"This codebase has {stats.get('files', 0)} files and "
|
|
1612
|
+
f"{stats.get('nodes', 0)} symbols across "
|
|
1613
|
+
f"{len(pkgs)} package{'s' if len(pkgs) != 1 else ''}: "
|
|
1614
|
+
f"{', '.join(pkgs)}."
|
|
1615
|
+
)
|
|
1616
|
+
data = overview
|
|
1617
|
+
sources = ["system_overview"]
|
|
1618
|
+
|
|
1619
|
+
elif intent == "module":
|
|
1620
|
+
# Try to extract a module/file name from the question
|
|
1621
|
+
mod_name = self._extract_module_name(question)
|
|
1622
|
+
if mod_name:
|
|
1623
|
+
view = self.module_view(mod_name)
|
|
1624
|
+
if "error" not in view:
|
|
1625
|
+
narrative = self.module_narrative(mod_name)
|
|
1626
|
+
answer = narrative.get("summary", f"Module {mod_name} found.")
|
|
1627
|
+
data = view
|
|
1628
|
+
sources = ["module_view", "module_narrative"]
|
|
1629
|
+
else:
|
|
1630
|
+
answer = f"Could not find module '{mod_name}'. Try using the full file path."
|
|
1631
|
+
data = {"error": view["error"]}
|
|
1632
|
+
sources = ["module_view"]
|
|
1633
|
+
else:
|
|
1634
|
+
answer = "Please specify a module or file name in your question."
|
|
1635
|
+
data = {}
|
|
1636
|
+
sources = []
|
|
1637
|
+
|
|
1638
|
+
elif intent == "risk":
|
|
1639
|
+
hotspots = self.risk_hotspots(top_n=10)
|
|
1640
|
+
if hotspots:
|
|
1641
|
+
lines = []
|
|
1642
|
+
for i, h in enumerate(hotspots[:5], 1):
|
|
1643
|
+
lines.append(
|
|
1644
|
+
f" {i}. {h['name']} ({h['file_path']}) — "
|
|
1645
|
+
f"risk: {h['risk_score']}, {h['direct_dependents']} dependents, "
|
|
1646
|
+
f"{h['affected_files']} affected files"
|
|
1647
|
+
)
|
|
1648
|
+
answer = "Top risk hotspots:\n" + "\n".join(lines)
|
|
1649
|
+
else:
|
|
1650
|
+
answer = "No significant risk hotspots found."
|
|
1651
|
+
data = hotspots
|
|
1652
|
+
sources = ["risk_hotspots"]
|
|
1653
|
+
|
|
1654
|
+
elif intent == "health":
|
|
1655
|
+
health = self.health_score()
|
|
1656
|
+
d = health["details"]
|
|
1657
|
+
answer = (
|
|
1658
|
+
f"Health score: {health['score']}/100 (grade {health['grade']}). "
|
|
1659
|
+
f"Dead code: {d['dead_code_count']} symbols ({d['dead_code_ratio']:.0%}). "
|
|
1660
|
+
f"Import cycles: {d['import_cycles']}. "
|
|
1661
|
+
f"High-risk hotspots: {d['high_risk_hotspots']}."
|
|
1662
|
+
)
|
|
1663
|
+
data = health
|
|
1664
|
+
sources = ["health_score"]
|
|
1665
|
+
|
|
1666
|
+
elif intent == "impact":
|
|
1667
|
+
# Try to find a file path first (e.g. "store.py")
|
|
1668
|
+
file_path_match = self._extract_module_name(question)
|
|
1669
|
+
# Extract a symbol name from the question.
|
|
1670
|
+
# Prefer CamelCase / capitalised names (likely real symbols)
|
|
1671
|
+
# over lowercase verbs like "depends", "change", "break".
|
|
1672
|
+
symbol_name = self._extract_symbol_name(question)
|
|
1673
|
+
# Also extract keywords and try each until one resolves
|
|
1674
|
+
keywords = self._extract_keywords(question)
|
|
1675
|
+
candidates = []
|
|
1676
|
+
if symbol_name:
|
|
1677
|
+
candidates.append(symbol_name)
|
|
1678
|
+
candidates.extend(keywords)
|
|
1679
|
+
|
|
1680
|
+
node = None
|
|
1681
|
+
tried_name = None
|
|
1682
|
+
for candidate in candidates:
|
|
1683
|
+
nodes = self.engine.resolve_node(candidate)
|
|
1684
|
+
if nodes:
|
|
1685
|
+
node = nodes[0]
|
|
1686
|
+
tried_name = candidate
|
|
1687
|
+
break
|
|
1688
|
+
tried_name = candidate
|
|
1689
|
+
|
|
1690
|
+
if node is not None:
|
|
1691
|
+
impacted = self.engine.impact_of_change(node["id"], max_depth=5)
|
|
1692
|
+
affected_files: set[str] = set()
|
|
1693
|
+
all_nodes_list = self.store.get_all_nodes()
|
|
1694
|
+
node_by_id = {n["id"]: n for n in all_nodes_list}
|
|
1695
|
+
for entry in impacted:
|
|
1696
|
+
t = node_by_id.get(entry["node_id"])
|
|
1697
|
+
if t:
|
|
1698
|
+
affected_files.add(t["file_path"])
|
|
1699
|
+
answer = (
|
|
1700
|
+
f"Changing {node['name']} ({node['file_path']}) would impact "
|
|
1701
|
+
f"{len(impacted)} symbols across {len(affected_files)} files."
|
|
1702
|
+
)
|
|
1703
|
+
data = {"target": node["id"], "impacted_count": len(impacted),
|
|
1704
|
+
"affected_files": sorted(affected_files)}
|
|
1705
|
+
sources = ["impact_of_change"]
|
|
1706
|
+
elif file_path_match:
|
|
1707
|
+
# Fall back to change_summary for file-based impact
|
|
1708
|
+
summary = self.change_summary(file_path_match)
|
|
1709
|
+
answer = (
|
|
1710
|
+
f"Changing {file_path_match} would affect "
|
|
1711
|
+
f"{summary['affected_file_count']} files."
|
|
1712
|
+
)
|
|
1713
|
+
data = summary
|
|
1714
|
+
sources = ["change_summary"]
|
|
1715
|
+
elif tried_name:
|
|
1716
|
+
answer = f"No symbol found matching '{tried_name}'."
|
|
1717
|
+
data = {}
|
|
1718
|
+
sources = ["resolve_node"]
|
|
1719
|
+
else:
|
|
1720
|
+
answer = "Please mention a symbol name to analyze impact."
|
|
1721
|
+
data = {}
|
|
1722
|
+
sources = []
|
|
1723
|
+
|
|
1724
|
+
elif intent == "anatomy":
|
|
1725
|
+
anatomy = self.codebase_anatomy()
|
|
1726
|
+
sub_lines = []
|
|
1727
|
+
for sub in anatomy.get("subsystems", []):
|
|
1728
|
+
sub_lines.append(
|
|
1729
|
+
f" - {sub['name']} ({sub['file_count']} files, "
|
|
1730
|
+
f"{sub['symbol_count']} symbols, {sub['importance_label']})"
|
|
1731
|
+
)
|
|
1732
|
+
answer = (
|
|
1733
|
+
anatomy.get("purpose", "Unknown purpose.") + "\n\n"
|
|
1734
|
+
"Subsystems:\n" + "\n".join(sub_lines)
|
|
1735
|
+
)
|
|
1736
|
+
data = anatomy
|
|
1737
|
+
sources = ["codebase_anatomy"]
|
|
1738
|
+
|
|
1739
|
+
elif intent == "dead_code":
|
|
1740
|
+
dead = self.engine.find_dead_code()
|
|
1741
|
+
if dead:
|
|
1742
|
+
sample = dead[:5]
|
|
1743
|
+
lines = [f" - {d['name']} ({d['type']}) in {d['file_path']}" for d in sample]
|
|
1744
|
+
answer = (
|
|
1745
|
+
f"Found {len(dead)} potentially unused symbols.\n"
|
|
1746
|
+
+ "\n".join(lines)
|
|
1747
|
+
)
|
|
1748
|
+
if len(dead) > 5:
|
|
1749
|
+
answer += f"\n ... and {len(dead) - 5} more."
|
|
1750
|
+
else:
|
|
1751
|
+
answer = "No dead code detected."
|
|
1752
|
+
data = dead
|
|
1753
|
+
sources = ["find_dead_code"]
|
|
1754
|
+
|
|
1755
|
+
elif intent == "cycles":
|
|
1756
|
+
cycles = self.engine.detect_cycles()
|
|
1757
|
+
if cycles:
|
|
1758
|
+
lines = []
|
|
1759
|
+
for c in cycles[:5]:
|
|
1760
|
+
path = " -> ".join(c) if isinstance(c, list) else str(c)
|
|
1761
|
+
lines.append(f" - {path}")
|
|
1762
|
+
answer = f"Found {len(cycles)} import cycle(s):\n" + "\n".join(lines)
|
|
1763
|
+
if len(cycles) > 5:
|
|
1764
|
+
answer += f"\n ... and {len(cycles) - 5} more."
|
|
1765
|
+
else:
|
|
1766
|
+
answer = "No import cycles detected."
|
|
1767
|
+
data = cycles
|
|
1768
|
+
sources = ["detect_cycles"]
|
|
1769
|
+
|
|
1770
|
+
elif intent == "symbol":
|
|
1771
|
+
keywords = self._extract_keywords(question)
|
|
1772
|
+
symbol_name = keywords[0] if keywords else None
|
|
1773
|
+
if symbol_name:
|
|
1774
|
+
nodes = self.engine.resolve_node(symbol_name)
|
|
1775
|
+
if nodes:
|
|
1776
|
+
node = nodes[0]
|
|
1777
|
+
narrative = self.symbol_narrative(node["id"])
|
|
1778
|
+
answer = narrative.get("summary", f"Found {node['name']}.")
|
|
1779
|
+
data = narrative
|
|
1780
|
+
sources = ["symbol_narrative"]
|
|
1781
|
+
else:
|
|
1782
|
+
answer = f"No symbol found matching '{symbol_name}'."
|
|
1783
|
+
data = {}
|
|
1784
|
+
sources = ["resolve_node"]
|
|
1785
|
+
else:
|
|
1786
|
+
answer = "Please mention a symbol name in your question."
|
|
1787
|
+
data = {}
|
|
1788
|
+
sources = []
|
|
1789
|
+
|
|
1790
|
+
else:
|
|
1791
|
+
# General: keyword search + overview stats
|
|
1792
|
+
keywords = self._extract_keywords(question)
|
|
1793
|
+
search_results = []
|
|
1794
|
+
for kw in keywords[:5]:
|
|
1795
|
+
results = self.engine.search_nodes(kw, limit=5)
|
|
1796
|
+
search_results.extend(results)
|
|
1797
|
+
# Deduplicate by id
|
|
1798
|
+
seen_ids: set[str] = set()
|
|
1799
|
+
unique_results: list[dict] = []
|
|
1800
|
+
for r in search_results:
|
|
1801
|
+
if r["id"] not in seen_ids:
|
|
1802
|
+
seen_ids.add(r["id"])
|
|
1803
|
+
unique_results.append(r)
|
|
1804
|
+
|
|
1805
|
+
overview = self.system_overview()
|
|
1806
|
+
stats = overview["stats"]
|
|
1807
|
+
if unique_results:
|
|
1808
|
+
lines = [f" - {r['name']} ({r['type']}) in {r['file_path']}" for r in unique_results[:5]]
|
|
1809
|
+
answer = (
|
|
1810
|
+
f"Found {len(unique_results)} matching symbols "
|
|
1811
|
+
f"(in a codebase with {stats.get('files', 0)} files):\n"
|
|
1812
|
+
+ "\n".join(lines)
|
|
1813
|
+
)
|
|
1814
|
+
else:
|
|
1815
|
+
pkgs = sorted(overview["packages"].keys())
|
|
1816
|
+
answer = (
|
|
1817
|
+
f"No specific matches found. This codebase has "
|
|
1818
|
+
f"{stats.get('files', 0)} files and {stats.get('nodes', 0)} symbols "
|
|
1819
|
+
f"across packages: {', '.join(pkgs)}."
|
|
1820
|
+
)
|
|
1821
|
+
data = {"search_results": unique_results[:15], "stats": stats}
|
|
1822
|
+
sources = ["search_nodes", "system_overview"]
|
|
1823
|
+
|
|
1824
|
+
suggestions = self._SUGGESTIONS.get(intent, self._SUGGESTIONS["general"])
|
|
1825
|
+
|
|
1826
|
+
return {
|
|
1827
|
+
"question": question,
|
|
1828
|
+
"intent": intent,
|
|
1829
|
+
"answer": answer,
|
|
1830
|
+
"data": data,
|
|
1831
|
+
"sources": sources,
|
|
1832
|
+
"suggestions": suggestions,
|
|
1833
|
+
}
|
|
1834
|
+
|
|
1835
|
+
def _extract_module_name(self, question: str) -> str | None:
|
|
1836
|
+
"""Try to extract a file/module path from the question."""
|
|
1837
|
+
# Look for explicit file paths (e.g. "mypackage/core.py")
|
|
1838
|
+
path_match = re.search(r"[\w/\\]+\.(?:py|ts|tsx|js|jsx)", question)
|
|
1839
|
+
if path_match:
|
|
1840
|
+
return path_match.group(0).replace("\\", "/")
|
|
1841
|
+
|
|
1842
|
+
# Look for quoted names
|
|
1843
|
+
quoted = re.search(r'["\']([^"\']+)["\']', question)
|
|
1844
|
+
if quoted:
|
|
1845
|
+
return quoted.group(1)
|
|
1846
|
+
|
|
1847
|
+
# Look for module-like words after "module" or "file"
|
|
1848
|
+
mod_match = re.search(r"\b(?:module|file)\s+(\w[\w./]*)", question, re.IGNORECASE)
|
|
1849
|
+
if mod_match:
|
|
1850
|
+
return mod_match.group(1)
|
|
1851
|
+
|
|
1852
|
+
return None
|
|
1853
|
+
|
|
1854
|
+
@staticmethod
|
|
1855
|
+
def _extract_symbol_name(question: str) -> str | None:
|
|
1856
|
+
"""Extract a likely symbol name from the question.
|
|
1857
|
+
|
|
1858
|
+
Prefers CamelCase identifiers (e.g. GraphStore, DataProcessor) or
|
|
1859
|
+
names inside quotes. Falls back to ``None`` so the caller can try
|
|
1860
|
+
keyword extraction instead.
|
|
1861
|
+
"""
|
|
1862
|
+
# Quoted names first
|
|
1863
|
+
quoted = re.search(r'["\']([^"\']+)["\']', question)
|
|
1864
|
+
if quoted:
|
|
1865
|
+
return quoted.group(1)
|
|
1866
|
+
|
|
1867
|
+
# CamelCase identifiers (at least two uppercase letters)
|
|
1868
|
+
camel = re.findall(r"\b([A-Z][a-z]+(?:[A-Z][a-z]*)+)\b", question)
|
|
1869
|
+
if camel:
|
|
1870
|
+
return camel[0]
|
|
1871
|
+
|
|
1872
|
+
# Single capitalised word that is NOT a sentence starter
|
|
1873
|
+
# e.g. "What depends on Store?" -> "Store"
|
|
1874
|
+
caps = re.findall(r"(?<!\.\s)(?<!^)\b([A-Z][a-z]{2,})\b", question)
|
|
1875
|
+
if caps:
|
|
1876
|
+
return caps[0]
|
|
1877
|
+
|
|
1878
|
+
return None
|
|
1879
|
+
|
|
1880
|
+
# ------------------------------------------------------------------
|
|
1881
|
+
# Helpers
|
|
1882
|
+
# ------------------------------------------------------------------
|
|
1883
|
+
def _extract_module_name_from_impact(self, question: str) -> str | None:
|
|
1884
|
+
"""Extract a file path from an impact-style question."""
|
|
1885
|
+
path_match = re.search(r"[\w/\\]+\.(?:py|ts|tsx|js|jsx)", question)
|
|
1886
|
+
if path_match:
|
|
1887
|
+
return path_match.group(0).replace("\\", "/")
|
|
1888
|
+
return None
|
|
1889
|
+
|
|
1890
|
+
def _infer_module_role(
|
|
1891
|
+
self, file_path: str, symbols: list[dict], coupling_in: int, coupling_out: int
|
|
1892
|
+
) -> str:
|
|
1893
|
+
"""Infer the architectural role of a module from its structure and coupling.
|
|
1894
|
+
|
|
1895
|
+
Uses a combination of filename conventions, symbol composition, and
|
|
1896
|
+
coupling patterns. Returns a role string. When multiple signals conflict,
|
|
1897
|
+
coupling-based signals take precedence over filename-based ones.
|
|
1898
|
+
"""
|
|
1899
|
+
name = file_path.split("/")[-1]
|
|
1900
|
+
# Remove extension (.py, .ts, .tsx, .js, .jsx)
|
|
1901
|
+
stem = name.rsplit(".", 1)[0] if "." in name else name
|
|
1902
|
+
|
|
1903
|
+
# Unambiguous filename matches
|
|
1904
|
+
if stem == "__init__":
|
|
1905
|
+
return "package_init"
|
|
1906
|
+
if stem == "__main__":
|
|
1907
|
+
return "entry_point"
|
|
1908
|
+
if stem.startswith("test_") or stem.endswith("_test") or stem.endswith(".test"):
|
|
1909
|
+
return "test"
|
|
1910
|
+
|
|
1911
|
+
classes = [s for s in symbols if s["type"] == "class"]
|
|
1912
|
+
functions = [s for s in symbols if s["type"] == "function"]
|
|
1913
|
+
methods = [s for s in symbols if s["type"] == "method"]
|
|
1914
|
+
|
|
1915
|
+
# Coupling-based roles (structural evidence, more reliable than name)
|
|
1916
|
+
if coupling_in > coupling_out * 2 and coupling_in > 3:
|
|
1917
|
+
return "core_library"
|
|
1918
|
+
if coupling_out > coupling_in * 3 and coupling_out > 3:
|
|
1919
|
+
return "orchestrator"
|
|
1920
|
+
if coupling_in == 0 and coupling_out == 0:
|
|
1921
|
+
return "isolated"
|
|
1922
|
+
|
|
1923
|
+
# Filename hints (only for common conventions)
|
|
1924
|
+
if stem in ("config", "settings", "constants", "conf"):
|
|
1925
|
+
return "configuration"
|
|
1926
|
+
if stem in ("utils", "helpers", "util", "common"):
|
|
1927
|
+
return "utility"
|
|
1928
|
+
if stem in ("models", "schemas", "types", "entities"):
|
|
1929
|
+
return "data_model"
|
|
1930
|
+
|
|
1931
|
+
# Composition-based roles
|
|
1932
|
+
if len(classes) > len(functions) and classes:
|
|
1933
|
+
return "domain_model"
|
|
1934
|
+
if functions and not classes:
|
|
1935
|
+
if coupling_in > 0:
|
|
1936
|
+
return "function_library"
|
|
1937
|
+
return "script"
|
|
1938
|
+
|
|
1939
|
+
return "module"
|