codebrain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. codebrain/__init__.py +3 -0
  2. codebrain/__main__.py +6 -0
  3. codebrain/agent_bridge.py +162 -0
  4. codebrain/analyzer.py +943 -0
  5. codebrain/api.py +578 -0
  6. codebrain/api_models.py +102 -0
  7. codebrain/cli.py +1927 -0
  8. codebrain/comprehension.py +1939 -0
  9. codebrain/config.py +46 -0
  10. codebrain/context.py +276 -0
  11. codebrain/export.py +334 -0
  12. codebrain/graph/__init__.py +0 -0
  13. codebrain/graph/query.py +656 -0
  14. codebrain/graph/schema.py +113 -0
  15. codebrain/graph/store.py +295 -0
  16. codebrain/hook_runner.py +71 -0
  17. codebrain/hooks.py +107 -0
  18. codebrain/indexer.py +450 -0
  19. codebrain/llm.py +676 -0
  20. codebrain/logging.py +42 -0
  21. codebrain/mcp_server.py +1635 -0
  22. codebrain/memory/__init__.py +5 -0
  23. codebrain/memory/store.py +270 -0
  24. codebrain/parser/__init__.py +0 -0
  25. codebrain/parser/base.py +27 -0
  26. codebrain/parser/config_parser.py +228 -0
  27. codebrain/parser/models.py +44 -0
  28. codebrain/parser/python_parser.py +658 -0
  29. codebrain/parser/registry.py +144 -0
  30. codebrain/parser/typescript_parser.py +1189 -0
  31. codebrain/parser/typescript_treesitter.py +535 -0
  32. codebrain/py.typed +0 -0
  33. codebrain/resolver.py +171 -0
  34. codebrain/settings.py +88 -0
  35. codebrain/utils.py +59 -0
  36. codebrain/validator.py +563 -0
  37. codebrain/watcher/__init__.py +0 -0
  38. codebrain/watcher/file_watcher.py +173 -0
  39. codebrain-0.1.0.dist-info/METADATA +360 -0
  40. codebrain-0.1.0.dist-info/RECORD +44 -0
  41. codebrain-0.1.0.dist-info/WHEEL +5 -0
  42. codebrain-0.1.0.dist-info/entry_points.txt +6 -0
  43. codebrain-0.1.0.dist-info/licenses/LICENSE +21 -0
  44. codebrain-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1939 @@
1
+ """Multi-resolution comprehension engine.
2
+
3
+ Generates layered views of a codebase at different zoom levels:
4
+ - System level: what is this codebase, what are its parts
5
+ - Module level: what does this file do, what does it depend on
6
+ - Symbol level: full context for a single function/class
7
+ - Risk level: where are the hotspots, what is fragile
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import re
14
+ from collections import Counter, defaultdict
15
+
16
+ from codebrain.graph.query import QueryEngine
17
+ from codebrain.graph.store import GraphStore
18
+ from codebrain.utils import is_test_file
19
+
20
+ # Generic names that are too common for bare-name matching in hotspot scoring.
21
+ # Without this filter, e.g. ``MyCache.get`` would collect ALL ``.get()`` callers
22
+ # across the entire codebase, producing wildly inflated risk scores.
23
+ _GENERIC_NAMES = frozenset({
24
+ "get", "set", "delete", "update", "create", "save", "load", "run",
25
+ "put", "post", "patch", "read", "write", "send", "close", "open",
26
+ "add", "remove", "pop", "push", "clear", "reset", "start", "stop",
27
+ "execute", "call", "apply", "handle", "process", "validate",
28
+ "init", "setup", "teardown", "configure",
29
+ "__init__", "__str__", "__repr__", "__eq__", "__hash__",
30
+ "__enter__", "__exit__", "__call__", "__getattr__", "__setattr__",
31
+ "__len__", "__iter__", "__next__", "__contains__",
32
+ })
33
+
34
+
35
+ class ComprehensionEngine:
36
+ """Produces deterministic, multi-resolution comprehension views."""
37
+
38
+ def __init__(self, store: GraphStore) -> None:
39
+ self.store = store
40
+ self.engine = QueryEngine(store)
41
+
42
+ # ------------------------------------------------------------------
43
+ # Unified zoom interface
44
+ # ------------------------------------------------------------------
45
+ def zoom(self, target: str | None = None) -> dict:
46
+ """Multi-resolution zoom — like Google Maps for architecture.
47
+
48
+ - No target: system level (what is this codebase?)
49
+ - Package name target: package level (what files are in this package?)
50
+ - File path target: module level (what does this file do?)
51
+ - Symbol name target: symbol level (full context for one symbol)
52
+
53
+ Each level includes navigation hints to drill down or zoom out.
54
+ """
55
+ if target is None:
56
+ return self._zoom_system()
57
+
58
+ # Check if target looks like a file path
59
+ all_nodes = self.store.get_all_nodes()
60
+ file_paths = {n["file_path"] for n in all_nodes}
61
+ normalized = target.replace("\\", "/")
62
+ if normalized in file_paths or target in file_paths:
63
+ return self._zoom_module(normalized)
64
+
65
+ # Check if target is a package (directory prefix matching indexed files)
66
+ pkg_prefix = normalized.rstrip("/") + "/"
67
+ pkg_files = [fp for fp in file_paths if fp.startswith(pkg_prefix)]
68
+ if pkg_files:
69
+ return self._zoom_package(normalized.rstrip("/"), all_nodes)
70
+
71
+ # Also check if target matches a top-level package name exactly
72
+ top_packages = set()
73
+ for fp in file_paths:
74
+ parts = fp.split("/")
75
+ if len(parts) > 1:
76
+ top_packages.add(parts[0])
77
+ if normalized in top_packages:
78
+ return self._zoom_package(normalized, all_nodes)
79
+
80
+ # Try as symbol name
81
+ return self._zoom_symbol(target, all_nodes)
82
+
83
+ def _zoom_system(self) -> dict:
84
+ """System-level zoom: overview + narrative + drill-down hints."""
85
+ narrative = self.system_narrative()
86
+ overview = self.system_overview()
87
+
88
+ # Discover packages with symbol counts for drill-down hints
89
+ all_nodes = self.store.get_all_nodes()
90
+ pkg_symbol_counts: dict[str, int] = {}
91
+ pkg_file_counts: dict[str, set[str]] = {}
92
+ for n in all_nodes:
93
+ parts = n["file_path"].split("/")
94
+ pkg = parts[0] if len(parts) > 1 else "(root)"
95
+ if n["type"] != "file":
96
+ pkg_symbol_counts[pkg] = pkg_symbol_counts.get(pkg, 0) + 1
97
+ if pkg not in pkg_file_counts:
98
+ pkg_file_counts[pkg] = set()
99
+ pkg_file_counts[pkg].add(n["file_path"])
100
+
101
+ top_packages = sorted(pkg_symbol_counts.items(), key=lambda x: -x[1])[:10]
102
+
103
+ return {
104
+ "level": "system",
105
+ "narrative": narrative,
106
+ "drill_down": [
107
+ {
108
+ "target": pkg,
109
+ "symbols": count,
110
+ "files": len(pkg_file_counts.get(pkg, set())),
111
+ "hint": f"zoom('{pkg}')",
112
+ }
113
+ for pkg, count in top_packages
114
+ ],
115
+ "stats": overview.get("stats", {}),
116
+ "zoom_out": None,
117
+ }
118
+
119
+ def _zoom_package(self, package_name: str, all_nodes: list[dict] | None = None) -> dict:
120
+ """Package-level zoom: files in a package with roles and stats."""
121
+ if all_nodes is None:
122
+ all_nodes = self.store.get_all_nodes()
123
+
124
+ pkg_prefix = package_name.rstrip("/") + "/"
125
+
126
+ # Collect files that belong to this package (direct children only)
127
+ file_nodes: dict[str, dict] = {}
128
+ file_symbols: dict[str, list[dict]] = defaultdict(list)
129
+ for n in all_nodes:
130
+ if not n["file_path"].startswith(pkg_prefix):
131
+ continue
132
+ # Check it's a direct child or one level deeper (sub-package __init__)
133
+ remainder = n["file_path"][len(pkg_prefix):]
134
+ if n["type"] == "file":
135
+ file_nodes[n["file_path"]] = n
136
+ else:
137
+ file_symbols[n["file_path"]].append(n)
138
+
139
+ if not file_nodes:
140
+ return {"level": "package", "error": f"No package: {package_name}"}
141
+
142
+ # Build file summaries
143
+ file_summaries = []
144
+ for fp, fnode in file_nodes.items():
145
+ syms = file_symbols.get(fp, [])
146
+ role = self._infer_module_role(fp, syms, 0, 0)
147
+ file_summaries.append({
148
+ "file_path": fp,
149
+ "role": role,
150
+ "symbol_count": len(syms),
151
+ "line_count": fnode.get("line_end", 0),
152
+ "drill_down": f"zoom('{fp}')",
153
+ })
154
+
155
+ file_summaries.sort(key=lambda x: x["symbol_count"], reverse=True)
156
+
157
+ # Detect sub-packages (directories within this package)
158
+ sub_packages: set[str] = set()
159
+ for fp in file_nodes:
160
+ remainder = fp[len(pkg_prefix):]
161
+ parts = remainder.split("/")
162
+ if len(parts) > 1:
163
+ sub_packages.add(package_name + "/" + parts[0])
164
+
165
+ # External dependencies: other packages this one depends on
166
+ all_edges = self.store.get_all_edges()
167
+ node_ids_in_pkg = set()
168
+ for fp in file_nodes:
169
+ for n in all_nodes:
170
+ if n["file_path"] == fp:
171
+ node_ids_in_pkg.add(n["id"])
172
+
173
+ ext_deps: set[str] = set()
174
+ int_deps: list[dict] = []
175
+ for e in all_edges:
176
+ if e["type"] not in ("IMPORTS", "CALLS"):
177
+ continue
178
+ src_fp = e.get("file_path", "")
179
+ if not src_fp.startswith(pkg_prefix):
180
+ continue
181
+ # Resolve target to a file path
182
+ tgt_fp = self._resolve_edge_target_file(e["target"], all_nodes)
183
+ if tgt_fp is None:
184
+ continue
185
+ if tgt_fp.startswith(pkg_prefix):
186
+ if src_fp != tgt_fp:
187
+ int_deps.append({"from": src_fp, "to": tgt_fp})
188
+ else:
189
+ tgt_parts = tgt_fp.split("/")
190
+ tgt_pkg = tgt_parts[0] if len(tgt_parts) > 1 else "(root)"
191
+ ext_deps.add(tgt_pkg)
192
+
193
+ return {
194
+ "level": "package",
195
+ "package": package_name,
196
+ "file_count": len(file_nodes),
197
+ "files": file_summaries,
198
+ "sub_packages": sorted(sub_packages),
199
+ "external_dependencies": sorted(ext_deps),
200
+ "internal_dependencies": int_deps[:50], # cap to avoid huge output
201
+ "zoom_out": {"hint": "zoom()"},
202
+ "drill_down": [
203
+ {"target": f["file_path"], "hint": f"zoom('{f['file_path']}')"} for f in file_summaries[:10]
204
+ ],
205
+ }
206
+
207
+ def _resolve_edge_target_file(self, target: str, all_nodes: list[dict]) -> str | None:
208
+ """Resolve an edge target (node ID, name, or qualified name) to a file path."""
209
+ for n in all_nodes:
210
+ if n["id"] == target or n["qualified_name"] == target:
211
+ return n["file_path"]
212
+ for n in all_nodes:
213
+ if n["name"] == target:
214
+ return n["file_path"]
215
+ return None
216
+
217
+ def _zoom_module(self, file_path: str) -> dict:
218
+ """Module-level zoom: module narrative + symbol list + zoom hints."""
219
+ narrative = self.module_narrative(file_path)
220
+ view = self.module_view(file_path)
221
+
222
+ symbols = []
223
+ for s in view.get("symbols", []):
224
+ symbols.append({
225
+ "name": s.get("name", ""),
226
+ "type": s.get("type", ""),
227
+ "line": s.get("line_start", 0),
228
+ "hint": f"zoom('{s.get('name', '')}')",
229
+ })
230
+
231
+ # Derive the package from the file path for zoom_out
232
+ parts = file_path.split("/")
233
+ if len(parts) > 1:
234
+ pkg = "/".join(parts[:-1])
235
+ zoom_out = {"target": pkg, "hint": f"zoom('{pkg}')"}
236
+ else:
237
+ zoom_out = {"hint": "zoom()"}
238
+
239
+ return {
240
+ "level": "module",
241
+ "file": file_path,
242
+ "narrative": narrative,
243
+ "symbols": symbols,
244
+ "zoom_out": zoom_out,
245
+ "drill_down": [
246
+ {"target": s["name"], "hint": s["hint"]} for s in symbols[:10]
247
+ ],
248
+ "dependencies": view.get("dependencies", []),
249
+ "dependents": view.get("dependents", []),
250
+ }
251
+
252
+ def _zoom_symbol(self, name: str, all_nodes: list[dict] | None = None) -> dict:
253
+ """Symbol-level zoom: full context + narrative."""
254
+ if all_nodes is None:
255
+ all_nodes = self.store.get_all_nodes()
256
+
257
+ # Find the symbol
258
+ matches = [n for n in all_nodes if n["name"] == name or n["id"] == name]
259
+ if not matches:
260
+ return {"level": "symbol", "error": f"Symbol '{name}' not found"}
261
+
262
+ node = matches[0]
263
+ narrative = self.symbol_narrative(node["id"])
264
+
265
+ # Get callers and callees for navigation
266
+ callers = self.engine.impact_of_change(node["id"], max_depth=1)
267
+ callees = self.engine.get_call_chain(node["id"], max_depth=1)
268
+
269
+ return {
270
+ "level": "symbol",
271
+ "name": node["name"],
272
+ "type": node["type"],
273
+ "file": node["file_path"],
274
+ "line": node.get("line_start", 0),
275
+ "narrative": narrative,
276
+ "callers": [{"name": c.get("name", c["node_id"]), "hint": f"zoom('{c.get('name', c['node_id'])}')"} for c in callers[:10]],
277
+ "callees": [{"name": c.get("name", c["node_id"]), "hint": f"zoom('{c.get('name', c['node_id'])}')"} for c in callees[:10]],
278
+ "zoom_out": {"target": node["file_path"], "hint": f"zoom('{node['file_path']}')"},
279
+ }
280
+
281
+ # ------------------------------------------------------------------
282
+ # System level
283
+ # ------------------------------------------------------------------
284
+ def system_overview(self) -> dict:
285
+ """Top-level view of the entire codebase."""
286
+ stats = self.store.get_stats()
287
+ all_nodes = self.store.get_all_nodes()
288
+
289
+ # Discover packages (top-level directories)
290
+ packages: dict[str, dict] = {}
291
+ for node in all_nodes:
292
+ parts = node["file_path"].split("/")
293
+ pkg = parts[0] if len(parts) > 1 else "(root)"
294
+ if pkg not in packages:
295
+ packages[pkg] = {"files": set(), "docstring": "", "node_count": 0}
296
+ packages[pkg]["files"].add(node["file_path"])
297
+ if node["type"] != "file":
298
+ packages[pkg]["node_count"] += 1
299
+
300
+ # Get package docstrings from __init__.py nodes (already in all_nodes)
301
+ for node in all_nodes:
302
+ if node["type"] == "file" and node["docstring"] and node["file_path"].endswith("__init__.py"):
303
+ parts = node["file_path"].split("/")
304
+ pkg = parts[0] if len(parts) > 1 else "(root)"
305
+ if pkg in packages:
306
+ packages[pkg]["docstring"] = node["docstring"]
307
+
308
+ # Entry points: functions named main, or in __main__.py
309
+ entry_points: list[dict] = []
310
+ for node in all_nodes:
311
+ if node["name"] == "main" and node["type"] == "function":
312
+ entry_points.append({
313
+ "id": node["id"],
314
+ "file_path": node["file_path"],
315
+ "line": node["line_start"],
316
+ })
317
+ elif "__main__" in node["file_path"] and node["type"] == "file":
318
+ entry_points.append({
319
+ "id": node["id"],
320
+ "file_path": node["file_path"],
321
+ "line": 1,
322
+ })
323
+
324
+ # Dependency flow between packages — use IMPORTS + CALLS edges
325
+ # Build node lookup for resolving targets to packages
326
+ node_by_id = {n["id"]: n for n in all_nodes}
327
+ nodes_by_name: dict[str, list[dict]] = defaultdict(list)
328
+ for n in all_nodes:
329
+ nodes_by_name[n["name"]].append(n)
330
+ if n["qualified_name"] != n["name"]:
331
+ nodes_by_name[n["qualified_name"]].append(n)
332
+
333
+ all_edges = self.store.get_all_edges()
334
+ pkg_deps: dict[str, set[str]] = defaultdict(set)
335
+ for e in all_edges:
336
+ if e["type"] not in ("IMPORTS", "CALLS"):
337
+ continue
338
+ src_parts = e["file_path"].split("/")
339
+ src_pkg = src_parts[0] if len(src_parts) > 1 else "(root)"
340
+
341
+ # Resolve target to a package
342
+ target_pkgs: set[str] = set()
343
+ # Try exact node ID
344
+ t_node = node_by_id.get(e["target"])
345
+ if t_node:
346
+ tp = t_node["file_path"].split("/")
347
+ target_pkgs.add(tp[0] if len(tp) > 1 else "(root)")
348
+ # Try name/qname lookup
349
+ for m in nodes_by_name.get(e["target"], []):
350
+ tp = m["file_path"].split("/")
351
+ target_pkgs.add(tp[0] if len(tp) > 1 else "(root)")
352
+ # Try dotted import prefix
353
+ if "." in e["target"]:
354
+ first = e["target"].split(".")[0]
355
+ if first in packages:
356
+ target_pkgs.add(first)
357
+
358
+ for target_pkg in target_pkgs:
359
+ if target_pkg != src_pkg and target_pkg in packages:
360
+ pkg_deps[src_pkg].add(target_pkg)
361
+
362
+ return {
363
+ "stats": stats,
364
+ "packages": {
365
+ name: {
366
+ "file_count": len(info["files"]),
367
+ "node_count": info["node_count"],
368
+ "docstring": info["docstring"],
369
+ "depends_on": sorted(pkg_deps.get(name, set())),
370
+ }
371
+ for name, info in sorted(packages.items())
372
+ },
373
+ "entry_points": entry_points,
374
+ }
375
+
376
+ # ------------------------------------------------------------------
377
+ # Package level
378
+ # ------------------------------------------------------------------
379
+ def package_view(self, package_name: str) -> dict:
380
+ """View of a single package (directory).
381
+
382
+ Bridges system_overview -> module_view.
383
+ Lists all files in the package with their roles, symbol counts,
384
+ dependencies, and which files are the most important.
385
+ """
386
+ all_nodes = self.store.get_all_nodes()
387
+ return self._zoom_package(package_name, all_nodes)
388
+
389
+ # ------------------------------------------------------------------
390
+ # Module level
391
+ # ------------------------------------------------------------------
392
+ def module_view(self, file_path: str) -> dict:
393
+ """Detailed view of a single file/module."""
394
+ nodes = self.store.get_nodes_by_file(file_path)
395
+ if not nodes:
396
+ return {"error": f"No indexed file: {file_path}"}
397
+
398
+ file_node = None
399
+ symbols: list[dict] = []
400
+ for n in nodes:
401
+ if n["type"] == "file":
402
+ file_node = n
403
+ else:
404
+ symbols.append({
405
+ "name": n["name"],
406
+ "type": n["type"],
407
+ "line_start": n["line_start"],
408
+ "line_end": n["line_end"],
409
+ "signature": n["signature"],
410
+ "is_exported": bool(n["is_exported"]),
411
+ "docstring": n["docstring"],
412
+ })
413
+
414
+ # What does this module import?
415
+ imports = self.engine.get_file_dependencies(file_path)
416
+
417
+ # What imports this module? — batch fetch all incoming edges for all nodes in this file
418
+ node_ids = {n["id"] for n in nodes}
419
+ node_names = {n["name"] for n in nodes}
420
+ node_qnames = {n["qualified_name"] for n in nodes}
421
+ # Use a single query to find all edges targeting any node in this file
422
+ all_edges = self.store.get_all_edges()
423
+ imported_by_set: set[str] = set()
424
+
425
+ # Detect language to filter cross-language false positives
426
+ _py = (".py",)
427
+ _js = (".ts", ".tsx", ".js", ".jsx")
428
+ fp_ext = os.path.splitext(file_path)[1].lower() if file_path else ""
429
+ fp_lang = "python" if fp_ext in _py else ("js" if fp_ext in _js else "other")
430
+
431
+ for e in all_edges:
432
+ if e["type"] in ("CALLS", "IMPORTS") and e["file_path"] != file_path:
433
+ # Match by node ID or qualified name (always safe)
434
+ if e["target"] in node_ids or e["target"] in node_qnames:
435
+ matched = True
436
+ elif e["target"] in node_names and e["target"] not in _GENERIC_NAMES:
437
+ # Bare-name match — skip generic names to avoid over-counting
438
+ matched = True
439
+ else:
440
+ matched = False
441
+ if matched:
442
+ # Skip cross-language false positives (Python <-> JS/TS)
443
+ e_ext = os.path.splitext(e["file_path"])[1].lower() if e["file_path"] else ""
444
+ e_lang = "python" if e_ext in _py else ("js" if e_ext in _js else "other")
445
+ if fp_lang != "other" and e_lang != "other" and fp_lang != e_lang:
446
+ continue
447
+ imported_by_set.add(e["file_path"])
448
+
449
+ # Coupling score: how many other files reference this one
450
+ coupling_in = len(imported_by_set)
451
+ coupling_out = len(imports)
452
+
453
+ # Infer role
454
+ role = self._infer_module_role(file_path, symbols, coupling_in, coupling_out)
455
+
456
+ return {
457
+ "file_path": file_path,
458
+ "docstring": file_node["docstring"] if file_node else "",
459
+ "line_count": file_node["line_end"] if file_node else 0,
460
+ "role": role,
461
+ "symbols": symbols,
462
+ "exports": [s["name"] for s in symbols if s["is_exported"]],
463
+ "imports": imports,
464
+ "imported_by": sorted(imported_by_set),
465
+ "coupling": {
466
+ "incoming": coupling_in,
467
+ "outgoing": coupling_out,
468
+ "score": coupling_in + coupling_out,
469
+ },
470
+ }
471
+
472
+ # ------------------------------------------------------------------
473
+ # Risk hotspots
474
+ # ------------------------------------------------------------------
475
+ def risk_hotspots(self, top_n: int = 20) -> list[dict]:
476
+ """Find the most structurally risky nodes in the codebase.
477
+
478
+ Risk = number of transitive dependents * centrality.
479
+ High-risk nodes are those where a change would cascade widely.
480
+ """
481
+ all_nodes = self.store.get_all_nodes()
482
+ all_edges = self.store.get_all_edges()
483
+
484
+ # Preload reverse index for bulk impact_of_change() calls
485
+ self.engine.preload_reverse_index()
486
+
487
+ # Build reverse index: target -> list of edges (for incoming lookups)
488
+ incoming_by_target: dict[str, list[dict]] = defaultdict(list)
489
+ for e in all_edges:
490
+ incoming_by_target[e["target"]].append(e)
491
+
492
+ # Build node lookup for resolving impacted node files
493
+ node_by_id = {n["id"]: n for n in all_nodes}
494
+
495
+ # Also index by name and qualified_name for edge target resolution
496
+ nodes_by_name: dict[str, list[dict]] = defaultdict(list)
497
+ for n in all_nodes:
498
+ nodes_by_name[n["name"]].append(n)
499
+ if n["qualified_name"] != n["name"]:
500
+ nodes_by_name[n["qualified_name"]].append(n)
501
+
502
+ # Phase 1: Compute direct dependents for all nodes (cheap, in-memory)
503
+ candidates: list[tuple[dict, int, int]] = [] # (node, direct, external)
504
+
505
+ for node in all_nodes:
506
+ if node["type"] == "file":
507
+ continue
508
+
509
+ # Skip test files and test functions/classes
510
+ if is_test_file(node["file_path"]):
511
+ continue
512
+ if node["name"].startswith("test_") or node["name"].startswith("Test"):
513
+ continue
514
+
515
+ # Count direct incoming references using pre-built index
516
+ callers: list[dict] = []
517
+ for e in incoming_by_target.get(node["id"], []):
518
+ if e["type"] in ("CALLS", "IMPORTS"):
519
+ callers.append(e)
520
+ # Skip bare-name matching for generic names to avoid inflated scores
521
+ # (e.g. MyCache.get collecting ALL .get() callers across the codebase)
522
+ if node["name"] not in _GENERIC_NAMES:
523
+ for e in incoming_by_target.get(node["name"], []):
524
+ if e["type"] in ("CALLS", "IMPORTS"):
525
+ callers.append(e)
526
+ if node["qualified_name"] != node["name"]:
527
+ for e in incoming_by_target.get(node["qualified_name"], []):
528
+ if e["type"] in ("CALLS", "IMPORTS"):
529
+ callers.append(e)
530
+
531
+ # Filter out callers from test files
532
+ callers = [e for e in callers if not is_test_file(e.get("file_path", ""))]
533
+
534
+ # Deduplicate by (source, target, line)
535
+ seen = set()
536
+ unique_callers = []
537
+ for e in callers:
538
+ key = (e["source"], e["target"], e["line"])
539
+ if key not in seen:
540
+ seen.add(key)
541
+ unique_callers.append(e)
542
+
543
+ direct_dependents = len(unique_callers)
544
+ if direct_dependents == 0:
545
+ continue
546
+
547
+ external_dependents = len([e for e in unique_callers if e["file_path"] != node["file_path"]])
548
+ candidates.append((node, direct_dependents, external_dependents))
549
+
550
+ # Phase 2: Sort by direct dependents (proxy for risk), take top candidates
551
+ # Only compute expensive transitive impact for top candidates
552
+ candidate_limit = max(top_n * 4, 100)
553
+ candidates.sort(key=lambda x: x[1] + x[2] * 2, reverse=True)
554
+ candidates = candidates[:candidate_limit]
555
+
556
+ hotspots: list[dict] = []
557
+ for node, direct_dependents, external_dependents in candidates:
558
+ # Transitive impact (limit depth for performance)
559
+ impacted = self.engine.impact_of_change(node["id"], max_depth=3)
560
+ transitive_count = len(impacted)
561
+
562
+ # Affected files — use node_by_id lookup instead of per-node DB query
563
+ affected_files = set()
564
+ for entry in impacted:
565
+ target = node_by_id.get(entry["node_id"])
566
+ if target:
567
+ affected_files.add(target["file_path"])
568
+ affected_files.discard(node["file_path"])
569
+
570
+ risk_score = (
571
+ direct_dependents * 1.0
572
+ + external_dependents * 2.0
573
+ + transitive_count * 0.5
574
+ + len(affected_files) * 3.0
575
+ )
576
+
577
+ hotspots.append({
578
+ "node_id": node["id"],
579
+ "name": node["name"],
580
+ "type": node["type"],
581
+ "file_path": node["file_path"],
582
+ "line_start": node["line_start"],
583
+ "direct_dependents": direct_dependents,
584
+ "external_dependents": external_dependents,
585
+ "transitive_impact": transitive_count,
586
+ "affected_files": len(affected_files),
587
+ "risk_score": round(risk_score, 1),
588
+ })
589
+
590
+ hotspots.sort(key=lambda h: h["risk_score"], reverse=True)
591
+ return hotspots[:top_n]
592
+
593
+ # ------------------------------------------------------------------
594
+ # Health score
595
+ # ------------------------------------------------------------------
596
+ def health_score(self, hotspots: list[dict] | None = None) -> dict:
597
+ """Compute codebase health metrics with individual scores per dimension.
598
+
599
+ Returns separate scores for each dimension rather than a single
600
+ misleading number. Each dimension is 0-100 (higher is better).
601
+ Pass pre-computed *hotspots* to avoid redundant computation.
602
+ """
603
+ stats = self.store.get_stats()
604
+ total_nodes = stats.get("nodes", 0)
605
+ total_files = stats.get("files", 0)
606
+
607
+ # Dead code analysis
608
+ dead = self.engine.find_dead_code()
609
+ dead_ratio = len(dead) / max(total_nodes, 1)
610
+ dead_score = max(0, int(100 - dead_ratio * 200)) # 50% dead = 0
611
+
612
+ # Cycle analysis
613
+ cycles = self.engine.detect_cycles()
614
+ cycle_ratio = len(cycles) / max(total_files, 1)
615
+ cycle_score = max(0, int(100 - cycle_ratio * 500)) # 20% cyclic = 0
616
+
617
+ # Hotspot concentration
618
+ if hotspots is None:
619
+ hotspots = self.risk_hotspots(top_n=50)
620
+ high_risk = [h for h in hotspots if h["risk_score"] > 20]
621
+ hotspot_ratio = len(high_risk) / max(total_files, 1)
622
+ coupling_score = max(0, int(100 - hotspot_ratio * 300))
623
+
624
+ # Overall is weighted average (not a penalty system)
625
+ overall = int(dead_score * 0.3 + cycle_score * 0.3 + coupling_score * 0.4)
626
+
627
+ if overall >= 80:
628
+ grade = "A"
629
+ elif overall >= 60:
630
+ grade = "B"
631
+ elif overall >= 40:
632
+ grade = "C"
633
+ elif overall >= 20:
634
+ grade = "D"
635
+ else:
636
+ grade = "F"
637
+
638
+ return {
639
+ "score": overall,
640
+ "grade": grade,
641
+ "dimensions": {
642
+ "dead_code": {
643
+ "score": dead_score,
644
+ "count": len(dead),
645
+ "ratio": round(dead_ratio, 3),
646
+ },
647
+ "import_cycles": {
648
+ "score": cycle_score,
649
+ "count": len(cycles),
650
+ },
651
+ "coupling": {
652
+ "score": coupling_score,
653
+ "high_risk_hotspots": len(high_risk),
654
+ },
655
+ },
656
+ "details": {
657
+ "total_nodes": total_nodes,
658
+ "total_files": total_files,
659
+ "dead_code_count": len(dead),
660
+ "dead_code_ratio": round(dead_ratio, 3),
661
+ "import_cycles": len(cycles),
662
+ "high_risk_hotspots": len(high_risk),
663
+ },
664
+ }
665
+
666
+ # ------------------------------------------------------------------
667
+ # Dependency map
668
+ # ------------------------------------------------------------------
669
+ def dependency_map(self) -> dict:
670
+ """File-level dependency graph for the entire codebase."""
671
+ file_nodes = self.store.get_all_nodes(type_filter="file")
672
+ all_nodes = self.store.get_all_nodes()
673
+ all_edges = self.store.get_all_edges()
674
+
675
+ # Build multiple lookup indices to resolve edge targets
676
+ node_by_id: dict[str, dict] = {}
677
+ nodes_by_name: dict[str, list[dict]] = defaultdict(list)
678
+ nodes_by_qname: dict[str, list[dict]] = defaultdict(list)
679
+ for n in all_nodes:
680
+ node_by_id[n["id"]] = n
681
+ nodes_by_name[n["name"]].append(n)
682
+ if n["qualified_name"] != n["name"]:
683
+ nodes_by_qname[n["qualified_name"]].append(n)
684
+
685
+ # Also map dotted import targets (e.g. "codebrain.indexer.full_index")
686
+ # to file paths by converting dots to path separators
687
+ file_paths = {n["file_path"] for n in file_nodes}
688
+
689
+ def _resolve_target_files(target: str) -> list[str]:
690
+ """Resolve an edge target to file path(s)."""
691
+ # 1. Exact node ID
692
+ node = node_by_id.get(target)
693
+ if node:
694
+ return [node["file_path"]]
695
+ # 2. Qualified name
696
+ matches = nodes_by_qname.get(target)
697
+ if matches:
698
+ return [m["file_path"] for m in matches]
699
+ # 3. Simple name (may match multiple nodes)
700
+ matches = nodes_by_name.get(target)
701
+ if matches:
702
+ return [m["file_path"] for m in matches]
703
+ # 4. Dotted import path → try converting to file path
704
+ # e.g. "codebrain.indexer.full_index" → check "codebrain/indexer.py"
705
+ parts = target.split(".")
706
+ for i in range(len(parts), 0, -1):
707
+ candidate = "/".join(parts[:i]) + ".py"
708
+ if candidate in file_paths:
709
+ return [candidate]
710
+ # Also try as package __init__.py
711
+ candidate_pkg = "/".join(parts[:i]) + "/__init__.py"
712
+ if candidate_pkg in file_paths:
713
+ return [candidate_pkg]
714
+ return []
715
+
716
+ # Build edges-by-source for IMPORTS/CALLS
717
+ edges_by_file: dict[str, set[str]] = defaultdict(set)
718
+ for e in all_edges:
719
+ if e["type"] in ("IMPORTS", "CALLS"):
720
+ src_file = e["file_path"]
721
+ for target_file in _resolve_target_files(e["target"]):
722
+ if target_file != src_file:
723
+ edges_by_file[src_file].add(target_file)
724
+
725
+ deps: dict[str, list[str]] = {}
726
+ for fn in file_nodes:
727
+ deps[fn["file_path"]] = sorted(edges_by_file.get(fn["file_path"], set()))
728
+
729
+ return {"file_dependencies": deps}
730
+
731
+ # ------------------------------------------------------------------
732
+ # Change impact summary
733
+ # ------------------------------------------------------------------
734
+ def change_summary(self, file_path: str) -> dict:
735
+ """If this file changes, what is the blast radius?"""
736
+ nodes = self.store.get_nodes_by_file(file_path)
737
+ all_affected: set[str] = set()
738
+ all_impacted_nodes: list[dict] = []
739
+
740
+ # Pre-fetch all nodes for ID->file resolution
741
+ all_nodes_list = self.store.get_all_nodes()
742
+ node_by_id = {n["id"]: n for n in all_nodes_list}
743
+
744
+ for node in nodes:
745
+ if node["type"] == "file":
746
+ continue
747
+ impacted = self.engine.impact_of_change(node["id"], max_depth=5)
748
+ for entry in impacted:
749
+ target = node_by_id.get(entry["node_id"])
750
+ if target and target["file_path"] != file_path:
751
+ all_affected.add(target["file_path"])
752
+ all_impacted_nodes.append({
753
+ "node_id": entry["node_id"],
754
+ "depth": entry["depth"],
755
+ "via": entry["via"],
756
+ })
757
+
758
+ return {
759
+ "file_path": file_path,
760
+ "affected_file_count": len(all_affected),
761
+ "affected_files": sorted(all_affected),
762
+ "impacted_nodes": all_impacted_nodes,
763
+ }
764
+
765
+ # ------------------------------------------------------------------
766
+ # Narrative generation
767
+ # ------------------------------------------------------------------
768
+ def system_narrative(self) -> dict:
769
+ """Generate a natural language narrative for the entire codebase.
770
+
771
+ Combines system_overview, health_score, and risk_hotspots into
772
+ human-readable summaries with relative comparisons.
773
+ """
774
+ overview = self.system_overview()
775
+ hotspots = self.risk_hotspots(top_n=50)
776
+ health = self.health_score(hotspots=hotspots)
777
+
778
+ stats = overview["stats"]
779
+ packages = overview["packages"]
780
+ pkg_names = sorted(packages.keys())
781
+ total_files = stats.get("files", 0)
782
+
783
+ # Size characterization
784
+ if total_files < 20:
785
+ size = "small"
786
+ elif total_files < 200:
787
+ size = "medium-sized"
788
+ else:
789
+ size = "large"
790
+
791
+ summary = (
792
+ f"This is a {size} project with {total_files} files "
793
+ f"and {stats.get('nodes', 0)} symbols across "
794
+ f"{len(pkg_names)} package{'s' if len(pkg_names) != 1 else ''}"
795
+ f" ({', '.join(pkg_names)})."
796
+ )
797
+
798
+ # Architecture: identify the largest package
799
+ arch_parts = []
800
+ if packages:
801
+ largest_pkg = max(packages.items(), key=lambda x: x[1]["node_count"])
802
+ for pkg_name in pkg_names:
803
+ pkg = packages[pkg_name]
804
+ part = f"{pkg_name} ({pkg['file_count']} files, {pkg['node_count']} symbols)"
805
+ if pkg_name == largest_pkg[0] and len(pkg_names) > 1:
806
+ part += " — the largest package"
807
+ arch_parts.append(part)
808
+ architecture = ". ".join(arch_parts) + "." if arch_parts else ""
809
+
810
+ # Backbone: top 3 most-depended-on files from hotspots
811
+ backbone = []
812
+ if hotspots:
813
+ top = hotspots[:3]
814
+ backbone_names = [h["file_path"].rsplit("/", 1)[-1] for h in top]
815
+ summary += (
816
+ f" The backbone modules are {', '.join(backbone_names)}"
817
+ " — most of the codebase depends on them."
818
+ )
819
+
820
+ # Health summary with relative context
821
+ grade = health["grade"]
822
+ score = health["score"]
823
+ details = health["details"]
824
+ dims = health.get("dimensions", {})
825
+ health_parts = [f"Health: {score}/100 (grade {grade})."]
826
+ dead_ratio = details.get("dead_code_ratio", 0)
827
+ if dead_ratio > 0.1:
828
+ health_parts.append(
829
+ f"Dead code is {dead_ratio:.0%} of symbols — higher than ideal (aim for <5%)."
830
+ )
831
+ elif dead_ratio > 0:
832
+ health_parts.append(f"Dead code is low at {dead_ratio:.0%} — good hygiene.")
833
+ cycles = details.get("import_cycles", 0)
834
+ if cycles > 0:
835
+ health_parts.append(f"{cycles} import cycle(s) — consider refactoring to improve layering.")
836
+ hotspot_count = details.get("high_risk_hotspots", 0)
837
+ if hotspot_count > 3:
838
+ health_parts.append(
839
+ f"{hotspot_count} high-risk hotspots — more than typical. Reduce coupling where possible."
840
+ )
841
+ health_summary = " ".join(health_parts)
842
+
843
+ # Key components with why
844
+ key_components = []
845
+ for h in hotspots[:5]:
846
+ risk_level = "high" if h["risk_score"] > 20 else ("medium" if h["risk_score"] > 5 else "low")
847
+ pct = h["affected_files"] / max(total_files, 1) * 100
848
+ key_components.append({
849
+ "name": h["name"],
850
+ "role": h["type"],
851
+ "risk": risk_level,
852
+ "why": (
853
+ f"{h['direct_dependents']} dependents, affects {h['affected_files']} files "
854
+ f"({pct:.0f}% of codebase)"
855
+ ),
856
+ })
857
+
858
+ # Recommendations
859
+ recommendations = []
860
+ if details["dead_code_count"] > 5:
861
+ recommendations.append(
862
+ f"Remove or document {details['dead_code_count']} potentially unused symbols."
863
+ )
864
+ if details.get("import_cycles", 0) > 0:
865
+ recommendations.append(
866
+ f"Resolve {details['import_cycles']} import cycle(s) to improve layering."
867
+ )
868
+ if hotspot_count > 3:
869
+ recommendations.append(
870
+ "Reduce coupling on high-risk hotspots to lower change risk."
871
+ )
872
+ if not recommendations:
873
+ recommendations.append("Codebase is in good shape. Keep monitoring hotspots.")
874
+
875
+ return {
876
+ "summary": summary,
877
+ "architecture": architecture,
878
+ "health_summary": health_summary,
879
+ "key_components": key_components,
880
+ "recommendations": recommendations,
881
+ }
882
+
883
+ def module_narrative(self, file_path: str) -> dict:
884
+ """Generate a natural language narrative for a single module.
885
+
886
+ Uses relative comparisons to codebase averages for context.
887
+ """
888
+ view = self.module_view(file_path)
889
+ if "error" in view:
890
+ return view
891
+
892
+ role = view.get("role", "module")
893
+ symbols = view.get("symbols", [])
894
+ imports = view.get("imports", [])
895
+ imported_by = view.get("imported_by", [])
896
+ exports = view.get("exports", [])
897
+ coupling = view.get("coupling", {})
898
+
899
+ # Compute codebase baselines for relative comparisons
900
+ all_nodes = self.store.get_all_nodes()
901
+ symbols_per_file: dict[str, int] = {}
902
+ for n in all_nodes:
903
+ fp = n.get("file_path", "")
904
+ if fp and n["type"] != "file":
905
+ symbols_per_file[fp] = symbols_per_file.get(fp, 0) + 1
906
+ avg_symbols = sum(symbols_per_file.values()) / max(len(symbols_per_file), 1)
907
+ this_count = len(symbols)
908
+
909
+ file_name = file_path.split("/")[-1]
910
+ classes = [s for s in symbols if s["type"] == "class"]
911
+ functions = [s for s in symbols if s["type"] in ("function", "method")]
912
+
913
+ # Summary with relative size
914
+ ratio = this_count / max(avg_symbols, 1)
915
+ if ratio > 2:
916
+ size_note = f" — {ratio:.1f}x the codebase average ({avg_symbols:.0f})"
917
+ elif ratio < 0.5 and avg_symbols > 0:
918
+ size_note = f" — lightweight (codebase average: {avg_symbols:.0f})"
919
+ else:
920
+ size_note = ""
921
+ summary = (
922
+ f"{file_name} is a {role.replace('_', ' ')} module with "
923
+ f"{this_count} symbols ({len(classes)} classes, {len(functions)} functions){size_note}."
924
+ )
925
+
926
+ # Importance with rank
927
+ incoming = coupling.get("incoming", 0)
928
+ all_incoming = sorted(
929
+ [v.get("coupling", {}).get("incoming", 0)
930
+ for v in [self.module_view(fp) for fp in list(symbols_per_file.keys())[:50]]
931
+ if "error" not in v],
932
+ reverse=True,
933
+ )
934
+ rank = (all_incoming.index(incoming) + 1) if incoming in all_incoming else len(all_incoming)
935
+ total_modules = len(symbols_per_file)
936
+
937
+ if rank == 1 and incoming > 0:
938
+ importance = f"Most depended-on file — {incoming} files depend on it. Changes here have the highest blast radius."
939
+ elif rank <= 3 and incoming > 0:
940
+ importance = f"Ranked #{rank} by dependents — {incoming} files depend on it. This is load-bearing infrastructure."
941
+ elif incoming > 0:
942
+ importance = f"{incoming} file(s) depend on it (ranked #{rank} of {total_modules})."
943
+ else:
944
+ importance = "No other files depend on this module. Changes here are low-risk."
945
+
946
+ # Dependencies narrative
947
+ if imports:
948
+ dep_str = f"Imports: {', '.join(imports[:10])}"
949
+ if len(imports) > 10:
950
+ dep_str += f" (+{len(imports) - 10} more)"
951
+ else:
952
+ dep_str = "No imports."
953
+ if imported_by:
954
+ dep_str += f". Imported by: {', '.join(imported_by[:10])}"
955
+ if len(imported_by) > 10:
956
+ dep_str += f" (+{len(imported_by) - 10} more)"
957
+ dependencies = dep_str + "."
958
+
959
+ # Exports summary
960
+ if exports:
961
+ exports_summary = f"Exports {len(exports)} symbols: {', '.join(exports[:10])}."
962
+ else:
963
+ exports_summary = "No exported symbols."
964
+
965
+ # Risks with context
966
+ risks = []
967
+ if incoming > 10:
968
+ risks.append(f"High coupling: {incoming} dependents — more than 90% of modules.")
969
+ if coupling.get("score", 0) > 20:
970
+ risks.append(f"High total coupling score ({coupling['score']}). Consider splitting this module.")
971
+ if not risks:
972
+ risks.append("No significant structural risks.")
973
+
974
+ top_callers = ", ".join(imported_by[:5]) if imported_by else "None"
975
+
976
+ return {
977
+ "summary": summary,
978
+ "importance": importance,
979
+ "dependencies": dependencies,
980
+ "exports_summary": exports_summary,
981
+ "risks": risks,
982
+ "top_callers": top_callers,
983
+ }
984
+
985
+ def symbol_narrative(self, node_id: str) -> dict:
986
+ """Generate a natural language narrative for a single symbol.
987
+
988
+ Includes blast radius as percentage of codebase and relative comparisons.
989
+ """
990
+ node = self.store.get_node(node_id)
991
+ if not node:
992
+ # Try resolving by name
993
+ resolved = self.engine.resolve_node(node_id)
994
+ if resolved:
995
+ node = resolved[0]
996
+ node_id = node["id"]
997
+ else:
998
+ return {"error": f"Symbol not found: {node_id}"}
999
+
1000
+ name = node["name"]
1001
+ sym_type = node["type"]
1002
+
1003
+ # Get callers
1004
+ reverse_deps = self.engine.get_reverse_dependencies(node_id)
1005
+ callers = [d for d in reverse_deps if d["edge_type"] == "CALLS"]
1006
+ importers = [d for d in reverse_deps if d["edge_type"] == "IMPORTS"]
1007
+
1008
+ # Impact + blast radius percentage
1009
+ impacted = self.engine.impact_of_change(node_id, max_depth=3)
1010
+ affected_files = {e.get("node_id", "").split("::")[0] for e in impacted}
1011
+ affected_files.discard("")
1012
+ affected_files.discard(node["file_path"])
1013
+
1014
+ all_nodes = self.store.get_all_nodes()
1015
+ total_files = len({n["file_path"] for n in all_nodes if n.get("file_path")})
1016
+ blast_pct = len(affected_files) / max(total_files, 1) * 100
1017
+
1018
+ # Summary
1019
+ summary = f"{name} is a {sym_type} defined in {node['file_path']}."
1020
+ if node.get("docstring"):
1021
+ summary += f" {node['docstring'].split(chr(10))[0]}"
1022
+
1023
+ # Importance with blast radius percentage
1024
+ total_deps = len(callers) + len(importers)
1025
+ if total_deps > 5:
1026
+ importance = (
1027
+ f"High — {total_deps} direct dependents, affects {len(affected_files)} files "
1028
+ f"({blast_pct:.0f}% of codebase)."
1029
+ )
1030
+ if blast_pct > 30:
1031
+ importance += " This is a foundational symbol — modify with extreme care."
1032
+ elif total_deps > 0:
1033
+ importance = f"Moderate — {total_deps} direct dependent(s), affects {blast_pct:.0f}% of codebase."
1034
+ else:
1035
+ importance = "Low — no known dependents. Changes here are safe."
1036
+
1037
+ # Usage
1038
+ caller_files = {c["file_path"] for c in callers}
1039
+ if caller_files:
1040
+ usage = f"Called from {len(callers)} location(s) across {len(caller_files)} file(s)."
1041
+ else:
1042
+ usage = "No known call sites."
1043
+
1044
+ # Callers summary — group by file with counts
1045
+ callers_by_file: dict[str, int] = defaultdict(int)
1046
+ for c in callers:
1047
+ callers_by_file[c["file_path"]] += 1
1048
+ top_caller_files = sorted(callers_by_file.items(), key=lambda x: -x[1])[:5]
1049
+
1050
+ if top_caller_files:
1051
+ caller_detail = [f"{fp} ({count}x)" for fp, count in top_caller_files]
1052
+ callers_summary = f"Called by: {', '.join(caller_detail)}."
1053
+ if len(callers) > 5:
1054
+ callers_summary += f" (+{len(callers) - 5} more)"
1055
+ else:
1056
+ callers_summary = "No known callers."
1057
+
1058
+ caller_files_list = [{"file": fp, "count": ct} for fp, ct in top_caller_files]
1059
+
1060
+ # Dependencies
1061
+ outgoing = self.store.get_edges_from(node_id)
1062
+ call_targets = [e["target"] for e in outgoing if e["type"] == "CALLS"]
1063
+ if call_targets:
1064
+ dependencies = f"Calls: {', '.join(call_targets[:10])}."
1065
+ else:
1066
+ dependencies = "No outgoing calls."
1067
+
1068
+ return {
1069
+ "summary": summary,
1070
+ "importance": importance,
1071
+ "usage": usage,
1072
+ "callers_summary": callers_summary,
1073
+ "caller_files": caller_files_list,
1074
+ "dependencies": dependencies,
1075
+ }
1076
+
1077
+ # ------------------------------------------------------------------
1078
+ # Codebase anatomy
1079
+ # ------------------------------------------------------------------
1080
+ def codebase_anatomy(self) -> dict:
1081
+ """Subsystem map for new developers — the 'truck parts' view.
1082
+
1083
+ Groups files into named subsystems with descriptions, importance
1084
+ rankings, and inter-subsystem connections.
1085
+ """
1086
+ overview = self.system_overview()
1087
+ all_nodes = self.store.get_all_nodes()
1088
+ all_edges = self.store.get_all_edges()
1089
+ hotspots = self.risk_hotspots(top_n=50)
1090
+
1091
+ subsystems = self._group_into_subsystems(all_nodes, overview)
1092
+ connections = self._compute_subsystem_connections(subsystems, all_nodes, all_edges)
1093
+ self._score_subsystem_importance(subsystems, hotspots, connections)
1094
+
1095
+ for sub in subsystems:
1096
+ sub["description"] = self._describe_subsystem(sub, connections)
1097
+
1098
+ purpose = self._derive_purpose(overview, subsystems)
1099
+
1100
+ return {
1101
+ "purpose": purpose,
1102
+ "subsystems": subsystems,
1103
+ "connections": connections,
1104
+ "entry_points": overview.get("entry_points", []),
1105
+ "stats": overview.get("stats", {}),
1106
+ }
1107
+
1108
+ # ------------------------------------------------------------------
1109
+ # Anatomy helpers (private)
1110
+ # ------------------------------------------------------------------
1111
+
1112
+ # Known directory → display name mappings
1113
+ _KNOWN_NAMES: dict[str, str] = {
1114
+ "graph": "Graph Engine",
1115
+ "parser": "Parser System",
1116
+ "web": "Web UI",
1117
+ "api": "REST API",
1118
+ "cli": "CLI Interface",
1119
+ "mcp": "MCP Server",
1120
+ "tests": "Test Suite",
1121
+ "test": "Test Suite",
1122
+ }
1123
+
1124
+ def _group_into_subsystems(
1125
+ self, all_nodes: list[dict], overview: dict
1126
+ ) -> list[dict]:
1127
+ """Group files into subsystems using a 3-tier algorithm."""
1128
+ # Build per-file info: file_path → list of non-file nodes
1129
+ file_nodes: dict[str, list[dict]] = defaultdict(list)
1130
+ for node in all_nodes:
1131
+ fp = node.get("file_path", "")
1132
+ if not fp:
1133
+ continue
1134
+ if node["type"] == "file":
1135
+ # ensure the file key exists even with no symbols
1136
+ if fp not in file_nodes:
1137
+ file_nodes[fp] = []
1138
+ else:
1139
+ file_nodes[fp].append(node)
1140
+
1141
+ # Tier 1: directory-based grouping by parent directory at depth 2
1142
+ dir_groups: dict[str, list[str]] = defaultdict(list)
1143
+ for fp in file_nodes:
1144
+ parts = fp.split("/")
1145
+ # Use the directory path (excluding filename), capped at depth 2
1146
+ dir_parts = parts[:-1] if len(parts) > 1 else []
1147
+ if len(dir_parts) >= 2:
1148
+ key = "/".join(dir_parts[:2])
1149
+ elif dir_parts:
1150
+ key = dir_parts[0]
1151
+ else:
1152
+ key = "(root)"
1153
+ dir_groups[key].append(fp)
1154
+
1155
+ # Tier 4: separate test files into a single group
1156
+ test_files: list[str] = []
1157
+ non_test_groups: dict[str, list[str]] = {}
1158
+ for key, files in dir_groups.items():
1159
+ test = [f for f in files if is_test_file(f)]
1160
+ non_test = [f for f in files if not is_test_file(f)]
1161
+ if test:
1162
+ test_files.extend(test)
1163
+ if non_test:
1164
+ non_test_groups[key] = non_test
1165
+ elif not test:
1166
+ # Key with no files left (shouldn't happen, but safe)
1167
+ non_test_groups[key] = files
1168
+ dir_groups = non_test_groups
1169
+
1170
+ # Tier 2: merge small groups (<2 files) into role-based buckets
1171
+ role_merge: dict[str, list[str]] = defaultdict(list)
1172
+ final_groups: dict[str, list[str]] = {}
1173
+ for key, files in dir_groups.items():
1174
+ if len(files) < 2:
1175
+ for fp in files:
1176
+ syms = file_nodes.get(fp, [])
1177
+ sym_dicts = [{"name": s["name"], "type": s["type"]} for s in syms]
1178
+ role = self._infer_module_role(fp, sym_dicts, 0, 0)
1179
+ role_merge[role].append(fp)
1180
+ else:
1181
+ final_groups[key] = files
1182
+
1183
+ # Tier 3: singleton promotion — large files get their own subsystem
1184
+ # First, handle role-merged files
1185
+ for role, files in role_merge.items():
1186
+ for fp in files:
1187
+ sym_count = len(file_nodes.get(fp, []))
1188
+ if sym_count >= 5:
1189
+ final_groups[fp] = [fp]
1190
+ else:
1191
+ final_groups.setdefault("_utilities", []).append(fp)
1192
+
1193
+ # Also promote large files from big groups (>5 files) that have
1194
+ # many symbols (>= 15) — these are major modules worthy of their own subsystem
1195
+ PROMOTE_THRESHOLD = 15
1196
+ for key in list(final_groups.keys()):
1197
+ files = final_groups[key]
1198
+ if key.startswith("_") or len(files) <= 3:
1199
+ continue
1200
+ promoted = []
1201
+ kept = []
1202
+ for fp in files:
1203
+ sym_count = len(file_nodes.get(fp, []))
1204
+ if sym_count >= PROMOTE_THRESHOLD:
1205
+ promoted.append(fp)
1206
+ else:
1207
+ kept.append(fp)
1208
+ if promoted and kept:
1209
+ final_groups[key] = kept
1210
+ for fp in promoted:
1211
+ final_groups[fp] = [fp]
1212
+
1213
+ if test_files:
1214
+ final_groups["_tests"] = test_files
1215
+
1216
+ # Cap at ~15 subsystems: merge smallest (by symbol count) into "Other"
1217
+ MAX_SUBSYSTEMS = 15
1218
+ if len(final_groups) > MAX_SUBSYSTEMS:
1219
+ by_weight = sorted(
1220
+ final_groups.items(),
1221
+ key=lambda x: sum(len(file_nodes.get(f, [])) for f in x[1]),
1222
+ )
1223
+ keep = dict(by_weight[-(MAX_SUBSYSTEMS - 1):])
1224
+ other_files: list[str] = []
1225
+ for key, files in by_weight[:len(by_weight) - MAX_SUBSYSTEMS + 1]:
1226
+ other_files.extend(files)
1227
+ if other_files:
1228
+ keep["_other"] = other_files
1229
+ final_groups = keep
1230
+
1231
+ # Build subsystem dicts
1232
+ subsystems: list[dict] = []
1233
+ for key, files in sorted(final_groups.items()):
1234
+ slug = self._subsystem_slug(key)
1235
+ name = self._subsystem_name(key)
1236
+
1237
+ # Collect symbols in this subsystem
1238
+ symbols: list[dict] = []
1239
+ for fp in files:
1240
+ symbols.extend(file_nodes.get(fp, []))
1241
+
1242
+ # Determine dominant role
1243
+ roles = Counter()
1244
+ for fp in files:
1245
+ sym_dicts = [{"name": s["name"], "type": s["type"]} for s in file_nodes.get(fp, [])]
1246
+ r = self._infer_module_role(fp, sym_dicts, 0, 0)
1247
+ roles[r] += 1
1248
+ dominant_role = roles.most_common(1)[0][0] if roles else "module"
1249
+
1250
+ # Map role to category
1251
+ ROLE_MAP = {
1252
+ "core_library": "core",
1253
+ "domain_model": "core",
1254
+ "data_model": "core",
1255
+ "orchestrator": "interface",
1256
+ "entry_point": "interface",
1257
+ "configuration": "infrastructure",
1258
+ "utility": "infrastructure",
1259
+ "package_init": "infrastructure",
1260
+ "test": "peripheral",
1261
+ }
1262
+ role_cat = ROLE_MAP.get(dominant_role, "peripheral")
1263
+
1264
+ # Top symbols by name
1265
+ top_syms = sorted(
1266
+ [s for s in symbols if s["type"] in ("class", "function")],
1267
+ key=lambda s: s["name"],
1268
+ )[:5]
1269
+
1270
+ # Find entry points in this subsystem
1271
+ eps = [
1272
+ ep for ep in overview.get("entry_points", [])
1273
+ if ep.get("file_path") in set(files)
1274
+ ]
1275
+
1276
+ subsystems.append({
1277
+ "name": name,
1278
+ "slug": slug,
1279
+ "packages": sorted(set(
1280
+ "/".join(fp.split("/")[:2]) if "/" in fp else fp
1281
+ for fp in files
1282
+ )),
1283
+ "files": sorted(files),
1284
+ "file_count": len(files),
1285
+ "symbol_count": len(symbols),
1286
+ "role": role_cat,
1287
+ "importance": 0.0,
1288
+ "importance_label": "peripheral",
1289
+ "description": "",
1290
+ "depends_on": [],
1291
+ "depended_by": [],
1292
+ "entry_points": eps,
1293
+ "top_symbols": [s["name"] for s in top_syms],
1294
+ "risk_score": 0.0,
1295
+ })
1296
+
1297
+ return subsystems
1298
+
1299
+ @staticmethod
1300
+ def _subsystem_slug(key: str) -> str:
1301
+ """Derive a URL-safe slug from a grouping key."""
1302
+ if key == "_utilities":
1303
+ return "utilities"
1304
+ if key == "_tests":
1305
+ return "tests"
1306
+ if key == "_other":
1307
+ return "other"
1308
+ # Use last path segment, lowercase
1309
+ return key.split("/")[-1].removesuffix(".py").lower().replace(" ", "-")
1310
+
1311
+ def _subsystem_name(self, key: str) -> str:
1312
+ """Derive a human-readable name from a grouping key."""
1313
+ if key == "_utilities":
1314
+ return "Utilities"
1315
+ if key == "_tests":
1316
+ return "Test Suite"
1317
+ if key == "_other":
1318
+ return "Other"
1319
+
1320
+ last = key.split("/")[-1].removesuffix(".py")
1321
+ # Check known mappings
1322
+ if last.lower() in self._KNOWN_NAMES:
1323
+ return self._KNOWN_NAMES[last.lower()]
1324
+ # Title case
1325
+ return last.replace("_", " ").replace("-", " ").title()
1326
+
1327
+ def _compute_subsystem_connections(
1328
+ self,
1329
+ subsystems: list[dict],
1330
+ all_nodes: list[dict],
1331
+ all_edges: list[dict],
1332
+ ) -> list[dict]:
1333
+ """Count inter-subsystem edges to build connection list."""
1334
+ # Map file_path → subsystem slug
1335
+ file_to_slug: dict[str, str] = {}
1336
+ for sub in subsystems:
1337
+ for fp in sub["files"]:
1338
+ file_to_slug[fp] = sub["slug"]
1339
+
1340
+ # Map node id/name/qname → file_path for target resolution
1341
+ node_file: dict[str, str] = {}
1342
+ for n in all_nodes:
1343
+ fp = n.get("file_path", "")
1344
+ if fp:
1345
+ node_file[n["id"]] = fp
1346
+ node_file[n["name"]] = fp
1347
+ if n["qualified_name"] != n["name"]:
1348
+ node_file[n["qualified_name"]] = fp
1349
+
1350
+ # Count edges between subsystems
1351
+ edge_counts: dict[tuple[str, str], int] = Counter()
1352
+ for e in all_edges:
1353
+ if e["type"] not in ("CALLS", "IMPORTS"):
1354
+ continue
1355
+ src_file = e.get("file_path", "")
1356
+ src_slug = file_to_slug.get(src_file)
1357
+ if not src_slug:
1358
+ continue
1359
+ tgt_file = node_file.get(e["target"], "")
1360
+ tgt_slug = file_to_slug.get(tgt_file)
1361
+ if not tgt_slug or tgt_slug == src_slug:
1362
+ continue
1363
+ edge_counts[(src_slug, tgt_slug)] += 1
1364
+
1365
+ # Build depends_on / depended_by on subsystems
1366
+ slug_lookup = {s["slug"]: s for s in subsystems}
1367
+ for (from_slug, to_slug), count in edge_counts.items():
1368
+ if to_slug not in slug_lookup.get(from_slug, {}).get("depends_on", []):
1369
+ slug_lookup[from_slug]["depends_on"].append(to_slug)
1370
+ if from_slug not in slug_lookup.get(to_slug, {}).get("depended_by", []):
1371
+ slug_lookup[to_slug]["depended_by"].append(from_slug)
1372
+
1373
+ # Sort depends_on / depended_by
1374
+ for sub in subsystems:
1375
+ sub["depends_on"] = sorted(set(sub["depends_on"]))
1376
+ sub["depended_by"] = sorted(set(sub["depended_by"]))
1377
+
1378
+ connections = [
1379
+ {
1380
+ "from": from_slug,
1381
+ "to": to_slug,
1382
+ "strength": count,
1383
+ "label": f"{count} calls + imports",
1384
+ }
1385
+ for (from_slug, to_slug), count in sorted(
1386
+ edge_counts.items(), key=lambda x: -x[1]
1387
+ )
1388
+ ]
1389
+ return connections
1390
+
1391
+ def _score_subsystem_importance(
1392
+ self,
1393
+ subsystems: list[dict],
1394
+ hotspots: list[dict],
1395
+ connections: list[dict],
1396
+ ) -> None:
1397
+ """Score each subsystem's importance (0.0–1.0) and assign labels."""
1398
+ total_subsystems = max(len(subsystems), 1)
1399
+
1400
+ # Build hotspot risk per file
1401
+ risk_by_file: dict[str, float] = defaultdict(float)
1402
+ for h in hotspots:
1403
+ risk_by_file[h["file_path"]] += h["risk_score"]
1404
+
1405
+ raw_scores: list[float] = []
1406
+ for sub in subsystems:
1407
+ incoming = len(sub.get("depended_by", []))
1408
+ risk_agg = sum(risk_by_file.get(fp, 0.0) for fp in sub["files"])
1409
+ sub["risk_score"] = round(risk_agg, 1)
1410
+ sym_count = sub["symbol_count"]
1411
+ has_entry = 1.0 if sub.get("entry_points") else 0.0
1412
+ conn_count = len(sub.get("depends_on", [])) + len(sub.get("depended_by", []))
1413
+ centrality = conn_count / max(total_subsystems, 1)
1414
+
1415
+ score = (
1416
+ incoming * 0.4
1417
+ + risk_agg * 0.25
1418
+ + sym_count * 0.15
1419
+ + has_entry * 0.1
1420
+ + centrality * 0.1
1421
+ )
1422
+ raw_scores.append(score)
1423
+
1424
+ # Normalize to 0.0–1.0
1425
+ max_raw = max(raw_scores) if raw_scores else 1.0
1426
+ if max_raw == 0:
1427
+ max_raw = 1.0
1428
+
1429
+ for sub, raw in zip(subsystems, raw_scores):
1430
+ normalized = round(raw / max_raw, 2)
1431
+ sub["importance"] = normalized
1432
+ if normalized >= 0.75:
1433
+ sub["importance_label"] = "critical"
1434
+ elif normalized >= 0.50:
1435
+ sub["importance_label"] = "important"
1436
+ elif normalized >= 0.25:
1437
+ sub["importance_label"] = "supporting"
1438
+ else:
1439
+ sub["importance_label"] = "peripheral"
1440
+
1441
+ def _describe_subsystem(self, subsystem: dict, connections: list[dict]) -> str:
1442
+ """Generate a deterministic one-liner description for a subsystem."""
1443
+ # Try __init__.py docstring from the files
1444
+ for fp in subsystem["files"]:
1445
+ if fp.endswith("__init__.py"):
1446
+ nodes = self.store.get_nodes_by_file(fp)
1447
+ for n in nodes:
1448
+ if n["type"] == "file" and n.get("docstring"):
1449
+ return n["docstring"].split("\n")[0]
1450
+
1451
+ role = subsystem["role"].replace("_", " ").title()
1452
+ sym_count = subsystem["symbol_count"]
1453
+ top = subsystem["top_symbols"][:3]
1454
+ dep_count = len(subsystem.get("depended_by", []))
1455
+
1456
+ top_str = ", ".join(top) if top else "no public symbols"
1457
+ dep_str = f"Depended on by {dep_count} other subsystem{'s' if dep_count != 1 else ''}." if dep_count else "No dependents."
1458
+
1459
+ return f"{role} providing {sym_count} symbols including {top_str}. {dep_str}"
1460
+
1461
+ def _derive_purpose(self, overview: dict, subsystems: list[dict]) -> str:
1462
+ """Generate a one-line codebase purpose statement."""
1463
+ stats = overview.get("stats", {})
1464
+ file_count = stats.get("files", 0)
1465
+ sub_count = len(subsystems)
1466
+
1467
+ # Find the most important non-utility subsystem
1468
+ real_subs = [s for s in subsystems if s["slug"] not in ("other", "tests", "utilities")]
1469
+ top_sub = max(real_subs, key=lambda s: s["importance"]) if real_subs else (
1470
+ max(subsystems, key=lambda s: s["importance"]) if subsystems else None
1471
+ )
1472
+ center = top_sub["name"] if top_sub else "unknown"
1473
+
1474
+ # Detect dominant language from file extensions
1475
+ all_files = []
1476
+ for sub in subsystems:
1477
+ all_files.extend(sub["files"])
1478
+ py_count = sum(1 for f in all_files if f.endswith(".py"))
1479
+ ts_count = sum(1 for f in all_files if f.endswith((".ts", ".tsx", ".js", ".jsx")))
1480
+ if py_count > ts_count:
1481
+ lang = "Python"
1482
+ elif ts_count > py_count:
1483
+ lang = "TypeScript"
1484
+ elif py_count and ts_count:
1485
+ lang = "Python + TypeScript"
1486
+ else:
1487
+ lang = "source"
1488
+
1489
+ return (
1490
+ f"A {lang} project with {file_count} files organized into "
1491
+ f"{sub_count} subsystem{'s' if sub_count != 1 else ''}, "
1492
+ f"centered around {center}."
1493
+ )
1494
+
1495
+ # ------------------------------------------------------------------
1496
+ # Ask: natural language question answering
1497
+ # ------------------------------------------------------------------
1498
+
1499
+ # Order matters: more specific patterns first, broad ones (overview) last.
1500
+ _INTENT_PATTERNS = {
1501
+ "risk": r"\b(risk\w*|dangerous|fragile|hotspot)\b",
1502
+ "health": r"\b(health|score|grade|quality)\b",
1503
+ "dead_code": r"\b(dead|unused|unreachable)\b",
1504
+ "cycles": r"\b(cycle|circular|import loop)\b",
1505
+ "impact": r"\b(impact|affect|change|break|depends?)\b.*\b(\w+)\b",
1506
+ "module": r"\b(module|file|package)\b.*\b(does?|about|role|purpose)\b",
1507
+ "anatomy": r"\b(anatomy|subsystem|parts?|structure|how.*organized)\b",
1508
+ "symbol": r"\b(function|class|method|symbol)\b.*\b(does?|what|where|who)\b",
1509
+ "overview": r"\b(overview|summary|what is|describe|about|main parts?|main packages?|main modules?)\b",
1510
+ }
1511
+
1512
+ _STOP_WORDS = frozenset({
1513
+ "what", "does", "how", "the", "this", "that", "with", "from",
1514
+ "about", "which", "where", "when", "would", "could", "should",
1515
+ "have", "been", "being", "will", "they", "them", "their",
1516
+ "most", "more", "some", "each", "every", "there", "here",
1517
+ "into", "between", "through", "after", "before", "these",
1518
+ "those", "other", "than", "very", "just", "only",
1519
+ })
1520
+
1521
+ _SUGGESTIONS: dict[str, list[str]] = {
1522
+ "overview": [
1523
+ "What are the riskiest parts of the codebase?",
1524
+ "What's the health score?",
1525
+ "How is the codebase organized?",
1526
+ ],
1527
+ "module": [
1528
+ "What are the riskiest files?",
1529
+ "What's the health score?",
1530
+ ],
1531
+ "risk": [
1532
+ "What would break if I changed the riskiest symbol?",
1533
+ "What's the health score?",
1534
+ "Are there any import cycles?",
1535
+ ],
1536
+ "health": [
1537
+ "What are the riskiest files?",
1538
+ "What dead code should I clean up?",
1539
+ "Are there any import cycles?",
1540
+ ],
1541
+ "impact": [
1542
+ "What are the riskiest parts of the codebase?",
1543
+ "How is the codebase organized?",
1544
+ ],
1545
+ "anatomy": [
1546
+ "What are the main parts of this codebase?",
1547
+ "What are the riskiest files?",
1548
+ "What's the health score?",
1549
+ ],
1550
+ "dead_code": [
1551
+ "What's the health score?",
1552
+ "What are the riskiest files?",
1553
+ ],
1554
+ "cycles": [
1555
+ "What's the health score?",
1556
+ "How is the codebase organized?",
1557
+ ],
1558
+ "symbol": [
1559
+ "What are the riskiest parts of the codebase?",
1560
+ "What's the health score?",
1561
+ ],
1562
+ "general": [
1563
+ "What are the main parts of this codebase?",
1564
+ "What's the health score?",
1565
+ "What are the riskiest files?",
1566
+ ],
1567
+ }
1568
+
1569
+ @staticmethod
1570
+ def _extract_keywords(question: str) -> list[str]:
1571
+ """Extract meaningful keywords from a question (stop-word filtered + bigrams)."""
1572
+ words = re.findall(r"[a-z_]+", question.lower())
1573
+ words = [w for w in words if len(w) > 3 and w not in ComprehensionEngine._STOP_WORDS]
1574
+ bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words) - 1)]
1575
+ return (words + bigrams)[:5]
1576
+
1577
+ def answer_question(self, question: str) -> dict:
1578
+ """Answer a natural language question about the codebase.
1579
+
1580
+ Uses regex intent detection to route the question to the right
1581
+ structural data source. Fully deterministic — no LLM call.
1582
+
1583
+ Returns:
1584
+ {
1585
+ "question": str,
1586
+ "intent": str,
1587
+ "answer": str,
1588
+ "data": dict | list,
1589
+ "sources": list[str],
1590
+ "suggestions": list[str],
1591
+ }
1592
+ """
1593
+ q_lower = question.lower()
1594
+
1595
+ # Detect intent
1596
+ intent = "general"
1597
+ for name, pattern in self._INTENT_PATTERNS.items():
1598
+ if re.search(pattern, q_lower):
1599
+ intent = name
1600
+ break
1601
+
1602
+ answer = ""
1603
+ data: dict | list = {}
1604
+ sources: list[str] = []
1605
+
1606
+ if intent == "overview":
1607
+ overview = self.system_overview()
1608
+ stats = overview["stats"]
1609
+ pkgs = sorted(overview["packages"].keys())
1610
+ answer = (
1611
+ f"This codebase has {stats.get('files', 0)} files and "
1612
+ f"{stats.get('nodes', 0)} symbols across "
1613
+ f"{len(pkgs)} package{'s' if len(pkgs) != 1 else ''}: "
1614
+ f"{', '.join(pkgs)}."
1615
+ )
1616
+ data = overview
1617
+ sources = ["system_overview"]
1618
+
1619
+ elif intent == "module":
1620
+ # Try to extract a module/file name from the question
1621
+ mod_name = self._extract_module_name(question)
1622
+ if mod_name:
1623
+ view = self.module_view(mod_name)
1624
+ if "error" not in view:
1625
+ narrative = self.module_narrative(mod_name)
1626
+ answer = narrative.get("summary", f"Module {mod_name} found.")
1627
+ data = view
1628
+ sources = ["module_view", "module_narrative"]
1629
+ else:
1630
+ answer = f"Could not find module '{mod_name}'. Try using the full file path."
1631
+ data = {"error": view["error"]}
1632
+ sources = ["module_view"]
1633
+ else:
1634
+ answer = "Please specify a module or file name in your question."
1635
+ data = {}
1636
+ sources = []
1637
+
1638
+ elif intent == "risk":
1639
+ hotspots = self.risk_hotspots(top_n=10)
1640
+ if hotspots:
1641
+ lines = []
1642
+ for i, h in enumerate(hotspots[:5], 1):
1643
+ lines.append(
1644
+ f" {i}. {h['name']} ({h['file_path']}) — "
1645
+ f"risk: {h['risk_score']}, {h['direct_dependents']} dependents, "
1646
+ f"{h['affected_files']} affected files"
1647
+ )
1648
+ answer = "Top risk hotspots:\n" + "\n".join(lines)
1649
+ else:
1650
+ answer = "No significant risk hotspots found."
1651
+ data = hotspots
1652
+ sources = ["risk_hotspots"]
1653
+
1654
+ elif intent == "health":
1655
+ health = self.health_score()
1656
+ d = health["details"]
1657
+ answer = (
1658
+ f"Health score: {health['score']}/100 (grade {health['grade']}). "
1659
+ f"Dead code: {d['dead_code_count']} symbols ({d['dead_code_ratio']:.0%}). "
1660
+ f"Import cycles: {d['import_cycles']}. "
1661
+ f"High-risk hotspots: {d['high_risk_hotspots']}."
1662
+ )
1663
+ data = health
1664
+ sources = ["health_score"]
1665
+
1666
+ elif intent == "impact":
1667
+ # Try to find a file path first (e.g. "store.py")
1668
+ file_path_match = self._extract_module_name(question)
1669
+ # Extract a symbol name from the question.
1670
+ # Prefer CamelCase / capitalised names (likely real symbols)
1671
+ # over lowercase verbs like "depends", "change", "break".
1672
+ symbol_name = self._extract_symbol_name(question)
1673
+ # Also extract keywords and try each until one resolves
1674
+ keywords = self._extract_keywords(question)
1675
+ candidates = []
1676
+ if symbol_name:
1677
+ candidates.append(symbol_name)
1678
+ candidates.extend(keywords)
1679
+
1680
+ node = None
1681
+ tried_name = None
1682
+ for candidate in candidates:
1683
+ nodes = self.engine.resolve_node(candidate)
1684
+ if nodes:
1685
+ node = nodes[0]
1686
+ tried_name = candidate
1687
+ break
1688
+ tried_name = candidate
1689
+
1690
+ if node is not None:
1691
+ impacted = self.engine.impact_of_change(node["id"], max_depth=5)
1692
+ affected_files: set[str] = set()
1693
+ all_nodes_list = self.store.get_all_nodes()
1694
+ node_by_id = {n["id"]: n for n in all_nodes_list}
1695
+ for entry in impacted:
1696
+ t = node_by_id.get(entry["node_id"])
1697
+ if t:
1698
+ affected_files.add(t["file_path"])
1699
+ answer = (
1700
+ f"Changing {node['name']} ({node['file_path']}) would impact "
1701
+ f"{len(impacted)} symbols across {len(affected_files)} files."
1702
+ )
1703
+ data = {"target": node["id"], "impacted_count": len(impacted),
1704
+ "affected_files": sorted(affected_files)}
1705
+ sources = ["impact_of_change"]
1706
+ elif file_path_match:
1707
+ # Fall back to change_summary for file-based impact
1708
+ summary = self.change_summary(file_path_match)
1709
+ answer = (
1710
+ f"Changing {file_path_match} would affect "
1711
+ f"{summary['affected_file_count']} files."
1712
+ )
1713
+ data = summary
1714
+ sources = ["change_summary"]
1715
+ elif tried_name:
1716
+ answer = f"No symbol found matching '{tried_name}'."
1717
+ data = {}
1718
+ sources = ["resolve_node"]
1719
+ else:
1720
+ answer = "Please mention a symbol name to analyze impact."
1721
+ data = {}
1722
+ sources = []
1723
+
1724
+ elif intent == "anatomy":
1725
+ anatomy = self.codebase_anatomy()
1726
+ sub_lines = []
1727
+ for sub in anatomy.get("subsystems", []):
1728
+ sub_lines.append(
1729
+ f" - {sub['name']} ({sub['file_count']} files, "
1730
+ f"{sub['symbol_count']} symbols, {sub['importance_label']})"
1731
+ )
1732
+ answer = (
1733
+ anatomy.get("purpose", "Unknown purpose.") + "\n\n"
1734
+ "Subsystems:\n" + "\n".join(sub_lines)
1735
+ )
1736
+ data = anatomy
1737
+ sources = ["codebase_anatomy"]
1738
+
1739
+ elif intent == "dead_code":
1740
+ dead = self.engine.find_dead_code()
1741
+ if dead:
1742
+ sample = dead[:5]
1743
+ lines = [f" - {d['name']} ({d['type']}) in {d['file_path']}" for d in sample]
1744
+ answer = (
1745
+ f"Found {len(dead)} potentially unused symbols.\n"
1746
+ + "\n".join(lines)
1747
+ )
1748
+ if len(dead) > 5:
1749
+ answer += f"\n ... and {len(dead) - 5} more."
1750
+ else:
1751
+ answer = "No dead code detected."
1752
+ data = dead
1753
+ sources = ["find_dead_code"]
1754
+
1755
+ elif intent == "cycles":
1756
+ cycles = self.engine.detect_cycles()
1757
+ if cycles:
1758
+ lines = []
1759
+ for c in cycles[:5]:
1760
+ path = " -> ".join(c) if isinstance(c, list) else str(c)
1761
+ lines.append(f" - {path}")
1762
+ answer = f"Found {len(cycles)} import cycle(s):\n" + "\n".join(lines)
1763
+ if len(cycles) > 5:
1764
+ answer += f"\n ... and {len(cycles) - 5} more."
1765
+ else:
1766
+ answer = "No import cycles detected."
1767
+ data = cycles
1768
+ sources = ["detect_cycles"]
1769
+
1770
+ elif intent == "symbol":
1771
+ keywords = self._extract_keywords(question)
1772
+ symbol_name = keywords[0] if keywords else None
1773
+ if symbol_name:
1774
+ nodes = self.engine.resolve_node(symbol_name)
1775
+ if nodes:
1776
+ node = nodes[0]
1777
+ narrative = self.symbol_narrative(node["id"])
1778
+ answer = narrative.get("summary", f"Found {node['name']}.")
1779
+ data = narrative
1780
+ sources = ["symbol_narrative"]
1781
+ else:
1782
+ answer = f"No symbol found matching '{symbol_name}'."
1783
+ data = {}
1784
+ sources = ["resolve_node"]
1785
+ else:
1786
+ answer = "Please mention a symbol name in your question."
1787
+ data = {}
1788
+ sources = []
1789
+
1790
+ else:
1791
+ # General: keyword search + overview stats
1792
+ keywords = self._extract_keywords(question)
1793
+ search_results = []
1794
+ for kw in keywords[:5]:
1795
+ results = self.engine.search_nodes(kw, limit=5)
1796
+ search_results.extend(results)
1797
+ # Deduplicate by id
1798
+ seen_ids: set[str] = set()
1799
+ unique_results: list[dict] = []
1800
+ for r in search_results:
1801
+ if r["id"] not in seen_ids:
1802
+ seen_ids.add(r["id"])
1803
+ unique_results.append(r)
1804
+
1805
+ overview = self.system_overview()
1806
+ stats = overview["stats"]
1807
+ if unique_results:
1808
+ lines = [f" - {r['name']} ({r['type']}) in {r['file_path']}" for r in unique_results[:5]]
1809
+ answer = (
1810
+ f"Found {len(unique_results)} matching symbols "
1811
+ f"(in a codebase with {stats.get('files', 0)} files):\n"
1812
+ + "\n".join(lines)
1813
+ )
1814
+ else:
1815
+ pkgs = sorted(overview["packages"].keys())
1816
+ answer = (
1817
+ f"No specific matches found. This codebase has "
1818
+ f"{stats.get('files', 0)} files and {stats.get('nodes', 0)} symbols "
1819
+ f"across packages: {', '.join(pkgs)}."
1820
+ )
1821
+ data = {"search_results": unique_results[:15], "stats": stats}
1822
+ sources = ["search_nodes", "system_overview"]
1823
+
1824
+ suggestions = self._SUGGESTIONS.get(intent, self._SUGGESTIONS["general"])
1825
+
1826
+ return {
1827
+ "question": question,
1828
+ "intent": intent,
1829
+ "answer": answer,
1830
+ "data": data,
1831
+ "sources": sources,
1832
+ "suggestions": suggestions,
1833
+ }
1834
+
1835
+ def _extract_module_name(self, question: str) -> str | None:
1836
+ """Try to extract a file/module path from the question."""
1837
+ # Look for explicit file paths (e.g. "mypackage/core.py")
1838
+ path_match = re.search(r"[\w/\\]+\.(?:py|ts|tsx|js|jsx)", question)
1839
+ if path_match:
1840
+ return path_match.group(0).replace("\\", "/")
1841
+
1842
+ # Look for quoted names
1843
+ quoted = re.search(r'["\']([^"\']+)["\']', question)
1844
+ if quoted:
1845
+ return quoted.group(1)
1846
+
1847
+ # Look for module-like words after "module" or "file"
1848
+ mod_match = re.search(r"\b(?:module|file)\s+(\w[\w./]*)", question, re.IGNORECASE)
1849
+ if mod_match:
1850
+ return mod_match.group(1)
1851
+
1852
+ return None
1853
+
1854
+ @staticmethod
1855
+ def _extract_symbol_name(question: str) -> str | None:
1856
+ """Extract a likely symbol name from the question.
1857
+
1858
+ Prefers CamelCase identifiers (e.g. GraphStore, DataProcessor) or
1859
+ names inside quotes. Falls back to ``None`` so the caller can try
1860
+ keyword extraction instead.
1861
+ """
1862
+ # Quoted names first
1863
+ quoted = re.search(r'["\']([^"\']+)["\']', question)
1864
+ if quoted:
1865
+ return quoted.group(1)
1866
+
1867
+ # CamelCase identifiers (at least two uppercase letters)
1868
+ camel = re.findall(r"\b([A-Z][a-z]+(?:[A-Z][a-z]*)+)\b", question)
1869
+ if camel:
1870
+ return camel[0]
1871
+
1872
+ # Single capitalised word that is NOT a sentence starter
1873
+ # e.g. "What depends on Store?" -> "Store"
1874
+ caps = re.findall(r"(?<!\.\s)(?<!^)\b([A-Z][a-z]{2,})\b", question)
1875
+ if caps:
1876
+ return caps[0]
1877
+
1878
+ return None
1879
+
1880
+ # ------------------------------------------------------------------
1881
+ # Helpers
1882
+ # ------------------------------------------------------------------
1883
+ def _extract_module_name_from_impact(self, question: str) -> str | None:
1884
+ """Extract a file path from an impact-style question."""
1885
+ path_match = re.search(r"[\w/\\]+\.(?:py|ts|tsx|js|jsx)", question)
1886
+ if path_match:
1887
+ return path_match.group(0).replace("\\", "/")
1888
+ return None
1889
+
1890
+ def _infer_module_role(
1891
+ self, file_path: str, symbols: list[dict], coupling_in: int, coupling_out: int
1892
+ ) -> str:
1893
+ """Infer the architectural role of a module from its structure and coupling.
1894
+
1895
+ Uses a combination of filename conventions, symbol composition, and
1896
+ coupling patterns. Returns a role string. When multiple signals conflict,
1897
+ coupling-based signals take precedence over filename-based ones.
1898
+ """
1899
+ name = file_path.split("/")[-1]
1900
+ # Remove extension (.py, .ts, .tsx, .js, .jsx)
1901
+ stem = name.rsplit(".", 1)[0] if "." in name else name
1902
+
1903
+ # Unambiguous filename matches
1904
+ if stem == "__init__":
1905
+ return "package_init"
1906
+ if stem == "__main__":
1907
+ return "entry_point"
1908
+ if stem.startswith("test_") or stem.endswith("_test") or stem.endswith(".test"):
1909
+ return "test"
1910
+
1911
+ classes = [s for s in symbols if s["type"] == "class"]
1912
+ functions = [s for s in symbols if s["type"] == "function"]
1913
+ methods = [s for s in symbols if s["type"] == "method"]
1914
+
1915
+ # Coupling-based roles (structural evidence, more reliable than name)
1916
+ if coupling_in > coupling_out * 2 and coupling_in > 3:
1917
+ return "core_library"
1918
+ if coupling_out > coupling_in * 3 and coupling_out > 3:
1919
+ return "orchestrator"
1920
+ if coupling_in == 0 and coupling_out == 0:
1921
+ return "isolated"
1922
+
1923
+ # Filename hints (only for common conventions)
1924
+ if stem in ("config", "settings", "constants", "conf"):
1925
+ return "configuration"
1926
+ if stem in ("utils", "helpers", "util", "common"):
1927
+ return "utility"
1928
+ if stem in ("models", "schemas", "types", "entities"):
1929
+ return "data_model"
1930
+
1931
+ # Composition-based roles
1932
+ if len(classes) > len(functions) and classes:
1933
+ return "domain_model"
1934
+ if functions and not classes:
1935
+ if coupling_in > 0:
1936
+ return "function_library"
1937
+ return "script"
1938
+
1939
+ return "module"