codebase-index 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_index/__init__.py +7 -0
- codebase_index/__main__.py +3 -0
- codebase_index/cli.py +916 -0
- codebase_index/config.py +110 -0
- codebase_index/discovery/__init__.py +10 -0
- codebase_index/discovery/classify.py +151 -0
- codebase_index/discovery/ignore.py +58 -0
- codebase_index/discovery/walker.py +75 -0
- codebase_index/doctor.py +138 -0
- codebase_index/embeddings/__init__.py +2 -0
- codebase_index/embeddings/backend.py +67 -0
- codebase_index/embeddings/external.py +56 -0
- codebase_index/embeddings/local.py +41 -0
- codebase_index/embeddings/noop.py +15 -0
- codebase_index/graph/__init__.py +8 -0
- codebase_index/graph/analysis.py +468 -0
- codebase_index/graph/builder.py +160 -0
- codebase_index/graph/expand.py +136 -0
- codebase_index/graph/export.py +381 -0
- codebase_index/graph/navigate.py +201 -0
- codebase_index/indexer/__init__.py +8 -0
- codebase_index/indexer/doc_chunks.py +202 -0
- codebase_index/indexer/freshness.py +109 -0
- codebase_index/indexer/pipeline.py +423 -0
- codebase_index/mcp/__init__.py +2 -0
- codebase_index/mcp/server.py +354 -0
- codebase_index/models.py +145 -0
- codebase_index/output/__init__.py +6 -0
- codebase_index/output/json.py +13 -0
- codebase_index/output/markdown.py +316 -0
- codebase_index/output/redact.py +31 -0
- codebase_index/parsers/__init__.py +9 -0
- codebase_index/parsers/base.py +47 -0
- codebase_index/parsers/languages.py +290 -0
- codebase_index/parsers/line_chunker.py +39 -0
- codebase_index/parsers/symbol_chunks.py +62 -0
- codebase_index/parsers/treesitter.py +439 -0
- codebase_index/retrieval/__init__.py +9 -0
- codebase_index/retrieval/budget.py +82 -0
- codebase_index/retrieval/fusion.py +62 -0
- codebase_index/retrieval/intent.py +56 -0
- codebase_index/retrieval/pipeline.py +207 -0
- codebase_index/retrieval/rerank.py +69 -0
- codebase_index/retrieval/searchers.py +291 -0
- codebase_index/retrieval/skeleton.py +251 -0
- codebase_index/retrieval/types.py +79 -0
- codebase_index/scaffold.py +399 -0
- codebase_index/service.py +158 -0
- codebase_index/skill_template/SKILL.md +198 -0
- codebase_index/skill_template/examples/hooks/settings.json +16 -0
- codebase_index/skill_template/scripts/cbx +25 -0
- codebase_index/skill_template/scripts/cbx.ps1 +25 -0
- codebase_index/skill_update.py +150 -0
- codebase_index/storage/__init__.py +8 -0
- codebase_index/storage/db.py +116 -0
- codebase_index/storage/repo.py +701 -0
- codebase_index/storage/schema.sql +125 -0
- codebase_index/watch/__init__.py +5 -0
- codebase_index/watch/watcher.py +93 -0
- codebase_index-1.6.0.dist-info/METADATA +748 -0
- codebase_index-1.6.0.dist-info/RECORD +64 -0
- codebase_index-1.6.0.dist-info/WHEEL +4 -0
- codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
- codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""Global graph pass: resolve unresolved edges against the whole repo and
|
|
2
|
+
denormalize symbol degrees.
|
|
3
|
+
|
|
4
|
+
Runs once after all files are indexed (it needs the complete symbol/file tables).
|
|
5
|
+
Symbol-target edges (call/reference/extends/implements) resolve only on an
|
|
6
|
+
UNAMBIGUOUS name match — if two definitions share a name, the edge is left
|
|
7
|
+
unresolved rather than guessed. Import edges resolve their module path to a file
|
|
8
|
+
by POSIX path-suffix match (e.g. 'auth.token' -> '%/auth/token.py').
|
|
9
|
+
|
|
10
|
+
The pass is batched: one query for globally-unique symbol names, one for file
|
|
11
|
+
paths (expanded into an in-memory suffix map), one executemany for the updates.
|
|
12
|
+
The per-edge variant did an indexed lookup per symbol edge and up to ~20
|
|
13
|
+
full-table LIKE scans per import edge, which dominated large builds.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import sqlite3
|
|
19
|
+
from typing import Optional
|
|
20
|
+
|
|
21
|
+
from ..storage import repo
|
|
22
|
+
|
|
23
|
+
_SYMBOL_EDGE_TYPES = {"call", "reference", "extends", "implements"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def build_graph(conn: sqlite3.Connection) -> dict[str, int]:
|
|
27
|
+
resolved = resolve_edges(conn)
|
|
28
|
+
repo.recompute_degrees(conn)
|
|
29
|
+
# Everything still unresolved that names a target is, by definition, a target we
|
|
30
|
+
# could not pin to a unique node — record it as 'ambiguous' for the honesty trail.
|
|
31
|
+
repo.mark_ambiguous_edges(conn)
|
|
32
|
+
total_unresolved = len(repo.unresolved_edges(conn))
|
|
33
|
+
# Architecture analytics (communities / god nodes / surprising bridges) are a
|
|
34
|
+
# derived view of the graph. Compute once per build and cache the JSON in meta so
|
|
35
|
+
# the `architecture` command and the HTML export read it instantly. Never let an
|
|
36
|
+
# analysis failure fail the build — the graph itself is already written.
|
|
37
|
+
try:
|
|
38
|
+
from . import analysis
|
|
39
|
+
|
|
40
|
+
analysis.refresh_analysis(conn)
|
|
41
|
+
except Exception: # pragma: no cover - defensive; analytics are best-effort
|
|
42
|
+
pass
|
|
43
|
+
return {"resolved": resolved, "unresolved": total_unresolved}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def resolve_edges(conn: sqlite3.Connection) -> int:
|
|
47
|
+
edges = repo.unresolved_edges(conn)
|
|
48
|
+
if not edges:
|
|
49
|
+
return 0
|
|
50
|
+
|
|
51
|
+
unique_symbols = repo.unique_symbol_ids_by_name(conn)
|
|
52
|
+
suffix_map = _path_suffix_map(repo.all_file_ids_with_paths(conn))
|
|
53
|
+
|
|
54
|
+
# (dst_kind, dst_id, edge_id, confidence). A repo-unique symbol name is an exact
|
|
55
|
+
# hit -> 'extracted'; an import resolved only by path-suffix matching is a best-
|
|
56
|
+
# effort heuristic -> 'inferred'.
|
|
57
|
+
resolutions: list[tuple[str, int, int, str]] = []
|
|
58
|
+
for edge in edges:
|
|
59
|
+
name = edge["dst_name"]
|
|
60
|
+
if edge["edge_type"] == "import":
|
|
61
|
+
file_id = _module_to_file_id(suffix_map, name, lang=edge["lang"])
|
|
62
|
+
if file_id is not None:
|
|
63
|
+
resolutions.append(("file", file_id, edge["id"], "inferred"))
|
|
64
|
+
elif edge["edge_type"] in _SYMBOL_EDGE_TYPES:
|
|
65
|
+
sym_id = unique_symbols.get(name)
|
|
66
|
+
if sym_id is not None:
|
|
67
|
+
resolutions.append(("symbol", sym_id, edge["id"], "extracted"))
|
|
68
|
+
|
|
69
|
+
repo.resolve_edges_bulk(conn, resolutions)
|
|
70
|
+
return len(resolutions)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _path_suffix_map(rows: list[sqlite3.Row]) -> dict[str, Optional[int]]:
|
|
74
|
+
"""Map every '/'-aligned path suffix to its file id, or None when ambiguous.
|
|
75
|
+
|
|
76
|
+
Mirrors files_with_suffix(path = suffix OR path LIKE '%/suffix') semantics:
|
|
77
|
+
a suffix shared by several files resolves to None (like a multi-row result),
|
|
78
|
+
and matching is case-insensitive the way SQLite LIKE folds ASCII.
|
|
79
|
+
"""
|
|
80
|
+
mapping: dict[str, Optional[int]] = {}
|
|
81
|
+
for row in rows:
|
|
82
|
+
parts = row["path"].lower().split("/")
|
|
83
|
+
for i in range(len(parts)):
|
|
84
|
+
suffix = "/".join(parts[i:])
|
|
85
|
+
mapping[suffix] = None if suffix in mapping else int(row["id"])
|
|
86
|
+
return mapping
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _lang_suffixes(lang: Optional[str], base: str, rust_base: str, go_pkg: str) -> list[str]:
|
|
90
|
+
"""Import-path suffixes specific to one language, most-specific first."""
|
|
91
|
+
return {
|
|
92
|
+
"python": [f"{base}.py", f"{base}/__init__.py"],
|
|
93
|
+
"typescript": [f"{base}.ts", f"{base}.tsx", f"{base}/index.ts", f"{base}/index.tsx"],
|
|
94
|
+
"javascript": [f"{base}.js", f"{base}/index.js"],
|
|
95
|
+
"java": [f"{base}.java"],
|
|
96
|
+
"kotlin": [f"{base}.kt"],
|
|
97
|
+
"go": [f"{go_pkg}.go"],
|
|
98
|
+
"rust": [
|
|
99
|
+
f"{rust_base}.rs", f"{rust_base}/mod.rs",
|
|
100
|
+
f"src/{rust_base}.rs", f"src/{rust_base}/mod.rs",
|
|
101
|
+
],
|
|
102
|
+
"csharp": [f"{base}.cs"],
|
|
103
|
+
"ruby": [f"{base}.rb"],
|
|
104
|
+
"php": [f"{base}.php"],
|
|
105
|
+
}.get(lang or "", [])
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _module_to_file_id(
|
|
109
|
+
suffix_map: dict[str, Optional[int]], module: str, lang: Optional[str] = None
|
|
110
|
+
) -> Optional[int]:
|
|
111
|
+
"""Resolve a module/import path to a unique file id, or None.
|
|
112
|
+
|
|
113
|
+
Handles Python, TypeScript/JavaScript, Java/Kotlin/Scala, Rust (:: separator),
|
|
114
|
+
Go (last path segment), C#, Ruby, and PHP import conventions. The importing
|
|
115
|
+
file's `lang` is tried first so that, in a polyglot repo, `import './base'` from
|
|
116
|
+
a .ts file resolves to base.ts rather than a same-named base.py earlier in the
|
|
117
|
+
fixed fallback order. The fallback order is unchanged, so single-language repos
|
|
118
|
+
and the lang-unknown path behave exactly as before.
|
|
119
|
+
"""
|
|
120
|
+
base = module.lower().replace(".", "/").strip("/")
|
|
121
|
+
rust_base = module.lower().replace("::", "/").strip("/")
|
|
122
|
+
if not base:
|
|
123
|
+
return None
|
|
124
|
+
# Last segment used for Go package-level resolution
|
|
125
|
+
go_pkg = base.rsplit("/", 1)[-1] if "/" in base else base
|
|
126
|
+
|
|
127
|
+
fallback = (
|
|
128
|
+
# Python
|
|
129
|
+
f"{base}.py",
|
|
130
|
+
f"{base}/__init__.py",
|
|
131
|
+
# TypeScript / JavaScript
|
|
132
|
+
f"{base}.ts",
|
|
133
|
+
f"{base}.tsx",
|
|
134
|
+
f"{base}.js",
|
|
135
|
+
f"{base}/index.ts",
|
|
136
|
+
f"{base}/index.tsx",
|
|
137
|
+
f"{base}/index.js",
|
|
138
|
+
# Java / Kotlin / Scala (dot-to-slash already done above)
|
|
139
|
+
f"{base}.java",
|
|
140
|
+
f"{base}.kt",
|
|
141
|
+
f"{base}.scala",
|
|
142
|
+
# Go: resolve last path segment to a .go file of the same name
|
|
143
|
+
f"{go_pkg}.go",
|
|
144
|
+
# Rust: :: separator mapped to /; also try under src/
|
|
145
|
+
f"{rust_base}.rs",
|
|
146
|
+
f"{rust_base}/mod.rs",
|
|
147
|
+
f"src/{rust_base}.rs",
|
|
148
|
+
f"src/{rust_base}/mod.rs",
|
|
149
|
+
# C#
|
|
150
|
+
f"{base}.cs",
|
|
151
|
+
# Ruby
|
|
152
|
+
f"{base}.rb",
|
|
153
|
+
# PHP
|
|
154
|
+
f"{base}.php",
|
|
155
|
+
)
|
|
156
|
+
for suffix in (*_lang_suffixes(lang, base, rust_base, go_pkg), *fallback):
|
|
157
|
+
file_id = suffix_map.get(suffix)
|
|
158
|
+
if file_id is not None:
|
|
159
|
+
return file_id
|
|
160
|
+
return None
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Impact analysis: bounded BFS over the resolved edge graph.
|
|
2
|
+
|
|
3
|
+
Direction semantics:
|
|
4
|
+
up -> dependents (who is affected if the target changes): incoming edges.
|
|
5
|
+
down -> dependencies (what the target relies on): outgoing edges.
|
|
6
|
+
both -> union of the two.
|
|
7
|
+
|
|
8
|
+
Target resolution: an exact file path -> a file node (seeded together with all
|
|
9
|
+
symbols defined in that file, so importers AND subclassers surface). Otherwise a
|
|
10
|
+
symbol name -> all symbol nodes with that name. A path suffix is the last resort.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import sqlite3
|
|
16
|
+
from collections import deque
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
from ..models import GraphCoverage, ImpactNode, ImpactResponse, IndexFreshness
|
|
20
|
+
from ..storage import repo
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _freshness(conn: sqlite3.Connection) -> IndexFreshness:
|
|
24
|
+
return IndexFreshness(
|
|
25
|
+
exists=True,
|
|
26
|
+
stale=False,
|
|
27
|
+
built_at=repo.get_meta(conn, "built_at"),
|
|
28
|
+
head_commit=repo.get_meta(conn, "head_commit"),
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _seed_nodes(conn: sqlite3.Connection, target: str) -> list[tuple[str, int]]:
|
|
33
|
+
"""Resolve a target string to one or more (kind, id) start nodes."""
|
|
34
|
+
frow = repo.file_by_path(conn, target)
|
|
35
|
+
if frow is not None:
|
|
36
|
+
seeds = [("file", int(frow["id"]))]
|
|
37
|
+
seeds += [("symbol", int(s["id"])) for s in repo.symbols_in_file(conn, int(frow["id"]))]
|
|
38
|
+
return seeds
|
|
39
|
+
|
|
40
|
+
sym_rows = repo.symbols_by_name(conn, target, exact=True)
|
|
41
|
+
if sym_rows:
|
|
42
|
+
return [("symbol", int(r["id"])) for r in sym_rows]
|
|
43
|
+
|
|
44
|
+
suffix = repo.files_with_suffix(conn, target)
|
|
45
|
+
if len(suffix) == 1:
|
|
46
|
+
fid = int(suffix[0]["id"])
|
|
47
|
+
return [("file", fid)] + [
|
|
48
|
+
("symbol", int(s["id"])) for s in repo.symbols_in_file(conn, fid)
|
|
49
|
+
]
|
|
50
|
+
return []
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _neighbors(conn, kind, node_id, direction):
|
|
54
|
+
"""Yield (next_kind, next_id, edge_type, confidence) for the requested direction(s)."""
|
|
55
|
+
if direction in ("up", "both"):
|
|
56
|
+
for e in repo.incoming_edges(conn, kind, node_id):
|
|
57
|
+
yield e["src_kind"], int(e["src_id"]), e["edge_type"], e["confidence"]
|
|
58
|
+
if direction in ("down", "both"):
|
|
59
|
+
for e in repo.outgoing_edges(conn, kind, node_id):
|
|
60
|
+
if e["dst_id"] is not None:
|
|
61
|
+
yield e["dst_kind"], int(e["dst_id"]), e["edge_type"], e["confidence"]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _node_meta(conn, kind, node_id) -> Optional[ImpactNode]:
|
|
65
|
+
if kind == "file":
|
|
66
|
+
row = conn.execute("SELECT path FROM files WHERE id = ?", (node_id,)).fetchone()
|
|
67
|
+
if row is None:
|
|
68
|
+
return None
|
|
69
|
+
return ImpactNode(kind="file", path=row["path"], distance=0)
|
|
70
|
+
row = conn.execute(
|
|
71
|
+
"SELECT s.name AS name, s.line_start AS line_start, f.path AS path "
|
|
72
|
+
"FROM symbols s JOIN files f ON f.id = s.file_id WHERE s.id = ?",
|
|
73
|
+
(node_id,),
|
|
74
|
+
).fetchone()
|
|
75
|
+
if row is None:
|
|
76
|
+
return None
|
|
77
|
+
return ImpactNode(kind="symbol", path=row["path"], name=row["name"],
|
|
78
|
+
line_start=row["line_start"], distance=0)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def walk_impact(
|
|
82
|
+
conn: sqlite3.Connection, target: str, *, depth: int, direction: str
|
|
83
|
+
) -> list[ImpactNode]:
|
|
84
|
+
seeds = _seed_nodes(conn, target)
|
|
85
|
+
if not seeds:
|
|
86
|
+
return []
|
|
87
|
+
visited: set[tuple[str, int]] = set(seeds)
|
|
88
|
+
queue: deque[tuple[str, int, int]] = deque((k, i, 0) for k, i in seeds)
|
|
89
|
+
out: list[ImpactNode] = []
|
|
90
|
+
|
|
91
|
+
while queue:
|
|
92
|
+
kind, node_id, dist = queue.popleft()
|
|
93
|
+
if dist >= depth:
|
|
94
|
+
continue
|
|
95
|
+
for nk, nid, etype, conf in _neighbors(conn, kind, node_id, direction):
|
|
96
|
+
if (nk, nid) in visited:
|
|
97
|
+
continue
|
|
98
|
+
visited.add((nk, nid))
|
|
99
|
+
meta = _node_meta(conn, nk, nid)
|
|
100
|
+
if meta is None:
|
|
101
|
+
continue
|
|
102
|
+
meta.distance = dist + 1
|
|
103
|
+
meta.via_edge = etype
|
|
104
|
+
meta.via_confidence = conf
|
|
105
|
+
out.append(meta)
|
|
106
|
+
queue.append((nk, nid, dist + 1))
|
|
107
|
+
return out
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _target_paths(conn: sqlite3.Connection, target: str) -> list[str]:
|
|
111
|
+
"""The file path(s) the target resolves to, for coverage classification."""
|
|
112
|
+
if repo.file_by_path(conn, target) is not None:
|
|
113
|
+
return [target]
|
|
114
|
+
sym_rows = repo.symbols_by_name(conn, target, exact=True)
|
|
115
|
+
if sym_rows:
|
|
116
|
+
return [r["path"] for r in sym_rows]
|
|
117
|
+
suffix = repo.files_with_suffix(conn, target)
|
|
118
|
+
if len(suffix) == 1:
|
|
119
|
+
return [suffix[0]["path"]]
|
|
120
|
+
return []
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def impact_lookup(
|
|
124
|
+
conn: sqlite3.Connection, target: str, *, depth: int, direction: str
|
|
125
|
+
) -> ImpactResponse:
|
|
126
|
+
nodes = walk_impact(conn, target, depth=depth, direction=direction)
|
|
127
|
+
best: dict[str, int] = {}
|
|
128
|
+
for n in nodes:
|
|
129
|
+
if n.path not in best or n.distance < best[n.path]:
|
|
130
|
+
best[n.path] = n.distance
|
|
131
|
+
files = sorted(best, key=lambda p: (best[p], p))
|
|
132
|
+
return ImpactResponse(
|
|
133
|
+
target=target, direction=direction, depth=depth,
|
|
134
|
+
index=_freshness(conn), nodes=nodes, files=files,
|
|
135
|
+
coverage=GraphCoverage.for_paths(_target_paths(conn, target)),
|
|
136
|
+
)
|
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
"""HTML graph export for the indexed call/import/reference graph."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import math
|
|
7
|
+
import sqlite3
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from .expand import impact_lookup
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _edge_rows(
|
|
15
|
+
conn: sqlite3.Connection,
|
|
16
|
+
*,
|
|
17
|
+
target: str | None,
|
|
18
|
+
depth: int,
|
|
19
|
+
direction: str,
|
|
20
|
+
limit: int,
|
|
21
|
+
) -> list[sqlite3.Row]:
|
|
22
|
+
params: list[Any] = []
|
|
23
|
+
where = "WHERE e.resolved = 1"
|
|
24
|
+
if target:
|
|
25
|
+
impact = impact_lookup(conn, target, depth=depth, direction=direction)
|
|
26
|
+
paths = set(impact.files)
|
|
27
|
+
if "/" in target or "\\" in target:
|
|
28
|
+
paths.add(target.replace("\\", "/"))
|
|
29
|
+
if paths:
|
|
30
|
+
placeholders = ",".join("?" for _ in paths)
|
|
31
|
+
where += (
|
|
32
|
+
f" AND (src_file.path IN ({placeholders}) "
|
|
33
|
+
f"OR src_sym_file.path IN ({placeholders}) "
|
|
34
|
+
f"OR dst_file.path IN ({placeholders}) "
|
|
35
|
+
f"OR dst_sym_file.path IN ({placeholders}))"
|
|
36
|
+
)
|
|
37
|
+
ordered = sorted(paths)
|
|
38
|
+
params.extend(ordered)
|
|
39
|
+
params.extend(ordered)
|
|
40
|
+
params.extend(ordered)
|
|
41
|
+
params.extend(ordered)
|
|
42
|
+
|
|
43
|
+
params.append(limit)
|
|
44
|
+
return conn.execute(
|
|
45
|
+
f"""
|
|
46
|
+
SELECT e.edge_type, e.resolved, e.line, e.dst_name, e.confidence,
|
|
47
|
+
e.src_kind, e.dst_kind,
|
|
48
|
+
src_file.path AS src_file_path,
|
|
49
|
+
src_sym_file.path AS src_symbol_file_path,
|
|
50
|
+
src_sym.name AS src_symbol_name,
|
|
51
|
+
src_sym.kind AS src_symbol_kind,
|
|
52
|
+
dst_file.path AS dst_file_path,
|
|
53
|
+
dst_sym.name AS dst_symbol_name,
|
|
54
|
+
dst_sym.kind AS dst_symbol_kind,
|
|
55
|
+
dst_sym_file.path AS dst_symbol_file_path
|
|
56
|
+
FROM edges e
|
|
57
|
+
LEFT JOIN files src_file ON e.src_kind = 'file' AND src_file.id = e.src_id
|
|
58
|
+
LEFT JOIN symbols src_sym ON e.src_kind = 'symbol' AND src_sym.id = e.src_id
|
|
59
|
+
LEFT JOIN files src_sym_file ON src_sym_file.id = src_sym.file_id
|
|
60
|
+
LEFT JOIN files dst_file ON e.dst_kind = 'file' AND dst_file.id = e.dst_id
|
|
61
|
+
LEFT JOIN symbols dst_sym ON e.dst_kind = 'symbol' AND dst_sym.id = e.dst_id
|
|
62
|
+
LEFT JOIN files dst_sym_file ON dst_sym_file.id = dst_sym.file_id
|
|
63
|
+
{where}
|
|
64
|
+
ORDER BY e.edge_type, COALESCE(src_file.path, src_sym_file.path), e.line
|
|
65
|
+
LIMIT ?
|
|
66
|
+
""",
|
|
67
|
+
params,
|
|
68
|
+
).fetchall()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _node_key(kind: str, path: str, name: str | None = None) -> str:
|
|
72
|
+
return f"{kind}:{path}:{name or ''}"
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _graph_data(rows: list[sqlite3.Row]) -> dict[str, Any]:
|
|
76
|
+
from collections import Counter, defaultdict
|
|
77
|
+
|
|
78
|
+
from .analysis import detect_communities, weighted_degree
|
|
79
|
+
|
|
80
|
+
nodes: dict[str, dict[str, Any]] = {}
|
|
81
|
+
edges: list[dict[str, Any]] = []
|
|
82
|
+
adj: dict[str, Counter] = defaultdict(Counter)
|
|
83
|
+
for row in rows:
|
|
84
|
+
src_path = row["src_file_path"] or row["src_symbol_file_path"] or ""
|
|
85
|
+
src_name = row["src_symbol_name"]
|
|
86
|
+
src_kind = row["src_symbol_kind"] or "file"
|
|
87
|
+
dst_path = row["dst_file_path"] or row["dst_symbol_file_path"] or ""
|
|
88
|
+
dst_name = row["dst_symbol_name"]
|
|
89
|
+
dst_kind = row["dst_symbol_kind"] or row["dst_kind"] or "file"
|
|
90
|
+
if not src_path or not dst_path:
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
src_key = _node_key("symbol" if src_name else "file", src_path, src_name)
|
|
94
|
+
dst_key = _node_key("symbol" if dst_name else "file", dst_path, dst_name)
|
|
95
|
+
nodes.setdefault(
|
|
96
|
+
src_key,
|
|
97
|
+
{"id": src_key, "path": src_path, "name": src_name, "kind": src_kind},
|
|
98
|
+
)
|
|
99
|
+
nodes.setdefault(
|
|
100
|
+
dst_key,
|
|
101
|
+
{"id": dst_key, "path": dst_path, "name": dst_name, "kind": dst_kind},
|
|
102
|
+
)
|
|
103
|
+
edges.append(
|
|
104
|
+
{
|
|
105
|
+
"source": src_key,
|
|
106
|
+
"target": dst_key,
|
|
107
|
+
"type": row["edge_type"],
|
|
108
|
+
"line": row["line"],
|
|
109
|
+
"confidence": row["confidence"] if "confidence" in row.keys() else "extracted",
|
|
110
|
+
}
|
|
111
|
+
)
|
|
112
|
+
if src_key != dst_key:
|
|
113
|
+
adj[src_key][dst_key] += 1
|
|
114
|
+
adj[dst_key][src_key] += 1
|
|
115
|
+
|
|
116
|
+
# Colour by module and size by centrality, computed on the displayed subgraph.
|
|
117
|
+
# The analysis functions are generic over the node key type, so string keys work.
|
|
118
|
+
communities = detect_communities(adj)
|
|
119
|
+
degree = weighted_degree(adj)
|
|
120
|
+
for key, node in nodes.items():
|
|
121
|
+
node["community"] = communities.get(key, -1)
|
|
122
|
+
node["degree"] = degree.get(key, 0)
|
|
123
|
+
return {"nodes": list(nodes.values()), "edges": edges}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _layout(nodes: list[dict[str, Any]], width: int = 1200, height: int = 760) -> None:
|
|
127
|
+
radius = min(width, height) * 0.38
|
|
128
|
+
cx = width / 2
|
|
129
|
+
cy = height / 2
|
|
130
|
+
count = max(1, len(nodes))
|
|
131
|
+
for idx, node in enumerate(nodes):
|
|
132
|
+
angle = 2 * math.pi * idx / count
|
|
133
|
+
node["x"] = round(cx + radius * math.cos(angle), 2)
|
|
134
|
+
node["y"] = round(cy + radius * math.sin(angle), 2)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def export_graph_html(
|
|
138
|
+
conn: sqlite3.Connection,
|
|
139
|
+
output: Path,
|
|
140
|
+
*,
|
|
141
|
+
target: str | None = None,
|
|
142
|
+
depth: int = 2,
|
|
143
|
+
direction: str = "both",
|
|
144
|
+
limit: int = 500,
|
|
145
|
+
) -> dict[str, int]:
|
|
146
|
+
rows = _edge_rows(conn, target=target, depth=depth, direction=direction, limit=limit)
|
|
147
|
+
data = _graph_data(rows)
|
|
148
|
+
_layout(data["nodes"])
|
|
149
|
+
payload = json.dumps(data).replace("</", "<\\/")
|
|
150
|
+
title = "codebase-index graph" + (f" - {target}" if target else "")
|
|
151
|
+
html = f"""<!doctype html>
|
|
152
|
+
<html lang="en">
|
|
153
|
+
<head>
|
|
154
|
+
<meta charset="utf-8">
|
|
155
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
156
|
+
<title>{title}</title>
|
|
157
|
+
<style>
|
|
158
|
+
body {{ margin:0; font:14px system-ui, Segoe UI, sans-serif; color:#1f2937; background:#f8fafc; }}
|
|
159
|
+
header {{ padding:14px 18px; border-bottom:1px solid #d1d5db; background:#fff; display:flex; gap:14px; align-items:center; }}
|
|
160
|
+
h1 {{ font-size:18px; margin:0; font-weight:650; }}
|
|
161
|
+
input {{ width:320px; max-width:40vw; padding:8px 10px; border:1px solid #cbd5e1; border-radius:6px; }}
|
|
162
|
+
main {{ display:grid; grid-template-columns:minmax(0,1fr) 420px; min-height:calc(100vh - 58px); }}
|
|
163
|
+
svg {{ width:100%; height:calc(100vh - 58px); background:#fff; }}
|
|
164
|
+
aside {{ border-left:1px solid #d1d5db; overflow:auto; background:#f8fafc; }}
|
|
165
|
+
table {{ width:100%; border-collapse:collapse; font-size:12px; }}
|
|
166
|
+
th,td {{ text-align:left; padding:8px; border-bottom:1px solid #e5e7eb; vertical-align:top; }}
|
|
167
|
+
th {{ position:sticky; top:0; background:#f1f5f9; z-index:1; }}
|
|
168
|
+
.edge {{ stroke:#94a3b8; stroke-width:1.3; }}
|
|
169
|
+
.edge.inferred {{ stroke-dasharray:5 3; }} /* heuristic-resolved */
|
|
170
|
+
.edge.ambiguous {{ stroke:#ef4444; stroke-dasharray:2 3; }} /* unresolved target */
|
|
171
|
+
.node {{ cursor:pointer; }}
|
|
172
|
+
.node circle {{ stroke:#1f2937; stroke-width:1.5; }}
|
|
173
|
+
.node.file circle {{ stroke-width:2.5; }}
|
|
174
|
+
.node text {{ font-size:11px; fill:#111827; }}
|
|
175
|
+
.dim {{ opacity:.12; }}
|
|
176
|
+
.selected circle {{ stroke:#111827; stroke-width:3; }}
|
|
177
|
+
.legend {{ font-size:11px; color:#475569; display:flex; gap:14px; align-items:center; flex-wrap:wrap; }}
|
|
178
|
+
.legend b {{ color:#1f2937; }}
|
|
179
|
+
.legend svg {{ width:34px; height:8px; vertical-align:middle; }}
|
|
180
|
+
</style>
|
|
181
|
+
</head>
|
|
182
|
+
<body>
|
|
183
|
+
<header>
|
|
184
|
+
<h1>codebase-index graph</h1>
|
|
185
|
+
<input id="filter" placeholder="Filter nodes or edges">
|
|
186
|
+
<span id="counts"></span>
|
|
187
|
+
<span class="legend">
|
|
188
|
+
<span><b>colour</b> = module</span>
|
|
189
|
+
<span><b>size</b> = connectivity</span>
|
|
190
|
+
<span><svg><line x1="0" y1="4" x2="34" y2="4" stroke="#94a3b8" stroke-width="1.3"/></svg> extracted</span>
|
|
191
|
+
<span><svg><line x1="0" y1="4" x2="34" y2="4" stroke="#94a3b8" stroke-width="1.3" stroke-dasharray="5 3"/></svg> inferred</span>
|
|
192
|
+
<span><svg><line x1="0" y1="4" x2="34" y2="4" stroke="#ef4444" stroke-width="1.3" stroke-dasharray="2 3"/></svg> ambiguous</span>
|
|
193
|
+
</span>
|
|
194
|
+
</header>
|
|
195
|
+
<main>
|
|
196
|
+
<svg id="graph" viewBox="0 0 1200 760" role="img" aria-label="code graph"></svg>
|
|
197
|
+
<aside>
|
|
198
|
+
<table>
|
|
199
|
+
<thead><tr><th>type</th><th>source</th><th>target</th><th>line</th></tr></thead>
|
|
200
|
+
<tbody id="edgeRows"></tbody>
|
|
201
|
+
</table>
|
|
202
|
+
</aside>
|
|
203
|
+
</main>
|
|
204
|
+
<script id="graph-data" type="application/json">{payload}</script>
|
|
205
|
+
<script>
|
|
206
|
+
const data = JSON.parse(document.getElementById('graph-data').textContent);
|
|
207
|
+
const svg = document.getElementById('graph');
|
|
208
|
+
const rows = document.getElementById('edgeRows');
|
|
209
|
+
const counts = document.getElementById('counts');
|
|
210
|
+
const byId = new Map(data.nodes.map(n => [n.id, n]));
|
|
211
|
+
// Stable, readable categorical palette; community id indexes into it.
|
|
212
|
+
const PALETTE = ['#2563eb','#059669','#d97706','#7c3aed','#db2777','#0891b2',
|
|
213
|
+
'#65a30d','#dc2626','#4f46e5','#ca8a04','#0d9488','#9333ea'];
|
|
214
|
+
function colorFor(n) {{
|
|
215
|
+
const c = n.community;
|
|
216
|
+
if (c === undefined || c < 0) return '#cbd5e1';
|
|
217
|
+
return PALETTE[c % PALETTE.length];
|
|
218
|
+
}}
|
|
219
|
+
function radiusFor(n) {{ return (n.name ? 8 : 11) + Math.min(14, Math.sqrt(n.degree || 0) * 2); }}
|
|
220
|
+
function label(n) {{ return n.name ? `${{n.name}} (${{n.path}})` : n.path; }}
|
|
221
|
+
function draw(filter = '') {{
|
|
222
|
+
svg.textContent = '';
|
|
223
|
+
rows.textContent = '';
|
|
224
|
+
const q = filter.toLowerCase();
|
|
225
|
+
const visibleNode = n => !q || label(n).toLowerCase().includes(q);
|
|
226
|
+
const visibleEdge = e => {{
|
|
227
|
+
const s = byId.get(e.source), t = byId.get(e.target);
|
|
228
|
+
return !q || e.type.toLowerCase().includes(q) || label(s).toLowerCase().includes(q) || label(t).toLowerCase().includes(q);
|
|
229
|
+
}};
|
|
230
|
+
for (const e of data.edges.filter(visibleEdge)) {{
|
|
231
|
+
const s = byId.get(e.source), t = byId.get(e.target);
|
|
232
|
+
const line = document.createElementNS('http://www.w3.org/2000/svg', 'line');
|
|
233
|
+
line.setAttribute('x1', s.x); line.setAttribute('y1', s.y);
|
|
234
|
+
line.setAttribute('x2', t.x); line.setAttribute('y2', t.y);
|
|
235
|
+
line.setAttribute('class', 'edge ' + (e.confidence || 'extracted'));
|
|
236
|
+
svg.appendChild(line);
|
|
237
|
+
const tr = document.createElement('tr');
|
|
238
|
+
for (const val of [e.type, label(s), label(t), e.line || '']) {{
|
|
239
|
+
const td = document.createElement('td'); td.textContent = val; tr.appendChild(td);
|
|
240
|
+
}}
|
|
241
|
+
rows.appendChild(tr);
|
|
242
|
+
}}
|
|
243
|
+
for (const n of data.nodes.filter(visibleNode)) {{
|
|
244
|
+
const g = document.createElementNS('http://www.w3.org/2000/svg', 'g');
|
|
245
|
+
g.setAttribute('class', `node ${{n.name ? 'symbol' : 'file'}}`);
|
|
246
|
+
g.setAttribute('transform', `translate(${{n.x}},${{n.y}})`);
|
|
247
|
+
const c = document.createElementNS('http://www.w3.org/2000/svg', 'circle');
|
|
248
|
+
c.setAttribute('r', radiusFor(n));
|
|
249
|
+
c.setAttribute('fill', colorFor(n));
|
|
250
|
+
c.setAttribute('fill-opacity', '0.85');
|
|
251
|
+
const txt = document.createElementNS('http://www.w3.org/2000/svg', 'text');
|
|
252
|
+
txt.setAttribute('x', 18); txt.setAttribute('y', 4);
|
|
253
|
+
txt.textContent = n.name || n.path.split('/').pop();
|
|
254
|
+
g.appendChild(c); g.appendChild(txt);
|
|
255
|
+
g.addEventListener('click', () => document.getElementById('filter').value = n.name || n.path);
|
|
256
|
+
svg.appendChild(g);
|
|
257
|
+
}}
|
|
258
|
+
counts.textContent = `${{data.nodes.length}} nodes / ${{data.edges.length}} edges`;
|
|
259
|
+
}}
|
|
260
|
+
document.getElementById('filter').addEventListener('input', e => draw(e.target.value));
|
|
261
|
+
draw();
|
|
262
|
+
</script>
|
|
263
|
+
</body>
|
|
264
|
+
</html>
|
|
265
|
+
"""
|
|
266
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
267
|
+
output.write_text(html, encoding="utf-8")
|
|
268
|
+
return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
# ---------------------------------------------------------------------------
|
|
272
|
+
# Interop exports — GraphML (Gephi/yEd), DOT (Graphviz), Cypher (Neo4j).
|
|
273
|
+
# All reuse _edge_rows + _graph_data, so they carry the same community/degree/
|
|
274
|
+
# confidence enrichment as the HTML view. Pure-stdlib, zero dependencies.
|
|
275
|
+
# ---------------------------------------------------------------------------
|
|
276
|
+
|
|
277
|
+
def _collect(conn, *, target, depth, direction, limit) -> dict[str, Any]:
|
|
278
|
+
return _graph_data(_edge_rows(conn, target=target, depth=depth, direction=direction, limit=limit))
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def _write(output: Path, text: str) -> None:
|
|
282
|
+
output.parent.mkdir(parents=True, exist_ok=True)
|
|
283
|
+
output.write_text(text, encoding="utf-8")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def export_graph_graphml(
|
|
287
|
+
conn: sqlite3.Connection, output: Path, *,
|
|
288
|
+
target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500,
|
|
289
|
+
) -> dict[str, int]:
|
|
290
|
+
"""GraphML for Gephi / yEd / NetworkX. Node ids are dense (n0, n1, …)."""
|
|
291
|
+
from xml.sax.saxutils import escape, quoteattr
|
|
292
|
+
|
|
293
|
+
data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit)
|
|
294
|
+
ids = {n["id"]: f"n{i}" for i, n in enumerate(data["nodes"])}
|
|
295
|
+
lines = [
|
|
296
|
+
'<?xml version="1.0" encoding="UTF-8"?>',
|
|
297
|
+
'<graphml xmlns="http://graphml.graphdrawing.org/xmlns">',
|
|
298
|
+
]
|
|
299
|
+
for k, ty in (("kind", "string"), ("name", "string"), ("path", "string"),
|
|
300
|
+
("community", "long"), ("degree", "long")):
|
|
301
|
+
lines.append(f' <key id="{k}" for="node" attr.name="{k}" attr.type="{ty}"/>')
|
|
302
|
+
for k in ("edge_type", "confidence"):
|
|
303
|
+
lines.append(f' <key id="{k}" for="edge" attr.name="{k}" attr.type="string"/>')
|
|
304
|
+
lines.append(' <graph edgedefault="directed">')
|
|
305
|
+
for n in data["nodes"]:
|
|
306
|
+
lines.append(f' <node id={quoteattr(ids[n["id"]])}>')
|
|
307
|
+
lines.append(f' <data key="kind">{escape(n.get("kind") or "")}</data>')
|
|
308
|
+
lines.append(f' <data key="name">{escape(n.get("name") or "")}</data>')
|
|
309
|
+
lines.append(f' <data key="path">{escape(n.get("path") or "")}</data>')
|
|
310
|
+
lines.append(f' <data key="community">{int(n.get("community", -1))}</data>')
|
|
311
|
+
lines.append(f' <data key="degree">{int(n.get("degree", 0))}</data>')
|
|
312
|
+
lines.append(" </node>")
|
|
313
|
+
for i, e in enumerate(data["edges"]):
|
|
314
|
+
s = ids.get(e["source"])
|
|
315
|
+
t = ids.get(e["target"])
|
|
316
|
+
if s is None or t is None:
|
|
317
|
+
continue
|
|
318
|
+
lines.append(f' <edge id="e{i}" source={quoteattr(s)} target={quoteattr(t)}>')
|
|
319
|
+
lines.append(f' <data key="edge_type">{escape(e["type"])}</data>')
|
|
320
|
+
lines.append(f' <data key="confidence">{escape(e.get("confidence") or "")}</data>')
|
|
321
|
+
lines.append(" </edge>")
|
|
322
|
+
lines += [" </graph>", "</graphml>", ""]
|
|
323
|
+
_write(output, "\n".join(lines))
|
|
324
|
+
return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def export_graph_dot(
|
|
328
|
+
conn: sqlite3.Connection, output: Path, *,
|
|
329
|
+
target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500,
|
|
330
|
+
) -> dict[str, int]:
|
|
331
|
+
"""Graphviz DOT. Edge style encodes confidence (solid/dashed/dotted)."""
|
|
332
|
+
data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit)
|
|
333
|
+
ids = {n["id"]: f"n{i}" for i, n in enumerate(data["nodes"])}
|
|
334
|
+
style = {"extracted": "solid", "inferred": "dashed", "ambiguous": "dotted"}
|
|
335
|
+
|
|
336
|
+
def esc(s: str) -> str:
|
|
337
|
+
return s.replace("\\", "\\\\").replace('"', '\\"')
|
|
338
|
+
|
|
339
|
+
lines = ["digraph codebase_index {", " rankdir=LR;", ' node [shape=box, fontsize=10];']
|
|
340
|
+
for n in data["nodes"]:
|
|
341
|
+
lbl = esc(f'{n["name"]}\n{n["path"]}' if n.get("name") else (n.get("path") or ""))
|
|
342
|
+
lines.append(f' {ids[n["id"]]} [label="{lbl}"];')
|
|
343
|
+
for e in data["edges"]:
|
|
344
|
+
s = ids.get(e["source"])
|
|
345
|
+
t = ids.get(e["target"])
|
|
346
|
+
if s is None or t is None:
|
|
347
|
+
continue
|
|
348
|
+
st = style.get(e.get("confidence") or "extracted", "solid")
|
|
349
|
+
lines.append(f' {s} -> {t} [label="{esc(e["type"])}", style={st}];')
|
|
350
|
+
lines += ["}", ""]
|
|
351
|
+
_write(output, "\n".join(lines))
|
|
352
|
+
return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
def export_graph_neo4j(
|
|
356
|
+
conn: sqlite3.Connection, output: Path, *,
|
|
357
|
+
target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500,
|
|
358
|
+
) -> dict[str, int]:
|
|
359
|
+
"""Cypher script (MERGE statements) to load the graph into Neo4j / FalkorDB."""
|
|
360
|
+
data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit)
|
|
361
|
+
|
|
362
|
+
def lit(s: str) -> str:
|
|
363
|
+
return "'" + (s or "").replace("\\", "\\\\").replace("'", "\\'") + "'"
|
|
364
|
+
|
|
365
|
+
lines = ["// codebase-index graph export for Neo4j / FalkorDB"]
|
|
366
|
+
for n in data["nodes"]:
|
|
367
|
+
node_label = "Symbol" if n.get("name") else "File"
|
|
368
|
+
lines.append(
|
|
369
|
+
f"MERGE (:{node_label} {{key:{lit(n['id'])}, name:{lit(n.get('name') or '')}, "
|
|
370
|
+
f"path:{lit(n.get('path') or '')}, community:{int(n.get('community', -1))}, "
|
|
371
|
+
f"degree:{int(n.get('degree', 0))}}});"
|
|
372
|
+
)
|
|
373
|
+
for e in data["edges"]:
|
|
374
|
+
rel = (e["type"] or "edge").upper()
|
|
375
|
+
lines.append(
|
|
376
|
+
f"MATCH (a {{key:{lit(e['source'])}}}), (b {{key:{lit(e['target'])}}}) "
|
|
377
|
+
f"MERGE (a)-[:{rel} {{confidence:{lit(e.get('confidence') or 'extracted')}}}]->(b);"
|
|
378
|
+
)
|
|
379
|
+
lines.append("")
|
|
380
|
+
_write(output, "\n".join(lines))
|
|
381
|
+
return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}
|