codebase-index 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. codebase_index/__init__.py +7 -0
  2. codebase_index/__main__.py +3 -0
  3. codebase_index/cli.py +916 -0
  4. codebase_index/config.py +110 -0
  5. codebase_index/discovery/__init__.py +10 -0
  6. codebase_index/discovery/classify.py +151 -0
  7. codebase_index/discovery/ignore.py +58 -0
  8. codebase_index/discovery/walker.py +75 -0
  9. codebase_index/doctor.py +138 -0
  10. codebase_index/embeddings/__init__.py +2 -0
  11. codebase_index/embeddings/backend.py +67 -0
  12. codebase_index/embeddings/external.py +56 -0
  13. codebase_index/embeddings/local.py +41 -0
  14. codebase_index/embeddings/noop.py +15 -0
  15. codebase_index/graph/__init__.py +8 -0
  16. codebase_index/graph/analysis.py +468 -0
  17. codebase_index/graph/builder.py +160 -0
  18. codebase_index/graph/expand.py +136 -0
  19. codebase_index/graph/export.py +381 -0
  20. codebase_index/graph/navigate.py +201 -0
  21. codebase_index/indexer/__init__.py +8 -0
  22. codebase_index/indexer/doc_chunks.py +202 -0
  23. codebase_index/indexer/freshness.py +109 -0
  24. codebase_index/indexer/pipeline.py +423 -0
  25. codebase_index/mcp/__init__.py +2 -0
  26. codebase_index/mcp/server.py +354 -0
  27. codebase_index/models.py +145 -0
  28. codebase_index/output/__init__.py +6 -0
  29. codebase_index/output/json.py +13 -0
  30. codebase_index/output/markdown.py +316 -0
  31. codebase_index/output/redact.py +31 -0
  32. codebase_index/parsers/__init__.py +9 -0
  33. codebase_index/parsers/base.py +47 -0
  34. codebase_index/parsers/languages.py +290 -0
  35. codebase_index/parsers/line_chunker.py +39 -0
  36. codebase_index/parsers/symbol_chunks.py +62 -0
  37. codebase_index/parsers/treesitter.py +439 -0
  38. codebase_index/retrieval/__init__.py +9 -0
  39. codebase_index/retrieval/budget.py +82 -0
  40. codebase_index/retrieval/fusion.py +62 -0
  41. codebase_index/retrieval/intent.py +56 -0
  42. codebase_index/retrieval/pipeline.py +207 -0
  43. codebase_index/retrieval/rerank.py +69 -0
  44. codebase_index/retrieval/searchers.py +291 -0
  45. codebase_index/retrieval/skeleton.py +251 -0
  46. codebase_index/retrieval/types.py +79 -0
  47. codebase_index/scaffold.py +399 -0
  48. codebase_index/service.py +158 -0
  49. codebase_index/skill_template/SKILL.md +198 -0
  50. codebase_index/skill_template/examples/hooks/settings.json +16 -0
  51. codebase_index/skill_template/scripts/cbx +25 -0
  52. codebase_index/skill_template/scripts/cbx.ps1 +25 -0
  53. codebase_index/skill_update.py +150 -0
  54. codebase_index/storage/__init__.py +8 -0
  55. codebase_index/storage/db.py +116 -0
  56. codebase_index/storage/repo.py +701 -0
  57. codebase_index/storage/schema.sql +125 -0
  58. codebase_index/watch/__init__.py +5 -0
  59. codebase_index/watch/watcher.py +93 -0
  60. codebase_index-1.6.0.dist-info/METADATA +748 -0
  61. codebase_index-1.6.0.dist-info/RECORD +64 -0
  62. codebase_index-1.6.0.dist-info/WHEEL +4 -0
  63. codebase_index-1.6.0.dist-info/entry_points.txt +4 -0
  64. codebase_index-1.6.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,160 @@
1
+ """Global graph pass: resolve unresolved edges against the whole repo and
2
+ denormalize symbol degrees.
3
+
4
+ Runs once after all files are indexed (it needs the complete symbol/file tables).
5
+ Symbol-target edges (call/reference/extends/implements) resolve only on an
6
+ UNAMBIGUOUS name match — if two definitions share a name, the edge is left
7
+ unresolved rather than guessed. Import edges resolve their module path to a file
8
+ by POSIX path-suffix match (e.g. 'auth.token' -> '%/auth/token.py').
9
+
10
+ The pass is batched: one query for globally-unique symbol names, one for file
11
+ paths (expanded into an in-memory suffix map), one executemany for the updates.
12
+ The per-edge variant did an indexed lookup per symbol edge and up to ~20
13
+ full-table LIKE scans per import edge, which dominated large builds.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import sqlite3
19
+ from typing import Optional
20
+
21
+ from ..storage import repo
22
+
23
+ _SYMBOL_EDGE_TYPES = {"call", "reference", "extends", "implements"}
24
+
25
+
26
+ def build_graph(conn: sqlite3.Connection) -> dict[str, int]:
27
+ resolved = resolve_edges(conn)
28
+ repo.recompute_degrees(conn)
29
+ # Everything still unresolved that names a target is, by definition, a target we
30
+ # could not pin to a unique node — record it as 'ambiguous' for the honesty trail.
31
+ repo.mark_ambiguous_edges(conn)
32
+ total_unresolved = len(repo.unresolved_edges(conn))
33
+ # Architecture analytics (communities / god nodes / surprising bridges) are a
34
+ # derived view of the graph. Compute once per build and cache the JSON in meta so
35
+ # the `architecture` command and the HTML export read it instantly. Never let an
36
+ # analysis failure fail the build — the graph itself is already written.
37
+ try:
38
+ from . import analysis
39
+
40
+ analysis.refresh_analysis(conn)
41
+ except Exception: # pragma: no cover - defensive; analytics are best-effort
42
+ pass
43
+ return {"resolved": resolved, "unresolved": total_unresolved}
44
+
45
+
46
+ def resolve_edges(conn: sqlite3.Connection) -> int:
47
+ edges = repo.unresolved_edges(conn)
48
+ if not edges:
49
+ return 0
50
+
51
+ unique_symbols = repo.unique_symbol_ids_by_name(conn)
52
+ suffix_map = _path_suffix_map(repo.all_file_ids_with_paths(conn))
53
+
54
+ # (dst_kind, dst_id, edge_id, confidence). A repo-unique symbol name is an exact
55
+ # hit -> 'extracted'; an import resolved only by path-suffix matching is a best-
56
+ # effort heuristic -> 'inferred'.
57
+ resolutions: list[tuple[str, int, int, str]] = []
58
+ for edge in edges:
59
+ name = edge["dst_name"]
60
+ if edge["edge_type"] == "import":
61
+ file_id = _module_to_file_id(suffix_map, name, lang=edge["lang"])
62
+ if file_id is not None:
63
+ resolutions.append(("file", file_id, edge["id"], "inferred"))
64
+ elif edge["edge_type"] in _SYMBOL_EDGE_TYPES:
65
+ sym_id = unique_symbols.get(name)
66
+ if sym_id is not None:
67
+ resolutions.append(("symbol", sym_id, edge["id"], "extracted"))
68
+
69
+ repo.resolve_edges_bulk(conn, resolutions)
70
+ return len(resolutions)
71
+
72
+
73
+ def _path_suffix_map(rows: list[sqlite3.Row]) -> dict[str, Optional[int]]:
74
+ """Map every '/'-aligned path suffix to its file id, or None when ambiguous.
75
+
76
+ Mirrors files_with_suffix(path = suffix OR path LIKE '%/suffix') semantics:
77
+ a suffix shared by several files resolves to None (like a multi-row result),
78
+ and matching is case-insensitive the way SQLite LIKE folds ASCII.
79
+ """
80
+ mapping: dict[str, Optional[int]] = {}
81
+ for row in rows:
82
+ parts = row["path"].lower().split("/")
83
+ for i in range(len(parts)):
84
+ suffix = "/".join(parts[i:])
85
+ mapping[suffix] = None if suffix in mapping else int(row["id"])
86
+ return mapping
87
+
88
+
89
+ def _lang_suffixes(lang: Optional[str], base: str, rust_base: str, go_pkg: str) -> list[str]:
90
+ """Import-path suffixes specific to one language, most-specific first."""
91
+ return {
92
+ "python": [f"{base}.py", f"{base}/__init__.py"],
93
+ "typescript": [f"{base}.ts", f"{base}.tsx", f"{base}/index.ts", f"{base}/index.tsx"],
94
+ "javascript": [f"{base}.js", f"{base}/index.js"],
95
+ "java": [f"{base}.java"],
96
+ "kotlin": [f"{base}.kt"],
97
+ "go": [f"{go_pkg}.go"],
98
+ "rust": [
99
+ f"{rust_base}.rs", f"{rust_base}/mod.rs",
100
+ f"src/{rust_base}.rs", f"src/{rust_base}/mod.rs",
101
+ ],
102
+ "csharp": [f"{base}.cs"],
103
+ "ruby": [f"{base}.rb"],
104
+ "php": [f"{base}.php"],
105
+ }.get(lang or "", [])
106
+
107
+
108
+ def _module_to_file_id(
109
+ suffix_map: dict[str, Optional[int]], module: str, lang: Optional[str] = None
110
+ ) -> Optional[int]:
111
+ """Resolve a module/import path to a unique file id, or None.
112
+
113
+ Handles Python, TypeScript/JavaScript, Java/Kotlin/Scala, Rust (:: separator),
114
+ Go (last path segment), C#, Ruby, and PHP import conventions. The importing
115
+ file's `lang` is tried first so that, in a polyglot repo, `import './base'` from
116
+ a .ts file resolves to base.ts rather than a same-named base.py earlier in the
117
+ fixed fallback order. The fallback order is unchanged, so single-language repos
118
+ and the lang-unknown path behave exactly as before.
119
+ """
120
+ base = module.lower().replace(".", "/").strip("/")
121
+ rust_base = module.lower().replace("::", "/").strip("/")
122
+ if not base:
123
+ return None
124
+ # Last segment used for Go package-level resolution
125
+ go_pkg = base.rsplit("/", 1)[-1] if "/" in base else base
126
+
127
+ fallback = (
128
+ # Python
129
+ f"{base}.py",
130
+ f"{base}/__init__.py",
131
+ # TypeScript / JavaScript
132
+ f"{base}.ts",
133
+ f"{base}.tsx",
134
+ f"{base}.js",
135
+ f"{base}/index.ts",
136
+ f"{base}/index.tsx",
137
+ f"{base}/index.js",
138
+ # Java / Kotlin / Scala (dot-to-slash already done above)
139
+ f"{base}.java",
140
+ f"{base}.kt",
141
+ f"{base}.scala",
142
+ # Go: resolve last path segment to a .go file of the same name
143
+ f"{go_pkg}.go",
144
+ # Rust: :: separator mapped to /; also try under src/
145
+ f"{rust_base}.rs",
146
+ f"{rust_base}/mod.rs",
147
+ f"src/{rust_base}.rs",
148
+ f"src/{rust_base}/mod.rs",
149
+ # C#
150
+ f"{base}.cs",
151
+ # Ruby
152
+ f"{base}.rb",
153
+ # PHP
154
+ f"{base}.php",
155
+ )
156
+ for suffix in (*_lang_suffixes(lang, base, rust_base, go_pkg), *fallback):
157
+ file_id = suffix_map.get(suffix)
158
+ if file_id is not None:
159
+ return file_id
160
+ return None
@@ -0,0 +1,136 @@
1
+ """Impact analysis: bounded BFS over the resolved edge graph.
2
+
3
+ Direction semantics:
4
+ up -> dependents (who is affected if the target changes): incoming edges.
5
+ down -> dependencies (what the target relies on): outgoing edges.
6
+ both -> union of the two.
7
+
8
+ Target resolution: an exact file path -> a file node (seeded together with all
9
+ symbols defined in that file, so importers AND subclassers surface). Otherwise a
10
+ symbol name -> all symbol nodes with that name. A path suffix is the last resort.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import sqlite3
16
+ from collections import deque
17
+ from typing import Optional
18
+
19
+ from ..models import GraphCoverage, ImpactNode, ImpactResponse, IndexFreshness
20
+ from ..storage import repo
21
+
22
+
23
+ def _freshness(conn: sqlite3.Connection) -> IndexFreshness:
24
+ return IndexFreshness(
25
+ exists=True,
26
+ stale=False,
27
+ built_at=repo.get_meta(conn, "built_at"),
28
+ head_commit=repo.get_meta(conn, "head_commit"),
29
+ )
30
+
31
+
32
+ def _seed_nodes(conn: sqlite3.Connection, target: str) -> list[tuple[str, int]]:
33
+ """Resolve a target string to one or more (kind, id) start nodes."""
34
+ frow = repo.file_by_path(conn, target)
35
+ if frow is not None:
36
+ seeds = [("file", int(frow["id"]))]
37
+ seeds += [("symbol", int(s["id"])) for s in repo.symbols_in_file(conn, int(frow["id"]))]
38
+ return seeds
39
+
40
+ sym_rows = repo.symbols_by_name(conn, target, exact=True)
41
+ if sym_rows:
42
+ return [("symbol", int(r["id"])) for r in sym_rows]
43
+
44
+ suffix = repo.files_with_suffix(conn, target)
45
+ if len(suffix) == 1:
46
+ fid = int(suffix[0]["id"])
47
+ return [("file", fid)] + [
48
+ ("symbol", int(s["id"])) for s in repo.symbols_in_file(conn, fid)
49
+ ]
50
+ return []
51
+
52
+
53
+ def _neighbors(conn, kind, node_id, direction):
54
+ """Yield (next_kind, next_id, edge_type, confidence) for the requested direction(s)."""
55
+ if direction in ("up", "both"):
56
+ for e in repo.incoming_edges(conn, kind, node_id):
57
+ yield e["src_kind"], int(e["src_id"]), e["edge_type"], e["confidence"]
58
+ if direction in ("down", "both"):
59
+ for e in repo.outgoing_edges(conn, kind, node_id):
60
+ if e["dst_id"] is not None:
61
+ yield e["dst_kind"], int(e["dst_id"]), e["edge_type"], e["confidence"]
62
+
63
+
64
+ def _node_meta(conn, kind, node_id) -> Optional[ImpactNode]:
65
+ if kind == "file":
66
+ row = conn.execute("SELECT path FROM files WHERE id = ?", (node_id,)).fetchone()
67
+ if row is None:
68
+ return None
69
+ return ImpactNode(kind="file", path=row["path"], distance=0)
70
+ row = conn.execute(
71
+ "SELECT s.name AS name, s.line_start AS line_start, f.path AS path "
72
+ "FROM symbols s JOIN files f ON f.id = s.file_id WHERE s.id = ?",
73
+ (node_id,),
74
+ ).fetchone()
75
+ if row is None:
76
+ return None
77
+ return ImpactNode(kind="symbol", path=row["path"], name=row["name"],
78
+ line_start=row["line_start"], distance=0)
79
+
80
+
81
+ def walk_impact(
82
+ conn: sqlite3.Connection, target: str, *, depth: int, direction: str
83
+ ) -> list[ImpactNode]:
84
+ seeds = _seed_nodes(conn, target)
85
+ if not seeds:
86
+ return []
87
+ visited: set[tuple[str, int]] = set(seeds)
88
+ queue: deque[tuple[str, int, int]] = deque((k, i, 0) for k, i in seeds)
89
+ out: list[ImpactNode] = []
90
+
91
+ while queue:
92
+ kind, node_id, dist = queue.popleft()
93
+ if dist >= depth:
94
+ continue
95
+ for nk, nid, etype, conf in _neighbors(conn, kind, node_id, direction):
96
+ if (nk, nid) in visited:
97
+ continue
98
+ visited.add((nk, nid))
99
+ meta = _node_meta(conn, nk, nid)
100
+ if meta is None:
101
+ continue
102
+ meta.distance = dist + 1
103
+ meta.via_edge = etype
104
+ meta.via_confidence = conf
105
+ out.append(meta)
106
+ queue.append((nk, nid, dist + 1))
107
+ return out
108
+
109
+
110
+ def _target_paths(conn: sqlite3.Connection, target: str) -> list[str]:
111
+ """The file path(s) the target resolves to, for coverage classification."""
112
+ if repo.file_by_path(conn, target) is not None:
113
+ return [target]
114
+ sym_rows = repo.symbols_by_name(conn, target, exact=True)
115
+ if sym_rows:
116
+ return [r["path"] for r in sym_rows]
117
+ suffix = repo.files_with_suffix(conn, target)
118
+ if len(suffix) == 1:
119
+ return [suffix[0]["path"]]
120
+ return []
121
+
122
+
123
+ def impact_lookup(
124
+ conn: sqlite3.Connection, target: str, *, depth: int, direction: str
125
+ ) -> ImpactResponse:
126
+ nodes = walk_impact(conn, target, depth=depth, direction=direction)
127
+ best: dict[str, int] = {}
128
+ for n in nodes:
129
+ if n.path not in best or n.distance < best[n.path]:
130
+ best[n.path] = n.distance
131
+ files = sorted(best, key=lambda p: (best[p], p))
132
+ return ImpactResponse(
133
+ target=target, direction=direction, depth=depth,
134
+ index=_freshness(conn), nodes=nodes, files=files,
135
+ coverage=GraphCoverage.for_paths(_target_paths(conn, target)),
136
+ )
@@ -0,0 +1,381 @@
1
+ """HTML graph export for the indexed call/import/reference graph."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import math
7
+ import sqlite3
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from .expand import impact_lookup
12
+
13
+
14
+ def _edge_rows(
15
+ conn: sqlite3.Connection,
16
+ *,
17
+ target: str | None,
18
+ depth: int,
19
+ direction: str,
20
+ limit: int,
21
+ ) -> list[sqlite3.Row]:
22
+ params: list[Any] = []
23
+ where = "WHERE e.resolved = 1"
24
+ if target:
25
+ impact = impact_lookup(conn, target, depth=depth, direction=direction)
26
+ paths = set(impact.files)
27
+ if "/" in target or "\\" in target:
28
+ paths.add(target.replace("\\", "/"))
29
+ if paths:
30
+ placeholders = ",".join("?" for _ in paths)
31
+ where += (
32
+ f" AND (src_file.path IN ({placeholders}) "
33
+ f"OR src_sym_file.path IN ({placeholders}) "
34
+ f"OR dst_file.path IN ({placeholders}) "
35
+ f"OR dst_sym_file.path IN ({placeholders}))"
36
+ )
37
+ ordered = sorted(paths)
38
+ params.extend(ordered)
39
+ params.extend(ordered)
40
+ params.extend(ordered)
41
+ params.extend(ordered)
42
+
43
+ params.append(limit)
44
+ return conn.execute(
45
+ f"""
46
+ SELECT e.edge_type, e.resolved, e.line, e.dst_name, e.confidence,
47
+ e.src_kind, e.dst_kind,
48
+ src_file.path AS src_file_path,
49
+ src_sym_file.path AS src_symbol_file_path,
50
+ src_sym.name AS src_symbol_name,
51
+ src_sym.kind AS src_symbol_kind,
52
+ dst_file.path AS dst_file_path,
53
+ dst_sym.name AS dst_symbol_name,
54
+ dst_sym.kind AS dst_symbol_kind,
55
+ dst_sym_file.path AS dst_symbol_file_path
56
+ FROM edges e
57
+ LEFT JOIN files src_file ON e.src_kind = 'file' AND src_file.id = e.src_id
58
+ LEFT JOIN symbols src_sym ON e.src_kind = 'symbol' AND src_sym.id = e.src_id
59
+ LEFT JOIN files src_sym_file ON src_sym_file.id = src_sym.file_id
60
+ LEFT JOIN files dst_file ON e.dst_kind = 'file' AND dst_file.id = e.dst_id
61
+ LEFT JOIN symbols dst_sym ON e.dst_kind = 'symbol' AND dst_sym.id = e.dst_id
62
+ LEFT JOIN files dst_sym_file ON dst_sym_file.id = dst_sym.file_id
63
+ {where}
64
+ ORDER BY e.edge_type, COALESCE(src_file.path, src_sym_file.path), e.line
65
+ LIMIT ?
66
+ """,
67
+ params,
68
+ ).fetchall()
69
+
70
+
71
+ def _node_key(kind: str, path: str, name: str | None = None) -> str:
72
+ return f"{kind}:{path}:{name or ''}"
73
+
74
+
75
+ def _graph_data(rows: list[sqlite3.Row]) -> dict[str, Any]:
76
+ from collections import Counter, defaultdict
77
+
78
+ from .analysis import detect_communities, weighted_degree
79
+
80
+ nodes: dict[str, dict[str, Any]] = {}
81
+ edges: list[dict[str, Any]] = []
82
+ adj: dict[str, Counter] = defaultdict(Counter)
83
+ for row in rows:
84
+ src_path = row["src_file_path"] or row["src_symbol_file_path"] or ""
85
+ src_name = row["src_symbol_name"]
86
+ src_kind = row["src_symbol_kind"] or "file"
87
+ dst_path = row["dst_file_path"] or row["dst_symbol_file_path"] or ""
88
+ dst_name = row["dst_symbol_name"]
89
+ dst_kind = row["dst_symbol_kind"] or row["dst_kind"] or "file"
90
+ if not src_path or not dst_path:
91
+ continue
92
+
93
+ src_key = _node_key("symbol" if src_name else "file", src_path, src_name)
94
+ dst_key = _node_key("symbol" if dst_name else "file", dst_path, dst_name)
95
+ nodes.setdefault(
96
+ src_key,
97
+ {"id": src_key, "path": src_path, "name": src_name, "kind": src_kind},
98
+ )
99
+ nodes.setdefault(
100
+ dst_key,
101
+ {"id": dst_key, "path": dst_path, "name": dst_name, "kind": dst_kind},
102
+ )
103
+ edges.append(
104
+ {
105
+ "source": src_key,
106
+ "target": dst_key,
107
+ "type": row["edge_type"],
108
+ "line": row["line"],
109
+ "confidence": row["confidence"] if "confidence" in row.keys() else "extracted",
110
+ }
111
+ )
112
+ if src_key != dst_key:
113
+ adj[src_key][dst_key] += 1
114
+ adj[dst_key][src_key] += 1
115
+
116
+ # Colour by module and size by centrality, computed on the displayed subgraph.
117
+ # The analysis functions are generic over the node key type, so string keys work.
118
+ communities = detect_communities(adj)
119
+ degree = weighted_degree(adj)
120
+ for key, node in nodes.items():
121
+ node["community"] = communities.get(key, -1)
122
+ node["degree"] = degree.get(key, 0)
123
+ return {"nodes": list(nodes.values()), "edges": edges}
124
+
125
+
126
+ def _layout(nodes: list[dict[str, Any]], width: int = 1200, height: int = 760) -> None:
127
+ radius = min(width, height) * 0.38
128
+ cx = width / 2
129
+ cy = height / 2
130
+ count = max(1, len(nodes))
131
+ for idx, node in enumerate(nodes):
132
+ angle = 2 * math.pi * idx / count
133
+ node["x"] = round(cx + radius * math.cos(angle), 2)
134
+ node["y"] = round(cy + radius * math.sin(angle), 2)
135
+
136
+
137
+ def export_graph_html(
138
+ conn: sqlite3.Connection,
139
+ output: Path,
140
+ *,
141
+ target: str | None = None,
142
+ depth: int = 2,
143
+ direction: str = "both",
144
+ limit: int = 500,
145
+ ) -> dict[str, int]:
146
+ rows = _edge_rows(conn, target=target, depth=depth, direction=direction, limit=limit)
147
+ data = _graph_data(rows)
148
+ _layout(data["nodes"])
149
+ payload = json.dumps(data).replace("</", "<\\/")
150
+ title = "codebase-index graph" + (f" - {target}" if target else "")
151
+ html = f"""<!doctype html>
152
+ <html lang="en">
153
+ <head>
154
+ <meta charset="utf-8">
155
+ <meta name="viewport" content="width=device-width, initial-scale=1">
156
+ <title>{title}</title>
157
+ <style>
158
+ body {{ margin:0; font:14px system-ui, Segoe UI, sans-serif; color:#1f2937; background:#f8fafc; }}
159
+ header {{ padding:14px 18px; border-bottom:1px solid #d1d5db; background:#fff; display:flex; gap:14px; align-items:center; }}
160
+ h1 {{ font-size:18px; margin:0; font-weight:650; }}
161
+ input {{ width:320px; max-width:40vw; padding:8px 10px; border:1px solid #cbd5e1; border-radius:6px; }}
162
+ main {{ display:grid; grid-template-columns:minmax(0,1fr) 420px; min-height:calc(100vh - 58px); }}
163
+ svg {{ width:100%; height:calc(100vh - 58px); background:#fff; }}
164
+ aside {{ border-left:1px solid #d1d5db; overflow:auto; background:#f8fafc; }}
165
+ table {{ width:100%; border-collapse:collapse; font-size:12px; }}
166
+ th,td {{ text-align:left; padding:8px; border-bottom:1px solid #e5e7eb; vertical-align:top; }}
167
+ th {{ position:sticky; top:0; background:#f1f5f9; z-index:1; }}
168
+ .edge {{ stroke:#94a3b8; stroke-width:1.3; }}
169
+ .edge.inferred {{ stroke-dasharray:5 3; }} /* heuristic-resolved */
170
+ .edge.ambiguous {{ stroke:#ef4444; stroke-dasharray:2 3; }} /* unresolved target */
171
+ .node {{ cursor:pointer; }}
172
+ .node circle {{ stroke:#1f2937; stroke-width:1.5; }}
173
+ .node.file circle {{ stroke-width:2.5; }}
174
+ .node text {{ font-size:11px; fill:#111827; }}
175
+ .dim {{ opacity:.12; }}
176
+ .selected circle {{ stroke:#111827; stroke-width:3; }}
177
+ .legend {{ font-size:11px; color:#475569; display:flex; gap:14px; align-items:center; flex-wrap:wrap; }}
178
+ .legend b {{ color:#1f2937; }}
179
+ .legend svg {{ width:34px; height:8px; vertical-align:middle; }}
180
+ </style>
181
+ </head>
182
+ <body>
183
+ <header>
184
+ <h1>codebase-index graph</h1>
185
+ <input id="filter" placeholder="Filter nodes or edges">
186
+ <span id="counts"></span>
187
+ <span class="legend">
188
+ <span><b>colour</b> = module</span>
189
+ <span><b>size</b> = connectivity</span>
190
+ <span><svg><line x1="0" y1="4" x2="34" y2="4" stroke="#94a3b8" stroke-width="1.3"/></svg> extracted</span>
191
+ <span><svg><line x1="0" y1="4" x2="34" y2="4" stroke="#94a3b8" stroke-width="1.3" stroke-dasharray="5 3"/></svg> inferred</span>
192
+ <span><svg><line x1="0" y1="4" x2="34" y2="4" stroke="#ef4444" stroke-width="1.3" stroke-dasharray="2 3"/></svg> ambiguous</span>
193
+ </span>
194
+ </header>
195
+ <main>
196
+ <svg id="graph" viewBox="0 0 1200 760" role="img" aria-label="code graph"></svg>
197
+ <aside>
198
+ <table>
199
+ <thead><tr><th>type</th><th>source</th><th>target</th><th>line</th></tr></thead>
200
+ <tbody id="edgeRows"></tbody>
201
+ </table>
202
+ </aside>
203
+ </main>
204
+ <script id="graph-data" type="application/json">{payload}</script>
205
+ <script>
206
+ const data = JSON.parse(document.getElementById('graph-data').textContent);
207
+ const svg = document.getElementById('graph');
208
+ const rows = document.getElementById('edgeRows');
209
+ const counts = document.getElementById('counts');
210
+ const byId = new Map(data.nodes.map(n => [n.id, n]));
211
+ // Stable, readable categorical palette; community id indexes into it.
212
+ const PALETTE = ['#2563eb','#059669','#d97706','#7c3aed','#db2777','#0891b2',
213
+ '#65a30d','#dc2626','#4f46e5','#ca8a04','#0d9488','#9333ea'];
214
+ function colorFor(n) {{
215
+ const c = n.community;
216
+ if (c === undefined || c < 0) return '#cbd5e1';
217
+ return PALETTE[c % PALETTE.length];
218
+ }}
219
+ function radiusFor(n) {{ return (n.name ? 8 : 11) + Math.min(14, Math.sqrt(n.degree || 0) * 2); }}
220
+ function label(n) {{ return n.name ? `${{n.name}} (${{n.path}})` : n.path; }}
221
+ function draw(filter = '') {{
222
+ svg.textContent = '';
223
+ rows.textContent = '';
224
+ const q = filter.toLowerCase();
225
+ const visibleNode = n => !q || label(n).toLowerCase().includes(q);
226
+ const visibleEdge = e => {{
227
+ const s = byId.get(e.source), t = byId.get(e.target);
228
+ return !q || e.type.toLowerCase().includes(q) || label(s).toLowerCase().includes(q) || label(t).toLowerCase().includes(q);
229
+ }};
230
+ for (const e of data.edges.filter(visibleEdge)) {{
231
+ const s = byId.get(e.source), t = byId.get(e.target);
232
+ const line = document.createElementNS('http://www.w3.org/2000/svg', 'line');
233
+ line.setAttribute('x1', s.x); line.setAttribute('y1', s.y);
234
+ line.setAttribute('x2', t.x); line.setAttribute('y2', t.y);
235
+ line.setAttribute('class', 'edge ' + (e.confidence || 'extracted'));
236
+ svg.appendChild(line);
237
+ const tr = document.createElement('tr');
238
+ for (const val of [e.type, label(s), label(t), e.line || '']) {{
239
+ const td = document.createElement('td'); td.textContent = val; tr.appendChild(td);
240
+ }}
241
+ rows.appendChild(tr);
242
+ }}
243
+ for (const n of data.nodes.filter(visibleNode)) {{
244
+ const g = document.createElementNS('http://www.w3.org/2000/svg', 'g');
245
+ g.setAttribute('class', `node ${{n.name ? 'symbol' : 'file'}}`);
246
+ g.setAttribute('transform', `translate(${{n.x}},${{n.y}})`);
247
+ const c = document.createElementNS('http://www.w3.org/2000/svg', 'circle');
248
+ c.setAttribute('r', radiusFor(n));
249
+ c.setAttribute('fill', colorFor(n));
250
+ c.setAttribute('fill-opacity', '0.85');
251
+ const txt = document.createElementNS('http://www.w3.org/2000/svg', 'text');
252
+ txt.setAttribute('x', 18); txt.setAttribute('y', 4);
253
+ txt.textContent = n.name || n.path.split('/').pop();
254
+ g.appendChild(c); g.appendChild(txt);
255
+ g.addEventListener('click', () => document.getElementById('filter').value = n.name || n.path);
256
+ svg.appendChild(g);
257
+ }}
258
+ counts.textContent = `${{data.nodes.length}} nodes / ${{data.edges.length}} edges`;
259
+ }}
260
+ document.getElementById('filter').addEventListener('input', e => draw(e.target.value));
261
+ draw();
262
+ </script>
263
+ </body>
264
+ </html>
265
+ """
266
+ output.parent.mkdir(parents=True, exist_ok=True)
267
+ output.write_text(html, encoding="utf-8")
268
+ return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}
269
+
270
+
271
+ # ---------------------------------------------------------------------------
272
+ # Interop exports — GraphML (Gephi/yEd), DOT (Graphviz), Cypher (Neo4j).
273
+ # All reuse _edge_rows + _graph_data, so they carry the same community/degree/
274
+ # confidence enrichment as the HTML view. Pure-stdlib, zero dependencies.
275
+ # ---------------------------------------------------------------------------
276
+
277
+ def _collect(conn, *, target, depth, direction, limit) -> dict[str, Any]:
278
+ return _graph_data(_edge_rows(conn, target=target, depth=depth, direction=direction, limit=limit))
279
+
280
+
281
+ def _write(output: Path, text: str) -> None:
282
+ output.parent.mkdir(parents=True, exist_ok=True)
283
+ output.write_text(text, encoding="utf-8")
284
+
285
+
286
+ def export_graph_graphml(
287
+ conn: sqlite3.Connection, output: Path, *,
288
+ target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500,
289
+ ) -> dict[str, int]:
290
+ """GraphML for Gephi / yEd / NetworkX. Node ids are dense (n0, n1, …)."""
291
+ from xml.sax.saxutils import escape, quoteattr
292
+
293
+ data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit)
294
+ ids = {n["id"]: f"n{i}" for i, n in enumerate(data["nodes"])}
295
+ lines = [
296
+ '<?xml version="1.0" encoding="UTF-8"?>',
297
+ '<graphml xmlns="http://graphml.graphdrawing.org/xmlns">',
298
+ ]
299
+ for k, ty in (("kind", "string"), ("name", "string"), ("path", "string"),
300
+ ("community", "long"), ("degree", "long")):
301
+ lines.append(f' <key id="{k}" for="node" attr.name="{k}" attr.type="{ty}"/>')
302
+ for k in ("edge_type", "confidence"):
303
+ lines.append(f' <key id="{k}" for="edge" attr.name="{k}" attr.type="string"/>')
304
+ lines.append(' <graph edgedefault="directed">')
305
+ for n in data["nodes"]:
306
+ lines.append(f' <node id={quoteattr(ids[n["id"]])}>')
307
+ lines.append(f' <data key="kind">{escape(n.get("kind") or "")}</data>')
308
+ lines.append(f' <data key="name">{escape(n.get("name") or "")}</data>')
309
+ lines.append(f' <data key="path">{escape(n.get("path") or "")}</data>')
310
+ lines.append(f' <data key="community">{int(n.get("community", -1))}</data>')
311
+ lines.append(f' <data key="degree">{int(n.get("degree", 0))}</data>')
312
+ lines.append(" </node>")
313
+ for i, e in enumerate(data["edges"]):
314
+ s = ids.get(e["source"])
315
+ t = ids.get(e["target"])
316
+ if s is None or t is None:
317
+ continue
318
+ lines.append(f' <edge id="e{i}" source={quoteattr(s)} target={quoteattr(t)}>')
319
+ lines.append(f' <data key="edge_type">{escape(e["type"])}</data>')
320
+ lines.append(f' <data key="confidence">{escape(e.get("confidence") or "")}</data>')
321
+ lines.append(" </edge>")
322
+ lines += [" </graph>", "</graphml>", ""]
323
+ _write(output, "\n".join(lines))
324
+ return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}
325
+
326
+
327
+ def export_graph_dot(
328
+ conn: sqlite3.Connection, output: Path, *,
329
+ target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500,
330
+ ) -> dict[str, int]:
331
+ """Graphviz DOT. Edge style encodes confidence (solid/dashed/dotted)."""
332
+ data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit)
333
+ ids = {n["id"]: f"n{i}" for i, n in enumerate(data["nodes"])}
334
+ style = {"extracted": "solid", "inferred": "dashed", "ambiguous": "dotted"}
335
+
336
+ def esc(s: str) -> str:
337
+ return s.replace("\\", "\\\\").replace('"', '\\"')
338
+
339
+ lines = ["digraph codebase_index {", " rankdir=LR;", ' node [shape=box, fontsize=10];']
340
+ for n in data["nodes"]:
341
+ lbl = esc(f'{n["name"]}\n{n["path"]}' if n.get("name") else (n.get("path") or ""))
342
+ lines.append(f' {ids[n["id"]]} [label="{lbl}"];')
343
+ for e in data["edges"]:
344
+ s = ids.get(e["source"])
345
+ t = ids.get(e["target"])
346
+ if s is None or t is None:
347
+ continue
348
+ st = style.get(e.get("confidence") or "extracted", "solid")
349
+ lines.append(f' {s} -> {t} [label="{esc(e["type"])}", style={st}];')
350
+ lines += ["}", ""]
351
+ _write(output, "\n".join(lines))
352
+ return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}
353
+
354
+
355
+ def export_graph_neo4j(
356
+ conn: sqlite3.Connection, output: Path, *,
357
+ target: str | None = None, depth: int = 2, direction: str = "both", limit: int = 500,
358
+ ) -> dict[str, int]:
359
+ """Cypher script (MERGE statements) to load the graph into Neo4j / FalkorDB."""
360
+ data = _collect(conn, target=target, depth=depth, direction=direction, limit=limit)
361
+
362
+ def lit(s: str) -> str:
363
+ return "'" + (s or "").replace("\\", "\\\\").replace("'", "\\'") + "'"
364
+
365
+ lines = ["// codebase-index graph export for Neo4j / FalkorDB"]
366
+ for n in data["nodes"]:
367
+ node_label = "Symbol" if n.get("name") else "File"
368
+ lines.append(
369
+ f"MERGE (:{node_label} {{key:{lit(n['id'])}, name:{lit(n.get('name') or '')}, "
370
+ f"path:{lit(n.get('path') or '')}, community:{int(n.get('community', -1))}, "
371
+ f"degree:{int(n.get('degree', 0))}}});"
372
+ )
373
+ for e in data["edges"]:
374
+ rel = (e["type"] or "edge").upper()
375
+ lines.append(
376
+ f"MATCH (a {{key:{lit(e['source'])}}}), (b {{key:{lit(e['target'])}}}) "
377
+ f"MERGE (a)-[:{rel} {{confidence:{lit(e.get('confidence') or 'extracted')}}}]->(b);"
378
+ )
379
+ lines.append("")
380
+ _write(output, "\n".join(lines))
381
+ return {"nodes": len(data["nodes"]), "edges": len(data["edges"])}