cerebro-code-memory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cerebro/__init__.py +3 -0
- cerebro/callgraph.py +38 -0
- cerebro/cli.py +348 -0
- cerebro/config.py +136 -0
- cerebro/db.py +245 -0
- cerebro/docaudit.py +174 -0
- cerebro/embeddings.py +175 -0
- cerebro/gitsync.py +124 -0
- cerebro/graph.py +77 -0
- cerebro/indexer.py +854 -0
- cerebro/insights.py +217 -0
- cerebro/notes.py +70 -0
- cerebro/server.py +382 -0
- cerebro/summaries.py +66 -0
- cerebro/summarizer.py +109 -0
- cerebro/tsconfig.py +159 -0
- cerebro/views.py +52 -0
- cerebro/viz.py +374 -0
- cerebro_code_memory-0.1.0.dist-info/METADATA +160 -0
- cerebro_code_memory-0.1.0.dist-info/RECORD +23 -0
- cerebro_code_memory-0.1.0.dist-info/WHEEL +4 -0
- cerebro_code_memory-0.1.0.dist-info/entry_points.txt +11 -0
- cerebro_code_memory-0.1.0.dist-info/licenses/LICENSE +21 -0
cerebro/summaries.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Cached English summaries (plan layer 2) and summary-staleness.
|
|
2
|
+
|
|
3
|
+
A summary is tied to the file version it described via `source_hash`. When the
|
|
4
|
+
file's current hash differs, the summary is flagged stale so a session knows to
|
|
5
|
+
re-read just that file instead of trusting an outdated trace.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def now_iso() -> str:
|
|
13
|
+
return datetime.now(timezone.utc).isoformat(timespec="seconds")
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def record(conn, path: str, summary: str, model: str | None = None) -> dict:
|
|
17
|
+
row = conn.execute("SELECT hash FROM files WHERE path=?", (path,)).fetchone()
|
|
18
|
+
source_hash = row["hash"] if row else None
|
|
19
|
+
conn.execute(
|
|
20
|
+
"""INSERT INTO summaries(path, summary_en, model, source_hash, updated_at)
|
|
21
|
+
VALUES(?,?,?,?,?)
|
|
22
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
23
|
+
summary_en=excluded.summary_en, model=excluded.model,
|
|
24
|
+
source_hash=excluded.source_hash, updated_at=excluded.updated_at""",
|
|
25
|
+
(path, summary, model, source_hash, now_iso()),
|
|
26
|
+
)
|
|
27
|
+
conn.execute("DELETE FROM fts WHERE path=? AND kind='summary'", (path,))
|
|
28
|
+
conn.execute(
|
|
29
|
+
"INSERT INTO fts(path, kind, text) VALUES(?, 'summary', ?)", (path, summary)
|
|
30
|
+
)
|
|
31
|
+
conn.commit()
|
|
32
|
+
return {"path": path, "indexed": source_hash is not None}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get(conn, path: str, current_hash: str | None = None) -> dict | None:
|
|
36
|
+
"""Look up a cached summary. Pass `current_hash` (the live on-disk hash) to
|
|
37
|
+
detect staleness against disk directly; otherwise it is compared against the
|
|
38
|
+
last-indexed hash, which only reflects changes after a reindex."""
|
|
39
|
+
row = conn.execute("SELECT * FROM summaries WHERE path=?", (path,)).fetchone()
|
|
40
|
+
if not row:
|
|
41
|
+
return None
|
|
42
|
+
if current_hash is None:
|
|
43
|
+
file_row = conn.execute(
|
|
44
|
+
"SELECT hash FROM files WHERE path=?", (path,)
|
|
45
|
+
).fetchone()
|
|
46
|
+
current_hash = file_row["hash"] if file_row else None
|
|
47
|
+
stale = bool(
|
|
48
|
+
row["source_hash"] and current_hash and current_hash != row["source_hash"]
|
|
49
|
+
)
|
|
50
|
+
return {
|
|
51
|
+
"path": path,
|
|
52
|
+
"summary_en": row["summary_en"],
|
|
53
|
+
"model": row["model"],
|
|
54
|
+
"updated_at": row["updated_at"],
|
|
55
|
+
"stale": stale,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def stale_summaries(conn) -> list[str]:
|
|
60
|
+
"""Summaries whose source file changed since the summary was written."""
|
|
61
|
+
rows = conn.execute(
|
|
62
|
+
"""SELECT s.path FROM summaries s JOIN files f ON f.path = s.path
|
|
63
|
+
WHERE s.source_hash IS NOT NULL AND s.source_hash != f.hash
|
|
64
|
+
ORDER BY s.path"""
|
|
65
|
+
).fetchall()
|
|
66
|
+
return [r["path"] for r in rows]
|
cerebro/summarizer.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""Batch summary generation (plan layer 2, warmed proactively).
|
|
2
|
+
|
|
3
|
+
Generates English summaries for the most central files so even a first-time query
|
|
4
|
+
is cheap, instead of waiting for sessions to fill them in lazily. Uses headless
|
|
5
|
+
`claude -p` so it needs no API key — it rides the user's existing Claude Code auth.
|
|
6
|
+
A cheap model (Haiku by default) keeps the one-time cost low.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import concurrent.futures as cf
|
|
11
|
+
import os
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
|
|
15
|
+
from . import config as cfg
|
|
16
|
+
from . import db, graph, summaries
|
|
17
|
+
|
|
18
|
+
INSTRUCTION = (
|
|
19
|
+
"You are summarizing a source file for a code-navigation index. In 1-3 dense "
|
|
20
|
+
"sentences, in English, describe what this file does and its role in the system "
|
|
21
|
+
"(key responsibilities, important types/functions, how it fits in). Output ONLY "
|
|
22
|
+
"the summary text — no preamble, no markdown, no bullet points."
|
|
23
|
+
)
|
|
24
|
+
MAX_CHARS = 16000
|
|
25
|
+
DEFAULT_MODEL = "claude-haiku-4-5"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _claude_bin() -> str:
|
|
29
|
+
return os.environ.get("CEREBRO_CLAUDE") or shutil.which("claude") or "claude"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def summarize_one(config, rel: str, model: str) -> str | None:
|
|
33
|
+
"""Generate a summary for one file via `claude -p`. Returns None on failure."""
|
|
34
|
+
abs_path = config.root / rel
|
|
35
|
+
try:
|
|
36
|
+
content = abs_path.read_text(encoding="utf-8", errors="ignore")[:MAX_CHARS]
|
|
37
|
+
except OSError:
|
|
38
|
+
return None
|
|
39
|
+
prompt = f"{INSTRUCTION}\n\nFile path: {rel}\n\n```\n{content}\n```\n"
|
|
40
|
+
try:
|
|
41
|
+
out = subprocess.run(
|
|
42
|
+
[_claude_bin(), "-p", "--model", model],
|
|
43
|
+
input=prompt,
|
|
44
|
+
capture_output=True,
|
|
45
|
+
text=True,
|
|
46
|
+
timeout=180,
|
|
47
|
+
)
|
|
48
|
+
except Exception:
|
|
49
|
+
return None
|
|
50
|
+
if out.returncode != 0:
|
|
51
|
+
return None
|
|
52
|
+
return out.stdout.strip() or None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def select_central_missing(conn, limit: int, prefix: str | None = None) -> list[str]:
|
|
56
|
+
"""Top files by dependency centrality that have no summary yet."""
|
|
57
|
+
have = {r["path"] for r in conn.execute("SELECT path FROM summaries")}
|
|
58
|
+
out = []
|
|
59
|
+
for path, _score in graph.rank(conn):
|
|
60
|
+
if path in have or cfg.Config.lang_for(path) is None:
|
|
61
|
+
continue
|
|
62
|
+
if prefix and not path.startswith(prefix):
|
|
63
|
+
continue
|
|
64
|
+
out.append(path)
|
|
65
|
+
if len(out) >= limit:
|
|
66
|
+
break
|
|
67
|
+
return out
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run(config, conn, rels: list[str], model: str = DEFAULT_MODEL, workers: int = 4) -> dict:
|
|
71
|
+
"""Summarize files in parallel (claude -p subprocesses), then record serially
|
|
72
|
+
(one sqlite writer). Returns a count of what was produced."""
|
|
73
|
+
produced: dict[str, str] = {}
|
|
74
|
+
with cf.ThreadPoolExecutor(max_workers=workers) as ex:
|
|
75
|
+
futs = {ex.submit(summarize_one, config, r, model): r for r in rels}
|
|
76
|
+
for fut in cf.as_completed(futs):
|
|
77
|
+
summary = fut.result()
|
|
78
|
+
if summary:
|
|
79
|
+
produced[futs[fut]] = summary
|
|
80
|
+
for rel, summary in produced.items():
|
|
81
|
+
summaries.record(conn, rel, summary, model=model)
|
|
82
|
+
return {"requested": len(rels), "summarized": len(produced)}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def main(): # `cerebro-summarize` entry point
|
|
86
|
+
import argparse
|
|
87
|
+
import json
|
|
88
|
+
|
|
89
|
+
ap = argparse.ArgumentParser(description="Pre-generate Cerebro summaries via claude -p")
|
|
90
|
+
ap.add_argument("--limit", type=int, default=20, help="max files to summarize")
|
|
91
|
+
ap.add_argument("--model", default=DEFAULT_MODEL)
|
|
92
|
+
ap.add_argument("--prefix", default=None, help="only files under this path prefix")
|
|
93
|
+
ap.add_argument("--workers", type=int, default=4)
|
|
94
|
+
args = ap.parse_args()
|
|
95
|
+
|
|
96
|
+
config = cfg.Config.load()
|
|
97
|
+
conn = db.connect(config.db_path)
|
|
98
|
+
rels = select_central_missing(conn, args.limit, args.prefix)
|
|
99
|
+
if not rels:
|
|
100
|
+
print(json.dumps({"summarized": 0, "note": "nothing missing in scope"}))
|
|
101
|
+
return
|
|
102
|
+
result = run(config, conn, rels, model=args.model, workers=args.workers)
|
|
103
|
+
result["model"] = args.model
|
|
104
|
+
result["root"] = str(config.root)
|
|
105
|
+
print(json.dumps(result, indent=2))
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
main()
|
cerebro/tsconfig.py
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""tsconfig / jsconfig path-alias resolution.
|
|
2
|
+
|
|
3
|
+
Next.js and NestJS projects import via aliases like `@/components/Button` instead
|
|
4
|
+
of relative paths. Those are declared in `compilerOptions.paths` (relative to
|
|
5
|
+
`baseUrl`). Without expanding them, the dependency graph misses most edges in
|
|
6
|
+
alias-heavy frontends. This module parses those configs (tolerating JSONC and a
|
|
7
|
+
single level of `extends`) and expands an aliased import to candidate repo-relative
|
|
8
|
+
module paths. The indexer then resolves a candidate to a real file.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import posixpath
|
|
14
|
+
import re
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class AliasConfig:
|
|
21
|
+
dir: str # posix dir of the config, relative to root ("" == root)
|
|
22
|
+
base_url: str # posix dir that `paths` resolve from, relative to root
|
|
23
|
+
patterns: dict[str, list[str]] # e.g. {"@/*": ["./*"]}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _strip_comments(text: str) -> str:
|
|
27
|
+
"""Remove // and /* */ comments while respecting string literals, so glob
|
|
28
|
+
patterns like "@/*" and "**/*.ts" (which contain /* and */) are not mistaken
|
|
29
|
+
for comment delimiters."""
|
|
30
|
+
out = []
|
|
31
|
+
i, n = 0, len(text)
|
|
32
|
+
in_str = False
|
|
33
|
+
while i < n:
|
|
34
|
+
ch = text[i]
|
|
35
|
+
if in_str:
|
|
36
|
+
out.append(ch)
|
|
37
|
+
if ch == "\\" and i + 1 < n:
|
|
38
|
+
out.append(text[i + 1])
|
|
39
|
+
i += 2
|
|
40
|
+
continue
|
|
41
|
+
if ch == '"':
|
|
42
|
+
in_str = False
|
|
43
|
+
i += 1
|
|
44
|
+
continue
|
|
45
|
+
if ch == '"':
|
|
46
|
+
in_str = True
|
|
47
|
+
out.append(ch)
|
|
48
|
+
i += 1
|
|
49
|
+
continue
|
|
50
|
+
if ch == "/" and i + 1 < n and text[i + 1] == "/":
|
|
51
|
+
i += 2
|
|
52
|
+
while i < n and text[i] not in "\n\r":
|
|
53
|
+
i += 1
|
|
54
|
+
continue
|
|
55
|
+
if ch == "/" and i + 1 < n and text[i + 1] == "*":
|
|
56
|
+
i += 2
|
|
57
|
+
while i + 1 < n and not (text[i] == "*" and text[i + 1] == "/"):
|
|
58
|
+
i += 1
|
|
59
|
+
i += 2
|
|
60
|
+
continue
|
|
61
|
+
out.append(ch)
|
|
62
|
+
i += 1
|
|
63
|
+
return "".join(out)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _loads_jsonc(text: str):
|
|
67
|
+
text = _strip_comments(text)
|
|
68
|
+
text = re.sub(r",(\s*[}\]])", r"\1", text) # trailing commas
|
|
69
|
+
return json.loads(text)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _resolve_extends(base_dir: Path, ext: str) -> Path | None:
|
|
73
|
+
# Only follow path-like extends (./base, ../tsconfig.base.json); skip packages.
|
|
74
|
+
if not (ext.startswith(".") or "/" in ext):
|
|
75
|
+
return None
|
|
76
|
+
cand = base_dir / ext
|
|
77
|
+
if cand.suffix != ".json":
|
|
78
|
+
cand = base_dir / (ext + ".json")
|
|
79
|
+
return cand if cand.exists() else None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _read_with_extends(abs_path: Path, seen: set) -> dict:
|
|
83
|
+
rp = abs_path.resolve()
|
|
84
|
+
if rp in seen:
|
|
85
|
+
return {}
|
|
86
|
+
seen.add(rp)
|
|
87
|
+
try:
|
|
88
|
+
data = _loads_jsonc(abs_path.read_text(encoding="utf-8", errors="ignore"))
|
|
89
|
+
except Exception:
|
|
90
|
+
return {}
|
|
91
|
+
co = data.get("compilerOptions") or {}
|
|
92
|
+
result = {"baseUrl": co.get("baseUrl"), "paths": co.get("paths")}
|
|
93
|
+
ext = data.get("extends")
|
|
94
|
+
if isinstance(ext, str) and (result["baseUrl"] is None or result["paths"] is None):
|
|
95
|
+
parent = _resolve_extends(abs_path.parent, ext)
|
|
96
|
+
if parent is not None:
|
|
97
|
+
pdata = _read_with_extends(parent, seen)
|
|
98
|
+
for key in ("baseUrl", "paths"):
|
|
99
|
+
if result[key] is None:
|
|
100
|
+
result[key] = pdata.get(key)
|
|
101
|
+
return result
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def load_alias_configs(config) -> list[AliasConfig]:
|
|
105
|
+
"""Scan the repo for tsconfig.json / jsconfig.json files that declare paths."""
|
|
106
|
+
out: list[AliasConfig] = []
|
|
107
|
+
for rel, abs_path in config.iter_files():
|
|
108
|
+
if posixpath.basename(rel) not in ("tsconfig.json", "jsconfig.json"):
|
|
109
|
+
continue
|
|
110
|
+
merged = _read_with_extends(abs_path, set())
|
|
111
|
+
raw_paths = merged.get("paths") or {}
|
|
112
|
+
if not raw_paths:
|
|
113
|
+
continue
|
|
114
|
+
patterns = {
|
|
115
|
+
k: (v if isinstance(v, list) else [v]) for k, v in raw_paths.items()
|
|
116
|
+
}
|
|
117
|
+
cfg_dir = posixpath.dirname(rel)
|
|
118
|
+
base_url = merged.get("baseUrl") or "."
|
|
119
|
+
base = posixpath.normpath(posixpath.join(cfg_dir, base_url))
|
|
120
|
+
out.append(AliasConfig(dir=cfg_dir, base_url=base, patterns=patterns))
|
|
121
|
+
return out
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _nearest(configs: list[AliasConfig], importer_rel: str) -> AliasConfig | None:
|
|
125
|
+
best = None
|
|
126
|
+
for c in configs:
|
|
127
|
+
prefix = (c.dir + "/") if c.dir else ""
|
|
128
|
+
if importer_rel.startswith(prefix):
|
|
129
|
+
if best is None or len(c.dir) > len(best.dir):
|
|
130
|
+
best = c
|
|
131
|
+
return best
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _match(pattern: str, name: str) -> str | None:
|
|
135
|
+
"""Return the wildcard capture if `name` matches `pattern`, else None.
|
|
136
|
+
Exact (no-`*`) patterns return '' on an exact match."""
|
|
137
|
+
if "*" in pattern:
|
|
138
|
+
pre, post = pattern.split("*", 1)
|
|
139
|
+
if name.startswith(pre) and name.endswith(post) and len(name) >= len(pre) + len(post):
|
|
140
|
+
return name[len(pre): len(name) - len(post)] if post else name[len(pre):]
|
|
141
|
+
return None
|
|
142
|
+
return "" if name == pattern else None
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def expand(import_str: str, importer_rel: str, configs: list[AliasConfig]) -> list[str]:
|
|
146
|
+
"""Expand an aliased import to candidate repo-relative module paths (no
|
|
147
|
+
extension). The indexer resolves these against the known file set."""
|
|
148
|
+
cfg = _nearest(configs, importer_rel)
|
|
149
|
+
if cfg is None:
|
|
150
|
+
return []
|
|
151
|
+
out: list[str] = []
|
|
152
|
+
for pattern, targets in cfg.patterns.items():
|
|
153
|
+
cap = _match(pattern, import_str)
|
|
154
|
+
if cap is None:
|
|
155
|
+
continue
|
|
156
|
+
for t in targets:
|
|
157
|
+
sub = t.replace("*", cap, 1) if "*" in t else t
|
|
158
|
+
out.append(posixpath.normpath(posixpath.join(cfg.base_url, sub)))
|
|
159
|
+
return out
|
cerebro/views.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""FastMCP-free renderers for the read tools.
|
|
2
|
+
|
|
3
|
+
The MCP SDK (FastMCP → pydantic/starlette/uvicorn) costs ~230ms to import and is
|
|
4
|
+
only needed to *serve* MCP. The CLI and the SessionStart hook just need the text
|
|
5
|
+
these tools produce, so the rendering logic lives here — importable without pulling
|
|
6
|
+
in the server module — and `server.py` delegates to it.
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from . import db, graph, notes, summaries
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def map_text(conn, root, top: int = 30) -> str:
|
|
14
|
+
total = conn.execute("SELECT COUNT(*) AS n FROM files").fetchone()["n"]
|
|
15
|
+
if total == 0:
|
|
16
|
+
return "Index is empty. Run cerebro_reindex() first to build the map."
|
|
17
|
+
langs = ", ".join(f"{r['lang']}:{r['n']}" for r in db.lang_counts(conn))
|
|
18
|
+
last = conn.execute("SELECT value FROM meta WHERE key='last_reindex'").fetchone()
|
|
19
|
+
lines = [
|
|
20
|
+
f"# Cerebro map — {root}",
|
|
21
|
+
f"{total} files | {langs} | last reindex: {last['value'] if last else 'n/a'}",
|
|
22
|
+
"",
|
|
23
|
+
f"## Top {top} modules by centrality (most depended-upon):",
|
|
24
|
+
]
|
|
25
|
+
for path, score in graph.rank(conn, top=top):
|
|
26
|
+
s = summaries.get(conn, path)
|
|
27
|
+
note = ""
|
|
28
|
+
if s:
|
|
29
|
+
flag = " (STALE)" if s["stale"] else ""
|
|
30
|
+
note = f" — {s['summary_en'][:90]}{flag}"
|
|
31
|
+
lines.append(f" {score:.3f} {path}{note}")
|
|
32
|
+
no_summary = conn.execute(
|
|
33
|
+
"SELECT COUNT(*) AS n FROM files f "
|
|
34
|
+
"LEFT JOIN summaries s ON s.path=f.path WHERE s.path IS NULL"
|
|
35
|
+
).fetchone()["n"]
|
|
36
|
+
lines.append("")
|
|
37
|
+
lines.append(
|
|
38
|
+
f"{no_summary} files have no summary yet. As you learn a file, call "
|
|
39
|
+
f"cerebro_record(path, summary) so future sessions skip re-reading it."
|
|
40
|
+
)
|
|
41
|
+
return "\n".join(lines)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def recall_text(conn, query: str = "", limit: int = 10) -> str:
|
|
45
|
+
rows = notes.recall(conn, query, limit=limit)
|
|
46
|
+
if not rows:
|
|
47
|
+
return "No notes recorded yet." if not query else f"No notes match '{query}'."
|
|
48
|
+
out = []
|
|
49
|
+
for r in rows:
|
|
50
|
+
head = f"#{r['id']}" + (f" [{r['topic']}]" if r["topic"] else "")
|
|
51
|
+
out.append(f"{head} ({r['created_at']})\n {r['content']}")
|
|
52
|
+
return "\n".join(out)
|