cerebro-code-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cerebro/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Cerebro — persistent code-knowledge memory across AI chat sessions."""
2
+
3
+ __version__ = "0.1.0"
cerebro/callgraph.py ADDED
@@ -0,0 +1,38 @@
1
+ """Symbol-level call graph queries over the `calls` table.
2
+
3
+ Resolution is by NAME (tree-sitter, no type inference), so results can include
4
+ same-named symbols from different files — fast and dependency-free, but not as
5
+ precise as an LSP. Good for "who calls X?" and "what does this file call?".
6
+ """
7
+ from __future__ import annotations
8
+
9
+
10
+ def callers(conn, name: str, limit: int = 300) -> dict:
11
+ defined = sorted(
12
+ {r["file_path"] for r in conn.execute(
13
+ "SELECT file_path FROM symbols WHERE name=?", (name,))}
14
+ )
15
+ rows = conn.execute(
16
+ "SELECT src_path, src_symbol, line FROM calls WHERE dst_name=? "
17
+ "ORDER BY src_path, line",
18
+ (name,),
19
+ ).fetchall()
20
+ sites = [(r["src_path"], r["src_symbol"], r["line"]) for r in rows]
21
+ return {"name": name, "defined_in": defined, "count": len(sites), "sites": sites[:limit]}
22
+
23
+
24
+ def calls_from(conn, path: str, limit: int = 300) -> dict:
25
+ """Internal calls a file makes — callees that resolve to a symbol defined
26
+ somewhere in the repo (external library calls are dropped)."""
27
+ defined = {r["name"] for r in conn.execute("SELECT DISTINCT name FROM symbols")}
28
+ rows = conn.execute(
29
+ "SELECT DISTINCT src_symbol, dst_name, line FROM calls WHERE src_path=? "
30
+ "ORDER BY line",
31
+ (path,),
32
+ ).fetchall()
33
+ internal = [
34
+ (r["src_symbol"], r["dst_name"], r["line"])
35
+ for r in rows
36
+ if r["dst_name"] in defined
37
+ ]
38
+ return {"path": path, "count": len(internal), "calls": internal[:limit]}
cerebro/cli.py ADDED
@@ -0,0 +1,348 @@
1
+ """Unified `cerebro` command.
2
+
3
+ With NO subcommand it runs the MCP server (stdio) — so the registration
4
+ `... run cerebro` keeps working. With a subcommand it acts as a normal CLI
5
+ (`cerebro setup`, `cerebro search ...`, `cerebro graph`, etc.).
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import argparse
10
+ import json
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ from . import config as cfg
15
+ from . import db
16
+
17
+
18
+ def _ctx():
19
+ config = cfg.Config.load()
20
+ conn = db.connect(config.db_path)
21
+ return config, conn
22
+
23
+
24
+ def _bind_server(config, conn):
25
+ """Point the server tool functions at this config/conn so we can reuse them."""
26
+ from . import server
27
+ server._CONFIG, server._CONN = config, conn
28
+ return server
29
+
30
+
31
+ def _project_root() -> Path:
32
+ return Path(__file__).resolve().parents[2]
33
+
34
+
35
+ # --- subcommands -------------------------------------------------------------
36
+
37
+ def cmd_serve(_args):
38
+ from . import server
39
+ server.main()
40
+
41
+
42
+ def cmd_index(args):
43
+ from . import indexer
44
+ config, conn = _ctx()
45
+ if args.paths:
46
+ rels = [r for r in (indexer._to_rel(config, a) for a in args.paths) if r]
47
+ res = indexer.reindex_paths(config, conn, rels)
48
+ res["mode"] = "incremental"
49
+ else:
50
+ res = indexer.reindex(config, conn, force=args.force)
51
+ res["mode"] = "full-force" if args.force else "full"
52
+ print(json.dumps(res, indent=2))
53
+
54
+
55
+ def cmd_search(args):
56
+ config, conn = _ctx()
57
+ print(_bind_server(config, conn).cerebro_search(" ".join(args.query)))
58
+
59
+
60
+ def cmd_map(args):
61
+ from . import views
62
+ config, conn = _ctx()
63
+ print(views.map_text(conn, config.root, args.top))
64
+
65
+
66
+ def cmd_impact(args):
67
+ config, conn = _ctx()
68
+ print(_bind_server(config, conn).cerebro_impact(args.path))
69
+
70
+
71
+ def cmd_cycles(_args):
72
+ config, conn = _ctx()
73
+ print(_bind_server(config, conn).cerebro_cycles())
74
+
75
+
76
+ def cmd_orphans(args):
77
+ config, conn = _ctx()
78
+ print(_bind_server(config, conn).cerebro_orphans(args.prefix))
79
+
80
+
81
+ def cmd_orphans_symbols(args):
82
+ config, conn = _ctx()
83
+ print(_bind_server(config, conn).cerebro_dead_symbols(args.prefix))
84
+
85
+
86
+ def cmd_callers(args):
87
+ config, conn = _ctx()
88
+ print(_bind_server(config, conn).cerebro_callers(args.name))
89
+
90
+
91
+ def cmd_calls(args):
92
+ config, conn = _ctx()
93
+ print(_bind_server(config, conn).cerebro_calls(args.path))
94
+
95
+
96
+ def cmd_recall(args):
97
+ from . import views
98
+ config, conn = _ctx()
99
+ print(views.recall_text(conn, " ".join(args.query)))
100
+
101
+
102
+ def cmd_graph(args):
103
+ from . import viz
104
+ config, conn = _ctx()
105
+ out = Path(args.out) if args.out else config.db_path.parent / "cerebro-graph.html"
106
+ out.write_text(viz.graph_html(conn, args.limit, args.prefix), encoding="utf-8")
107
+ print(json.dumps({"html": str(out), "open_with": f"open '{out}'"}))
108
+
109
+
110
+ def cmd_graph_all(_args):
111
+ """Regenerate every scoped graph declared in `.cerebro/graphs.toml`.
112
+
113
+ Opt-in per project: with no config file this is an instant no-op, so the
114
+ SessionStart hook can call it for every project without writing graphs into
115
+ repos that never use them. Each [[graph]] entry writes
116
+ `cerebro-graph-<name>.html` (an entry with no name writes the global
117
+ `cerebro-graph.html`); `prefix`/`limit`/`out` mirror the `graph` command.
118
+ """
119
+ from . import viz
120
+ config, conn = _ctx()
121
+ cfg_path = config.db_path.parent / "graphs.toml"
122
+ if not cfg_path.exists():
123
+ print(json.dumps({"skipped": "no .cerebro/graphs.toml"}))
124
+ return
125
+ try:
126
+ import tomllib # stdlib on Python 3.11+
127
+ except ModuleNotFoundError:
128
+ print(json.dumps({"skipped": "tomllib unavailable (needs Python 3.11+)"}))
129
+ return
130
+ spec = tomllib.loads(cfg_path.read_text(encoding="utf-8"))
131
+ written = []
132
+ for g in spec.get("graph", []):
133
+ prefix = g.get("prefix") or None
134
+ limit = int(g.get("limit", 400))
135
+ if g.get("out"):
136
+ out = Path(g["out"])
137
+ elif g.get("name"):
138
+ out = config.db_path.parent / f"cerebro-graph-{g['name']}.html"
139
+ else:
140
+ out = config.db_path.parent / "cerebro-graph.html"
141
+ out.write_text(viz.graph_html(conn, limit, prefix), encoding="utf-8")
142
+ written.append(str(out))
143
+ print(json.dumps({"written": written}))
144
+
145
+
146
+ def cmd_obsidian(args):
147
+ from . import viz
148
+ config, conn = _ctx()
149
+ out = Path(args.out) if args.out else config.db_path.parent / "vault"
150
+ print(json.dumps(viz.export_obsidian(config, conn, out)))
151
+
152
+
153
+ def cmd_summarize(args):
154
+ from . import summarizer
155
+ config, conn = _ctx()
156
+ rels = summarizer.select_central_missing(conn, args.limit, args.prefix)
157
+ print(json.dumps(summarizer.run(config, conn, rels, workers=args.workers)))
158
+
159
+
160
+ def cmd_embed(_args):
161
+ from . import embeddings
162
+ config, conn = _ctx()
163
+ print(json.dumps(embeddings.build(config, conn)))
164
+
165
+
166
+ _SESSION_DIRECTIVE = (
167
+ "## Cerebro is active for this project\n"
168
+ "This repo has a Cerebro brain: a cached index of its structure, symbols, "
169
+ "dependencies and summaries. Querying it costs far fewer tokens than "
170
+ "re-reading files. RULES for this session:\n"
171
+ "1. BEFORE exploring with grep/find/ls/Read — or spawning an Explore/"
172
+ "general-purpose subagent to look around — call cerebro_search(query) to "
173
+ "locate code and cerebro_get(path) to learn a file (summary + symbols + "
174
+ "dependencies) without opening it.\n"
175
+ "2. Open a file's full contents ONLY when cerebro_get reports no summary or "
176
+ "flags it STALE, or when you need exact implementation detail.\n"
177
+ "3. After you understand a file you had to read, call "
178
+ "cerebro_record(path, <1-3 sentence English summary>) so the next session "
179
+ "reuses it instead of re-reading.\n"
180
+ "4. Record decisions/domain rules/gotchas with cerebro_note, and call "
181
+ "cerebro_recall before re-deriving the 'why' behind the code.\n\n"
182
+ "Current map:\n\n"
183
+ )
184
+
185
+
186
+ def cmd_session(_args):
187
+ """Combined SessionStart context (git-sync + map + decisions) in ONE process.
188
+ Uses `views` (not `server`) so it skips the ~230ms MCP-SDK import — this is the
189
+ per-session hot path."""
190
+ from . import gitsync, views
191
+ config, conn = _ctx()
192
+ gitsync.sync(config, conn) # catch branch switch / pull / external edits
193
+ text = views.map_text(conn, config.root)
194
+ if not text or "Index is empty" in text:
195
+ return
196
+ out = _SESSION_DIRECTIVE + text
197
+ decisions = views.recall_text(conn)
198
+ if decisions and "No notes recorded" not in decisions:
199
+ out += "\n\n## Decisions on record (from past sessions):\n" + decisions
200
+ print(out)
201
+
202
+
203
+ def cmd_doc_audit(args):
204
+ """Living docs: flag vault notes whose referenced code changed or vanished."""
205
+ from . import docaudit
206
+ _, conn = _ctx()
207
+ vault = Path(args.vault).expanduser()
208
+ aliases = dict(
209
+ kv.split("=", 1) for kv in (args.aliases.split(",") if args.aliases else []) if "=" in kv
210
+ )
211
+ results = docaudit.audit_vault(conn, vault, aliases)
212
+ stale = [r for r in results if r["status"] == "stale"]
213
+ hints = [r for r in results if r["status"] == "hint"]
214
+ if args.json:
215
+ import json as _json
216
+ print(_json.dumps(
217
+ [{"note": str(r["note"]), "status": r["status"], "issues": r["issues"]} for r in results],
218
+ indent=2, ensure_ascii=False))
219
+ return
220
+ fresh = len(results) - len(stale) - len(hints)
221
+ print(f"Audited {len(results)} notes with code refs: "
222
+ f"⚠ {len(stale)} stale · {len(hints)} hints · {fresh} fresh\n")
223
+ for r in stale:
224
+ print(f"⚠ STALE {r['note'].name}")
225
+ for kind, msg in r["issues"]:
226
+ if kind in ("broken", "changed"):
227
+ print(f" [{kind}] {msg}")
228
+ if args.fix and docaudit.mark_stale(r["note"]):
229
+ print(" → frontmatter set to estado: revisar")
230
+ if args.show_hints and hints:
231
+ print("\nHints (heuristic — symbol names not found):")
232
+ for r in hints[:20]:
233
+ syms = ", ".join(m.split("`")[1] for k, m in r["issues"] if k == "symbol?")
234
+ print(f" {r['note'].name}: {syms}")
235
+
236
+
237
+ def cmd_doc_refresh(args):
238
+ """Re-audit a single stale note against live code; print the refresh briefing."""
239
+ from . import docaudit
240
+ _, conn = _ctx()
241
+ aliases = dict(
242
+ kv.split("=", 1) for kv in (args.aliases.split(",") if args.aliases else []) if "=" in kv
243
+ )
244
+ briefing = docaudit.refresh_briefing(conn, Path(args.note).expanduser(), aliases)
245
+ print(docaudit.format_briefing(briefing))
246
+
247
+
248
+ def cmd_setup(args):
249
+ """One-command onboarding for the current repo."""
250
+ from . import indexer, gitsync
251
+ config, conn = _ctx()
252
+ print(f"🧠 Cerebro setup — {config.root}\n")
253
+ res = indexer.reindex(config, conn)
254
+ print(f" ✓ indexed {res['total_files']} files ({res['new']} new, {res['changed']} changed)")
255
+ gitsync.sync(config, conn)
256
+ print(" ✓ git baseline recorded")
257
+
258
+ if args.summarize:
259
+ from . import summarizer
260
+ rels = summarizer.select_central_missing(conn, args.summarize)
261
+ sr = summarizer.run(config, conn, rels)
262
+ print(f" ✓ summarized {sr['summarized']} central files (claude -p)")
263
+ if args.embed:
264
+ from . import embeddings
265
+ er = embeddings.build(config, conn)
266
+ msg = er.get("reason") or f"{er.get('embedded', 0)} files"
267
+ print(f" ✓ semantic index: {msg}")
268
+
269
+ proj = _project_root()
270
+ print("\nNext steps:")
271
+ print(" 1) Register the MCP server with Claude Code:")
272
+ print(
273
+ f" claude mcp add cerebro -s user -e CEREBRO_ROOT='{config.root}' "
274
+ f"-- uv --directory '{proj}' run cerebro"
275
+ )
276
+ print(" (or install globally: uv tool install --from '%s' cerebro )" % proj)
277
+ print(" 2) Optional auto-use: enable the SessionStart/PostToolUse hooks in")
278
+ print(f" {proj}/plugin/hooks (see README) and copy plugin/skills/cerebro to ~/.claude/skills/")
279
+ print(" 3) Reload your editor, open a chat, and the cerebro_* tools are available.")
280
+
281
+
282
+ _COMMANDS = {
283
+ "serve": cmd_serve, "setup": cmd_setup, "session-context": cmd_session,
284
+ "doc-audit": cmd_doc_audit, "doc-refresh": cmd_doc_refresh,
285
+ "index": cmd_index, "search": cmd_search,
286
+ "map": cmd_map, "graph": cmd_graph, "graph-all": cmd_graph_all,
287
+ "obsidian": cmd_obsidian, "summarize": cmd_summarize,
288
+ "embed": cmd_embed, "impact": cmd_impact, "cycles": cmd_cycles, "orphans": cmd_orphans,
289
+ "dead-symbols": cmd_orphans_symbols,
290
+ "callers": cmd_callers, "calls": cmd_calls, "recall": cmd_recall,
291
+ }
292
+
293
+
294
+ def _build_parser() -> argparse.ArgumentParser:
295
+ p = argparse.ArgumentParser(prog="cerebro", description="Persistent code-knowledge brain")
296
+ sub = p.add_subparsers(dest="cmd")
297
+ sub.add_parser("serve", help="run the MCP server (stdio) — the default with no args")
298
+ sub.add_parser("session-context", help="combined SessionStart context (used by the hook)")
299
+ s = sub.add_parser("doc-audit", help="flag vault notes whose referenced code changed (living docs)")
300
+ s.add_argument("vault", help="path to the markdown knowledge vault")
301
+ s.add_argument("--aliases", help="map wiki app names to repo dirs: 'backend_app=fenix-store-backend,...'")
302
+ s.add_argument("--fix", action="store_true", help="patch stale notes' frontmatter to estado: revisar")
303
+ s.add_argument("--json", action="store_true")
304
+ s.add_argument("--show-hints", action="store_true", help="also show heuristic symbol-name hints")
305
+ s = sub.add_parser("doc-refresh", help="re-audit one note vs live code → refresh briefing")
306
+ s.add_argument("note", help="path to the stale note")
307
+ s.add_argument("--aliases", help="map wiki app names to repo dirs")
308
+ s = sub.add_parser("setup", help="index this repo + print MCP registration")
309
+ s.add_argument("--summarize", type=int, nargs="?", const=30, default=0,
310
+ help="also warm summaries for the top N central files")
311
+ s.add_argument("--embed", action="store_true", help="also build the semantic index")
312
+ s = sub.add_parser("index", help="build/refresh the index")
313
+ s.add_argument("paths", nargs="*"); s.add_argument("--force", action="store_true")
314
+ s = sub.add_parser("search", help="hybrid semantic + keyword search")
315
+ s.add_argument("query", nargs="+")
316
+ s = sub.add_parser("map", help="project overview"); s.add_argument("--top", type=int, default=30)
317
+ s = sub.add_parser("graph", help="write the interactive dependency-graph HTML")
318
+ s.add_argument("--limit", type=int, default=400); s.add_argument("--prefix"); s.add_argument("-o", "--out")
319
+ sub.add_parser("graph-all", help="regenerate every graph declared in .cerebro/graphs.toml")
320
+ s = sub.add_parser("obsidian", help="export an Obsidian vault"); s.add_argument("-o", "--out")
321
+ s = sub.add_parser("summarize", help="warm summaries via claude -p")
322
+ s.add_argument("--limit", type=int, default=20); s.add_argument("--prefix"); s.add_argument("--workers", type=int, default=4)
323
+ sub.add_parser("embed", help="build the semantic index (needs --extra semantic)")
324
+ s = sub.add_parser("impact", help="transitive blast radius of a file"); s.add_argument("path")
325
+ sub.add_parser("cycles", help="circular-import groups")
326
+ s = sub.add_parser("orphans", help="dead-code candidates (file-level)"); s.add_argument("--prefix", default="")
327
+ s = sub.add_parser("dead-symbols", help="unused-export candidates (symbol-level)")
328
+ s.add_argument("--prefix", default="")
329
+ s = sub.add_parser("callers", help="call sites of a symbol"); s.add_argument("name")
330
+ s = sub.add_parser("calls", help="internal calls a file makes"); s.add_argument("path")
331
+ s = sub.add_parser("recall", help="recall recorded decisions"); s.add_argument("query", nargs="*")
332
+ return p
333
+
334
+
335
+ def main(argv=None):
336
+ argv = sys.argv[1:] if argv is None else argv
337
+ if not argv: # no subcommand -> MCP server (keeps `... run cerebro` working)
338
+ cmd_serve(None)
339
+ return
340
+ args = _build_parser().parse_args(argv)
341
+ if not args.cmd:
342
+ cmd_serve(None)
343
+ return
344
+ _COMMANDS[args.cmd](args)
345
+
346
+
347
+ if __name__ == "__main__":
348
+ main()
cerebro/config.py ADDED
@@ -0,0 +1,136 @@
1
+ """Project configuration: repo root discovery, ignore rules, file walking.
2
+
3
+ The "root" is the directory whose code Cerebro indexes. Resolution order:
4
+ 1. CEREBRO_ROOT env var (explicit override)
5
+ 2. the nearest ancestor holding a built brain (.cerebro/brain.db) — so a
6
+ polyrepo subfolder (each its own git repo) resolves to the shared root
7
+ brain instead of re-indexing itself. Mirrors the SessionStart hook.
8
+ 3. the enclosing git toplevel (if inside a git repo)
9
+ 4. the current working directory
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import subprocess
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+
18
+ import pathspec
19
+
20
+ # Directories / globs we never index, regardless of .gitignore.
21
+ DEFAULT_IGNORES = [
22
+ ".git/",
23
+ ".cerebro/",
24
+ ".cerebroignore",
25
+ "node_modules/",
26
+ ".venv/",
27
+ "venv/",
28
+ "__pycache__/",
29
+ ".mypy_cache/",
30
+ ".pytest_cache/",
31
+ ".dart_tool/",
32
+ "dist/",
33
+ "build/",
34
+ ".next/",
35
+ ".cache/",
36
+ "*.pyc",
37
+ "*.lock",
38
+ "*.min.js",
39
+ "*.map",
40
+ ".DS_Store",
41
+ # Secrets / local env — never index (privacy invariant: nothing leaves the
42
+ # machine). Nested sub-repos in a polyrepo .gitignore these, but Cerebro only
43
+ # reads the root .gitignore, so make them a hard default.
44
+ ".env",
45
+ ".env.*",
46
+ ]
47
+
48
+ # File extension -> tree-sitter-language-pack language name.
49
+ LANG_BY_EXT = {
50
+ ".py": "python",
51
+ ".js": "javascript",
52
+ ".jsx": "javascript",
53
+ ".mjs": "javascript",
54
+ ".cjs": "javascript",
55
+ ".ts": "typescript",
56
+ ".tsx": "tsx",
57
+ ".dart": "dart",
58
+ }
59
+
60
+
61
+ def find_root(start: str | None = None) -> Path:
62
+ env = os.environ.get("CEREBRO_ROOT")
63
+ if env:
64
+ return Path(env).expanduser().resolve()
65
+ start_path = Path(start or os.getcwd()).resolve()
66
+ # Walk up to an existing brain first: in a polyrepo each subfolder is its own
67
+ # git repo, so the git step below would resolve to the subfolder and miss the
68
+ # shared root brain. Finding `.cerebro/brain.db` upward makes one brain at the
69
+ # monorepo root serve every nested package automatically — no per-folder
70
+ # CEREBRO_ROOT needed. (Only matches a *built* brain, so first-time `setup`
71
+ # still falls through to git below.)
72
+ probe = start_path if start_path.is_dir() else start_path.parent
73
+ for d in (probe, *probe.parents):
74
+ if (d / ".cerebro" / "brain.db").exists():
75
+ return d
76
+ try:
77
+ out = subprocess.run(
78
+ ["git", "-C", str(start_path), "rev-parse", "--show-toplevel"],
79
+ capture_output=True,
80
+ text=True,
81
+ timeout=5,
82
+ )
83
+ if out.returncode == 0 and out.stdout.strip():
84
+ return Path(out.stdout.strip()).resolve()
85
+ except Exception:
86
+ pass
87
+ return start_path
88
+
89
+
90
+ @dataclass
91
+ class Config:
92
+ root: Path
93
+ db_path: Path
94
+ spec: pathspec.PathSpec
95
+
96
+ @classmethod
97
+ def load(cls, start: str | None = None) -> "Config":
98
+ root = find_root(start)
99
+ db_path = root / ".cerebro" / "brain.db"
100
+ patterns = list(DEFAULT_IGNORES)
101
+ # .gitignore + an optional .cerebroignore (same gitignore syntax) let the
102
+ # user exclude heavy non-source dirs (backups, uploads) without polluting
103
+ # their VCS config.
104
+ for fname in (".gitignore", ".cerebroignore"):
105
+ f = root / fname
106
+ if f.exists():
107
+ patterns += f.read_text(encoding="utf-8", errors="ignore").splitlines()
108
+ spec = pathspec.PathSpec.from_lines("gitignore", patterns)
109
+ return cls(root=root, db_path=db_path, spec=spec)
110
+
111
+ def is_ignored(self, path: Path) -> bool:
112
+ try:
113
+ rel = path.relative_to(self.root).as_posix()
114
+ except ValueError:
115
+ return True
116
+ if path.is_dir():
117
+ rel += "/"
118
+ return self.spec.match_file(rel)
119
+
120
+ def iter_files(self):
121
+ """Yield (relative_posix_path, absolute_Path) for every indexable file."""
122
+ for dirpath, dirnames, filenames in os.walk(self.root):
123
+ d = Path(dirpath)
124
+ # Prune ignored directories in place so os.walk never descends them.
125
+ dirnames[:] = sorted(
126
+ dn for dn in dirnames if not self.is_ignored(d / dn)
127
+ )
128
+ for fn in sorted(filenames):
129
+ fp = d / fn
130
+ if self.is_ignored(fp):
131
+ continue
132
+ yield fp.relative_to(self.root).as_posix(), fp
133
+
134
+ @staticmethod
135
+ def lang_for(rel: str) -> str | None:
136
+ return LANG_BY_EXT.get(Path(rel).suffix.lower())