cerebro-code-memory 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cerebro/__init__.py +3 -0
- cerebro/callgraph.py +38 -0
- cerebro/cli.py +348 -0
- cerebro/config.py +136 -0
- cerebro/db.py +245 -0
- cerebro/docaudit.py +174 -0
- cerebro/embeddings.py +175 -0
- cerebro/gitsync.py +124 -0
- cerebro/graph.py +77 -0
- cerebro/indexer.py +854 -0
- cerebro/insights.py +217 -0
- cerebro/notes.py +70 -0
- cerebro/server.py +382 -0
- cerebro/summaries.py +66 -0
- cerebro/summarizer.py +109 -0
- cerebro/tsconfig.py +159 -0
- cerebro/views.py +52 -0
- cerebro/viz.py +374 -0
- cerebro_code_memory-0.1.0.dist-info/METADATA +160 -0
- cerebro_code_memory-0.1.0.dist-info/RECORD +23 -0
- cerebro_code_memory-0.1.0.dist-info/WHEEL +4 -0
- cerebro_code_memory-0.1.0.dist-info/entry_points.txt +11 -0
- cerebro_code_memory-0.1.0.dist-info/licenses/LICENSE +21 -0
cerebro/db.py
ADDED
|
@@ -0,0 +1,245 @@
|
|
|
1
|
+
"""SQLite storage: schema, connection, and low-level write/query helpers.
|
|
2
|
+
|
|
3
|
+
A single file, `.cerebro/brain.db`, holds every "trace": the structural index
|
|
4
|
+
(files, symbols, edges), the cached English summaries, the decision notes
|
|
5
|
+
(reserved for v2), and an FTS5 index used by keyword search.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sqlite3
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
SCHEMA = """
|
|
13
|
+
CREATE TABLE IF NOT EXISTS files (
|
|
14
|
+
path TEXT PRIMARY KEY,
|
|
15
|
+
lang TEXT,
|
|
16
|
+
hash TEXT NOT NULL,
|
|
17
|
+
mtime REAL,
|
|
18
|
+
size INTEGER,
|
|
19
|
+
indexed_at TEXT NOT NULL
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
CREATE TABLE IF NOT EXISTS symbols (
|
|
23
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
24
|
+
file_path TEXT NOT NULL,
|
|
25
|
+
kind TEXT NOT NULL,
|
|
26
|
+
name TEXT NOT NULL,
|
|
27
|
+
line INTEGER,
|
|
28
|
+
signature TEXT
|
|
29
|
+
);
|
|
30
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_path);
|
|
31
|
+
CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name);
|
|
32
|
+
|
|
33
|
+
CREATE TABLE IF NOT EXISTS edges (
|
|
34
|
+
src_path TEXT NOT NULL,
|
|
35
|
+
dst_path TEXT NOT NULL,
|
|
36
|
+
kind TEXT NOT NULL DEFAULT 'import',
|
|
37
|
+
PRIMARY KEY (src_path, dst_path, kind)
|
|
38
|
+
);
|
|
39
|
+
CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_path);
|
|
40
|
+
|
|
41
|
+
-- Symbol-level call graph (tree-sitter, name-resolved): src_symbol in src_path
|
|
42
|
+
-- calls/references a symbol named dst_name. src_symbol is NULL at module scope.
|
|
43
|
+
CREATE TABLE IF NOT EXISTS calls (
|
|
44
|
+
src_path TEXT NOT NULL,
|
|
45
|
+
src_symbol TEXT,
|
|
46
|
+
dst_name TEXT NOT NULL,
|
|
47
|
+
line INTEGER
|
|
48
|
+
);
|
|
49
|
+
CREATE INDEX IF NOT EXISTS idx_calls_dst ON calls(dst_name);
|
|
50
|
+
CREATE INDEX IF NOT EXISTS idx_calls_src ON calls(src_path);
|
|
51
|
+
|
|
52
|
+
-- Identifier references: a distinct (path, name) for every name USED in a file
|
|
53
|
+
-- (calls, JSX, type annotations, value reads), EXCLUDING the file's own
|
|
54
|
+
-- definition sites. Lets dead_symbols() ask "is this symbol's name referenced
|
|
55
|
+
-- anywhere?" — a name absent from refs entirely is an unused-export candidate.
|
|
56
|
+
CREATE TABLE IF NOT EXISTS refs (
|
|
57
|
+
path TEXT NOT NULL,
|
|
58
|
+
name TEXT NOT NULL,
|
|
59
|
+
PRIMARY KEY (path, name)
|
|
60
|
+
);
|
|
61
|
+
CREATE INDEX IF NOT EXISTS idx_refs_name ON refs(name);
|
|
62
|
+
|
|
63
|
+
CREATE TABLE IF NOT EXISTS summaries (
|
|
64
|
+
path TEXT PRIMARY KEY,
|
|
65
|
+
summary_en TEXT NOT NULL,
|
|
66
|
+
model TEXT,
|
|
67
|
+
source_hash TEXT,
|
|
68
|
+
updated_at TEXT NOT NULL
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
-- Reserved for v2 (decision log). Created now so the schema is stable.
|
|
72
|
+
CREATE TABLE IF NOT EXISTS notes (
|
|
73
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
74
|
+
topic TEXT,
|
|
75
|
+
content TEXT NOT NULL,
|
|
76
|
+
created_at TEXT NOT NULL
|
|
77
|
+
);
|
|
78
|
+
|
|
79
|
+
CREATE TABLE IF NOT EXISTS meta (
|
|
80
|
+
key TEXT PRIMARY KEY,
|
|
81
|
+
value TEXT
|
|
82
|
+
);
|
|
83
|
+
|
|
84
|
+
-- Optional semantic layer (legacy, file-level): one vector per file. Superseded
|
|
85
|
+
-- by symbol_embeddings below; kept so older brains still open cleanly.
|
|
86
|
+
CREATE TABLE IF NOT EXISTS embeddings (
|
|
87
|
+
path TEXT PRIMARY KEY,
|
|
88
|
+
dim INTEGER,
|
|
89
|
+
vec BLOB NOT NULL,
|
|
90
|
+
doc_hash TEXT,
|
|
91
|
+
updated_at TEXT NOT NULL
|
|
92
|
+
);
|
|
93
|
+
|
|
94
|
+
-- Symbol-level semantic vectors: one row per symbol (function/class), plus one
|
|
95
|
+
-- whole-file row (name IS NULL) for files with no indexable symbols. Lets search
|
|
96
|
+
-- land on the exact symbol + line, not just the file. doc_hash is a per-file
|
|
97
|
+
-- fingerprint (identical on every row of a file) so build() skips unchanged files.
|
|
98
|
+
CREATE TABLE IF NOT EXISTS symbol_embeddings (
|
|
99
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
100
|
+
path TEXT NOT NULL,
|
|
101
|
+
name TEXT,
|
|
102
|
+
line INTEGER,
|
|
103
|
+
kind TEXT,
|
|
104
|
+
dim INTEGER,
|
|
105
|
+
vec BLOB NOT NULL,
|
|
106
|
+
doc_hash TEXT,
|
|
107
|
+
updated_at TEXT NOT NULL
|
|
108
|
+
);
|
|
109
|
+
CREATE INDEX IF NOT EXISTS idx_symemb_path ON symbol_embeddings(path);
|
|
110
|
+
|
|
111
|
+
-- One row per (path, kind). kind='symbol' aggregates a file's symbol names;
|
|
112
|
+
-- kind='summary' holds the file's English summary. Maintained manually.
|
|
113
|
+
CREATE VIRTUAL TABLE IF NOT EXISTS fts USING fts5(
|
|
114
|
+
path UNINDEXED,
|
|
115
|
+
kind UNINDEXED,
|
|
116
|
+
text
|
|
117
|
+
);
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def connect(db_path: Path) -> sqlite3.Connection:
|
|
122
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
123
|
+
conn = sqlite3.connect(str(db_path))
|
|
124
|
+
conn.row_factory = sqlite3.Row
|
|
125
|
+
# Set busy_timeout FIRST: switching to WAL itself takes a write lock, so a
|
|
126
|
+
# concurrent connection (e.g. the post-edit hook's reindex while the MCP
|
|
127
|
+
# server connects) must wait here rather than fail with "database is locked".
|
|
128
|
+
conn.execute("PRAGMA busy_timeout=5000")
|
|
129
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
130
|
+
conn.executescript(SCHEMA)
|
|
131
|
+
return conn
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# --- structural index writes -------------------------------------------------
|
|
135
|
+
|
|
136
|
+
def upsert_file(conn, path, lang, file_hash, mtime, size, indexed_at):
|
|
137
|
+
conn.execute(
|
|
138
|
+
"""INSERT INTO files(path, lang, hash, mtime, size, indexed_at)
|
|
139
|
+
VALUES(?,?,?,?,?,?)
|
|
140
|
+
ON CONFLICT(path) DO UPDATE SET
|
|
141
|
+
lang=excluded.lang, hash=excluded.hash, mtime=excluded.mtime,
|
|
142
|
+
size=excluded.size, indexed_at=excluded.indexed_at""",
|
|
143
|
+
(path, lang, file_hash, mtime, size, indexed_at),
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def replace_symbols(conn, path, symbols):
|
|
148
|
+
"""symbols: iterable of (kind, name, line, signature)."""
|
|
149
|
+
conn.execute("DELETE FROM symbols WHERE file_path=?", (path,))
|
|
150
|
+
conn.executemany(
|
|
151
|
+
"INSERT INTO symbols(file_path, kind, name, line, signature) VALUES(?,?,?,?,?)",
|
|
152
|
+
[(path, k, n, ln, sig) for (k, n, ln, sig) in symbols],
|
|
153
|
+
)
|
|
154
|
+
names = " ".join(sorted({n for (_, n, _, _) in symbols}))
|
|
155
|
+
conn.execute("DELETE FROM fts WHERE path=? AND kind='symbol'", (path,))
|
|
156
|
+
if names:
|
|
157
|
+
conn.execute(
|
|
158
|
+
"INSERT INTO fts(path, kind, text) VALUES(?, 'symbol', ?)", (path, names)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def replace_edges(conn, src_path, edges):
|
|
163
|
+
"""edges: an iterable of dst paths (all recorded as kind 'import'), or a
|
|
164
|
+
mapping {dst_path: kind} to record edge kinds (e.g. type-only TS imports)."""
|
|
165
|
+
items = edges.items() if isinstance(edges, dict) else ((d, "import") for d in edges)
|
|
166
|
+
rows = sorted({(src_path, d, k) for d, k in items})
|
|
167
|
+
conn.execute("DELETE FROM edges WHERE src_path=?", (src_path,))
|
|
168
|
+
conn.executemany(
|
|
169
|
+
"INSERT OR IGNORE INTO edges(src_path, dst_path, kind) VALUES(?,?,?)",
|
|
170
|
+
rows,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def replace_calls(conn, src_path, calls):
|
|
175
|
+
"""calls: iterable of (src_symbol|None, dst_name, line)."""
|
|
176
|
+
conn.execute("DELETE FROM calls WHERE src_path=?", (src_path,))
|
|
177
|
+
conn.executemany(
|
|
178
|
+
"INSERT INTO calls(src_path, src_symbol, dst_name, line) VALUES(?,?,?,?)",
|
|
179
|
+
[(src_path, sym, name, line) for (sym, name, line) in calls],
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def replace_refs(conn, src_path, names):
|
|
184
|
+
"""names: iterable of identifier names used (referenced) in src_path."""
|
|
185
|
+
conn.execute("DELETE FROM refs WHERE path=?", (src_path,))
|
|
186
|
+
conn.executemany(
|
|
187
|
+
"INSERT OR IGNORE INTO refs(path, name) VALUES(?,?)",
|
|
188
|
+
[(src_path, n) for n in sorted(set(names))],
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def forget_file(conn, path):
|
|
193
|
+
"""Remove every trace of a file that no longer exists on disk."""
|
|
194
|
+
conn.execute("DELETE FROM files WHERE path=?", (path,))
|
|
195
|
+
conn.execute("DELETE FROM symbols WHERE file_path=?", (path,))
|
|
196
|
+
conn.execute("DELETE FROM edges WHERE src_path=? OR dst_path=?", (path, path))
|
|
197
|
+
conn.execute("DELETE FROM calls WHERE src_path=?", (path,))
|
|
198
|
+
conn.execute("DELETE FROM refs WHERE path=?", (path,))
|
|
199
|
+
conn.execute("DELETE FROM summaries WHERE path=?", (path,))
|
|
200
|
+
conn.execute("DELETE FROM embeddings WHERE path=?", (path,))
|
|
201
|
+
conn.execute("DELETE FROM symbol_embeddings WHERE path=?", (path,))
|
|
202
|
+
conn.execute("DELETE FROM fts WHERE path=?", (path,))
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# --- reads -------------------------------------------------------------------
|
|
206
|
+
|
|
207
|
+
def stored_hashes(conn) -> dict[str, str]:
|
|
208
|
+
return {r["path"]: r["hash"] for r in conn.execute("SELECT path, hash FROM files")}
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def symbols_for(conn, path):
|
|
212
|
+
return conn.execute(
|
|
213
|
+
"SELECT kind, name, line, signature FROM symbols WHERE file_path=? ORDER BY line",
|
|
214
|
+
(path,),
|
|
215
|
+
).fetchall()
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def lang_counts(conn):
|
|
219
|
+
return conn.execute(
|
|
220
|
+
"SELECT COALESCE(lang,'other') AS lang, COUNT(*) AS n "
|
|
221
|
+
"FROM files GROUP BY lang ORDER BY n DESC"
|
|
222
|
+
).fetchall()
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def search(conn, query: str, limit: int = 15):
|
|
226
|
+
"""Keyword search over symbol names + summaries. Falls back to LIKE if the
|
|
227
|
+
FTS5 MATCH syntax rejects the raw query."""
|
|
228
|
+
try:
|
|
229
|
+
rows = conn.execute(
|
|
230
|
+
# summaries are higher-signal than symbol-name matches → rank them first
|
|
231
|
+
"SELECT path, kind, snippet(fts, 2, '[', ']', '…', 12) AS snip "
|
|
232
|
+
"FROM fts WHERE fts MATCH ? AND kind != 'note' "
|
|
233
|
+
"ORDER BY (kind != 'summary'), rank LIMIT ?",
|
|
234
|
+
(query, limit),
|
|
235
|
+
).fetchall()
|
|
236
|
+
if rows:
|
|
237
|
+
return rows
|
|
238
|
+
except sqlite3.OperationalError:
|
|
239
|
+
pass
|
|
240
|
+
like = f"%{query}%"
|
|
241
|
+
return conn.execute(
|
|
242
|
+
"SELECT path, kind, substr(text,1,120) AS snip FROM fts "
|
|
243
|
+
"WHERE text LIKE ? AND kind != 'note' LIMIT ?",
|
|
244
|
+
(like, limit),
|
|
245
|
+
).fetchall()
|
cerebro/docaudit.py
ADDED
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
"""Living-docs audit: cross-check a knowledge vault (Markdown notes) against the
|
|
2
|
+
Cerebro code index to find notes whose referenced code has changed or vanished.
|
|
3
|
+
|
|
4
|
+
This automates the rule every wiki states but no wiki enforces — "if the doc
|
|
5
|
+
contradicts the code, the code wins, mark the doc stale". A note is flagged when:
|
|
6
|
+
- a code path it references no longer exists in the index (broken), or
|
|
7
|
+
- a referenced file changed after the note's `ultima_verificacion` / `fecha`, or
|
|
8
|
+
- a referenced symbol (method/class) is no longer defined anywhere (heuristic).
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
from . import db as _db
|
|
17
|
+
from . import graph as _graph
|
|
18
|
+
|
|
19
|
+
_EXT = r"(?:ts|tsx|js|jsx|mjs|cjs|py)"
|
|
20
|
+
# a real code path: at least one '/', ending in a code extension, optional :line
|
|
21
|
+
_PATH_RE = re.compile(r"([\w.@-]+(?:/[\w.@-]+)+\." + _EXT + r")(?::(\d+))?")
|
|
22
|
+
_FM_RE = re.compile(r"^---\s*\n(.*?)\n---", re.S)
|
|
23
|
+
_DATE_RE = re.compile(r"(?:ultima_verificacion|fecha)\s*:\s*(\d{4}-\d{2}-\d{2})")
|
|
24
|
+
_BACKTICK_RE = re.compile(r"`([^`\n]+)`")
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def parse_note(path: Path) -> dict:
|
|
28
|
+
text = path.read_text(encoding="utf-8", errors="ignore")
|
|
29
|
+
fm = _FM_RE.match(text)
|
|
30
|
+
fm_text = fm.group(1) if fm else ""
|
|
31
|
+
dm = _DATE_RE.search(fm_text)
|
|
32
|
+
files = {(m.group(1), m.group(2)) for m in _PATH_RE.finditer(text)}
|
|
33
|
+
symbols = set()
|
|
34
|
+
for m in _BACKTICK_RE.finditer(text):
|
|
35
|
+
content = m.group(1)
|
|
36
|
+
for call in re.findall(r"\b([A-Za-z_]\w{2,})\s*\(", content): # foo(...)
|
|
37
|
+
symbols.add(call)
|
|
38
|
+
for tok in re.findall(r"\b([A-Za-z_]\w{2,})\b", content): # mixedCase ident
|
|
39
|
+
if re.search(r"[a-z]", tok) and re.search(r"[A-Z]", tok):
|
|
40
|
+
symbols.add(tok)
|
|
41
|
+
return {"date": dm.group(1) if dm else None, "files": files, "symbols": symbols}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _normalize_ref(ref: str, aliases: dict[str, str]) -> str:
|
|
45
|
+
"""Resolve wiki naming to real repo paths: map logical app aliases
|
|
46
|
+
(backend_app -> fenix-store-backend) and strip ../ / cross-machine prefixes."""
|
|
47
|
+
parts = [p for p in ref.split("/") if p not in ("", ".")]
|
|
48
|
+
for i, p in enumerate(parts): # cut at the first known app-alias segment
|
|
49
|
+
if p in aliases:
|
|
50
|
+
parts = [aliases[p]] + parts[i + 1:]
|
|
51
|
+
return "/".join(parts)
|
|
52
|
+
while parts and parts[0] == "..":
|
|
53
|
+
parts.pop(0)
|
|
54
|
+
return "/".join(parts)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _resolve(conn, ref: str, aliases: dict[str, str]) -> str | None:
|
|
58
|
+
ref = _normalize_ref(ref, aliases)
|
|
59
|
+
if conn.execute("SELECT 1 FROM files WHERE path=?", (ref,)).fetchone():
|
|
60
|
+
return ref
|
|
61
|
+
rows = conn.execute("SELECT path FROM files WHERE path LIKE ?", ("%/" + ref,)).fetchall()
|
|
62
|
+
return rows[0]["path"] if len(rows) == 1 else None
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _epoch(date: str) -> float:
|
|
66
|
+
return datetime.strptime(date, "%Y-%m-%d").timestamp()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def audit_note(conn, note: dict, known_symbols: set[str], aliases: dict[str, str]) -> list[tuple[str, str]]:
|
|
70
|
+
issues: list[tuple[str, str]] = []
|
|
71
|
+
note_epoch = _epoch(note["date"]) if note["date"] else None
|
|
72
|
+
for ref, _line in sorted(note["files"]):
|
|
73
|
+
resolved = _resolve(conn, ref, aliases)
|
|
74
|
+
if resolved is None:
|
|
75
|
+
issues.append(("broken", f"{ref} — not in the index"))
|
|
76
|
+
continue
|
|
77
|
+
if note_epoch is not None:
|
|
78
|
+
row = conn.execute("SELECT mtime FROM files WHERE path=?", (resolved,)).fetchone()
|
|
79
|
+
if row and row["mtime"] and row["mtime"] > note_epoch:
|
|
80
|
+
mod = datetime.fromtimestamp(row["mtime"]).date().isoformat()
|
|
81
|
+
issues.append(("changed", f"{resolved} changed {mod} (note verified {note['date']})"))
|
|
82
|
+
for s in sorted(note["symbols"]):
|
|
83
|
+
if s not in known_symbols:
|
|
84
|
+
issues.append(("symbol?", f"`{s}` — not defined anywhere (renamed/removed?)"))
|
|
85
|
+
return issues
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def audit_vault(conn, vault: Path, aliases: dict[str, str] | None = None) -> list[dict]:
|
|
89
|
+
aliases = aliases or {}
|
|
90
|
+
known = {r["name"] for r in conn.execute("SELECT DISTINCT name FROM symbols")}
|
|
91
|
+
results = []
|
|
92
|
+
for md in sorted(vault.rglob("*.md")):
|
|
93
|
+
if any(part.startswith(".") for part in md.parts): # skip .obsidian etc.
|
|
94
|
+
continue
|
|
95
|
+
note = parse_note(md)
|
|
96
|
+
if not note["files"] and not note["symbols"]:
|
|
97
|
+
continue # purely conceptual note — nothing to verify against code
|
|
98
|
+
issues = audit_note(conn, note, known, aliases)
|
|
99
|
+
hard = [i for i in issues if i[0] in ("broken", "changed")]
|
|
100
|
+
status = "stale" if hard else ("hint" if issues else "fresh")
|
|
101
|
+
results.append({"note": md, "status": status, "date": note["date"], "issues": issues})
|
|
102
|
+
return results
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def relocate(conn, ref: str) -> list[str]:
|
|
106
|
+
"""Where a moved/renamed file likely lives now — same basename in the index."""
|
|
107
|
+
base = ref.split("/")[-1]
|
|
108
|
+
return [r["path"] for r in conn.execute(
|
|
109
|
+
"SELECT path FROM files WHERE path LIKE ?", ("%/" + base,)).fetchall()]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def refresh_briefing(conn, note_path: Path, aliases: dict[str, str] | None = None) -> dict:
|
|
113
|
+
"""Re-audit a stale note against the LIVE code: for each reference, the current
|
|
114
|
+
facts (symbols, summary, dependents, last-change) or a relocation candidate if
|
|
115
|
+
moved. This is the structured context an agent uses to propose the update."""
|
|
116
|
+
aliases = aliases or {}
|
|
117
|
+
note = parse_note(Path(note_path))
|
|
118
|
+
note_epoch = _epoch(note["date"]) if note["date"] else None
|
|
119
|
+
refs = []
|
|
120
|
+
for ref, line in sorted(note["files"]):
|
|
121
|
+
resolved = _resolve(conn, ref, aliases)
|
|
122
|
+
if resolved is None:
|
|
123
|
+
refs.append({"ref": ref, "status": "moved/missing", "candidates": relocate(conn, ref)})
|
|
124
|
+
continue
|
|
125
|
+
row = conn.execute("SELECT mtime FROM files WHERE path=?", (resolved,)).fetchone()
|
|
126
|
+
changed = bool(note_epoch and row and row["mtime"] and row["mtime"] > note_epoch)
|
|
127
|
+
srow = conn.execute("SELECT summary_en FROM summaries WHERE path=?", (resolved,)).fetchone()
|
|
128
|
+
refs.append({
|
|
129
|
+
"ref": ref, "resolved": resolved,
|
|
130
|
+
"status": "changed" if changed else "current",
|
|
131
|
+
"changed_date": datetime.fromtimestamp(row["mtime"]).date().isoformat() if row and row["mtime"] else None,
|
|
132
|
+
"symbols": [f"L{s['line']} {s['kind']} {s['signature'] or s['name']}" for s in _db.symbols_for(conn, resolved)],
|
|
133
|
+
"summary": srow["summary_en"] if srow else None,
|
|
134
|
+
"dependents": _graph.dependents(conn, resolved)[:8],
|
|
135
|
+
})
|
|
136
|
+
return {"note": str(note_path), "date": note["date"], "refs": refs}
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def format_briefing(b: dict) -> str:
|
|
140
|
+
out = [f"# Refresh briefing — {Path(b['note']).name} (verified {b['date'] or 'n/a'})", ""]
|
|
141
|
+
for r in b["refs"]:
|
|
142
|
+
if r["status"] == "moved/missing":
|
|
143
|
+
cand = ", ".join(r["candidates"]) or "no candidate found"
|
|
144
|
+
out.append(f"## ⚠ MOVED/MISSING: {r['ref']}\n likely now → {cand}\n")
|
|
145
|
+
continue
|
|
146
|
+
tag = "CHANGED since note" if r["status"] == "changed" else "current"
|
|
147
|
+
out.append(f"## {r['resolved']} [{tag}, last change {r['changed_date']}]")
|
|
148
|
+
if r["summary"]:
|
|
149
|
+
out.append(f" summary: {r['summary']}")
|
|
150
|
+
if r["symbols"]:
|
|
151
|
+
out.append(" symbols now:")
|
|
152
|
+
out += [f" {s}" for s in r["symbols"][:25]]
|
|
153
|
+
if r["dependents"]:
|
|
154
|
+
out.append(" used by: " + ", ".join(r["dependents"]))
|
|
155
|
+
out.append("")
|
|
156
|
+
return "\n".join(out)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
_ESTADO_RE = re.compile(r"^(estado|status)\s*:.*$", re.M)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def mark_stale(path: Path) -> bool:
|
|
163
|
+
"""Patch the note's frontmatter to estado: revisar (their convention)."""
|
|
164
|
+
text = path.read_text(encoding="utf-8", errors="ignore")
|
|
165
|
+
fm = _FM_RE.match(text)
|
|
166
|
+
if not fm:
|
|
167
|
+
return False
|
|
168
|
+
block = fm.group(1)
|
|
169
|
+
if _ESTADO_RE.search(block):
|
|
170
|
+
new_block = _ESTADO_RE.sub("estado: revisar", block, count=1)
|
|
171
|
+
else:
|
|
172
|
+
new_block = block + "\nestado: revisar"
|
|
173
|
+
path.write_text(text[: fm.start(1)] + new_block + text[fm.end(1):], encoding="utf-8")
|
|
174
|
+
return True
|
cerebro/embeddings.py
ADDED
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Optional semantic search layer.
|
|
2
|
+
|
|
3
|
+
Keyword search (FTS5) misses intent — "where do we validate stock during purchase?"
|
|
4
|
+
shares no keywords with the checkout service. This embeds one vector per SYMBOL
|
|
5
|
+
(function/class: `path kind name signature` + the file summary), plus a whole-file
|
|
6
|
+
vector for files with no indexable symbols, with a small LOCAL model (model2vec, no
|
|
7
|
+
torch, no API key, nothing leaves the machine) and ranks by cosine similarity in
|
|
8
|
+
numpy — so a hit lands on the exact symbol + line, not just the file.
|
|
9
|
+
|
|
10
|
+
It is fully optional: install with `uv sync --extra semantic`. Without the extra,
|
|
11
|
+
every function degrades to a no-op and search stays keyword-only.
|
|
12
|
+
"""
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import hashlib
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
|
|
19
|
+
from . import db
|
|
20
|
+
from . import summaries
|
|
21
|
+
|
|
22
|
+
# Keep the local model quiet — no progress bars / HTTP chatter (set before any
|
|
23
|
+
# huggingface import, which model2vec does lazily inside _model()).
|
|
24
|
+
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
|
25
|
+
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
|
|
26
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
27
|
+
|
|
28
|
+
try: # optional dependencies — guarded so the module imports either way
|
|
29
|
+
import numpy as np
|
|
30
|
+
except Exception: # pragma: no cover
|
|
31
|
+
np = None
|
|
32
|
+
|
|
33
|
+
_MODEL = None
|
|
34
|
+
_MODEL_NAME = "minishlab/potion-base-8M"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def model_available() -> bool:
|
|
38
|
+
if np is None:
|
|
39
|
+
return False
|
|
40
|
+
try:
|
|
41
|
+
import model2vec # noqa: F401
|
|
42
|
+
return True
|
|
43
|
+
except Exception:
|
|
44
|
+
return False
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _model():
|
|
48
|
+
global _MODEL
|
|
49
|
+
if _MODEL is None:
|
|
50
|
+
from model2vec import StaticModel
|
|
51
|
+
_MODEL = StaticModel.from_pretrained(_MODEL_NAME)
|
|
52
|
+
return _MODEL
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _docs_for(conn, path: str):
|
|
56
|
+
"""One document per symbol — `path kind name signature` + the file summary for
|
|
57
|
+
context (there are no per-symbol summaries). A file with no indexable symbols
|
|
58
|
+
gets a single whole-file document (name/line/kind None) so it stays searchable.
|
|
59
|
+
Returns a list of (name, line, kind, doc)."""
|
|
60
|
+
row = conn.execute(
|
|
61
|
+
"SELECT summary_en FROM summaries WHERE path=?", (path,)
|
|
62
|
+
).fetchone()
|
|
63
|
+
summary = row["summary_en"] if row else ""
|
|
64
|
+
syms = db.symbols_for(conn, path)
|
|
65
|
+
if syms:
|
|
66
|
+
return [
|
|
67
|
+
(
|
|
68
|
+
s["name"],
|
|
69
|
+
s["line"],
|
|
70
|
+
s["kind"],
|
|
71
|
+
f"{path} {s['kind']} {s['name']} {s['signature'] or s['name']}\n{summary}".strip(),
|
|
72
|
+
)
|
|
73
|
+
for s in syms
|
|
74
|
+
]
|
|
75
|
+
return [(None, None, None, f"{path}\n{summary}".strip())]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def has_index(conn) -> bool:
|
|
79
|
+
if np is None:
|
|
80
|
+
return False
|
|
81
|
+
return (
|
|
82
|
+
conn.execute("SELECT COUNT(*) AS n FROM symbol_embeddings").fetchone()["n"]
|
|
83
|
+
> 0
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def build(config, conn, only_missing: bool = True) -> dict:
|
|
88
|
+
"""Embed each file's symbols (one vector per symbol, plus a whole-file vector
|
|
89
|
+
for symbol-less files). doc_hash is a per-file fingerprint — identical across a
|
|
90
|
+
file's rows — so an unchanged file is skipped wholesale and a changed one has
|
|
91
|
+
all its rows replaced atomically. `embedded` counts files re-embedded."""
|
|
92
|
+
if not model_available():
|
|
93
|
+
return {"ok": False, "reason": "semantic extra not installed (uv sync --extra semantic)"}
|
|
94
|
+
model = _model()
|
|
95
|
+
files = [
|
|
96
|
+
r["path"]
|
|
97
|
+
for r in conn.execute("SELECT path FROM files WHERE lang IS NOT NULL")
|
|
98
|
+
]
|
|
99
|
+
have: dict[str, str] = {}
|
|
100
|
+
for r in conn.execute("SELECT path, doc_hash FROM symbol_embeddings"):
|
|
101
|
+
have.setdefault(r["path"], r["doc_hash"])
|
|
102
|
+
|
|
103
|
+
targets, docs = [], [] # targets: (path, name, line, kind, fhash)
|
|
104
|
+
for p in files:
|
|
105
|
+
file_docs = _docs_for(conn, p)
|
|
106
|
+
fhash = hashlib.sha1(
|
|
107
|
+
"\x00".join(d for (_, _, _, d) in file_docs).encode("utf-8")
|
|
108
|
+
).hexdigest()
|
|
109
|
+
if only_missing and have.get(p) == fhash:
|
|
110
|
+
continue
|
|
111
|
+
# Changed (or new) file: drop its stale rows, re-embed all of them.
|
|
112
|
+
conn.execute("DELETE FROM symbol_embeddings WHERE path=?", (p,))
|
|
113
|
+
for (name, line, kind, doc) in file_docs:
|
|
114
|
+
targets.append((p, name, line, kind, fhash))
|
|
115
|
+
docs.append(doc)
|
|
116
|
+
if not docs:
|
|
117
|
+
conn.commit()
|
|
118
|
+
return {"ok": True, "embedded": 0, "total": len(files)}
|
|
119
|
+
|
|
120
|
+
vecs = np.asarray(model.encode(docs), dtype="float32")
|
|
121
|
+
dim = int(vecs.shape[1])
|
|
122
|
+
now = summaries.now_iso()
|
|
123
|
+
for (p, name, line, kind, fhash), v in zip(targets, vecs):
|
|
124
|
+
conn.execute(
|
|
125
|
+
"INSERT INTO symbol_embeddings"
|
|
126
|
+
"(path, name, line, kind, dim, vec, doc_hash, updated_at) "
|
|
127
|
+
"VALUES(?,?,?,?,?,?,?,?)",
|
|
128
|
+
(p, name, line, kind, dim, v.tobytes(), fhash, now),
|
|
129
|
+
)
|
|
130
|
+
conn.commit()
|
|
131
|
+
return {
|
|
132
|
+
"ok": True,
|
|
133
|
+
"embedded": len({t[0] for t in targets}),
|
|
134
|
+
"total": len(files),
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def search(config, conn, query: str, limit: int = 10):
|
|
139
|
+
"""Return [(path, name, line, cosine_score), ...] best-first — name/line are
|
|
140
|
+
None for a whole-file hit. [] if unavailable."""
|
|
141
|
+
if not model_available() or not has_index(conn):
|
|
142
|
+
return []
|
|
143
|
+
rows = conn.execute(
|
|
144
|
+
"SELECT path, name, line, vec FROM symbol_embeddings"
|
|
145
|
+
).fetchall()
|
|
146
|
+
if not rows:
|
|
147
|
+
return []
|
|
148
|
+
q = np.asarray(_model().encode([query])[0], dtype="float32")
|
|
149
|
+
q /= np.linalg.norm(q) + 1e-9
|
|
150
|
+
mat = np.frombuffer(b"".join(r["vec"] for r in rows), dtype="float32").reshape(
|
|
151
|
+
len(rows), -1
|
|
152
|
+
)
|
|
153
|
+
mat = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9)
|
|
154
|
+
sims = mat @ q
|
|
155
|
+
order = np.argsort(-sims)[:limit]
|
|
156
|
+
return [
|
|
157
|
+
(rows[i]["path"], rows[i]["name"], rows[i]["line"], float(sims[i]))
|
|
158
|
+
for i in order
|
|
159
|
+
]
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def main(): # `cerebro-embed` entry point
|
|
163
|
+
import json
|
|
164
|
+
|
|
165
|
+
from . import config as cfg
|
|
166
|
+
|
|
167
|
+
config = cfg.Config.load()
|
|
168
|
+
conn = db.connect(config.db_path)
|
|
169
|
+
result = build(config, conn)
|
|
170
|
+
result["root"] = str(config.root)
|
|
171
|
+
print(json.dumps(result))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
if __name__ == "__main__":
|
|
175
|
+
main()
|