cerebro-code-memory 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cerebro/db.py ADDED
@@ -0,0 +1,245 @@
1
+ """SQLite storage: schema, connection, and low-level write/query helpers.
2
+
3
+ A single file, `.cerebro/brain.db`, holds every "trace": the structural index
4
+ (files, symbols, edges), the cached English summaries, the decision notes
5
+ (reserved for v2), and an FTS5 index used by keyword search.
6
+ """
7
+ from __future__ import annotations
8
+
9
+ import sqlite3
10
+ from pathlib import Path
11
+
12
+ SCHEMA = """
13
+ CREATE TABLE IF NOT EXISTS files (
14
+ path TEXT PRIMARY KEY,
15
+ lang TEXT,
16
+ hash TEXT NOT NULL,
17
+ mtime REAL,
18
+ size INTEGER,
19
+ indexed_at TEXT NOT NULL
20
+ );
21
+
22
+ CREATE TABLE IF NOT EXISTS symbols (
23
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
24
+ file_path TEXT NOT NULL,
25
+ kind TEXT NOT NULL,
26
+ name TEXT NOT NULL,
27
+ line INTEGER,
28
+ signature TEXT
29
+ );
30
+ CREATE INDEX IF NOT EXISTS idx_symbols_file ON symbols(file_path);
31
+ CREATE INDEX IF NOT EXISTS idx_symbols_name ON symbols(name);
32
+
33
+ CREATE TABLE IF NOT EXISTS edges (
34
+ src_path TEXT NOT NULL,
35
+ dst_path TEXT NOT NULL,
36
+ kind TEXT NOT NULL DEFAULT 'import',
37
+ PRIMARY KEY (src_path, dst_path, kind)
38
+ );
39
+ CREATE INDEX IF NOT EXISTS idx_edges_dst ON edges(dst_path);
40
+
41
+ -- Symbol-level call graph (tree-sitter, name-resolved): src_symbol in src_path
42
+ -- calls/references a symbol named dst_name. src_symbol is NULL at module scope.
43
+ CREATE TABLE IF NOT EXISTS calls (
44
+ src_path TEXT NOT NULL,
45
+ src_symbol TEXT,
46
+ dst_name TEXT NOT NULL,
47
+ line INTEGER
48
+ );
49
+ CREATE INDEX IF NOT EXISTS idx_calls_dst ON calls(dst_name);
50
+ CREATE INDEX IF NOT EXISTS idx_calls_src ON calls(src_path);
51
+
52
+ -- Identifier references: a distinct (path, name) for every name USED in a file
53
+ -- (calls, JSX, type annotations, value reads), EXCLUDING the file's own
54
+ -- definition sites. Lets dead_symbols() ask "is this symbol's name referenced
55
+ -- anywhere?" — a name absent from refs entirely is an unused-export candidate.
56
+ CREATE TABLE IF NOT EXISTS refs (
57
+ path TEXT NOT NULL,
58
+ name TEXT NOT NULL,
59
+ PRIMARY KEY (path, name)
60
+ );
61
+ CREATE INDEX IF NOT EXISTS idx_refs_name ON refs(name);
62
+
63
+ CREATE TABLE IF NOT EXISTS summaries (
64
+ path TEXT PRIMARY KEY,
65
+ summary_en TEXT NOT NULL,
66
+ model TEXT,
67
+ source_hash TEXT,
68
+ updated_at TEXT NOT NULL
69
+ );
70
+
71
+ -- Reserved for v2 (decision log). Created now so the schema is stable.
72
+ CREATE TABLE IF NOT EXISTS notes (
73
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
74
+ topic TEXT,
75
+ content TEXT NOT NULL,
76
+ created_at TEXT NOT NULL
77
+ );
78
+
79
+ CREATE TABLE IF NOT EXISTS meta (
80
+ key TEXT PRIMARY KEY,
81
+ value TEXT
82
+ );
83
+
84
+ -- Optional semantic layer (legacy, file-level): one vector per file. Superseded
85
+ -- by symbol_embeddings below; kept so older brains still open cleanly.
86
+ CREATE TABLE IF NOT EXISTS embeddings (
87
+ path TEXT PRIMARY KEY,
88
+ dim INTEGER,
89
+ vec BLOB NOT NULL,
90
+ doc_hash TEXT,
91
+ updated_at TEXT NOT NULL
92
+ );
93
+
94
+ -- Symbol-level semantic vectors: one row per symbol (function/class), plus one
95
+ -- whole-file row (name IS NULL) for files with no indexable symbols. Lets search
96
+ -- land on the exact symbol + line, not just the file. doc_hash is a per-file
97
+ -- fingerprint (identical on every row of a file) so build() skips unchanged files.
98
+ CREATE TABLE IF NOT EXISTS symbol_embeddings (
99
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
100
+ path TEXT NOT NULL,
101
+ name TEXT,
102
+ line INTEGER,
103
+ kind TEXT,
104
+ dim INTEGER,
105
+ vec BLOB NOT NULL,
106
+ doc_hash TEXT,
107
+ updated_at TEXT NOT NULL
108
+ );
109
+ CREATE INDEX IF NOT EXISTS idx_symemb_path ON symbol_embeddings(path);
110
+
111
+ -- One row per (path, kind). kind='symbol' aggregates a file's symbol names;
112
+ -- kind='summary' holds the file's English summary. Maintained manually.
113
+ CREATE VIRTUAL TABLE IF NOT EXISTS fts USING fts5(
114
+ path UNINDEXED,
115
+ kind UNINDEXED,
116
+ text
117
+ );
118
+ """
119
+
120
+
121
+ def connect(db_path: Path) -> sqlite3.Connection:
122
+ db_path.parent.mkdir(parents=True, exist_ok=True)
123
+ conn = sqlite3.connect(str(db_path))
124
+ conn.row_factory = sqlite3.Row
125
+ # Set busy_timeout FIRST: switching to WAL itself takes a write lock, so a
126
+ # concurrent connection (e.g. the post-edit hook's reindex while the MCP
127
+ # server connects) must wait here rather than fail with "database is locked".
128
+ conn.execute("PRAGMA busy_timeout=5000")
129
+ conn.execute("PRAGMA journal_mode=WAL")
130
+ conn.executescript(SCHEMA)
131
+ return conn
132
+
133
+
134
+ # --- structural index writes -------------------------------------------------
135
+
136
+ def upsert_file(conn, path, lang, file_hash, mtime, size, indexed_at):
137
+ conn.execute(
138
+ """INSERT INTO files(path, lang, hash, mtime, size, indexed_at)
139
+ VALUES(?,?,?,?,?,?)
140
+ ON CONFLICT(path) DO UPDATE SET
141
+ lang=excluded.lang, hash=excluded.hash, mtime=excluded.mtime,
142
+ size=excluded.size, indexed_at=excluded.indexed_at""",
143
+ (path, lang, file_hash, mtime, size, indexed_at),
144
+ )
145
+
146
+
147
+ def replace_symbols(conn, path, symbols):
148
+ """symbols: iterable of (kind, name, line, signature)."""
149
+ conn.execute("DELETE FROM symbols WHERE file_path=?", (path,))
150
+ conn.executemany(
151
+ "INSERT INTO symbols(file_path, kind, name, line, signature) VALUES(?,?,?,?,?)",
152
+ [(path, k, n, ln, sig) for (k, n, ln, sig) in symbols],
153
+ )
154
+ names = " ".join(sorted({n for (_, n, _, _) in symbols}))
155
+ conn.execute("DELETE FROM fts WHERE path=? AND kind='symbol'", (path,))
156
+ if names:
157
+ conn.execute(
158
+ "INSERT INTO fts(path, kind, text) VALUES(?, 'symbol', ?)", (path, names)
159
+ )
160
+
161
+
162
+ def replace_edges(conn, src_path, edges):
163
+ """edges: an iterable of dst paths (all recorded as kind 'import'), or a
164
+ mapping {dst_path: kind} to record edge kinds (e.g. type-only TS imports)."""
165
+ items = edges.items() if isinstance(edges, dict) else ((d, "import") for d in edges)
166
+ rows = sorted({(src_path, d, k) for d, k in items})
167
+ conn.execute("DELETE FROM edges WHERE src_path=?", (src_path,))
168
+ conn.executemany(
169
+ "INSERT OR IGNORE INTO edges(src_path, dst_path, kind) VALUES(?,?,?)",
170
+ rows,
171
+ )
172
+
173
+
174
+ def replace_calls(conn, src_path, calls):
175
+ """calls: iterable of (src_symbol|None, dst_name, line)."""
176
+ conn.execute("DELETE FROM calls WHERE src_path=?", (src_path,))
177
+ conn.executemany(
178
+ "INSERT INTO calls(src_path, src_symbol, dst_name, line) VALUES(?,?,?,?)",
179
+ [(src_path, sym, name, line) for (sym, name, line) in calls],
180
+ )
181
+
182
+
183
+ def replace_refs(conn, src_path, names):
184
+ """names: iterable of identifier names used (referenced) in src_path."""
185
+ conn.execute("DELETE FROM refs WHERE path=?", (src_path,))
186
+ conn.executemany(
187
+ "INSERT OR IGNORE INTO refs(path, name) VALUES(?,?)",
188
+ [(src_path, n) for n in sorted(set(names))],
189
+ )
190
+
191
+
192
+ def forget_file(conn, path):
193
+ """Remove every trace of a file that no longer exists on disk."""
194
+ conn.execute("DELETE FROM files WHERE path=?", (path,))
195
+ conn.execute("DELETE FROM symbols WHERE file_path=?", (path,))
196
+ conn.execute("DELETE FROM edges WHERE src_path=? OR dst_path=?", (path, path))
197
+ conn.execute("DELETE FROM calls WHERE src_path=?", (path,))
198
+ conn.execute("DELETE FROM refs WHERE path=?", (path,))
199
+ conn.execute("DELETE FROM summaries WHERE path=?", (path,))
200
+ conn.execute("DELETE FROM embeddings WHERE path=?", (path,))
201
+ conn.execute("DELETE FROM symbol_embeddings WHERE path=?", (path,))
202
+ conn.execute("DELETE FROM fts WHERE path=?", (path,))
203
+
204
+
205
+ # --- reads -------------------------------------------------------------------
206
+
207
+ def stored_hashes(conn) -> dict[str, str]:
208
+ return {r["path"]: r["hash"] for r in conn.execute("SELECT path, hash FROM files")}
209
+
210
+
211
+ def symbols_for(conn, path):
212
+ return conn.execute(
213
+ "SELECT kind, name, line, signature FROM symbols WHERE file_path=? ORDER BY line",
214
+ (path,),
215
+ ).fetchall()
216
+
217
+
218
+ def lang_counts(conn):
219
+ return conn.execute(
220
+ "SELECT COALESCE(lang,'other') AS lang, COUNT(*) AS n "
221
+ "FROM files GROUP BY lang ORDER BY n DESC"
222
+ ).fetchall()
223
+
224
+
225
+ def search(conn, query: str, limit: int = 15):
226
+ """Keyword search over symbol names + summaries. Falls back to LIKE if the
227
+ FTS5 MATCH syntax rejects the raw query."""
228
+ try:
229
+ rows = conn.execute(
230
+ # summaries are higher-signal than symbol-name matches → rank them first
231
+ "SELECT path, kind, snippet(fts, 2, '[', ']', '…', 12) AS snip "
232
+ "FROM fts WHERE fts MATCH ? AND kind != 'note' "
233
+ "ORDER BY (kind != 'summary'), rank LIMIT ?",
234
+ (query, limit),
235
+ ).fetchall()
236
+ if rows:
237
+ return rows
238
+ except sqlite3.OperationalError:
239
+ pass
240
+ like = f"%{query}%"
241
+ return conn.execute(
242
+ "SELECT path, kind, substr(text,1,120) AS snip FROM fts "
243
+ "WHERE text LIKE ? AND kind != 'note' LIMIT ?",
244
+ (like, limit),
245
+ ).fetchall()
cerebro/docaudit.py ADDED
@@ -0,0 +1,174 @@
1
+ """Living-docs audit: cross-check a knowledge vault (Markdown notes) against the
2
+ Cerebro code index to find notes whose referenced code has changed or vanished.
3
+
4
+ This automates the rule every wiki states but no wiki enforces — "if the doc
5
+ contradicts the code, the code wins, mark the doc stale". A note is flagged when:
6
+ - a code path it references no longer exists in the index (broken), or
7
+ - a referenced file changed after the note's `ultima_verificacion` / `fecha`, or
8
+ - a referenced symbol (method/class) is no longer defined anywhere (heuristic).
9
+ """
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ from datetime import datetime
14
+ from pathlib import Path
15
+
16
+ from . import db as _db
17
+ from . import graph as _graph
18
+
19
+ _EXT = r"(?:ts|tsx|js|jsx|mjs|cjs|py)"
20
+ # a real code path: at least one '/', ending in a code extension, optional :line
21
+ _PATH_RE = re.compile(r"([\w.@-]+(?:/[\w.@-]+)+\." + _EXT + r")(?::(\d+))?")
22
+ _FM_RE = re.compile(r"^---\s*\n(.*?)\n---", re.S)
23
+ _DATE_RE = re.compile(r"(?:ultima_verificacion|fecha)\s*:\s*(\d{4}-\d{2}-\d{2})")
24
+ _BACKTICK_RE = re.compile(r"`([^`\n]+)`")
25
+
26
+
27
+ def parse_note(path: Path) -> dict:
28
+ text = path.read_text(encoding="utf-8", errors="ignore")
29
+ fm = _FM_RE.match(text)
30
+ fm_text = fm.group(1) if fm else ""
31
+ dm = _DATE_RE.search(fm_text)
32
+ files = {(m.group(1), m.group(2)) for m in _PATH_RE.finditer(text)}
33
+ symbols = set()
34
+ for m in _BACKTICK_RE.finditer(text):
35
+ content = m.group(1)
36
+ for call in re.findall(r"\b([A-Za-z_]\w{2,})\s*\(", content): # foo(...)
37
+ symbols.add(call)
38
+ for tok in re.findall(r"\b([A-Za-z_]\w{2,})\b", content): # mixedCase ident
39
+ if re.search(r"[a-z]", tok) and re.search(r"[A-Z]", tok):
40
+ symbols.add(tok)
41
+ return {"date": dm.group(1) if dm else None, "files": files, "symbols": symbols}
42
+
43
+
44
+ def _normalize_ref(ref: str, aliases: dict[str, str]) -> str:
45
+ """Resolve wiki naming to real repo paths: map logical app aliases
46
+ (backend_app -> fenix-store-backend) and strip ../ / cross-machine prefixes."""
47
+ parts = [p for p in ref.split("/") if p not in ("", ".")]
48
+ for i, p in enumerate(parts): # cut at the first known app-alias segment
49
+ if p in aliases:
50
+ parts = [aliases[p]] + parts[i + 1:]
51
+ return "/".join(parts)
52
+ while parts and parts[0] == "..":
53
+ parts.pop(0)
54
+ return "/".join(parts)
55
+
56
+
57
+ def _resolve(conn, ref: str, aliases: dict[str, str]) -> str | None:
58
+ ref = _normalize_ref(ref, aliases)
59
+ if conn.execute("SELECT 1 FROM files WHERE path=?", (ref,)).fetchone():
60
+ return ref
61
+ rows = conn.execute("SELECT path FROM files WHERE path LIKE ?", ("%/" + ref,)).fetchall()
62
+ return rows[0]["path"] if len(rows) == 1 else None
63
+
64
+
65
+ def _epoch(date: str) -> float:
66
+ return datetime.strptime(date, "%Y-%m-%d").timestamp()
67
+
68
+
69
+ def audit_note(conn, note: dict, known_symbols: set[str], aliases: dict[str, str]) -> list[tuple[str, str]]:
70
+ issues: list[tuple[str, str]] = []
71
+ note_epoch = _epoch(note["date"]) if note["date"] else None
72
+ for ref, _line in sorted(note["files"]):
73
+ resolved = _resolve(conn, ref, aliases)
74
+ if resolved is None:
75
+ issues.append(("broken", f"{ref} — not in the index"))
76
+ continue
77
+ if note_epoch is not None:
78
+ row = conn.execute("SELECT mtime FROM files WHERE path=?", (resolved,)).fetchone()
79
+ if row and row["mtime"] and row["mtime"] > note_epoch:
80
+ mod = datetime.fromtimestamp(row["mtime"]).date().isoformat()
81
+ issues.append(("changed", f"{resolved} changed {mod} (note verified {note['date']})"))
82
+ for s in sorted(note["symbols"]):
83
+ if s not in known_symbols:
84
+ issues.append(("symbol?", f"`{s}` — not defined anywhere (renamed/removed?)"))
85
+ return issues
86
+
87
+
88
+ def audit_vault(conn, vault: Path, aliases: dict[str, str] | None = None) -> list[dict]:
89
+ aliases = aliases or {}
90
+ known = {r["name"] for r in conn.execute("SELECT DISTINCT name FROM symbols")}
91
+ results = []
92
+ for md in sorted(vault.rglob("*.md")):
93
+ if any(part.startswith(".") for part in md.parts): # skip .obsidian etc.
94
+ continue
95
+ note = parse_note(md)
96
+ if not note["files"] and not note["symbols"]:
97
+ continue # purely conceptual note — nothing to verify against code
98
+ issues = audit_note(conn, note, known, aliases)
99
+ hard = [i for i in issues if i[0] in ("broken", "changed")]
100
+ status = "stale" if hard else ("hint" if issues else "fresh")
101
+ results.append({"note": md, "status": status, "date": note["date"], "issues": issues})
102
+ return results
103
+
104
+
105
+ def relocate(conn, ref: str) -> list[str]:
106
+ """Where a moved/renamed file likely lives now — same basename in the index."""
107
+ base = ref.split("/")[-1]
108
+ return [r["path"] for r in conn.execute(
109
+ "SELECT path FROM files WHERE path LIKE ?", ("%/" + base,)).fetchall()]
110
+
111
+
112
+ def refresh_briefing(conn, note_path: Path, aliases: dict[str, str] | None = None) -> dict:
113
+ """Re-audit a stale note against the LIVE code: for each reference, the current
114
+ facts (symbols, summary, dependents, last-change) or a relocation candidate if
115
+ moved. This is the structured context an agent uses to propose the update."""
116
+ aliases = aliases or {}
117
+ note = parse_note(Path(note_path))
118
+ note_epoch = _epoch(note["date"]) if note["date"] else None
119
+ refs = []
120
+ for ref, line in sorted(note["files"]):
121
+ resolved = _resolve(conn, ref, aliases)
122
+ if resolved is None:
123
+ refs.append({"ref": ref, "status": "moved/missing", "candidates": relocate(conn, ref)})
124
+ continue
125
+ row = conn.execute("SELECT mtime FROM files WHERE path=?", (resolved,)).fetchone()
126
+ changed = bool(note_epoch and row and row["mtime"] and row["mtime"] > note_epoch)
127
+ srow = conn.execute("SELECT summary_en FROM summaries WHERE path=?", (resolved,)).fetchone()
128
+ refs.append({
129
+ "ref": ref, "resolved": resolved,
130
+ "status": "changed" if changed else "current",
131
+ "changed_date": datetime.fromtimestamp(row["mtime"]).date().isoformat() if row and row["mtime"] else None,
132
+ "symbols": [f"L{s['line']} {s['kind']} {s['signature'] or s['name']}" for s in _db.symbols_for(conn, resolved)],
133
+ "summary": srow["summary_en"] if srow else None,
134
+ "dependents": _graph.dependents(conn, resolved)[:8],
135
+ })
136
+ return {"note": str(note_path), "date": note["date"], "refs": refs}
137
+
138
+
139
+ def format_briefing(b: dict) -> str:
140
+ out = [f"# Refresh briefing — {Path(b['note']).name} (verified {b['date'] or 'n/a'})", ""]
141
+ for r in b["refs"]:
142
+ if r["status"] == "moved/missing":
143
+ cand = ", ".join(r["candidates"]) or "no candidate found"
144
+ out.append(f"## ⚠ MOVED/MISSING: {r['ref']}\n likely now → {cand}\n")
145
+ continue
146
+ tag = "CHANGED since note" if r["status"] == "changed" else "current"
147
+ out.append(f"## {r['resolved']} [{tag}, last change {r['changed_date']}]")
148
+ if r["summary"]:
149
+ out.append(f" summary: {r['summary']}")
150
+ if r["symbols"]:
151
+ out.append(" symbols now:")
152
+ out += [f" {s}" for s in r["symbols"][:25]]
153
+ if r["dependents"]:
154
+ out.append(" used by: " + ", ".join(r["dependents"]))
155
+ out.append("")
156
+ return "\n".join(out)
157
+
158
+
159
+ _ESTADO_RE = re.compile(r"^(estado|status)\s*:.*$", re.M)
160
+
161
+
162
+ def mark_stale(path: Path) -> bool:
163
+ """Patch the note's frontmatter to estado: revisar (their convention)."""
164
+ text = path.read_text(encoding="utf-8", errors="ignore")
165
+ fm = _FM_RE.match(text)
166
+ if not fm:
167
+ return False
168
+ block = fm.group(1)
169
+ if _ESTADO_RE.search(block):
170
+ new_block = _ESTADO_RE.sub("estado: revisar", block, count=1)
171
+ else:
172
+ new_block = block + "\nestado: revisar"
173
+ path.write_text(text[: fm.start(1)] + new_block + text[fm.end(1):], encoding="utf-8")
174
+ return True
cerebro/embeddings.py ADDED
@@ -0,0 +1,175 @@
1
+ """Optional semantic search layer.
2
+
3
+ Keyword search (FTS5) misses intent — "where do we validate stock during purchase?"
4
+ shares no keywords with the checkout service. This embeds one vector per SYMBOL
5
+ (function/class: `path kind name signature` + the file summary), plus a whole-file
6
+ vector for files with no indexable symbols, with a small LOCAL model (model2vec, no
7
+ torch, no API key, nothing leaves the machine) and ranks by cosine similarity in
8
+ numpy — so a hit lands on the exact symbol + line, not just the file.
9
+
10
+ It is fully optional: install with `uv sync --extra semantic`. Without the extra,
11
+ every function degrades to a no-op and search stays keyword-only.
12
+ """
13
+ from __future__ import annotations
14
+
15
+ import hashlib
16
+ import logging
17
+ import os
18
+
19
+ from . import db
20
+ from . import summaries
21
+
22
+ # Keep the local model quiet — no progress bars / HTTP chatter (set before any
23
+ # huggingface import, which model2vec does lazily inside _model()).
24
+ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
25
+ os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
26
+ logging.getLogger("httpx").setLevel(logging.WARNING)
27
+
28
+ try: # optional dependencies — guarded so the module imports either way
29
+ import numpy as np
30
+ except Exception: # pragma: no cover
31
+ np = None
32
+
33
+ _MODEL = None
34
+ _MODEL_NAME = "minishlab/potion-base-8M"
35
+
36
+
37
+ def model_available() -> bool:
38
+ if np is None:
39
+ return False
40
+ try:
41
+ import model2vec # noqa: F401
42
+ return True
43
+ except Exception:
44
+ return False
45
+
46
+
47
+ def _model():
48
+ global _MODEL
49
+ if _MODEL is None:
50
+ from model2vec import StaticModel
51
+ _MODEL = StaticModel.from_pretrained(_MODEL_NAME)
52
+ return _MODEL
53
+
54
+
55
+ def _docs_for(conn, path: str):
56
+ """One document per symbol — `path kind name signature` + the file summary for
57
+ context (there are no per-symbol summaries). A file with no indexable symbols
58
+ gets a single whole-file document (name/line/kind None) so it stays searchable.
59
+ Returns a list of (name, line, kind, doc)."""
60
+ row = conn.execute(
61
+ "SELECT summary_en FROM summaries WHERE path=?", (path,)
62
+ ).fetchone()
63
+ summary = row["summary_en"] if row else ""
64
+ syms = db.symbols_for(conn, path)
65
+ if syms:
66
+ return [
67
+ (
68
+ s["name"],
69
+ s["line"],
70
+ s["kind"],
71
+ f"{path} {s['kind']} {s['name']} {s['signature'] or s['name']}\n{summary}".strip(),
72
+ )
73
+ for s in syms
74
+ ]
75
+ return [(None, None, None, f"{path}\n{summary}".strip())]
76
+
77
+
78
+ def has_index(conn) -> bool:
79
+ if np is None:
80
+ return False
81
+ return (
82
+ conn.execute("SELECT COUNT(*) AS n FROM symbol_embeddings").fetchone()["n"]
83
+ > 0
84
+ )
85
+
86
+
87
+ def build(config, conn, only_missing: bool = True) -> dict:
88
+ """Embed each file's symbols (one vector per symbol, plus a whole-file vector
89
+ for symbol-less files). doc_hash is a per-file fingerprint — identical across a
90
+ file's rows — so an unchanged file is skipped wholesale and a changed one has
91
+ all its rows replaced atomically. `embedded` counts files re-embedded."""
92
+ if not model_available():
93
+ return {"ok": False, "reason": "semantic extra not installed (uv sync --extra semantic)"}
94
+ model = _model()
95
+ files = [
96
+ r["path"]
97
+ for r in conn.execute("SELECT path FROM files WHERE lang IS NOT NULL")
98
+ ]
99
+ have: dict[str, str] = {}
100
+ for r in conn.execute("SELECT path, doc_hash FROM symbol_embeddings"):
101
+ have.setdefault(r["path"], r["doc_hash"])
102
+
103
+ targets, docs = [], [] # targets: (path, name, line, kind, fhash)
104
+ for p in files:
105
+ file_docs = _docs_for(conn, p)
106
+ fhash = hashlib.sha1(
107
+ "\x00".join(d for (_, _, _, d) in file_docs).encode("utf-8")
108
+ ).hexdigest()
109
+ if only_missing and have.get(p) == fhash:
110
+ continue
111
+ # Changed (or new) file: drop its stale rows, re-embed all of them.
112
+ conn.execute("DELETE FROM symbol_embeddings WHERE path=?", (p,))
113
+ for (name, line, kind, doc) in file_docs:
114
+ targets.append((p, name, line, kind, fhash))
115
+ docs.append(doc)
116
+ if not docs:
117
+ conn.commit()
118
+ return {"ok": True, "embedded": 0, "total": len(files)}
119
+
120
+ vecs = np.asarray(model.encode(docs), dtype="float32")
121
+ dim = int(vecs.shape[1])
122
+ now = summaries.now_iso()
123
+ for (p, name, line, kind, fhash), v in zip(targets, vecs):
124
+ conn.execute(
125
+ "INSERT INTO symbol_embeddings"
126
+ "(path, name, line, kind, dim, vec, doc_hash, updated_at) "
127
+ "VALUES(?,?,?,?,?,?,?,?)",
128
+ (p, name, line, kind, dim, v.tobytes(), fhash, now),
129
+ )
130
+ conn.commit()
131
+ return {
132
+ "ok": True,
133
+ "embedded": len({t[0] for t in targets}),
134
+ "total": len(files),
135
+ }
136
+
137
+
138
+ def search(config, conn, query: str, limit: int = 10):
139
+ """Return [(path, name, line, cosine_score), ...] best-first — name/line are
140
+ None for a whole-file hit. [] if unavailable."""
141
+ if not model_available() or not has_index(conn):
142
+ return []
143
+ rows = conn.execute(
144
+ "SELECT path, name, line, vec FROM symbol_embeddings"
145
+ ).fetchall()
146
+ if not rows:
147
+ return []
148
+ q = np.asarray(_model().encode([query])[0], dtype="float32")
149
+ q /= np.linalg.norm(q) + 1e-9
150
+ mat = np.frombuffer(b"".join(r["vec"] for r in rows), dtype="float32").reshape(
151
+ len(rows), -1
152
+ )
153
+ mat = mat / (np.linalg.norm(mat, axis=1, keepdims=True) + 1e-9)
154
+ sims = mat @ q
155
+ order = np.argsort(-sims)[:limit]
156
+ return [
157
+ (rows[i]["path"], rows[i]["name"], rows[i]["line"], float(sims[i]))
158
+ for i in order
159
+ ]
160
+
161
+
162
+ def main(): # `cerebro-embed` entry point
163
+ import json
164
+
165
+ from . import config as cfg
166
+
167
+ config = cfg.Config.load()
168
+ conn = db.connect(config.db_path)
169
+ result = build(config, conn)
170
+ result["root"] = str(config.root)
171
+ print(json.dumps(result))
172
+
173
+
174
+ if __name__ == "__main__":
175
+ main()