megabrain 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
megabrain/__init__.py ADDED
@@ -0,0 +1,13 @@
1
+ """megabrain — code-intelligence engine: one-shot retrieval of all code related
2
+ to a feature, as a view-ready map.
3
+
4
+ Validated configuration (experiments phases 0-5, June 2026):
5
+ - chunking: cAST split-then-merge, 4000 nws chars, breadcrumb headers
6
+ - embeddings: pplx-embed-v1-0.6b (1024d, int8 wire format, L2-normalized)
7
+ - scoring: dense chunk cosine + 0.5 * file-skeleton cosine
8
+ - graph: import+call edges; used for bundle candidates and map annotations,
9
+ NOT for ranking (PageRank rejected by experiment)
10
+ - pruning: OFF by default (LLM pruning costs completeness); --prune optional
11
+ """
12
+
13
+ __version__ = "0.1.0"
megabrain/ask.py ADDED
@@ -0,0 +1,345 @@
1
+ """megabrain ask — agent-style explained answer with cherry-picked REAL code.
2
+
3
+ The LLM explains the answer like an agent walking through the codebase, but
4
+ it cannot paste code: it cites chunks as [[3]] or [[3:705-731]] and the engine
5
+ REPLACES each citation with the real code block (file header + fenced code,
6
+ true line numbers). Explanation = LLM; every line of code = verbatim from
7
+ disk. Streamed, ~1-3s. Fail-open: no citations / API error -> full bundle.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import re
14
+ import sys
15
+ import time
16
+ import urllib.request
17
+ from pathlib import Path
18
+
19
+ from .query import lang_of, render, search
20
+ from .rerank import _key
21
+ from .strategies import MarkdownStrategy
22
+
23
+ # ask is a CODE walkthrough: docs (markdown) are excluded from its candidates so a
24
+ # code explanation isn't diluted with prose. docs_only flips it to a docs-only
25
+ # walkthrough. Docs stay retrievable via `query` regardless.
26
+ DOC_EXTS = MarkdownStrategy.exts
27
+
28
+ MODEL = "claude-haiku-4-5"
29
+ MAX_CTX_CHARS = 200_000 # ~50K tokens of candidate code; Haiku window is 200K
30
+ # double-bracket so the model can still mention [n] in prose without collision.
31
+ # Tolerate an "L" prefix and stray spaces on the line range: the chunk headers in
32
+ # the prompt read "L1-172", so the model often mirrors that as [[0:L1-172]] — accept
33
+ # it (and [[3:705-731]], [[3]]) instead of leaking the citation as raw text.
34
+ _SEL = re.compile(r"\[\[(\d+)(?::\s*[Ll]?(\d+)\s*-\s*[Ll]?(\d+))?\s*\]\]")
35
+
36
+
37
+ def _candidates(res: dict, docs_only: bool = False) -> list[dict]:
38
+ """Retrieved chunks for the walkthrough: CORE chunks + RELATED best chunks,
39
+ numbered. By default docs (markdown) are excluded — ask is a code walkthrough and
40
+ citing doc prose pollutes it. docs_only=True flips it to a docs-only walkthrough.
41
+ `query` surfaces both regardless of this setting."""
42
+ def keep(f: str) -> bool:
43
+ is_doc = f.endswith(DOC_EXTS)
44
+ return is_doc if docs_only else not is_doc
45
+ out = []
46
+ for t in res["tier1"]:
47
+ if not keep(t["file"]):
48
+ continue
49
+ for c in t["chunks"]:
50
+ out.append({"file": t["file"], **{k: c[k] for k in
51
+ ("name", "kind", "start_line", "end_line", "text")}})
52
+ for t in res["tier2"]:
53
+ if not keep(t["file"]):
54
+ continue
55
+ bc = t.get("best_chunk")
56
+ if bc:
57
+ out.append({"file": t["file"], **{k: bc[k] for k in
58
+ ("name", "kind", "start_line", "end_line", "text")}})
59
+ return out
60
+
61
+
62
+ _RULES = """- NEVER paste or quote code. Cite it with DOUBLE brackets: [[3]] (whole chunk) or [[3:705-731]] (file lines 705-731 of chunk 3). Each such citation is REPLACED by the real code block in your answer, so explain AROUND the code, not the code itself. (If you ever need to mention the citation syntax itself in prose, use single brackets — only [[...]] gets replaced.)
63
+ - Put each [[...]] citation on its own line, right after the sentence that introduces it.
64
+ - Show GENEROUS, COMPLETE code: cite whole [[k]] chunks (a full function/class/block) by default so the reader sees the complete implementation, not a fragment. Only use a [[k:lo-hi]] sub-range when a chunk is very large and only one section is relevant — and then take the WHOLE enclosing function, not a few lines. Never cite the same span twice.
65
+ - Structure it: use ## section headings for each phase of the flow, 1-3 sentences of explanation per citation. Be thorough — the reader must understand everything perfectly from the code shown, without opening any file.
66
+ - Finish the thought: end with a short "## Summary" of the flow in 2-3 sentences. Never end mid-sentence."""
67
+
68
+
69
+ def _build_body(question: str, cands: list[dict]) -> dict:
70
+ """Anthropic request body: the cite-only walkthrough prompt over numbered chunks."""
71
+ blocks, used = [], 0
72
+ for i, c in enumerate(cands):
73
+ head = f'[{i}] {c["file"]} L{c["start_line"]}-{c["end_line"]}' + \
74
+ (f' ({c["name"]})' if c["name"] else "")
75
+ body = c["text"]
76
+ if used + len(body) > MAX_CTX_CHARS:
77
+ body = body[:2000] + "\n# ...truncated...\n"
78
+ used += len(body)
79
+ blocks.append(f"{head}\n{body}")
80
+ prompt = f"""You are a senior engineer giving a complete code walkthrough that answers the developer's query. Cover the ENTIRE relevant flow end to end — do not stop early, do not leave a thread dangling.
81
+
82
+ STRICT RULES:
83
+ {_RULES}
84
+
85
+ QUERY: {question}
86
+
87
+ RETRIEVED CHUNKS:
88
+
89
+ {chr(10).join(blocks)}"""
90
+ return {"model": MODEL, "max_tokens": 2400, "temperature": 0, "stream": True,
91
+ "messages": [{"role": "user", "content": prompt}]}
92
+
93
+
94
+ def _explain_stream(question: str, cands: list[dict], key: str) -> str:
95
+ """ONE streamed Haiku call -> explanation text with [[k]]/[[k:lo-hi]] citations."""
96
+ text, stop = _stream_with_retry(_build_body(question, cands), key)
97
+ if stop == "max_tokens":
98
+ cut = max(text.rfind("\n\n"), text.rfind(". "))
99
+ if cut > 0:
100
+ text = text[:cut + 1].rstrip() + "\n\n_(walkthrough truncated — ask a narrower question for the rest)_"
101
+ return text
102
+
103
+
104
+ def _stream_with_retry(body: dict, key: str, retries: int = 4,
105
+ on_delta=None) -> tuple[str, str]:
106
+ """Streamed Anthropic call with backoff on 429/5xx/overloaded. Returns (text, stop).
107
+ If on_delta is given it's called with each text delta (live rendering); once any
108
+ delta has been emitted we stop retrying, so the terminal never sees duplicate text."""
109
+ import time as _t
110
+ last = None
111
+ emitted = False
112
+ for attempt in range(retries):
113
+ req = urllib.request.Request(
114
+ "https://api.anthropic.com/v1/messages", data=json.dumps(body).encode(),
115
+ headers={"x-api-key": key, "anthropic-version": "2023-06-01",
116
+ "content-type": "application/json"})
117
+ text, stop = "", ""
118
+ try:
119
+ with urllib.request.urlopen(req, timeout=90) as r:
120
+ for raw in r:
121
+ line = raw.decode("utf-8", "replace").strip()
122
+ if not line.startswith("data: "):
123
+ continue
124
+ try:
125
+ ev = json.loads(line[6:])
126
+ except json.JSONDecodeError:
127
+ continue
128
+ t = ev.get("type")
129
+ if t == "content_block_delta":
130
+ d = ev["delta"].get("text", "")
131
+ text += d
132
+ if d and on_delta is not None:
133
+ on_delta(d)
134
+ emitted = True
135
+ elif t == "message_delta":
136
+ stop = ev.get("delta", {}).get("stop_reason") or stop
137
+ elif t == "error": # mid-stream overloaded_error etc.
138
+ raise urllib.error.HTTPError(req.full_url, 529, "stream error", None, None)
139
+ return text, stop
140
+ except urllib.error.HTTPError as e:
141
+ last = e
142
+ if emitted: # already streamed live: a retry would double-print
143
+ raise
144
+ if e.code in (429, 500, 502, 503, 529) and attempt < retries - 1:
145
+ _t.sleep(2 ** attempt)
146
+ continue
147
+ raise
148
+ except (urllib.error.URLError, TimeoutError) as e:
149
+ last = e
150
+ if emitted:
151
+ raise
152
+ if attempt < retries - 1:
153
+ _t.sleep(2 ** attempt)
154
+ continue
155
+ raise
156
+ raise last if last else RuntimeError("unreachable")
157
+
158
+
159
+ def _code_block(c: dict, lo: int | None, hi: int | None, seen: set,
160
+ file_syms: dict[str, list[dict]]) -> str:
161
+ cs, ce = c["start_line"], c["end_line"]
162
+ s, e = cs, ce
163
+ if lo is not None and hi is not None and not (hi < cs or lo > ce):
164
+ s, e = max(lo, cs), min(hi, ce)
165
+ _FN = ("function", "async_function", "method", "async_method", "class")
166
+ syms = [y for y in file_syms.get(c["file"], []) if y["kind"] in _FN]
167
+ if (s, e) != (cs, ce):
168
+ # snap to enclosing symbol edges when close (readable boundaries)
169
+ encl = [y for y in syms if y["line"] <= e and y["end_line"] >= s]
170
+ if encl:
171
+ best = min(encl, key=lambda y: y["end_line"] - y["line"])
172
+ if 0 < s - best["line"] <= 8:
173
+ s = max(best["line"], cs)
174
+ if 0 < best["end_line"] - e <= 8:
175
+ e = min(best["end_line"], ce)
176
+ # trim orphan tail of a previous symbol at the head of the range
177
+ nexts = sorted(y["line"] for y in syms if s < y["line"] <= min(s + 8, e))
178
+ if nexts:
179
+ owner = [y for y in syms if y["line"] < s <= y["end_line"]
180
+ and y["end_line"] < nexts[0]]
181
+ if owner:
182
+ s = nexts[0]
183
+ lines = c["text"].splitlines(keepends=True)
184
+ text = "".join(lines[s - cs:e - cs + 1])
185
+ key = (c["file"], s, e)
186
+ if key in seen:
187
+ return f'*(see `{c["file"]}:L{s}-{e}` above)*'
188
+ seen.add(key)
189
+ # label = most specific symbols overlapping the emitted range
190
+ inside = [y for y in syms if not (y["end_line"] < s or y["line"] > e)]
191
+ inside.sort(key=lambda y: y["end_line"] - y["line"])
192
+ tight = [y for y in inside if (y["end_line"] - y["line"]) <= 3 * (e - s + 1)]
193
+ label = ", ".join(dict.fromkeys(y["name"] for y in (tight or inside)[:2])) \
194
+ or (c["name"] or c["kind"])
195
+ return (f'\n**`{c["file"]}` L{s}-{e}** — {label}\n'
196
+ f'```{lang_of(c["file"])}\n{text.rstrip(chr(10))}\n```\n')
197
+
198
+
199
+ def ask(root: Path, question: str, rerank: bool = False,
200
+ docs_only: bool = False) -> dict:
201
+ t0 = time.time()
202
+ res = search(Path(root), question, rerank=rerank)
203
+ retrieval_ms = int((time.time() - t0) * 1000)
204
+ cands = _candidates(res, docs_only)
205
+ key = _key()
206
+ text, llm_ms = "", 0
207
+ if key and cands:
208
+ t1 = time.time()
209
+ try:
210
+ text = _explain_stream(question, cands, key)
211
+ except Exception:
212
+ text = ""
213
+ llm_ms = int((time.time() - t1) * 1000)
214
+ from .store import Store
215
+ st = Store(Path(root))
216
+ file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
217
+ return {"result": res, "cands": cands, "text": text, "file_syms": file_syms,
218
+ "retrieval_ms": retrieval_ms, "llm_ms": llm_ms,
219
+ "query": question, "repo": res["repo"]}
220
+
221
+
222
+ def cited_files(out: dict) -> list[str]:
223
+ """Files cited in the explanation, in first-mention order (for eval)."""
224
+ cands = out["cands"]
225
+ files: list[str] = []
226
+ for m in _SEL.finditer(out["text"] or ""):
227
+ k = int(m.group(1))
228
+ if 0 <= k < len(cands):
229
+ f = cands[k]["file"]
230
+ if f not in files:
231
+ files.append(f)
232
+ return files
233
+
234
+
235
+ def render_ask(out: dict) -> str:
236
+ cands, text = out["cands"], out["text"]
237
+ if not text or not _SEL.search(text):
238
+ return render(out["result"]) # fail-open: unfiltered bundle
239
+ seen: set = set()
240
+ cited: set = set()
241
+
242
+ def sub(m):
243
+ k = int(m.group(1))
244
+ if not (0 <= k < len(cands)):
245
+ return m.group(0)
246
+ cited.add(k)
247
+ lo = int(m.group(2)) if m.group(2) else None
248
+ hi = int(m.group(3)) if m.group(3) else None
249
+ return _code_block(cands[k], lo, hi, seen, out.get("file_syms", {}))
250
+
251
+ body = _SEL.sub(sub, text).strip()
252
+ n_files = len({cands[k]["file"] for k in cited})
253
+ L = [f'# megabrain — "{out["query"]}"',
254
+ f'repo `{out["repo"]}` · {len(seen)} code spans · {n_files} files · '
255
+ f'{out["retrieval_ms"]}ms retrieval + {out["llm_ms"]}ms explain\n',
256
+ body]
257
+ dropped = [c for i, c in enumerate(cands) if i not in cited]
258
+ if dropped:
259
+ items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
260
+ for c in dropped[:12])
261
+ L.append(f'\n— not cited ({len(dropped)}): {items}')
262
+ L.append('— full bundle: `megabrain query` · any file: `megabrain get <file>`')
263
+ return "\n".join(L)
264
+
265
+
266
+ def stream_ask(root: Path, question: str, out=None, rerank: bool = False,
267
+ show_map: bool = True, docs_only: bool = False) -> None:
268
+ """Live-streaming `ask` for the terminal: prose appears token by token and each
269
+ [[k]]/[[k:lo-hi]] citation is spliced into its real code block as soon as its line
270
+ completes (citations are emitted on their own line). Same grounding + fail-open as
271
+ render_ask, but the reader sees output immediately instead of waiting for the whole
272
+ walkthrough. Programmatic/eval/MCP callers keep using ask()/render_ask()."""
273
+ out = out or sys.stdout
274
+
275
+ def write(s: str):
276
+ out.write(s)
277
+ out.flush()
278
+
279
+ t0 = time.time()
280
+ res = search(Path(root), question, rerank=rerank)
281
+ retrieval_ms = int((time.time() - t0) * 1000)
282
+ cands = _candidates(res, docs_only)
283
+ key = _key()
284
+ if not key or not cands: # no LLM available / nothing retrieved
285
+ write(render(res) + "\n")
286
+ return
287
+
288
+ from .store import Store
289
+ st = Store(Path(root))
290
+ file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
291
+
292
+ write(f'# megabrain — "{question}"\n')
293
+ write(f'repo `{res["repo"]}` · {retrieval_ms}ms retrieval · streaming {MODEL}…\n\n')
294
+
295
+ seen: set = set()
296
+ cited: set = set()
297
+
298
+ def sub(m):
299
+ k = int(m.group(1))
300
+ if not (0 <= k < len(cands)):
301
+ return m.group(0)
302
+ cited.add(k)
303
+ lo = int(m.group(2)) if m.group(2) else None
304
+ hi = int(m.group(3)) if m.group(3) else None
305
+ return _code_block(cands[k], lo, hi, seen, file_syms)
306
+
307
+ pending = [""] # hold the in-progress line; citations live on their own line
308
+
309
+ def on_delta(d: str):
310
+ pending[0] += d
311
+ nl = pending[0].rfind("\n")
312
+ if nl != -1:
313
+ ready, pending[0] = pending[0][:nl + 1], pending[0][nl + 1:]
314
+ write(_SEL.sub(sub, ready))
315
+
316
+ t1 = time.time()
317
+ interrupted = False
318
+ stop = ""
319
+ try:
320
+ _, stop = _stream_with_retry(_build_body(question, cands), key, on_delta=on_delta)
321
+ except Exception:
322
+ interrupted = True
323
+ if pending[0]: # flush the trailing partial line
324
+ write(_SEL.sub(sub, pending[0]))
325
+ pending[0] = ""
326
+ llm_ms = int((time.time() - t1) * 1000)
327
+
328
+ if not cited: # fail-open: ungrounded prose -> show the bundle
329
+ note = "_(explanation unavailable — full bundle below)_" if interrupted \
330
+ else "_(no code cited — full bundle below)_"
331
+ write(f"\n\n{note}\n\n{render(res)}\n")
332
+ return
333
+ if stop == "max_tokens":
334
+ write("\n\n_(walkthrough truncated — ask a narrower question for the rest)_")
335
+
336
+ n_files = len({cands[k]["file"] for k in cited})
337
+ write(f'\n\n— {len(seen)} code spans · {n_files} files · '
338
+ f'{retrieval_ms}ms retrieval + {llm_ms}ms explain\n')
339
+ if show_map:
340
+ dropped = [c for i, c in enumerate(cands) if i not in cited]
341
+ if dropped:
342
+ items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
343
+ for c in dropped[:12])
344
+ write(f'— not cited ({len(dropped)}): {items}\n')
345
+ write('— full bundle: `megabrain query` · any file: `megabrain get <file>`\n')
megabrain/bm25.py ADDED
@@ -0,0 +1,52 @@
1
+ """Sparse lexical channel over entity-IDs (LocAgent T4) — pure python, no deps.
2
+
3
+ Each file's document = its path + all symbol qualified names + signatures,
4
+ tokenized identifier-aware (split camelCase/snake_case). Catches issues that
5
+ mention a symbol descriptively when the dense embedding misses it.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ import re
12
+ from collections import Counter
13
+
14
+
15
+ def tokenize(text: str) -> list[str]:
16
+ out = []
17
+ for w in re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+", text):
18
+ lw = w.lower()
19
+ out.append(lw)
20
+ for p in re.split(r"_+", w):
21
+ for s in re.findall(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+", p):
22
+ if len(s) > 1:
23
+ out.append(s.lower())
24
+ return out
25
+
26
+
27
+ class BM25:
28
+ def __init__(self, docs: list[list[str]], k1: float = 1.2, b: float = 0.75):
29
+ self.k1, self.b = k1, b
30
+ self.N = len(docs)
31
+ self.tf = [Counter(d) for d in docs]
32
+ self.dl = [len(d) for d in docs]
33
+ self.avgdl = (sum(self.dl) / self.N) if self.N else 0.0
34
+ df: Counter = Counter()
35
+ for d in docs:
36
+ df.update(set(d))
37
+ self.idf = {t: math.log(1 + (self.N - n + 0.5) / (n + 0.5)) for t, n in df.items()}
38
+
39
+ def scores(self, query: str):
40
+ import numpy as np
41
+ q = [t for t in set(tokenize(query)) if t in self.idf]
42
+ s = np.zeros(self.N)
43
+ if not q or not self.avgdl:
44
+ return s
45
+ for t in q:
46
+ idf = self.idf[t]
47
+ for i in range(self.N):
48
+ f = self.tf[i].get(t, 0)
49
+ if f:
50
+ s[i] += idf * f * (self.k1 + 1) / (
51
+ f + self.k1 * (1 - self.b + self.b * self.dl[i] / self.avgdl))
52
+ return s