megabrain 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. megabrain-0.1.0/LICENSE +21 -0
  2. megabrain-0.1.0/PKG-INFO +136 -0
  3. megabrain-0.1.0/README.md +110 -0
  4. megabrain-0.1.0/megabrain/__init__.py +13 -0
  5. megabrain-0.1.0/megabrain/ask.py +345 -0
  6. megabrain-0.1.0/megabrain/bm25.py +52 -0
  7. megabrain-0.1.0/megabrain/chunker.py +449 -0
  8. megabrain-0.1.0/megabrain/chunker_ts.py +378 -0
  9. megabrain-0.1.0/megabrain/cli.py +88 -0
  10. megabrain-0.1.0/megabrain/embeddings.py +92 -0
  11. megabrain-0.1.0/megabrain/graph.py +108 -0
  12. megabrain-0.1.0/megabrain/indexer.py +100 -0
  13. megabrain-0.1.0/megabrain/issue.py +120 -0
  14. megabrain-0.1.0/megabrain/markdown.py +214 -0
  15. megabrain-0.1.0/megabrain/mcp_server.py +156 -0
  16. megabrain-0.1.0/megabrain/query.py +355 -0
  17. megabrain-0.1.0/megabrain/rerank.py +69 -0
  18. megabrain-0.1.0/megabrain/rerank2.py +86 -0
  19. megabrain-0.1.0/megabrain/serve.py +282 -0
  20. megabrain-0.1.0/megabrain/store.py +141 -0
  21. megabrain-0.1.0/megabrain/strategies.py +144 -0
  22. megabrain-0.1.0/megabrain.egg-info/PKG-INFO +136 -0
  23. megabrain-0.1.0/megabrain.egg-info/SOURCES.txt +34 -0
  24. megabrain-0.1.0/megabrain.egg-info/dependency_links.txt +1 -0
  25. megabrain-0.1.0/megabrain.egg-info/entry_points.txt +2 -0
  26. megabrain-0.1.0/megabrain.egg-info/requires.txt +7 -0
  27. megabrain-0.1.0/megabrain.egg-info/top_level.txt +1 -0
  28. megabrain-0.1.0/pyproject.toml +48 -0
  29. megabrain-0.1.0/setup.cfg +4 -0
  30. megabrain-0.1.0/tests/test_ask_citation.py +61 -0
  31. megabrain-0.1.0/tests/test_cast_chunker.py +187 -0
  32. megabrain-0.1.0/tests/test_chunker_ts.py +67 -0
  33. megabrain-0.1.0/tests/test_engine_golden.py +55 -0
  34. megabrain-0.1.0/tests/test_markdown_chunker.py +104 -0
  35. megabrain-0.1.0/tests/test_multi_repo.py +48 -0
  36. megabrain-0.1.0/tests/test_scale.py +55 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Berna Castro
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,136 @@
1
+ Metadata-Version: 2.4
2
+ Name: megabrain
3
+ Version: 0.1.0
4
+ Summary: Local code-intelligence engine: one call returns all the code related to a question, explained with the real code spliced in.
5
+ Author-email: Berna Castro <bernacas@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/pinecall/megabrain
8
+ Project-URL: Repository, https://github.com/pinecall/megabrain
9
+ Keywords: code-intelligence,retrieval,rag,embeddings,code-search,mcp,ast,tree-sitter,developer-tools
10
+ Classifier: Programming Language :: Python :: 3.11
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Topic :: Software Development :: Libraries
15
+ Classifier: Topic :: Software Development :: Documentation
16
+ Requires-Python: >=3.11
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: numpy>=1.24
20
+ Requires-Dist: tree_sitter>=0.21
21
+ Requires-Dist: tree_sitter_typescript>=0.23
22
+ Provides-Extra: languages
23
+ Requires-Dist: tree_sitter_ruby>=0.23; extra == "languages"
24
+ Requires-Dist: tree_sitter_go>=0.23; extra == "languages"
25
+ Dynamic: license-file
26
+
27
+ <p align="center">
28
+ <img src="https://raw.githubusercontent.com/pinecall/megabrain/master/assets/megabrain.png" alt="megabrain" width="180">
29
+ </p>
30
+
31
+ <h1 align="center">megabrain</h1>
32
+
33
+ <p align="center">
34
+ <b>One call returns all the code related to a question</b><br>
35
+ — explained like a senior engineer, with the real code spliced in.
36
+ </p>
37
+
38
+ <p align="center">
39
+ <img src="https://img.shields.io/badge/python-3.11+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python 3.11+">
40
+ <img src="https://img.shields.io/badge/retrieval-no%20LLM%20·%20~200ms-2ea44f?style=flat-square" alt="No LLM in the retrieval path">
41
+ <img src="https://img.shields.io/badge/code-zero%20hallucination-6f42c1?style=flat-square" alt="Zero code hallucination">
42
+ <img src="https://img.shields.io/badge/MCP-ready-000000?style=flat-square" alt="MCP ready">
43
+ </p>
44
+
45
+ ---
46
+
47
+ **megabrain** is a local code-intelligence engine. It replaces minutes of file-by-file
48
+ crawling — grep, read, explore-agent chains — with a single grounded answer. Index a repo
49
+ once; every later question retrieves *all* the related code and stitches it into a
50
+ walkthrough narrated by an LLM that can **only point at code, never rewrite it** — so
51
+ nothing is hallucinated.
52
+
53
+ ## Install
54
+
55
+ No packaging step — runs straight from a clone:
56
+
57
+ ```bash
58
+ git clone https://github.com/pinecall/megabrain.git
59
+ cd megabrain
60
+ pip install numpy # core (Python indexing)
61
+ pip install tree_sitter tree_sitter_typescript # TS/JS (+ tree_sitter_ruby tree_sitter_go for Ruby/Go)
62
+ alias megabrain='python3 -m megabrain.cli' # optional: clean invocation
63
+ ```
64
+
65
+ Keys are read from the environment (with a `~/.zshrc` fallback):
66
+
67
+ ```bash
68
+ export PERPLEXITY_API_KEY=... # required — embeddings
69
+ export ANTHROPIC_API_KEY=... # only for `ask` and `--best`
70
+ ```
71
+
72
+ ## Usage
73
+
74
+ ```bash
75
+ megabrain index ~/repo # incremental (sha256), no daemon
76
+ megabrain ask ~/repo "how does auth work end to end" # walkthrough + real code (~6–20s)
77
+ megabrain ask ~/repo "how do I configure X" --docs # explain the docs instead of code
78
+ megabrain query ~/repo "request retry logic" # raw code map, no LLM (~200ms)
79
+ megabrain get ~/repo src/x.py --symbol Class.method # one file or symbol
80
+ ```
81
+
82
+ Indexes code (`.py` · `.ts` · `.tsx` · `.js` · `.jsx` · `.mjs` · `.cjs` · Ruby · Go) and
83
+ markdown (`.md` · `.markdown` · `.mdx`) through a **strategy registry** — adding a language
84
+ or content type is a config entry, not a branch in the indexer.
85
+
86
+ ## How it works
87
+
88
+ A three-stage pipeline. **Only `ask` calls an LLM — and only to narrate.**
89
+
90
+ | stage | what it does |
91
+ |---|---|
92
+ | **index** | cAST chunk → Perplexity embed (int8, L2-normalized) → SQLite. Incremental by `sha256`, no watcher. |
93
+ | **query** | No-LLM retrieval (~200ms): dense-chunk + file-skeleton fusion, with import/call-graph candidates. Returns a map — **CORE** (full code of the top files) + **RELATED** (every connected file with its best chunk). |
94
+ | **ask** | One streamed Haiku call writes the walkthrough and cites code as `[[k]]`; the engine **replaces each citation with the verbatim block** (real file, real line numbers). Non-cited related files are listed at the end. Fail-open: any API error falls back to the full `query` bundle. |
95
+
96
+ Because the model only emits citations and the engine splices code from disk, **code cannot
97
+ be hallucinated or rewritten.**
98
+
99
+ ## MCP
100
+
101
+ Use it from Claude Code or any MCP client:
102
+
103
+ ```bash
104
+ claude mcp add megabrain -- python3 -m megabrain.mcp_server
105
+ ```
106
+
107
+ Tools: `megabrain_ask` (primary), `megabrain_query`, `megabrain_get`, `megabrain_index`.
108
+ The server auto-refreshes a stale index before answering, so results always match disk.
109
+
110
+ ## Design
111
+
112
+ Every choice below is backed by an internal golden set (30 verified queries):
113
+
114
+ | decision | evidence |
115
+ |---|---|
116
+ | cAST chunking (4K nws chars, breadcrumbs, partition-guaranteed) | unit-tested; every line lands in exactly one chunk — no gaps, no overlaps |
117
+ | `pplx-embed-v1` (1024-d, int8 wire, **L2-normalized**) | beats `openai-3-large` on code; ~$0.0016/repo |
118
+ | dense chunk + 0.5 × file-skeleton score | dual-granularity; precision up, no downside |
119
+ | graph (import + call edges) for candidates only | PageRank-as-ranking **rejected** by data (Acc@1 0.91 → 0.73) |
120
+ | **no LLM in the retrieval path** | every LLM *prune* variant cost completeness; `ask` explains, it never prunes |
121
+
122
+ **Engine retrieval** (internal golden set): R@1 **0.86** · bundle\_full **1.00** · p50 **8 ms** warm.
123
+ **SWE-bench Lite** localization (no training): retrieval Acc@1 ≈ 0.52 / @5 ≈ 0.83 — on par
124
+ with the trained CodeRankEmbed retriever.
125
+
126
+ ## Project layout
127
+
128
+ ```
129
+ megabrain/ engine — chunkers, embeddings, SQLite store, graph, indexer, query, ask, cli, mcp_server
130
+ evals/ golden.json (30 verified queries) + swebench harness
131
+ tests/ engine + chunker gates
132
+ ```
133
+
134
+ ---
135
+
136
+ <p align="center"><sub>github.com/pinecall/megabrain</sub></p>
@@ -0,0 +1,110 @@
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/pinecall/megabrain/master/assets/megabrain.png" alt="megabrain" width="180">
3
+ </p>
4
+
5
+ <h1 align="center">megabrain</h1>
6
+
7
+ <p align="center">
8
+ <b>One call returns all the code related to a question</b><br>
9
+ — explained like a senior engineer, with the real code spliced in.
10
+ </p>
11
+
12
+ <p align="center">
13
+ <img src="https://img.shields.io/badge/python-3.11+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python 3.11+">
14
+ <img src="https://img.shields.io/badge/retrieval-no%20LLM%20·%20~200ms-2ea44f?style=flat-square" alt="No LLM in the retrieval path">
15
+ <img src="https://img.shields.io/badge/code-zero%20hallucination-6f42c1?style=flat-square" alt="Zero code hallucination">
16
+ <img src="https://img.shields.io/badge/MCP-ready-000000?style=flat-square" alt="MCP ready">
17
+ </p>
18
+
19
+ ---
20
+
21
+ **megabrain** is a local code-intelligence engine. It replaces minutes of file-by-file
22
+ crawling — grep, read, explore-agent chains — with a single grounded answer. Index a repo
23
+ once; every later question retrieves *all* the related code and stitches it into a
24
+ walkthrough narrated by an LLM that can **only point at code, never rewrite it** — so
25
+ nothing is hallucinated.
26
+
27
+ ## Install
28
+
29
+ No packaging step — runs straight from a clone:
30
+
31
+ ```bash
32
+ git clone https://github.com/pinecall/megabrain.git
33
+ cd megabrain
34
+ pip install numpy # core (Python indexing)
35
+ pip install tree_sitter tree_sitter_typescript # TS/JS (+ tree_sitter_ruby tree_sitter_go for Ruby/Go)
36
+ alias megabrain='python3 -m megabrain.cli' # optional: clean invocation
37
+ ```
38
+
39
+ Keys are read from the environment (with a `~/.zshrc` fallback):
40
+
41
+ ```bash
42
+ export PERPLEXITY_API_KEY=... # required — embeddings
43
+ export ANTHROPIC_API_KEY=... # only for `ask` and `--best`
44
+ ```
45
+
46
+ ## Usage
47
+
48
+ ```bash
49
+ megabrain index ~/repo # incremental (sha256), no daemon
50
+ megabrain ask ~/repo "how does auth work end to end" # walkthrough + real code (~6–20s)
51
+ megabrain ask ~/repo "how do I configure X" --docs # explain the docs instead of code
52
+ megabrain query ~/repo "request retry logic" # raw code map, no LLM (~200ms)
53
+ megabrain get ~/repo src/x.py --symbol Class.method # one file or symbol
54
+ ```
55
+
56
+ Indexes code (`.py` · `.ts` · `.tsx` · `.js` · `.jsx` · `.mjs` · `.cjs` · Ruby · Go) and
57
+ markdown (`.md` · `.markdown` · `.mdx`) through a **strategy registry** — adding a language
58
+ or content type is a config entry, not a branch in the indexer.
59
+
60
+ ## How it works
61
+
62
+ A three-stage pipeline. **Only `ask` calls an LLM — and only to narrate.**
63
+
64
+ | stage | what it does |
65
+ |---|---|
66
+ | **index** | cAST chunk → Perplexity embed (int8, L2-normalized) → SQLite. Incremental by `sha256`, no watcher. |
67
+ | **query** | No-LLM retrieval (~200ms): dense-chunk + file-skeleton fusion, with import/call-graph candidates. Returns a map — **CORE** (full code of the top files) + **RELATED** (every connected file with its best chunk). |
68
+ | **ask** | One streamed Haiku call writes the walkthrough and cites code as `[[k]]`; the engine **replaces each citation with the verbatim block** (real file, real line numbers). Non-cited related files are listed at the end. Fail-open: any API error falls back to the full `query` bundle. |
69
+
70
+ Because the model only emits citations and the engine splices code from disk, **code cannot
71
+ be hallucinated or rewritten.**
72
+
73
+ ## MCP
74
+
75
+ Use it from Claude Code or any MCP client:
76
+
77
+ ```bash
78
+ claude mcp add megabrain -- python3 -m megabrain.mcp_server
79
+ ```
80
+
81
+ Tools: `megabrain_ask` (primary), `megabrain_query`, `megabrain_get`, `megabrain_index`.
82
+ The server auto-refreshes a stale index before answering, so results always match disk.
83
+
84
+ ## Design
85
+
86
+ Every choice below is backed by an internal golden set (30 verified queries):
87
+
88
+ | decision | evidence |
89
+ |---|---|
90
+ | cAST chunking (4K nws chars, breadcrumbs, partition-guaranteed) | unit-tested; every line lands in exactly one chunk — no gaps, no overlaps |
91
+ | `pplx-embed-v1` (1024-d, int8 wire, **L2-normalized**) | beats `openai-3-large` on code; ~$0.0016/repo |
92
+ | dense chunk + 0.5 × file-skeleton score | dual-granularity; precision up, no downside |
93
+ | graph (import + call edges) for candidates only | PageRank-as-ranking **rejected** by data (Acc@1 0.91 → 0.73) |
94
+ | **no LLM in the retrieval path** | every LLM *prune* variant cost completeness; `ask` explains, it never prunes |
95
+
96
+ **Engine retrieval** (internal golden set): R@1 **0.86** · bundle\_full **1.00** · p50 **8 ms** warm.
97
+ **SWE-bench Lite** localization (no training): retrieval Acc@1 ≈ 0.52 / @5 ≈ 0.83 — on par
98
+ with the trained CodeRankEmbed retriever.
99
+
100
+ ## Project layout
101
+
102
+ ```
103
+ megabrain/ engine — chunkers, embeddings, SQLite store, graph, indexer, query, ask, cli, mcp_server
104
+ evals/ golden.json (30 verified queries) + swebench harness
105
+ tests/ engine + chunker gates
106
+ ```
107
+
108
+ ---
109
+
110
+ <p align="center"><sub>github.com/pinecall/megabrain</sub></p>
@@ -0,0 +1,13 @@
1
+ """megabrain — code-intelligence engine: one-shot retrieval of all code related
2
+ to a feature, as a view-ready map.
3
+
4
+ Validated configuration (experiments phases 0-5, June 2026):
5
+ - chunking: cAST split-then-merge, 4000 nws chars, breadcrumb headers
6
+ - embeddings: pplx-embed-v1-0.6b (1024d, int8 wire format, L2-normalized)
7
+ - scoring: dense chunk cosine + 0.5 * file-skeleton cosine
8
+ - graph: import+call edges; used for bundle candidates and map annotations,
9
+ NOT for ranking (PageRank rejected by experiment)
10
+ - pruning: OFF by default (LLM pruning costs completeness); --prune optional
11
+ """
12
+
13
+ __version__ = "0.1.0"
@@ -0,0 +1,345 @@
1
+ """megabrain ask — agent-style explained answer with cherry-picked REAL code.
2
+
3
+ The LLM explains the answer like an agent walking through the codebase, but
4
+ it cannot paste code: it cites chunks as [[3]] or [[3:705-731]] and the engine
5
+ REPLACES each citation with the real code block (file header + fenced code,
6
+ true line numbers). Explanation = LLM; every line of code = verbatim from
7
+ disk. Streamed, ~1-3s. Fail-open: no citations / API error -> full bundle.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import re
14
+ import sys
15
+ import time
16
+ import urllib.request
17
+ from pathlib import Path
18
+
19
+ from .query import lang_of, render, search
20
+ from .rerank import _key
21
+ from .strategies import MarkdownStrategy
22
+
23
+ # ask is a CODE walkthrough: docs (markdown) are excluded from its candidates so a
24
+ # code explanation isn't diluted with prose. docs_only flips it to a docs-only
25
+ # walkthrough. Docs stay retrievable via `query` regardless.
26
+ DOC_EXTS = MarkdownStrategy.exts
27
+
28
+ MODEL = "claude-haiku-4-5"
29
+ MAX_CTX_CHARS = 200_000 # ~50K tokens of candidate code; Haiku window is 200K
30
+ # double-bracket so the model can still mention [n] in prose without collision.
31
+ # Tolerate an "L" prefix and stray spaces on the line range: the chunk headers in
32
+ # the prompt read "L1-172", so the model often mirrors that as [[0:L1-172]] — accept
33
+ # it (and [[3:705-731]], [[3]]) instead of leaking the citation as raw text.
34
+ _SEL = re.compile(r"\[\[(\d+)(?::\s*[Ll]?(\d+)\s*-\s*[Ll]?(\d+))?\s*\]\]")
35
+
36
+
37
+ def _candidates(res: dict, docs_only: bool = False) -> list[dict]:
38
+ """Retrieved chunks for the walkthrough: CORE chunks + RELATED best chunks,
39
+ numbered. By default docs (markdown) are excluded — ask is a code walkthrough and
40
+ citing doc prose pollutes it. docs_only=True flips it to a docs-only walkthrough.
41
+ `query` surfaces both regardless of this setting."""
42
+ def keep(f: str) -> bool:
43
+ is_doc = f.endswith(DOC_EXTS)
44
+ return is_doc if docs_only else not is_doc
45
+ out = []
46
+ for t in res["tier1"]:
47
+ if not keep(t["file"]):
48
+ continue
49
+ for c in t["chunks"]:
50
+ out.append({"file": t["file"], **{k: c[k] for k in
51
+ ("name", "kind", "start_line", "end_line", "text")}})
52
+ for t in res["tier2"]:
53
+ if not keep(t["file"]):
54
+ continue
55
+ bc = t.get("best_chunk")
56
+ if bc:
57
+ out.append({"file": t["file"], **{k: bc[k] for k in
58
+ ("name", "kind", "start_line", "end_line", "text")}})
59
+ return out
60
+
61
+
62
+ _RULES = """- NEVER paste or quote code. Cite it with DOUBLE brackets: [[3]] (whole chunk) or [[3:705-731]] (file lines 705-731 of chunk 3). Each such citation is REPLACED by the real code block in your answer, so explain AROUND the code, not the code itself. (If you ever need to mention the citation syntax itself in prose, use single brackets — only [[...]] gets replaced.)
63
+ - Put each [[...]] citation on its own line, right after the sentence that introduces it.
64
+ - Show GENEROUS, COMPLETE code: cite whole [[k]] chunks (a full function/class/block) by default so the reader sees the complete implementation, not a fragment. Only use a [[k:lo-hi]] sub-range when a chunk is very large and only one section is relevant — and then take the WHOLE enclosing function, not a few lines. Never cite the same span twice.
65
+ - Structure it: use ## section headings for each phase of the flow, 1-3 sentences of explanation per citation. Be thorough — the reader must understand everything perfectly from the code shown, without opening any file.
66
+ - Finish the thought: end with a short "## Summary" of the flow in 2-3 sentences. Never end mid-sentence."""
67
+
68
+
69
+ def _build_body(question: str, cands: list[dict]) -> dict:
70
+ """Anthropic request body: the cite-only walkthrough prompt over numbered chunks."""
71
+ blocks, used = [], 0
72
+ for i, c in enumerate(cands):
73
+ head = f'[{i}] {c["file"]} L{c["start_line"]}-{c["end_line"]}' + \
74
+ (f' ({c["name"]})' if c["name"] else "")
75
+ body = c["text"]
76
+ if used + len(body) > MAX_CTX_CHARS:
77
+ body = body[:2000] + "\n# ...truncated...\n"
78
+ used += len(body)
79
+ blocks.append(f"{head}\n{body}")
80
+ prompt = f"""You are a senior engineer giving a complete code walkthrough that answers the developer's query. Cover the ENTIRE relevant flow end to end — do not stop early, do not leave a thread dangling.
81
+
82
+ STRICT RULES:
83
+ {_RULES}
84
+
85
+ QUERY: {question}
86
+
87
+ RETRIEVED CHUNKS:
88
+
89
+ {chr(10).join(blocks)}"""
90
+ return {"model": MODEL, "max_tokens": 2400, "temperature": 0, "stream": True,
91
+ "messages": [{"role": "user", "content": prompt}]}
92
+
93
+
94
+ def _explain_stream(question: str, cands: list[dict], key: str) -> str:
95
+ """ONE streamed Haiku call -> explanation text with [[k]]/[[k:lo-hi]] citations."""
96
+ text, stop = _stream_with_retry(_build_body(question, cands), key)
97
+ if stop == "max_tokens":
98
+ cut = max(text.rfind("\n\n"), text.rfind(". "))
99
+ if cut > 0:
100
+ text = text[:cut + 1].rstrip() + "\n\n_(walkthrough truncated — ask a narrower question for the rest)_"
101
+ return text
102
+
103
+
104
+ def _stream_with_retry(body: dict, key: str, retries: int = 4,
105
+ on_delta=None) -> tuple[str, str]:
106
+ """Streamed Anthropic call with backoff on 429/5xx/overloaded. Returns (text, stop).
107
+ If on_delta is given it's called with each text delta (live rendering); once any
108
+ delta has been emitted we stop retrying, so the terminal never sees duplicate text."""
109
+ import time as _t
110
+ last = None
111
+ emitted = False
112
+ for attempt in range(retries):
113
+ req = urllib.request.Request(
114
+ "https://api.anthropic.com/v1/messages", data=json.dumps(body).encode(),
115
+ headers={"x-api-key": key, "anthropic-version": "2023-06-01",
116
+ "content-type": "application/json"})
117
+ text, stop = "", ""
118
+ try:
119
+ with urllib.request.urlopen(req, timeout=90) as r:
120
+ for raw in r:
121
+ line = raw.decode("utf-8", "replace").strip()
122
+ if not line.startswith("data: "):
123
+ continue
124
+ try:
125
+ ev = json.loads(line[6:])
126
+ except json.JSONDecodeError:
127
+ continue
128
+ t = ev.get("type")
129
+ if t == "content_block_delta":
130
+ d = ev["delta"].get("text", "")
131
+ text += d
132
+ if d and on_delta is not None:
133
+ on_delta(d)
134
+ emitted = True
135
+ elif t == "message_delta":
136
+ stop = ev.get("delta", {}).get("stop_reason") or stop
137
+ elif t == "error": # mid-stream overloaded_error etc.
138
+ raise urllib.error.HTTPError(req.full_url, 529, "stream error", None, None)
139
+ return text, stop
140
+ except urllib.error.HTTPError as e:
141
+ last = e
142
+ if emitted: # already streamed live: a retry would double-print
143
+ raise
144
+ if e.code in (429, 500, 502, 503, 529) and attempt < retries - 1:
145
+ _t.sleep(2 ** attempt)
146
+ continue
147
+ raise
148
+ except (urllib.error.URLError, TimeoutError) as e:
149
+ last = e
150
+ if emitted:
151
+ raise
152
+ if attempt < retries - 1:
153
+ _t.sleep(2 ** attempt)
154
+ continue
155
+ raise
156
+ raise last if last else RuntimeError("unreachable")
157
+
158
+
159
+ def _code_block(c: dict, lo: int | None, hi: int | None, seen: set,
160
+ file_syms: dict[str, list[dict]]) -> str:
161
+ cs, ce = c["start_line"], c["end_line"]
162
+ s, e = cs, ce
163
+ if lo is not None and hi is not None and not (hi < cs or lo > ce):
164
+ s, e = max(lo, cs), min(hi, ce)
165
+ _FN = ("function", "async_function", "method", "async_method", "class")
166
+ syms = [y for y in file_syms.get(c["file"], []) if y["kind"] in _FN]
167
+ if (s, e) != (cs, ce):
168
+ # snap to enclosing symbol edges when close (readable boundaries)
169
+ encl = [y for y in syms if y["line"] <= e and y["end_line"] >= s]
170
+ if encl:
171
+ best = min(encl, key=lambda y: y["end_line"] - y["line"])
172
+ if 0 < s - best["line"] <= 8:
173
+ s = max(best["line"], cs)
174
+ if 0 < best["end_line"] - e <= 8:
175
+ e = min(best["end_line"], ce)
176
+ # trim orphan tail of a previous symbol at the head of the range
177
+ nexts = sorted(y["line"] for y in syms if s < y["line"] <= min(s + 8, e))
178
+ if nexts:
179
+ owner = [y for y in syms if y["line"] < s <= y["end_line"]
180
+ and y["end_line"] < nexts[0]]
181
+ if owner:
182
+ s = nexts[0]
183
+ lines = c["text"].splitlines(keepends=True)
184
+ text = "".join(lines[s - cs:e - cs + 1])
185
+ key = (c["file"], s, e)
186
+ if key in seen:
187
+ return f'*(see `{c["file"]}:L{s}-{e}` above)*'
188
+ seen.add(key)
189
+ # label = most specific symbols overlapping the emitted range
190
+ inside = [y for y in syms if not (y["end_line"] < s or y["line"] > e)]
191
+ inside.sort(key=lambda y: y["end_line"] - y["line"])
192
+ tight = [y for y in inside if (y["end_line"] - y["line"]) <= 3 * (e - s + 1)]
193
+ label = ", ".join(dict.fromkeys(y["name"] for y in (tight or inside)[:2])) \
194
+ or (c["name"] or c["kind"])
195
+ return (f'\n**`{c["file"]}` L{s}-{e}** — {label}\n'
196
+ f'```{lang_of(c["file"])}\n{text.rstrip(chr(10))}\n```\n')
197
+
198
+
199
+ def ask(root: Path, question: str, rerank: bool = False,
200
+ docs_only: bool = False) -> dict:
201
+ t0 = time.time()
202
+ res = search(Path(root), question, rerank=rerank)
203
+ retrieval_ms = int((time.time() - t0) * 1000)
204
+ cands = _candidates(res, docs_only)
205
+ key = _key()
206
+ text, llm_ms = "", 0
207
+ if key and cands:
208
+ t1 = time.time()
209
+ try:
210
+ text = _explain_stream(question, cands, key)
211
+ except Exception:
212
+ text = ""
213
+ llm_ms = int((time.time() - t1) * 1000)
214
+ from .store import Store
215
+ st = Store(Path(root))
216
+ file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
217
+ return {"result": res, "cands": cands, "text": text, "file_syms": file_syms,
218
+ "retrieval_ms": retrieval_ms, "llm_ms": llm_ms,
219
+ "query": question, "repo": res["repo"]}
220
+
221
+
222
+ def cited_files(out: dict) -> list[str]:
223
+ """Files cited in the explanation, in first-mention order (for eval)."""
224
+ cands = out["cands"]
225
+ files: list[str] = []
226
+ for m in _SEL.finditer(out["text"] or ""):
227
+ k = int(m.group(1))
228
+ if 0 <= k < len(cands):
229
+ f = cands[k]["file"]
230
+ if f not in files:
231
+ files.append(f)
232
+ return files
233
+
234
+
235
+ def render_ask(out: dict) -> str:
236
+ cands, text = out["cands"], out["text"]
237
+ if not text or not _SEL.search(text):
238
+ return render(out["result"]) # fail-open: unfiltered bundle
239
+ seen: set = set()
240
+ cited: set = set()
241
+
242
+ def sub(m):
243
+ k = int(m.group(1))
244
+ if not (0 <= k < len(cands)):
245
+ return m.group(0)
246
+ cited.add(k)
247
+ lo = int(m.group(2)) if m.group(2) else None
248
+ hi = int(m.group(3)) if m.group(3) else None
249
+ return _code_block(cands[k], lo, hi, seen, out.get("file_syms", {}))
250
+
251
+ body = _SEL.sub(sub, text).strip()
252
+ n_files = len({cands[k]["file"] for k in cited})
253
+ L = [f'# megabrain — "{out["query"]}"',
254
+ f'repo `{out["repo"]}` · {len(seen)} code spans · {n_files} files · '
255
+ f'{out["retrieval_ms"]}ms retrieval + {out["llm_ms"]}ms explain\n',
256
+ body]
257
+ dropped = [c for i, c in enumerate(cands) if i not in cited]
258
+ if dropped:
259
+ items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
260
+ for c in dropped[:12])
261
+ L.append(f'\n— not cited ({len(dropped)}): {items}')
262
+ L.append('— full bundle: `megabrain query` · any file: `megabrain get <file>`')
263
+ return "\n".join(L)
264
+
265
+
266
+ def stream_ask(root: Path, question: str, out=None, rerank: bool = False,
267
+ show_map: bool = True, docs_only: bool = False) -> None:
268
+ """Live-streaming `ask` for the terminal: prose appears token by token and each
269
+ [[k]]/[[k:lo-hi]] citation is spliced into its real code block as soon as its line
270
+ completes (citations are emitted on their own line). Same grounding + fail-open as
271
+ render_ask, but the reader sees output immediately instead of waiting for the whole
272
+ walkthrough. Programmatic/eval/MCP callers keep using ask()/render_ask()."""
273
+ out = out or sys.stdout
274
+
275
+ def write(s: str):
276
+ out.write(s)
277
+ out.flush()
278
+
279
+ t0 = time.time()
280
+ res = search(Path(root), question, rerank=rerank)
281
+ retrieval_ms = int((time.time() - t0) * 1000)
282
+ cands = _candidates(res, docs_only)
283
+ key = _key()
284
+ if not key or not cands: # no LLM available / nothing retrieved
285
+ write(render(res) + "\n")
286
+ return
287
+
288
+ from .store import Store
289
+ st = Store(Path(root))
290
+ file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
291
+
292
+ write(f'# megabrain — "{question}"\n')
293
+ write(f'repo `{res["repo"]}` · {retrieval_ms}ms retrieval · streaming {MODEL}…\n\n')
294
+
295
+ seen: set = set()
296
+ cited: set = set()
297
+
298
+ def sub(m):
299
+ k = int(m.group(1))
300
+ if not (0 <= k < len(cands)):
301
+ return m.group(0)
302
+ cited.add(k)
303
+ lo = int(m.group(2)) if m.group(2) else None
304
+ hi = int(m.group(3)) if m.group(3) else None
305
+ return _code_block(cands[k], lo, hi, seen, file_syms)
306
+
307
+ pending = [""] # hold the in-progress line; citations live on their own line
308
+
309
+ def on_delta(d: str):
310
+ pending[0] += d
311
+ nl = pending[0].rfind("\n")
312
+ if nl != -1:
313
+ ready, pending[0] = pending[0][:nl + 1], pending[0][nl + 1:]
314
+ write(_SEL.sub(sub, ready))
315
+
316
+ t1 = time.time()
317
+ interrupted = False
318
+ stop = ""
319
+ try:
320
+ _, stop = _stream_with_retry(_build_body(question, cands), key, on_delta=on_delta)
321
+ except Exception:
322
+ interrupted = True
323
+ if pending[0]: # flush the trailing partial line
324
+ write(_SEL.sub(sub, pending[0]))
325
+ pending[0] = ""
326
+ llm_ms = int((time.time() - t1) * 1000)
327
+
328
+ if not cited: # fail-open: ungrounded prose -> show the bundle
329
+ note = "_(explanation unavailable — full bundle below)_" if interrupted \
330
+ else "_(no code cited — full bundle below)_"
331
+ write(f"\n\n{note}\n\n{render(res)}\n")
332
+ return
333
+ if stop == "max_tokens":
334
+ write("\n\n_(walkthrough truncated — ask a narrower question for the rest)_")
335
+
336
+ n_files = len({cands[k]["file"] for k in cited})
337
+ write(f'\n\n— {len(seen)} code spans · {n_files} files · '
338
+ f'{retrieval_ms}ms retrieval + {llm_ms}ms explain\n')
339
+ if show_map:
340
+ dropped = [c for i, c in enumerate(cands) if i not in cited]
341
+ if dropped:
342
+ items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
343
+ for c in dropped[:12])
344
+ write(f'— not cited ({len(dropped)}): {items}\n')
345
+ write('— full bundle: `megabrain query` · any file: `megabrain get <file>`\n')
@@ -0,0 +1,52 @@
1
+ """Sparse lexical channel over entity-IDs (LocAgent T4) — pure python, no deps.
2
+
3
+ Each file's document = its path + all symbol qualified names + signatures,
4
+ tokenized identifier-aware (split camelCase/snake_case). Catches issues that
5
+ mention a symbol descriptively when the dense embedding misses it.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import math
11
+ import re
12
+ from collections import Counter
13
+
14
+
15
+ def tokenize(text: str) -> list[str]:
16
+ out = []
17
+ for w in re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+", text):
18
+ lw = w.lower()
19
+ out.append(lw)
20
+ for p in re.split(r"_+", w):
21
+ for s in re.findall(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+", p):
22
+ if len(s) > 1:
23
+ out.append(s.lower())
24
+ return out
25
+
26
+
27
+ class BM25:
28
+ def __init__(self, docs: list[list[str]], k1: float = 1.2, b: float = 0.75):
29
+ self.k1, self.b = k1, b
30
+ self.N = len(docs)
31
+ self.tf = [Counter(d) for d in docs]
32
+ self.dl = [len(d) for d in docs]
33
+ self.avgdl = (sum(self.dl) / self.N) if self.N else 0.0
34
+ df: Counter = Counter()
35
+ for d in docs:
36
+ df.update(set(d))
37
+ self.idf = {t: math.log(1 + (self.N - n + 0.5) / (n + 0.5)) for t, n in df.items()}
38
+
39
+ def scores(self, query: str):
40
+ import numpy as np
41
+ q = [t for t in set(tokenize(query)) if t in self.idf]
42
+ s = np.zeros(self.N)
43
+ if not q or not self.avgdl:
44
+ return s
45
+ for t in q:
46
+ idf = self.idf[t]
47
+ for i in range(self.N):
48
+ f = self.tf[i].get(t, 0)
49
+ if f:
50
+ s[i] += idf * f * (self.k1 + 1) / (
51
+ f + self.k1 * (1 - self.b + self.b * self.dl[i] / self.avgdl))
52
+ return s