megabrain 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megabrain-0.1.0/LICENSE +21 -0
- megabrain-0.1.0/PKG-INFO +136 -0
- megabrain-0.1.0/README.md +110 -0
- megabrain-0.1.0/megabrain/__init__.py +13 -0
- megabrain-0.1.0/megabrain/ask.py +345 -0
- megabrain-0.1.0/megabrain/bm25.py +52 -0
- megabrain-0.1.0/megabrain/chunker.py +449 -0
- megabrain-0.1.0/megabrain/chunker_ts.py +378 -0
- megabrain-0.1.0/megabrain/cli.py +88 -0
- megabrain-0.1.0/megabrain/embeddings.py +92 -0
- megabrain-0.1.0/megabrain/graph.py +108 -0
- megabrain-0.1.0/megabrain/indexer.py +100 -0
- megabrain-0.1.0/megabrain/issue.py +120 -0
- megabrain-0.1.0/megabrain/markdown.py +214 -0
- megabrain-0.1.0/megabrain/mcp_server.py +156 -0
- megabrain-0.1.0/megabrain/query.py +355 -0
- megabrain-0.1.0/megabrain/rerank.py +69 -0
- megabrain-0.1.0/megabrain/rerank2.py +86 -0
- megabrain-0.1.0/megabrain/serve.py +282 -0
- megabrain-0.1.0/megabrain/store.py +141 -0
- megabrain-0.1.0/megabrain/strategies.py +144 -0
- megabrain-0.1.0/megabrain.egg-info/PKG-INFO +136 -0
- megabrain-0.1.0/megabrain.egg-info/SOURCES.txt +34 -0
- megabrain-0.1.0/megabrain.egg-info/dependency_links.txt +1 -0
- megabrain-0.1.0/megabrain.egg-info/entry_points.txt +2 -0
- megabrain-0.1.0/megabrain.egg-info/requires.txt +7 -0
- megabrain-0.1.0/megabrain.egg-info/top_level.txt +1 -0
- megabrain-0.1.0/pyproject.toml +48 -0
- megabrain-0.1.0/setup.cfg +4 -0
- megabrain-0.1.0/tests/test_ask_citation.py +61 -0
- megabrain-0.1.0/tests/test_cast_chunker.py +187 -0
- megabrain-0.1.0/tests/test_chunker_ts.py +67 -0
- megabrain-0.1.0/tests/test_engine_golden.py +55 -0
- megabrain-0.1.0/tests/test_markdown_chunker.py +104 -0
- megabrain-0.1.0/tests/test_multi_repo.py +48 -0
- megabrain-0.1.0/tests/test_scale.py +55 -0
megabrain-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Berna Castro
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
megabrain-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: megabrain
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local code-intelligence engine: one call returns all the code related to a question, explained with the real code spliced in.
|
|
5
|
+
Author-email: Berna Castro <bernacas@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/pinecall/megabrain
|
|
8
|
+
Project-URL: Repository, https://github.com/pinecall/megabrain
|
|
9
|
+
Keywords: code-intelligence,retrieval,rag,embeddings,code-search,mcp,ast,tree-sitter,developer-tools
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
15
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
16
|
+
Requires-Python: >=3.11
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: numpy>=1.24
|
|
20
|
+
Requires-Dist: tree_sitter>=0.21
|
|
21
|
+
Requires-Dist: tree_sitter_typescript>=0.23
|
|
22
|
+
Provides-Extra: languages
|
|
23
|
+
Requires-Dist: tree_sitter_ruby>=0.23; extra == "languages"
|
|
24
|
+
Requires-Dist: tree_sitter_go>=0.23; extra == "languages"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
<p align="center">
|
|
28
|
+
<img src="https://raw.githubusercontent.com/pinecall/megabrain/master/assets/megabrain.png" alt="megabrain" width="180">
|
|
29
|
+
</p>
|
|
30
|
+
|
|
31
|
+
<h1 align="center">megabrain</h1>
|
|
32
|
+
|
|
33
|
+
<p align="center">
|
|
34
|
+
<b>One call returns all the code related to a question</b><br>
|
|
35
|
+
— explained like a senior engineer, with the real code spliced in.
|
|
36
|
+
</p>
|
|
37
|
+
|
|
38
|
+
<p align="center">
|
|
39
|
+
<img src="https://img.shields.io/badge/python-3.11+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python 3.11+">
|
|
40
|
+
<img src="https://img.shields.io/badge/retrieval-no%20LLM%20·%20~200ms-2ea44f?style=flat-square" alt="No LLM in the retrieval path">
|
|
41
|
+
<img src="https://img.shields.io/badge/code-zero%20hallucination-6f42c1?style=flat-square" alt="Zero code hallucination">
|
|
42
|
+
<img src="https://img.shields.io/badge/MCP-ready-000000?style=flat-square" alt="MCP ready">
|
|
43
|
+
</p>
|
|
44
|
+
|
|
45
|
+
---
|
|
46
|
+
|
|
47
|
+
**megabrain** is a local code-intelligence engine. It replaces minutes of file-by-file
|
|
48
|
+
crawling — grep, read, explore-agent chains — with a single grounded answer. Index a repo
|
|
49
|
+
once; every later question retrieves *all* the related code and stitches it into a
|
|
50
|
+
walkthrough narrated by an LLM that can **only point at code, never rewrite it** — so
|
|
51
|
+
nothing is hallucinated.
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
No packaging step — runs straight from a clone:
|
|
56
|
+
|
|
57
|
+
```bash
|
|
58
|
+
git clone https://github.com/pinecall/megabrain.git
|
|
59
|
+
cd megabrain
|
|
60
|
+
pip install numpy # core (Python indexing)
|
|
61
|
+
pip install tree_sitter tree_sitter_typescript # TS/JS (+ tree_sitter_ruby tree_sitter_go for Ruby/Go)
|
|
62
|
+
alias megabrain='python3 -m megabrain.cli' # optional: clean invocation
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
Keys are read from the environment (with a `~/.zshrc` fallback):
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
export PERPLEXITY_API_KEY=... # required — embeddings
|
|
69
|
+
export ANTHROPIC_API_KEY=... # only for `ask` and `--best`
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Usage
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
megabrain index ~/repo # incremental (sha256), no daemon
|
|
76
|
+
megabrain ask ~/repo "how does auth work end to end" # walkthrough + real code (~6–20s)
|
|
77
|
+
megabrain ask ~/repo "how do I configure X" --docs # explain the docs instead of code
|
|
78
|
+
megabrain query ~/repo "request retry logic" # raw code map, no LLM (~200ms)
|
|
79
|
+
megabrain get ~/repo src/x.py --symbol Class.method # one file or symbol
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Indexes code (`.py` · `.ts` · `.tsx` · `.js` · `.jsx` · `.mjs` · `.cjs` · Ruby · Go) and
|
|
83
|
+
markdown (`.md` · `.markdown` · `.mdx`) through a **strategy registry** — adding a language
|
|
84
|
+
or content type is a config entry, not a branch in the indexer.
|
|
85
|
+
|
|
86
|
+
## How it works
|
|
87
|
+
|
|
88
|
+
A three-stage pipeline. **Only `ask` calls an LLM — and only to narrate.**
|
|
89
|
+
|
|
90
|
+
| stage | what it does |
|
|
91
|
+
|---|---|
|
|
92
|
+
| **index** | cAST chunk → Perplexity embed (int8, L2-normalized) → SQLite. Incremental by `sha256`, no watcher. |
|
|
93
|
+
| **query** | No-LLM retrieval (~200ms): dense-chunk + file-skeleton fusion, with import/call-graph candidates. Returns a map — **CORE** (full code of the top files) + **RELATED** (every connected file with its best chunk). |
|
|
94
|
+
| **ask** | One streamed Haiku call writes the walkthrough and cites code as `[[k]]`; the engine **replaces each citation with the verbatim block** (real file, real line numbers). Non-cited related files are listed at the end. Fail-open: any API error falls back to the full `query` bundle. |
|
|
95
|
+
|
|
96
|
+
Because the model only emits citations and the engine splices code from disk, **code cannot
|
|
97
|
+
be hallucinated or rewritten.**
|
|
98
|
+
|
|
99
|
+
## MCP
|
|
100
|
+
|
|
101
|
+
Use it from Claude Code or any MCP client:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
claude mcp add megabrain -- python3 -m megabrain.mcp_server
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Tools: `megabrain_ask` (primary), `megabrain_query`, `megabrain_get`, `megabrain_index`.
|
|
108
|
+
The server auto-refreshes a stale index before answering, so results always match disk.
|
|
109
|
+
|
|
110
|
+
## Design
|
|
111
|
+
|
|
112
|
+
Every choice below is backed by an internal golden set (30 verified queries):
|
|
113
|
+
|
|
114
|
+
| decision | evidence |
|
|
115
|
+
|---|---|
|
|
116
|
+
| cAST chunking (4K nws chars, breadcrumbs, partition-guaranteed) | unit-tested; every line lands in exactly one chunk — no gaps, no overlaps |
|
|
117
|
+
| `pplx-embed-v1` (1024-d, int8 wire, **L2-normalized**) | beats `openai-3-large` on code; ~$0.0016/repo |
|
|
118
|
+
| dense chunk + 0.5 × file-skeleton score | dual-granularity; precision up, no downside |
|
|
119
|
+
| graph (import + call edges) for candidates only | PageRank-as-ranking **rejected** by data (Acc@1 0.91 → 0.73) |
|
|
120
|
+
| **no LLM in the retrieval path** | every LLM *prune* variant cost completeness; `ask` explains, it never prunes |
|
|
121
|
+
|
|
122
|
+
**Engine retrieval** (internal golden set): R@1 **0.86** · bundle\_full **1.00** · p50 **8 ms** warm.
|
|
123
|
+
**SWE-bench Lite** localization (no training): retrieval Acc@1 ≈ 0.52 / @5 ≈ 0.83 — on par
|
|
124
|
+
with the trained CodeRankEmbed retriever.
|
|
125
|
+
|
|
126
|
+
## Project layout
|
|
127
|
+
|
|
128
|
+
```
|
|
129
|
+
megabrain/ engine — chunkers, embeddings, SQLite store, graph, indexer, query, ask, cli, mcp_server
|
|
130
|
+
evals/ golden.json (30 verified queries) + swebench harness
|
|
131
|
+
tests/ engine + chunker gates
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
---
|
|
135
|
+
|
|
136
|
+
<p align="center"><sub>github.com/pinecall/megabrain</sub></p>
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/pinecall/megabrain/master/assets/megabrain.png" alt="megabrain" width="180">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<h1 align="center">megabrain</h1>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<b>One call returns all the code related to a question</b><br>
|
|
9
|
+
— explained like a senior engineer, with the real code spliced in.
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
<p align="center">
|
|
13
|
+
<img src="https://img.shields.io/badge/python-3.11+-3776AB?style=flat-square&logo=python&logoColor=white" alt="Python 3.11+">
|
|
14
|
+
<img src="https://img.shields.io/badge/retrieval-no%20LLM%20·%20~200ms-2ea44f?style=flat-square" alt="No LLM in the retrieval path">
|
|
15
|
+
<img src="https://img.shields.io/badge/code-zero%20hallucination-6f42c1?style=flat-square" alt="Zero code hallucination">
|
|
16
|
+
<img src="https://img.shields.io/badge/MCP-ready-000000?style=flat-square" alt="MCP ready">
|
|
17
|
+
</p>
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
**megabrain** is a local code-intelligence engine. It replaces minutes of file-by-file
|
|
22
|
+
crawling — grep, read, explore-agent chains — with a single grounded answer. Index a repo
|
|
23
|
+
once; every later question retrieves *all* the related code and stitches it into a
|
|
24
|
+
walkthrough narrated by an LLM that can **only point at code, never rewrite it** — so
|
|
25
|
+
nothing is hallucinated.
|
|
26
|
+
|
|
27
|
+
## Install
|
|
28
|
+
|
|
29
|
+
No packaging step — runs straight from a clone:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
git clone https://github.com/pinecall/megabrain.git
|
|
33
|
+
cd megabrain
|
|
34
|
+
pip install numpy # core (Python indexing)
|
|
35
|
+
pip install tree_sitter tree_sitter_typescript # TS/JS (+ tree_sitter_ruby tree_sitter_go for Ruby/Go)
|
|
36
|
+
alias megabrain='python3 -m megabrain.cli' # optional: clean invocation
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Keys are read from the environment (with a `~/.zshrc` fallback):
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
export PERPLEXITY_API_KEY=... # required — embeddings
|
|
43
|
+
export ANTHROPIC_API_KEY=... # only for `ask` and `--best`
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
megabrain index ~/repo # incremental (sha256), no daemon
|
|
50
|
+
megabrain ask ~/repo "how does auth work end to end" # walkthrough + real code (~6–20s)
|
|
51
|
+
megabrain ask ~/repo "how do I configure X" --docs # explain the docs instead of code
|
|
52
|
+
megabrain query ~/repo "request retry logic" # raw code map, no LLM (~200ms)
|
|
53
|
+
megabrain get ~/repo src/x.py --symbol Class.method # one file or symbol
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Indexes code (`.py` · `.ts` · `.tsx` · `.js` · `.jsx` · `.mjs` · `.cjs` · Ruby · Go) and
|
|
57
|
+
markdown (`.md` · `.markdown` · `.mdx`) through a **strategy registry** — adding a language
|
|
58
|
+
or content type is a config entry, not a branch in the indexer.
|
|
59
|
+
|
|
60
|
+
## How it works
|
|
61
|
+
|
|
62
|
+
A three-stage pipeline. **Only `ask` calls an LLM — and only to narrate.**
|
|
63
|
+
|
|
64
|
+
| stage | what it does |
|
|
65
|
+
|---|---|
|
|
66
|
+
| **index** | cAST chunk → Perplexity embed (int8, L2-normalized) → SQLite. Incremental by `sha256`, no watcher. |
|
|
67
|
+
| **query** | No-LLM retrieval (~200ms): dense-chunk + file-skeleton fusion, with import/call-graph candidates. Returns a map — **CORE** (full code of the top files) + **RELATED** (every connected file with its best chunk). |
|
|
68
|
+
| **ask** | One streamed Haiku call writes the walkthrough and cites code as `[[k]]`; the engine **replaces each citation with the verbatim block** (real file, real line numbers). Non-cited related files are listed at the end. Fail-open: any API error falls back to the full `query` bundle. |
|
|
69
|
+
|
|
70
|
+
Because the model only emits citations and the engine splices code from disk, **code cannot
|
|
71
|
+
be hallucinated or rewritten.**
|
|
72
|
+
|
|
73
|
+
## MCP
|
|
74
|
+
|
|
75
|
+
Use it from Claude Code or any MCP client:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
claude mcp add megabrain -- python3 -m megabrain.mcp_server
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
Tools: `megabrain_ask` (primary), `megabrain_query`, `megabrain_get`, `megabrain_index`.
|
|
82
|
+
The server auto-refreshes a stale index before answering, so results always match disk.
|
|
83
|
+
|
|
84
|
+
## Design
|
|
85
|
+
|
|
86
|
+
Every choice below is backed by an internal golden set (30 verified queries):
|
|
87
|
+
|
|
88
|
+
| decision | evidence |
|
|
89
|
+
|---|---|
|
|
90
|
+
| cAST chunking (4K nws chars, breadcrumbs, partition-guaranteed) | unit-tested; every line lands in exactly one chunk — no gaps, no overlaps |
|
|
91
|
+
| `pplx-embed-v1` (1024-d, int8 wire, **L2-normalized**) | beats `openai-3-large` on code; ~$0.0016/repo |
|
|
92
|
+
| dense chunk + 0.5 × file-skeleton score | dual-granularity; precision up, no downside |
|
|
93
|
+
| graph (import + call edges) for candidates only | PageRank-as-ranking **rejected** by data (Acc@1 0.91 → 0.73) |
|
|
94
|
+
| **no LLM in the retrieval path** | every LLM *prune* variant cost completeness; `ask` explains, it never prunes |
|
|
95
|
+
|
|
96
|
+
**Engine retrieval** (internal golden set): R@1 **0.86** · bundle\_full **1.00** · p50 **8 ms** warm.
|
|
97
|
+
**SWE-bench Lite** localization (no training): retrieval Acc@1 ≈ 0.52 / @5 ≈ 0.83 — on par
|
|
98
|
+
with the trained CodeRankEmbed retriever.
|
|
99
|
+
|
|
100
|
+
## Project layout
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
megabrain/ engine — chunkers, embeddings, SQLite store, graph, indexer, query, ask, cli, mcp_server
|
|
104
|
+
evals/ golden.json (30 verified queries) + swebench harness
|
|
105
|
+
tests/ engine + chunker gates
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
<p align="center"><sub>github.com/pinecall/megabrain</sub></p>
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""megabrain — code-intelligence engine: one-shot retrieval of all code related
|
|
2
|
+
to a feature, as a view-ready map.
|
|
3
|
+
|
|
4
|
+
Validated configuration (experiments phases 0-5, June 2026):
|
|
5
|
+
- chunking: cAST split-then-merge, 4000 nws chars, breadcrumb headers
|
|
6
|
+
- embeddings: pplx-embed-v1-0.6b (1024d, int8 wire format, L2-normalized)
|
|
7
|
+
- scoring: dense chunk cosine + 0.5 * file-skeleton cosine
|
|
8
|
+
- graph: import+call edges; used for bundle candidates and map annotations,
|
|
9
|
+
NOT for ranking (PageRank rejected by experiment)
|
|
10
|
+
- pruning: OFF by default (LLM pruning costs completeness); --prune optional
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""megabrain ask — agent-style explained answer with cherry-picked REAL code.
|
|
2
|
+
|
|
3
|
+
The LLM explains the answer like an agent walking through the codebase, but
|
|
4
|
+
it cannot paste code: it cites chunks as [[3]] or [[3:705-731]] and the engine
|
|
5
|
+
REPLACES each citation with the real code block (file header + fenced code,
|
|
6
|
+
true line numbers). Explanation = LLM; every line of code = verbatim from
|
|
7
|
+
disk. Streamed, ~1-3s. Fail-open: no citations / API error -> full bundle.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
import urllib.request
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from .query import lang_of, render, search
|
|
20
|
+
from .rerank import _key
|
|
21
|
+
from .strategies import MarkdownStrategy
|
|
22
|
+
|
|
23
|
+
# ask is a CODE walkthrough: docs (markdown) are excluded from its candidates so a
|
|
24
|
+
# code explanation isn't diluted with prose. docs_only flips it to a docs-only
|
|
25
|
+
# walkthrough. Docs stay retrievable via `query` regardless.
|
|
26
|
+
DOC_EXTS = MarkdownStrategy.exts
|
|
27
|
+
|
|
28
|
+
MODEL = "claude-haiku-4-5"
|
|
29
|
+
MAX_CTX_CHARS = 200_000 # ~50K tokens of candidate code; Haiku window is 200K
|
|
30
|
+
# double-bracket so the model can still mention [n] in prose without collision.
|
|
31
|
+
# Tolerate an "L" prefix and stray spaces on the line range: the chunk headers in
|
|
32
|
+
# the prompt read "L1-172", so the model often mirrors that as [[0:L1-172]] — accept
|
|
33
|
+
# it (and [[3:705-731]], [[3]]) instead of leaking the citation as raw text.
|
|
34
|
+
_SEL = re.compile(r"\[\[(\d+)(?::\s*[Ll]?(\d+)\s*-\s*[Ll]?(\d+))?\s*\]\]")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _candidates(res: dict, docs_only: bool = False) -> list[dict]:
|
|
38
|
+
"""Retrieved chunks for the walkthrough: CORE chunks + RELATED best chunks,
|
|
39
|
+
numbered. By default docs (markdown) are excluded — ask is a code walkthrough and
|
|
40
|
+
citing doc prose pollutes it. docs_only=True flips it to a docs-only walkthrough.
|
|
41
|
+
`query` surfaces both regardless of this setting."""
|
|
42
|
+
def keep(f: str) -> bool:
|
|
43
|
+
is_doc = f.endswith(DOC_EXTS)
|
|
44
|
+
return is_doc if docs_only else not is_doc
|
|
45
|
+
out = []
|
|
46
|
+
for t in res["tier1"]:
|
|
47
|
+
if not keep(t["file"]):
|
|
48
|
+
continue
|
|
49
|
+
for c in t["chunks"]:
|
|
50
|
+
out.append({"file": t["file"], **{k: c[k] for k in
|
|
51
|
+
("name", "kind", "start_line", "end_line", "text")}})
|
|
52
|
+
for t in res["tier2"]:
|
|
53
|
+
if not keep(t["file"]):
|
|
54
|
+
continue
|
|
55
|
+
bc = t.get("best_chunk")
|
|
56
|
+
if bc:
|
|
57
|
+
out.append({"file": t["file"], **{k: bc[k] for k in
|
|
58
|
+
("name", "kind", "start_line", "end_line", "text")}})
|
|
59
|
+
return out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
_RULES = """- NEVER paste or quote code. Cite it with DOUBLE brackets: [[3]] (whole chunk) or [[3:705-731]] (file lines 705-731 of chunk 3). Each such citation is REPLACED by the real code block in your answer, so explain AROUND the code, not the code itself. (If you ever need to mention the citation syntax itself in prose, use single brackets — only [[...]] gets replaced.)
|
|
63
|
+
- Put each [[...]] citation on its own line, right after the sentence that introduces it.
|
|
64
|
+
- Show GENEROUS, COMPLETE code: cite whole [[k]] chunks (a full function/class/block) by default so the reader sees the complete implementation, not a fragment. Only use a [[k:lo-hi]] sub-range when a chunk is very large and only one section is relevant — and then take the WHOLE enclosing function, not a few lines. Never cite the same span twice.
|
|
65
|
+
- Structure it: use ## section headings for each phase of the flow, 1-3 sentences of explanation per citation. Be thorough — the reader must understand everything perfectly from the code shown, without opening any file.
|
|
66
|
+
- Finish the thought: end with a short "## Summary" of the flow in 2-3 sentences. Never end mid-sentence."""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _build_body(question: str, cands: list[dict]) -> dict:
|
|
70
|
+
"""Anthropic request body: the cite-only walkthrough prompt over numbered chunks."""
|
|
71
|
+
blocks, used = [], 0
|
|
72
|
+
for i, c in enumerate(cands):
|
|
73
|
+
head = f'[{i}] {c["file"]} L{c["start_line"]}-{c["end_line"]}' + \
|
|
74
|
+
(f' ({c["name"]})' if c["name"] else "")
|
|
75
|
+
body = c["text"]
|
|
76
|
+
if used + len(body) > MAX_CTX_CHARS:
|
|
77
|
+
body = body[:2000] + "\n# ...truncated...\n"
|
|
78
|
+
used += len(body)
|
|
79
|
+
blocks.append(f"{head}\n{body}")
|
|
80
|
+
prompt = f"""You are a senior engineer giving a complete code walkthrough that answers the developer's query. Cover the ENTIRE relevant flow end to end — do not stop early, do not leave a thread dangling.
|
|
81
|
+
|
|
82
|
+
STRICT RULES:
|
|
83
|
+
{_RULES}
|
|
84
|
+
|
|
85
|
+
QUERY: {question}
|
|
86
|
+
|
|
87
|
+
RETRIEVED CHUNKS:
|
|
88
|
+
|
|
89
|
+
{chr(10).join(blocks)}"""
|
|
90
|
+
return {"model": MODEL, "max_tokens": 2400, "temperature": 0, "stream": True,
|
|
91
|
+
"messages": [{"role": "user", "content": prompt}]}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _explain_stream(question: str, cands: list[dict], key: str) -> str:
|
|
95
|
+
"""ONE streamed Haiku call -> explanation text with [[k]]/[[k:lo-hi]] citations."""
|
|
96
|
+
text, stop = _stream_with_retry(_build_body(question, cands), key)
|
|
97
|
+
if stop == "max_tokens":
|
|
98
|
+
cut = max(text.rfind("\n\n"), text.rfind(". "))
|
|
99
|
+
if cut > 0:
|
|
100
|
+
text = text[:cut + 1].rstrip() + "\n\n_(walkthrough truncated — ask a narrower question for the rest)_"
|
|
101
|
+
return text
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _stream_with_retry(body: dict, key: str, retries: int = 4,
|
|
105
|
+
on_delta=None) -> tuple[str, str]:
|
|
106
|
+
"""Streamed Anthropic call with backoff on 429/5xx/overloaded. Returns (text, stop).
|
|
107
|
+
If on_delta is given it's called with each text delta (live rendering); once any
|
|
108
|
+
delta has been emitted we stop retrying, so the terminal never sees duplicate text."""
|
|
109
|
+
import time as _t
|
|
110
|
+
last = None
|
|
111
|
+
emitted = False
|
|
112
|
+
for attempt in range(retries):
|
|
113
|
+
req = urllib.request.Request(
|
|
114
|
+
"https://api.anthropic.com/v1/messages", data=json.dumps(body).encode(),
|
|
115
|
+
headers={"x-api-key": key, "anthropic-version": "2023-06-01",
|
|
116
|
+
"content-type": "application/json"})
|
|
117
|
+
text, stop = "", ""
|
|
118
|
+
try:
|
|
119
|
+
with urllib.request.urlopen(req, timeout=90) as r:
|
|
120
|
+
for raw in r:
|
|
121
|
+
line = raw.decode("utf-8", "replace").strip()
|
|
122
|
+
if not line.startswith("data: "):
|
|
123
|
+
continue
|
|
124
|
+
try:
|
|
125
|
+
ev = json.loads(line[6:])
|
|
126
|
+
except json.JSONDecodeError:
|
|
127
|
+
continue
|
|
128
|
+
t = ev.get("type")
|
|
129
|
+
if t == "content_block_delta":
|
|
130
|
+
d = ev["delta"].get("text", "")
|
|
131
|
+
text += d
|
|
132
|
+
if d and on_delta is not None:
|
|
133
|
+
on_delta(d)
|
|
134
|
+
emitted = True
|
|
135
|
+
elif t == "message_delta":
|
|
136
|
+
stop = ev.get("delta", {}).get("stop_reason") or stop
|
|
137
|
+
elif t == "error": # mid-stream overloaded_error etc.
|
|
138
|
+
raise urllib.error.HTTPError(req.full_url, 529, "stream error", None, None)
|
|
139
|
+
return text, stop
|
|
140
|
+
except urllib.error.HTTPError as e:
|
|
141
|
+
last = e
|
|
142
|
+
if emitted: # already streamed live: a retry would double-print
|
|
143
|
+
raise
|
|
144
|
+
if e.code in (429, 500, 502, 503, 529) and attempt < retries - 1:
|
|
145
|
+
_t.sleep(2 ** attempt)
|
|
146
|
+
continue
|
|
147
|
+
raise
|
|
148
|
+
except (urllib.error.URLError, TimeoutError) as e:
|
|
149
|
+
last = e
|
|
150
|
+
if emitted:
|
|
151
|
+
raise
|
|
152
|
+
if attempt < retries - 1:
|
|
153
|
+
_t.sleep(2 ** attempt)
|
|
154
|
+
continue
|
|
155
|
+
raise
|
|
156
|
+
raise last if last else RuntimeError("unreachable")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _code_block(c: dict, lo: int | None, hi: int | None, seen: set,
|
|
160
|
+
file_syms: dict[str, list[dict]]) -> str:
|
|
161
|
+
cs, ce = c["start_line"], c["end_line"]
|
|
162
|
+
s, e = cs, ce
|
|
163
|
+
if lo is not None and hi is not None and not (hi < cs or lo > ce):
|
|
164
|
+
s, e = max(lo, cs), min(hi, ce)
|
|
165
|
+
_FN = ("function", "async_function", "method", "async_method", "class")
|
|
166
|
+
syms = [y for y in file_syms.get(c["file"], []) if y["kind"] in _FN]
|
|
167
|
+
if (s, e) != (cs, ce):
|
|
168
|
+
# snap to enclosing symbol edges when close (readable boundaries)
|
|
169
|
+
encl = [y for y in syms if y["line"] <= e and y["end_line"] >= s]
|
|
170
|
+
if encl:
|
|
171
|
+
best = min(encl, key=lambda y: y["end_line"] - y["line"])
|
|
172
|
+
if 0 < s - best["line"] <= 8:
|
|
173
|
+
s = max(best["line"], cs)
|
|
174
|
+
if 0 < best["end_line"] - e <= 8:
|
|
175
|
+
e = min(best["end_line"], ce)
|
|
176
|
+
# trim orphan tail of a previous symbol at the head of the range
|
|
177
|
+
nexts = sorted(y["line"] for y in syms if s < y["line"] <= min(s + 8, e))
|
|
178
|
+
if nexts:
|
|
179
|
+
owner = [y for y in syms if y["line"] < s <= y["end_line"]
|
|
180
|
+
and y["end_line"] < nexts[0]]
|
|
181
|
+
if owner:
|
|
182
|
+
s = nexts[0]
|
|
183
|
+
lines = c["text"].splitlines(keepends=True)
|
|
184
|
+
text = "".join(lines[s - cs:e - cs + 1])
|
|
185
|
+
key = (c["file"], s, e)
|
|
186
|
+
if key in seen:
|
|
187
|
+
return f'*(see `{c["file"]}:L{s}-{e}` above)*'
|
|
188
|
+
seen.add(key)
|
|
189
|
+
# label = most specific symbols overlapping the emitted range
|
|
190
|
+
inside = [y for y in syms if not (y["end_line"] < s or y["line"] > e)]
|
|
191
|
+
inside.sort(key=lambda y: y["end_line"] - y["line"])
|
|
192
|
+
tight = [y for y in inside if (y["end_line"] - y["line"]) <= 3 * (e - s + 1)]
|
|
193
|
+
label = ", ".join(dict.fromkeys(y["name"] for y in (tight or inside)[:2])) \
|
|
194
|
+
or (c["name"] or c["kind"])
|
|
195
|
+
return (f'\n**`{c["file"]}` L{s}-{e}** — {label}\n'
|
|
196
|
+
f'```{lang_of(c["file"])}\n{text.rstrip(chr(10))}\n```\n')
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def ask(root: Path, question: str, rerank: bool = False,
|
|
200
|
+
docs_only: bool = False) -> dict:
|
|
201
|
+
t0 = time.time()
|
|
202
|
+
res = search(Path(root), question, rerank=rerank)
|
|
203
|
+
retrieval_ms = int((time.time() - t0) * 1000)
|
|
204
|
+
cands = _candidates(res, docs_only)
|
|
205
|
+
key = _key()
|
|
206
|
+
text, llm_ms = "", 0
|
|
207
|
+
if key and cands:
|
|
208
|
+
t1 = time.time()
|
|
209
|
+
try:
|
|
210
|
+
text = _explain_stream(question, cands, key)
|
|
211
|
+
except Exception:
|
|
212
|
+
text = ""
|
|
213
|
+
llm_ms = int((time.time() - t1) * 1000)
|
|
214
|
+
from .store import Store
|
|
215
|
+
st = Store(Path(root))
|
|
216
|
+
file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
|
|
217
|
+
return {"result": res, "cands": cands, "text": text, "file_syms": file_syms,
|
|
218
|
+
"retrieval_ms": retrieval_ms, "llm_ms": llm_ms,
|
|
219
|
+
"query": question, "repo": res["repo"]}
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def cited_files(out: dict) -> list[str]:
|
|
223
|
+
"""Files cited in the explanation, in first-mention order (for eval)."""
|
|
224
|
+
cands = out["cands"]
|
|
225
|
+
files: list[str] = []
|
|
226
|
+
for m in _SEL.finditer(out["text"] or ""):
|
|
227
|
+
k = int(m.group(1))
|
|
228
|
+
if 0 <= k < len(cands):
|
|
229
|
+
f = cands[k]["file"]
|
|
230
|
+
if f not in files:
|
|
231
|
+
files.append(f)
|
|
232
|
+
return files
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def render_ask(out: dict) -> str:
|
|
236
|
+
cands, text = out["cands"], out["text"]
|
|
237
|
+
if not text or not _SEL.search(text):
|
|
238
|
+
return render(out["result"]) # fail-open: unfiltered bundle
|
|
239
|
+
seen: set = set()
|
|
240
|
+
cited: set = set()
|
|
241
|
+
|
|
242
|
+
def sub(m):
|
|
243
|
+
k = int(m.group(1))
|
|
244
|
+
if not (0 <= k < len(cands)):
|
|
245
|
+
return m.group(0)
|
|
246
|
+
cited.add(k)
|
|
247
|
+
lo = int(m.group(2)) if m.group(2) else None
|
|
248
|
+
hi = int(m.group(3)) if m.group(3) else None
|
|
249
|
+
return _code_block(cands[k], lo, hi, seen, out.get("file_syms", {}))
|
|
250
|
+
|
|
251
|
+
body = _SEL.sub(sub, text).strip()
|
|
252
|
+
n_files = len({cands[k]["file"] for k in cited})
|
|
253
|
+
L = [f'# megabrain — "{out["query"]}"',
|
|
254
|
+
f'repo `{out["repo"]}` · {len(seen)} code spans · {n_files} files · '
|
|
255
|
+
f'{out["retrieval_ms"]}ms retrieval + {out["llm_ms"]}ms explain\n',
|
|
256
|
+
body]
|
|
257
|
+
dropped = [c for i, c in enumerate(cands) if i not in cited]
|
|
258
|
+
if dropped:
|
|
259
|
+
items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
|
|
260
|
+
for c in dropped[:12])
|
|
261
|
+
L.append(f'\n— not cited ({len(dropped)}): {items}')
|
|
262
|
+
L.append('— full bundle: `megabrain query` · any file: `megabrain get <file>`')
|
|
263
|
+
return "\n".join(L)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def stream_ask(root: Path, question: str, out=None, rerank: bool = False,
|
|
267
|
+
show_map: bool = True, docs_only: bool = False) -> None:
|
|
268
|
+
"""Live-streaming `ask` for the terminal: prose appears token by token and each
|
|
269
|
+
[[k]]/[[k:lo-hi]] citation is spliced into its real code block as soon as its line
|
|
270
|
+
completes (citations are emitted on their own line). Same grounding + fail-open as
|
|
271
|
+
render_ask, but the reader sees output immediately instead of waiting for the whole
|
|
272
|
+
walkthrough. Programmatic/eval/MCP callers keep using ask()/render_ask()."""
|
|
273
|
+
out = out or sys.stdout
|
|
274
|
+
|
|
275
|
+
def write(s: str):
|
|
276
|
+
out.write(s)
|
|
277
|
+
out.flush()
|
|
278
|
+
|
|
279
|
+
t0 = time.time()
|
|
280
|
+
res = search(Path(root), question, rerank=rerank)
|
|
281
|
+
retrieval_ms = int((time.time() - t0) * 1000)
|
|
282
|
+
cands = _candidates(res, docs_only)
|
|
283
|
+
key = _key()
|
|
284
|
+
if not key or not cands: # no LLM available / nothing retrieved
|
|
285
|
+
write(render(res) + "\n")
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
from .store import Store
|
|
289
|
+
st = Store(Path(root))
|
|
290
|
+
file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
|
|
291
|
+
|
|
292
|
+
write(f'# megabrain — "{question}"\n')
|
|
293
|
+
write(f'repo `{res["repo"]}` · {retrieval_ms}ms retrieval · streaming {MODEL}…\n\n')
|
|
294
|
+
|
|
295
|
+
seen: set = set()
|
|
296
|
+
cited: set = set()
|
|
297
|
+
|
|
298
|
+
def sub(m):
|
|
299
|
+
k = int(m.group(1))
|
|
300
|
+
if not (0 <= k < len(cands)):
|
|
301
|
+
return m.group(0)
|
|
302
|
+
cited.add(k)
|
|
303
|
+
lo = int(m.group(2)) if m.group(2) else None
|
|
304
|
+
hi = int(m.group(3)) if m.group(3) else None
|
|
305
|
+
return _code_block(cands[k], lo, hi, seen, file_syms)
|
|
306
|
+
|
|
307
|
+
pending = [""] # hold the in-progress line; citations live on their own line
|
|
308
|
+
|
|
309
|
+
def on_delta(d: str):
|
|
310
|
+
pending[0] += d
|
|
311
|
+
nl = pending[0].rfind("\n")
|
|
312
|
+
if nl != -1:
|
|
313
|
+
ready, pending[0] = pending[0][:nl + 1], pending[0][nl + 1:]
|
|
314
|
+
write(_SEL.sub(sub, ready))
|
|
315
|
+
|
|
316
|
+
t1 = time.time()
|
|
317
|
+
interrupted = False
|
|
318
|
+
stop = ""
|
|
319
|
+
try:
|
|
320
|
+
_, stop = _stream_with_retry(_build_body(question, cands), key, on_delta=on_delta)
|
|
321
|
+
except Exception:
|
|
322
|
+
interrupted = True
|
|
323
|
+
if pending[0]: # flush the trailing partial line
|
|
324
|
+
write(_SEL.sub(sub, pending[0]))
|
|
325
|
+
pending[0] = ""
|
|
326
|
+
llm_ms = int((time.time() - t1) * 1000)
|
|
327
|
+
|
|
328
|
+
if not cited: # fail-open: ungrounded prose -> show the bundle
|
|
329
|
+
note = "_(explanation unavailable — full bundle below)_" if interrupted \
|
|
330
|
+
else "_(no code cited — full bundle below)_"
|
|
331
|
+
write(f"\n\n{note}\n\n{render(res)}\n")
|
|
332
|
+
return
|
|
333
|
+
if stop == "max_tokens":
|
|
334
|
+
write("\n\n_(walkthrough truncated — ask a narrower question for the rest)_")
|
|
335
|
+
|
|
336
|
+
n_files = len({cands[k]["file"] for k in cited})
|
|
337
|
+
write(f'\n\n— {len(seen)} code spans · {n_files} files · '
|
|
338
|
+
f'{retrieval_ms}ms retrieval + {llm_ms}ms explain\n')
|
|
339
|
+
if show_map:
|
|
340
|
+
dropped = [c for i, c in enumerate(cands) if i not in cited]
|
|
341
|
+
if dropped:
|
|
342
|
+
items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
|
|
343
|
+
for c in dropped[:12])
|
|
344
|
+
write(f'— not cited ({len(dropped)}): {items}\n')
|
|
345
|
+
write('— full bundle: `megabrain query` · any file: `megabrain get <file>`\n')
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Sparse lexical channel over entity-IDs (LocAgent T4) — pure python, no deps.
|
|
2
|
+
|
|
3
|
+
Each file's document = its path + all symbol qualified names + signatures,
|
|
4
|
+
tokenized identifier-aware (split camelCase/snake_case). Catches issues that
|
|
5
|
+
mention a symbol descriptively when the dense embedding misses it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
import re
|
|
12
|
+
from collections import Counter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def tokenize(text: str) -> list[str]:
|
|
16
|
+
out = []
|
|
17
|
+
for w in re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+", text):
|
|
18
|
+
lw = w.lower()
|
|
19
|
+
out.append(lw)
|
|
20
|
+
for p in re.split(r"_+", w):
|
|
21
|
+
for s in re.findall(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+", p):
|
|
22
|
+
if len(s) > 1:
|
|
23
|
+
out.append(s.lower())
|
|
24
|
+
return out
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BM25:
|
|
28
|
+
def __init__(self, docs: list[list[str]], k1: float = 1.2, b: float = 0.75):
|
|
29
|
+
self.k1, self.b = k1, b
|
|
30
|
+
self.N = len(docs)
|
|
31
|
+
self.tf = [Counter(d) for d in docs]
|
|
32
|
+
self.dl = [len(d) for d in docs]
|
|
33
|
+
self.avgdl = (sum(self.dl) / self.N) if self.N else 0.0
|
|
34
|
+
df: Counter = Counter()
|
|
35
|
+
for d in docs:
|
|
36
|
+
df.update(set(d))
|
|
37
|
+
self.idf = {t: math.log(1 + (self.N - n + 0.5) / (n + 0.5)) for t, n in df.items()}
|
|
38
|
+
|
|
39
|
+
def scores(self, query: str):
|
|
40
|
+
import numpy as np
|
|
41
|
+
q = [t for t in set(tokenize(query)) if t in self.idf]
|
|
42
|
+
s = np.zeros(self.N)
|
|
43
|
+
if not q or not self.avgdl:
|
|
44
|
+
return s
|
|
45
|
+
for t in q:
|
|
46
|
+
idf = self.idf[t]
|
|
47
|
+
for i in range(self.N):
|
|
48
|
+
f = self.tf[i].get(t, 0)
|
|
49
|
+
if f:
|
|
50
|
+
s[i] += idf * f * (self.k1 + 1) / (
|
|
51
|
+
f + self.k1 * (1 - self.b + self.b * self.dl[i] / self.avgdl))
|
|
52
|
+
return s
|