megabrain 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- megabrain/__init__.py +13 -0
- megabrain/ask.py +345 -0
- megabrain/bm25.py +52 -0
- megabrain/chunker.py +449 -0
- megabrain/chunker_ts.py +378 -0
- megabrain/cli.py +88 -0
- megabrain/embeddings.py +92 -0
- megabrain/graph.py +108 -0
- megabrain/indexer.py +100 -0
- megabrain/issue.py +120 -0
- megabrain/markdown.py +214 -0
- megabrain/mcp_server.py +156 -0
- megabrain/query.py +355 -0
- megabrain/rerank.py +69 -0
- megabrain/rerank2.py +86 -0
- megabrain/serve.py +282 -0
- megabrain/store.py +141 -0
- megabrain/strategies.py +144 -0
- megabrain-0.1.0.dist-info/METADATA +136 -0
- megabrain-0.1.0.dist-info/RECORD +24 -0
- megabrain-0.1.0.dist-info/WHEEL +5 -0
- megabrain-0.1.0.dist-info/entry_points.txt +2 -0
- megabrain-0.1.0.dist-info/licenses/LICENSE +21 -0
- megabrain-0.1.0.dist-info/top_level.txt +1 -0
megabrain/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""megabrain — code-intelligence engine: one-shot retrieval of all code related
|
|
2
|
+
to a feature, as a view-ready map.
|
|
3
|
+
|
|
4
|
+
Validated configuration (experiments phases 0-5, June 2026):
|
|
5
|
+
- chunking: cAST split-then-merge, 4000 nws chars, breadcrumb headers
|
|
6
|
+
- embeddings: pplx-embed-v1-0.6b (1024d, int8 wire format, L2-normalized)
|
|
7
|
+
- scoring: dense chunk cosine + 0.5 * file-skeleton cosine
|
|
8
|
+
- graph: import+call edges; used for bundle candidates and map annotations,
|
|
9
|
+
NOT for ranking (PageRank rejected by experiment)
|
|
10
|
+
- pruning: OFF by default (LLM pruning costs completeness); --prune optional
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
__version__ = "0.1.0"
|
megabrain/ask.py
ADDED
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""megabrain ask — agent-style explained answer with cherry-picked REAL code.
|
|
2
|
+
|
|
3
|
+
The LLM explains the answer like an agent walking through the codebase, but
|
|
4
|
+
it cannot paste code: it cites chunks as [[3]] or [[3:705-731]] and the engine
|
|
5
|
+
REPLACES each citation with the real code block (file header + fenced code,
|
|
6
|
+
true line numbers). Explanation = LLM; every line of code = verbatim from
|
|
7
|
+
disk. Streamed, ~1-3s. Fail-open: no citations / API error -> full bundle.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import sys
|
|
15
|
+
import time
|
|
16
|
+
import urllib.request
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
|
|
19
|
+
from .query import lang_of, render, search
|
|
20
|
+
from .rerank import _key
|
|
21
|
+
from .strategies import MarkdownStrategy
|
|
22
|
+
|
|
23
|
+
# ask is a CODE walkthrough: docs (markdown) are excluded from its candidates so a
|
|
24
|
+
# code explanation isn't diluted with prose. docs_only flips it to a docs-only
|
|
25
|
+
# walkthrough. Docs stay retrievable via `query` regardless.
|
|
26
|
+
DOC_EXTS = MarkdownStrategy.exts
|
|
27
|
+
|
|
28
|
+
MODEL = "claude-haiku-4-5"
|
|
29
|
+
MAX_CTX_CHARS = 200_000 # ~50K tokens of candidate code; Haiku window is 200K
|
|
30
|
+
# double-bracket so the model can still mention [n] in prose without collision.
|
|
31
|
+
# Tolerate an "L" prefix and stray spaces on the line range: the chunk headers in
|
|
32
|
+
# the prompt read "L1-172", so the model often mirrors that as [[0:L1-172]] — accept
|
|
33
|
+
# it (and [[3:705-731]], [[3]]) instead of leaking the citation as raw text.
|
|
34
|
+
_SEL = re.compile(r"\[\[(\d+)(?::\s*[Ll]?(\d+)\s*-\s*[Ll]?(\d+))?\s*\]\]")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _candidates(res: dict, docs_only: bool = False) -> list[dict]:
|
|
38
|
+
"""Retrieved chunks for the walkthrough: CORE chunks + RELATED best chunks,
|
|
39
|
+
numbered. By default docs (markdown) are excluded — ask is a code walkthrough and
|
|
40
|
+
citing doc prose pollutes it. docs_only=True flips it to a docs-only walkthrough.
|
|
41
|
+
`query` surfaces both regardless of this setting."""
|
|
42
|
+
def keep(f: str) -> bool:
|
|
43
|
+
is_doc = f.endswith(DOC_EXTS)
|
|
44
|
+
return is_doc if docs_only else not is_doc
|
|
45
|
+
out = []
|
|
46
|
+
for t in res["tier1"]:
|
|
47
|
+
if not keep(t["file"]):
|
|
48
|
+
continue
|
|
49
|
+
for c in t["chunks"]:
|
|
50
|
+
out.append({"file": t["file"], **{k: c[k] for k in
|
|
51
|
+
("name", "kind", "start_line", "end_line", "text")}})
|
|
52
|
+
for t in res["tier2"]:
|
|
53
|
+
if not keep(t["file"]):
|
|
54
|
+
continue
|
|
55
|
+
bc = t.get("best_chunk")
|
|
56
|
+
if bc:
|
|
57
|
+
out.append({"file": t["file"], **{k: bc[k] for k in
|
|
58
|
+
("name", "kind", "start_line", "end_line", "text")}})
|
|
59
|
+
return out
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
_RULES = """- NEVER paste or quote code. Cite it with DOUBLE brackets: [[3]] (whole chunk) or [[3:705-731]] (file lines 705-731 of chunk 3). Each such citation is REPLACED by the real code block in your answer, so explain AROUND the code, not the code itself. (If you ever need to mention the citation syntax itself in prose, use single brackets — only [[...]] gets replaced.)
|
|
63
|
+
- Put each [[...]] citation on its own line, right after the sentence that introduces it.
|
|
64
|
+
- Show GENEROUS, COMPLETE code: cite whole [[k]] chunks (a full function/class/block) by default so the reader sees the complete implementation, not a fragment. Only use a [[k:lo-hi]] sub-range when a chunk is very large and only one section is relevant — and then take the WHOLE enclosing function, not a few lines. Never cite the same span twice.
|
|
65
|
+
- Structure it: use ## section headings for each phase of the flow, 1-3 sentences of explanation per citation. Be thorough — the reader must understand everything perfectly from the code shown, without opening any file.
|
|
66
|
+
- Finish the thought: end with a short "## Summary" of the flow in 2-3 sentences. Never end mid-sentence."""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _build_body(question: str, cands: list[dict]) -> dict:
|
|
70
|
+
"""Anthropic request body: the cite-only walkthrough prompt over numbered chunks."""
|
|
71
|
+
blocks, used = [], 0
|
|
72
|
+
for i, c in enumerate(cands):
|
|
73
|
+
head = f'[{i}] {c["file"]} L{c["start_line"]}-{c["end_line"]}' + \
|
|
74
|
+
(f' ({c["name"]})' if c["name"] else "")
|
|
75
|
+
body = c["text"]
|
|
76
|
+
if used + len(body) > MAX_CTX_CHARS:
|
|
77
|
+
body = body[:2000] + "\n# ...truncated...\n"
|
|
78
|
+
used += len(body)
|
|
79
|
+
blocks.append(f"{head}\n{body}")
|
|
80
|
+
prompt = f"""You are a senior engineer giving a complete code walkthrough that answers the developer's query. Cover the ENTIRE relevant flow end to end — do not stop early, do not leave a thread dangling.
|
|
81
|
+
|
|
82
|
+
STRICT RULES:
|
|
83
|
+
{_RULES}
|
|
84
|
+
|
|
85
|
+
QUERY: {question}
|
|
86
|
+
|
|
87
|
+
RETRIEVED CHUNKS:
|
|
88
|
+
|
|
89
|
+
{chr(10).join(blocks)}"""
|
|
90
|
+
return {"model": MODEL, "max_tokens": 2400, "temperature": 0, "stream": True,
|
|
91
|
+
"messages": [{"role": "user", "content": prompt}]}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _explain_stream(question: str, cands: list[dict], key: str) -> str:
|
|
95
|
+
"""ONE streamed Haiku call -> explanation text with [[k]]/[[k:lo-hi]] citations."""
|
|
96
|
+
text, stop = _stream_with_retry(_build_body(question, cands), key)
|
|
97
|
+
if stop == "max_tokens":
|
|
98
|
+
cut = max(text.rfind("\n\n"), text.rfind(". "))
|
|
99
|
+
if cut > 0:
|
|
100
|
+
text = text[:cut + 1].rstrip() + "\n\n_(walkthrough truncated — ask a narrower question for the rest)_"
|
|
101
|
+
return text
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _stream_with_retry(body: dict, key: str, retries: int = 4,
|
|
105
|
+
on_delta=None) -> tuple[str, str]:
|
|
106
|
+
"""Streamed Anthropic call with backoff on 429/5xx/overloaded. Returns (text, stop).
|
|
107
|
+
If on_delta is given it's called with each text delta (live rendering); once any
|
|
108
|
+
delta has been emitted we stop retrying, so the terminal never sees duplicate text."""
|
|
109
|
+
import time as _t
|
|
110
|
+
last = None
|
|
111
|
+
emitted = False
|
|
112
|
+
for attempt in range(retries):
|
|
113
|
+
req = urllib.request.Request(
|
|
114
|
+
"https://api.anthropic.com/v1/messages", data=json.dumps(body).encode(),
|
|
115
|
+
headers={"x-api-key": key, "anthropic-version": "2023-06-01",
|
|
116
|
+
"content-type": "application/json"})
|
|
117
|
+
text, stop = "", ""
|
|
118
|
+
try:
|
|
119
|
+
with urllib.request.urlopen(req, timeout=90) as r:
|
|
120
|
+
for raw in r:
|
|
121
|
+
line = raw.decode("utf-8", "replace").strip()
|
|
122
|
+
if not line.startswith("data: "):
|
|
123
|
+
continue
|
|
124
|
+
try:
|
|
125
|
+
ev = json.loads(line[6:])
|
|
126
|
+
except json.JSONDecodeError:
|
|
127
|
+
continue
|
|
128
|
+
t = ev.get("type")
|
|
129
|
+
if t == "content_block_delta":
|
|
130
|
+
d = ev["delta"].get("text", "")
|
|
131
|
+
text += d
|
|
132
|
+
if d and on_delta is not None:
|
|
133
|
+
on_delta(d)
|
|
134
|
+
emitted = True
|
|
135
|
+
elif t == "message_delta":
|
|
136
|
+
stop = ev.get("delta", {}).get("stop_reason") or stop
|
|
137
|
+
elif t == "error": # mid-stream overloaded_error etc.
|
|
138
|
+
raise urllib.error.HTTPError(req.full_url, 529, "stream error", None, None)
|
|
139
|
+
return text, stop
|
|
140
|
+
except urllib.error.HTTPError as e:
|
|
141
|
+
last = e
|
|
142
|
+
if emitted: # already streamed live: a retry would double-print
|
|
143
|
+
raise
|
|
144
|
+
if e.code in (429, 500, 502, 503, 529) and attempt < retries - 1:
|
|
145
|
+
_t.sleep(2 ** attempt)
|
|
146
|
+
continue
|
|
147
|
+
raise
|
|
148
|
+
except (urllib.error.URLError, TimeoutError) as e:
|
|
149
|
+
last = e
|
|
150
|
+
if emitted:
|
|
151
|
+
raise
|
|
152
|
+
if attempt < retries - 1:
|
|
153
|
+
_t.sleep(2 ** attempt)
|
|
154
|
+
continue
|
|
155
|
+
raise
|
|
156
|
+
raise last if last else RuntimeError("unreachable")
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def _code_block(c: dict, lo: int | None, hi: int | None, seen: set,
|
|
160
|
+
file_syms: dict[str, list[dict]]) -> str:
|
|
161
|
+
cs, ce = c["start_line"], c["end_line"]
|
|
162
|
+
s, e = cs, ce
|
|
163
|
+
if lo is not None and hi is not None and not (hi < cs or lo > ce):
|
|
164
|
+
s, e = max(lo, cs), min(hi, ce)
|
|
165
|
+
_FN = ("function", "async_function", "method", "async_method", "class")
|
|
166
|
+
syms = [y for y in file_syms.get(c["file"], []) if y["kind"] in _FN]
|
|
167
|
+
if (s, e) != (cs, ce):
|
|
168
|
+
# snap to enclosing symbol edges when close (readable boundaries)
|
|
169
|
+
encl = [y for y in syms if y["line"] <= e and y["end_line"] >= s]
|
|
170
|
+
if encl:
|
|
171
|
+
best = min(encl, key=lambda y: y["end_line"] - y["line"])
|
|
172
|
+
if 0 < s - best["line"] <= 8:
|
|
173
|
+
s = max(best["line"], cs)
|
|
174
|
+
if 0 < best["end_line"] - e <= 8:
|
|
175
|
+
e = min(best["end_line"], ce)
|
|
176
|
+
# trim orphan tail of a previous symbol at the head of the range
|
|
177
|
+
nexts = sorted(y["line"] for y in syms if s < y["line"] <= min(s + 8, e))
|
|
178
|
+
if nexts:
|
|
179
|
+
owner = [y for y in syms if y["line"] < s <= y["end_line"]
|
|
180
|
+
and y["end_line"] < nexts[0]]
|
|
181
|
+
if owner:
|
|
182
|
+
s = nexts[0]
|
|
183
|
+
lines = c["text"].splitlines(keepends=True)
|
|
184
|
+
text = "".join(lines[s - cs:e - cs + 1])
|
|
185
|
+
key = (c["file"], s, e)
|
|
186
|
+
if key in seen:
|
|
187
|
+
return f'*(see `{c["file"]}:L{s}-{e}` above)*'
|
|
188
|
+
seen.add(key)
|
|
189
|
+
# label = most specific symbols overlapping the emitted range
|
|
190
|
+
inside = [y for y in syms if not (y["end_line"] < s or y["line"] > e)]
|
|
191
|
+
inside.sort(key=lambda y: y["end_line"] - y["line"])
|
|
192
|
+
tight = [y for y in inside if (y["end_line"] - y["line"]) <= 3 * (e - s + 1)]
|
|
193
|
+
label = ", ".join(dict.fromkeys(y["name"] for y in (tight or inside)[:2])) \
|
|
194
|
+
or (c["name"] or c["kind"])
|
|
195
|
+
return (f'\n**`{c["file"]}` L{s}-{e}** — {label}\n'
|
|
196
|
+
f'```{lang_of(c["file"])}\n{text.rstrip(chr(10))}\n```\n')
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def ask(root: Path, question: str, rerank: bool = False,
|
|
200
|
+
docs_only: bool = False) -> dict:
|
|
201
|
+
t0 = time.time()
|
|
202
|
+
res = search(Path(root), question, rerank=rerank)
|
|
203
|
+
retrieval_ms = int((time.time() - t0) * 1000)
|
|
204
|
+
cands = _candidates(res, docs_only)
|
|
205
|
+
key = _key()
|
|
206
|
+
text, llm_ms = "", 0
|
|
207
|
+
if key and cands:
|
|
208
|
+
t1 = time.time()
|
|
209
|
+
try:
|
|
210
|
+
text = _explain_stream(question, cands, key)
|
|
211
|
+
except Exception:
|
|
212
|
+
text = ""
|
|
213
|
+
llm_ms = int((time.time() - t1) * 1000)
|
|
214
|
+
from .store import Store
|
|
215
|
+
st = Store(Path(root))
|
|
216
|
+
file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
|
|
217
|
+
return {"result": res, "cands": cands, "text": text, "file_syms": file_syms,
|
|
218
|
+
"retrieval_ms": retrieval_ms, "llm_ms": llm_ms,
|
|
219
|
+
"query": question, "repo": res["repo"]}
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def cited_files(out: dict) -> list[str]:
|
|
223
|
+
"""Files cited in the explanation, in first-mention order (for eval)."""
|
|
224
|
+
cands = out["cands"]
|
|
225
|
+
files: list[str] = []
|
|
226
|
+
for m in _SEL.finditer(out["text"] or ""):
|
|
227
|
+
k = int(m.group(1))
|
|
228
|
+
if 0 <= k < len(cands):
|
|
229
|
+
f = cands[k]["file"]
|
|
230
|
+
if f not in files:
|
|
231
|
+
files.append(f)
|
|
232
|
+
return files
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def render_ask(out: dict) -> str:
|
|
236
|
+
cands, text = out["cands"], out["text"]
|
|
237
|
+
if not text or not _SEL.search(text):
|
|
238
|
+
return render(out["result"]) # fail-open: unfiltered bundle
|
|
239
|
+
seen: set = set()
|
|
240
|
+
cited: set = set()
|
|
241
|
+
|
|
242
|
+
def sub(m):
|
|
243
|
+
k = int(m.group(1))
|
|
244
|
+
if not (0 <= k < len(cands)):
|
|
245
|
+
return m.group(0)
|
|
246
|
+
cited.add(k)
|
|
247
|
+
lo = int(m.group(2)) if m.group(2) else None
|
|
248
|
+
hi = int(m.group(3)) if m.group(3) else None
|
|
249
|
+
return _code_block(cands[k], lo, hi, seen, out.get("file_syms", {}))
|
|
250
|
+
|
|
251
|
+
body = _SEL.sub(sub, text).strip()
|
|
252
|
+
n_files = len({cands[k]["file"] for k in cited})
|
|
253
|
+
L = [f'# megabrain — "{out["query"]}"',
|
|
254
|
+
f'repo `{out["repo"]}` · {len(seen)} code spans · {n_files} files · '
|
|
255
|
+
f'{out["retrieval_ms"]}ms retrieval + {out["llm_ms"]}ms explain\n',
|
|
256
|
+
body]
|
|
257
|
+
dropped = [c for i, c in enumerate(cands) if i not in cited]
|
|
258
|
+
if dropped:
|
|
259
|
+
items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
|
|
260
|
+
for c in dropped[:12])
|
|
261
|
+
L.append(f'\n— not cited ({len(dropped)}): {items}')
|
|
262
|
+
L.append('— full bundle: `megabrain query` · any file: `megabrain get <file>`')
|
|
263
|
+
return "\n".join(L)
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def stream_ask(root: Path, question: str, out=None, rerank: bool = False,
|
|
267
|
+
show_map: bool = True, docs_only: bool = False) -> None:
|
|
268
|
+
"""Live-streaming `ask` for the terminal: prose appears token by token and each
|
|
269
|
+
[[k]]/[[k:lo-hi]] citation is spliced into its real code block as soon as its line
|
|
270
|
+
completes (citations are emitted on their own line). Same grounding + fail-open as
|
|
271
|
+
render_ask, but the reader sees output immediately instead of waiting for the whole
|
|
272
|
+
walkthrough. Programmatic/eval/MCP callers keep using ask()/render_ask()."""
|
|
273
|
+
out = out or sys.stdout
|
|
274
|
+
|
|
275
|
+
def write(s: str):
|
|
276
|
+
out.write(s)
|
|
277
|
+
out.flush()
|
|
278
|
+
|
|
279
|
+
t0 = time.time()
|
|
280
|
+
res = search(Path(root), question, rerank=rerank)
|
|
281
|
+
retrieval_ms = int((time.time() - t0) * 1000)
|
|
282
|
+
cands = _candidates(res, docs_only)
|
|
283
|
+
key = _key()
|
|
284
|
+
if not key or not cands: # no LLM available / nothing retrieved
|
|
285
|
+
write(render(res) + "\n")
|
|
286
|
+
return
|
|
287
|
+
|
|
288
|
+
from .store import Store
|
|
289
|
+
st = Store(Path(root))
|
|
290
|
+
file_syms = {f: st.symbols_for(f) for f in {c["file"] for c in cands}}
|
|
291
|
+
|
|
292
|
+
write(f'# megabrain — "{question}"\n')
|
|
293
|
+
write(f'repo `{res["repo"]}` · {retrieval_ms}ms retrieval · streaming {MODEL}…\n\n')
|
|
294
|
+
|
|
295
|
+
seen: set = set()
|
|
296
|
+
cited: set = set()
|
|
297
|
+
|
|
298
|
+
def sub(m):
|
|
299
|
+
k = int(m.group(1))
|
|
300
|
+
if not (0 <= k < len(cands)):
|
|
301
|
+
return m.group(0)
|
|
302
|
+
cited.add(k)
|
|
303
|
+
lo = int(m.group(2)) if m.group(2) else None
|
|
304
|
+
hi = int(m.group(3)) if m.group(3) else None
|
|
305
|
+
return _code_block(cands[k], lo, hi, seen, file_syms)
|
|
306
|
+
|
|
307
|
+
pending = [""] # hold the in-progress line; citations live on their own line
|
|
308
|
+
|
|
309
|
+
def on_delta(d: str):
|
|
310
|
+
pending[0] += d
|
|
311
|
+
nl = pending[0].rfind("\n")
|
|
312
|
+
if nl != -1:
|
|
313
|
+
ready, pending[0] = pending[0][:nl + 1], pending[0][nl + 1:]
|
|
314
|
+
write(_SEL.sub(sub, ready))
|
|
315
|
+
|
|
316
|
+
t1 = time.time()
|
|
317
|
+
interrupted = False
|
|
318
|
+
stop = ""
|
|
319
|
+
try:
|
|
320
|
+
_, stop = _stream_with_retry(_build_body(question, cands), key, on_delta=on_delta)
|
|
321
|
+
except Exception:
|
|
322
|
+
interrupted = True
|
|
323
|
+
if pending[0]: # flush the trailing partial line
|
|
324
|
+
write(_SEL.sub(sub, pending[0]))
|
|
325
|
+
pending[0] = ""
|
|
326
|
+
llm_ms = int((time.time() - t1) * 1000)
|
|
327
|
+
|
|
328
|
+
if not cited: # fail-open: ungrounded prose -> show the bundle
|
|
329
|
+
note = "_(explanation unavailable — full bundle below)_" if interrupted \
|
|
330
|
+
else "_(no code cited — full bundle below)_"
|
|
331
|
+
write(f"\n\n{note}\n\n{render(res)}\n")
|
|
332
|
+
return
|
|
333
|
+
if stop == "max_tokens":
|
|
334
|
+
write("\n\n_(walkthrough truncated — ask a narrower question for the rest)_")
|
|
335
|
+
|
|
336
|
+
n_files = len({cands[k]["file"] for k in cited})
|
|
337
|
+
write(f'\n\n— {len(seen)} code spans · {n_files} files · '
|
|
338
|
+
f'{retrieval_ms}ms retrieval + {llm_ms}ms explain\n')
|
|
339
|
+
if show_map:
|
|
340
|
+
dropped = [c for i, c in enumerate(cands) if i not in cited]
|
|
341
|
+
if dropped:
|
|
342
|
+
items = ", ".join(f'{c["file"].rsplit("/", 1)[-1]}:{c["start_line"]}'
|
|
343
|
+
for c in dropped[:12])
|
|
344
|
+
write(f'— not cited ({len(dropped)}): {items}\n')
|
|
345
|
+
write('— full bundle: `megabrain query` · any file: `megabrain get <file>`\n')
|
megabrain/bm25.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Sparse lexical channel over entity-IDs (LocAgent T4) — pure python, no deps.
|
|
2
|
+
|
|
3
|
+
Each file's document = its path + all symbol qualified names + signatures,
|
|
4
|
+
tokenized identifier-aware (split camelCase/snake_case). Catches issues that
|
|
5
|
+
mention a symbol descriptively when the dense embedding misses it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import math
|
|
11
|
+
import re
|
|
12
|
+
from collections import Counter
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def tokenize(text: str) -> list[str]:
|
|
16
|
+
out = []
|
|
17
|
+
for w in re.findall(r"[A-Za-z_][A-Za-z0-9_]*|\d+", text):
|
|
18
|
+
lw = w.lower()
|
|
19
|
+
out.append(lw)
|
|
20
|
+
for p in re.split(r"_+", w):
|
|
21
|
+
for s in re.findall(r"[A-Z]+(?=[A-Z][a-z])|[A-Z]?[a-z]+|[A-Z]+|\d+", p):
|
|
22
|
+
if len(s) > 1:
|
|
23
|
+
out.append(s.lower())
|
|
24
|
+
return out
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BM25:
|
|
28
|
+
def __init__(self, docs: list[list[str]], k1: float = 1.2, b: float = 0.75):
|
|
29
|
+
self.k1, self.b = k1, b
|
|
30
|
+
self.N = len(docs)
|
|
31
|
+
self.tf = [Counter(d) for d in docs]
|
|
32
|
+
self.dl = [len(d) for d in docs]
|
|
33
|
+
self.avgdl = (sum(self.dl) / self.N) if self.N else 0.0
|
|
34
|
+
df: Counter = Counter()
|
|
35
|
+
for d in docs:
|
|
36
|
+
df.update(set(d))
|
|
37
|
+
self.idf = {t: math.log(1 + (self.N - n + 0.5) / (n + 0.5)) for t, n in df.items()}
|
|
38
|
+
|
|
39
|
+
def scores(self, query: str):
|
|
40
|
+
import numpy as np
|
|
41
|
+
q = [t for t in set(tokenize(query)) if t in self.idf]
|
|
42
|
+
s = np.zeros(self.N)
|
|
43
|
+
if not q or not self.avgdl:
|
|
44
|
+
return s
|
|
45
|
+
for t in q:
|
|
46
|
+
idf = self.idf[t]
|
|
47
|
+
for i in range(self.N):
|
|
48
|
+
f = self.tf[i].get(t, 0)
|
|
49
|
+
if f:
|
|
50
|
+
s[i] += idf * f * (self.k1 + 1) / (
|
|
51
|
+
f + self.k1 * (1 - self.b + self.b * self.dl[i] / self.avgdl))
|
|
52
|
+
return s
|