codebase-retrieval-context-engine 2.0.4__py3-none-any.whl → 2.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codebase_retrieval_context_engine-2.0.4.dist-info → codebase_retrieval_context_engine-2.0.6.dist-info}/METADATA +10 -2
- {codebase_retrieval_context_engine-2.0.4.dist-info → codebase_retrieval_context_engine-2.0.6.dist-info}/RECORD +11 -11
- corbell/__init__.py +1 -1
- corbell/core/embeddings/model.py +4 -0
- corbell/core/mcp/server.py +100 -88
- corbell/core/query/engine.py +27 -15
- corbell/core/query/formatter.py +36 -4
- corbell/core/workspace.py +42 -24
- {codebase_retrieval_context_engine-2.0.4.dist-info → codebase_retrieval_context_engine-2.0.6.dist-info}/WHEEL +0 -0
- {codebase_retrieval_context_engine-2.0.4.dist-info → codebase_retrieval_context_engine-2.0.6.dist-info}/entry_points.txt +0 -0
- {codebase_retrieval_context_engine-2.0.4.dist-info → codebase_retrieval_context_engine-2.0.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codebase-retrieval-context-engine
|
|
3
|
-
Version: 2.0.
|
|
3
|
+
Version: 2.0.6
|
|
4
4
|
Summary: Code retrieval engine — hybrid embedding + graph search for LLM context injection.
|
|
5
5
|
Project-URL: Homepage, https://github.com/nullmastermind/local-context-engine
|
|
6
6
|
Project-URL: Repository, https://github.com/nullmastermind/local-context-engine
|
|
@@ -71,11 +71,19 @@ Description-Content-Type: text/markdown
|
|
|
71
71
|
## Add to Claude Code
|
|
72
72
|
|
|
73
73
|
```bash
|
|
74
|
-
claude mcp add codebase-retrieval -
|
|
74
|
+
claude mcp add-json codebase-retrieval --scope user '{"type":"stdio","command":"uvx","args":["codebase-retrieval-context-engine"],"env":{"CORBELL_LLM_PROVIDER":"google","GOOGLE_API_KEY":"your-google-api-key","GOOGLE_MODEL":"gemini-3.1-flash-lite","CORBELL_EMBEDDING_MODEL":"voyage-4-lite","VOYAGE_API_KEY":"your-voyage-api-key"}}'
|
|
75
75
|
```
|
|
76
76
|
|
|
77
77
|
That's it. The AI agent passes workspace path and triggers index builds automatically.
|
|
78
78
|
|
|
79
|
+
## Remove from Claude Code
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
claude mcp remove codebase-retrieval --scope user
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
After adding, you can also edit or remove the MCP config directly in `~/.claude.json`.
|
|
86
|
+
|
|
79
87
|
---
|
|
80
88
|
|
|
81
89
|
## Environment variables
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
corbell/__init__.py,sha256=
|
|
1
|
+
corbell/__init__.py,sha256=8lQdcrCgCID8TDejlPX3pfWw3rZhnPOMOtaxf-XRMtY,124
|
|
2
2
|
corbell/cli/__init__.py,sha256=5-MP6JIWgp4nDLNIhqP6Gtx97GESaIYg3NGxtRGaMv0,28
|
|
3
3
|
corbell/cli/main.py,sha256=CP5EHizFLaBLF1EohgVo_-XFlm4VaO6peQaSnzyfxAI,1954
|
|
4
4
|
corbell/cli/commands/__init__.py,sha256=0mAOs3RWC7XMZnGRN677hjPCHHQKDq9ASjIr_GQM3js,37
|
|
@@ -9,12 +9,12 @@ corbell/core/__init__.py,sha256=VS9PnhHr4NXYlWs1TLCyllnVCNsiwVZ1Xj-AOBhZpAU,29
|
|
|
9
9
|
corbell/core/constants.py,sha256=P0fCJ0J5V2Nt348ZAVH1bHd9dFPJRLtpUyQhHPAl0_8,1203
|
|
10
10
|
corbell/core/gitignore.py,sha256=UO588tAxSVv7YEGNDjzdcBys_aqMIAhXrDgToRfcnzc,2347
|
|
11
11
|
corbell/core/llm_client.py,sha256=qGKuptxMAMDwqvhGAKVjppf2p-sX-auaA26WKo6Nlkk,26221
|
|
12
|
-
corbell/core/workspace.py,sha256=
|
|
12
|
+
corbell/core/workspace.py,sha256=qpBJNoxYmt-2OOx4K8bSsoJPgjEPDM3IKSYHMm6H54M,15130
|
|
13
13
|
corbell/core/embeddings/__init__.py,sha256=RCekvfNkFuMGEDLnls78i3znR84cTdnj4KJ_PeQrMNg,213
|
|
14
14
|
corbell/core/embeddings/base.py,sha256=udPW4XmcPhCpNQA6n8KqMcu2JXvVNv1JjdRJmFq5ZRA,2175
|
|
15
15
|
corbell/core/embeddings/extractor.py,sha256=2_BxRpsUcz-C-3HXjvlARqM3U5dzHRJcPR_hhPdMxSE,7314
|
|
16
16
|
corbell/core/embeddings/factory.py,sha256=Lonjbk8Lsxykz-2ZEgFCWoH9zZ005Qm4dXVdA6P4qJY,1817
|
|
17
|
-
corbell/core/embeddings/model.py,sha256=
|
|
17
|
+
corbell/core/embeddings/model.py,sha256=QYQy7W0iuce3ZHFXuNLHMnkqg5axQIyeYLpOBk2qpf8,14458
|
|
18
18
|
corbell/core/embeddings/search_cache.py,sha256=FHzO3mu4m4MJGy2jOFwb9GCEypcT11CcVrLts4Ib0ho,3351
|
|
19
19
|
corbell/core/embeddings/sqlite_store.py,sha256=99lHU_gPYwKw9BhUMS-XimQI8vDpBbBrIc_RkrsVdOM,11676
|
|
20
20
|
corbell/core/graph/__init__.py,sha256=VaxDKeXMgMEBBMC0dglwj68A_aNYRI5O8VM6oMC1GIM,29
|
|
@@ -31,17 +31,17 @@ corbell/core/indexing/builder.py,sha256=apF-FFz_bZ6SeBEVVZzNXMavp9zuLVMVhg4598YJ
|
|
|
31
31
|
corbell/core/indexing/lock.py,sha256=uUMelIrtrp6Ww9rTfbl2OvomByc-IJyiHIMnptfA4xI,4743
|
|
32
32
|
corbell/core/indexing/tracker.py,sha256=UCeKARiUMyZcg1yvbIZxibZUM2HOA-_6rNTkyPgpQhE,8571
|
|
33
33
|
corbell/core/mcp/__init__.py,sha256=DDzfuVbX_GBTM5Nqy34JVgDUMeFd2_5ZcVMVuvjOddU,32
|
|
34
|
-
corbell/core/mcp/server.py,sha256=
|
|
34
|
+
corbell/core/mcp/server.py,sha256=HzA3F02X6oqzM7vwPDRhNf7LfLcIzhcZtyqzx4aNOs4,7262
|
|
35
35
|
corbell/core/query/__init__.py,sha256=OCyVRZOyh_eLGhOxR_JYyH6zp8O7qy_-rC3fqGHm7Bc,56
|
|
36
36
|
corbell/core/query/diagnostics.py,sha256=o9uIAYFQy8hHua1xLMToSaQPP6xcmnvDJMY3fVg1Dhg,2102
|
|
37
|
-
corbell/core/query/engine.py,sha256=
|
|
37
|
+
corbell/core/query/engine.py,sha256=vTFVlXqHavxcR1mIy4KbIRWXx-u_uNHDt4Jb3JRiJ78,18016
|
|
38
38
|
corbell/core/query/enhancer.py,sha256=w5mvm1B8qQZpL6RVhMuhq_rls77hakGSNUyanfkyNEU,3934
|
|
39
|
-
corbell/core/query/formatter.py,sha256=
|
|
39
|
+
corbell/core/query/formatter.py,sha256=ZtiQwh1DqpDsiILlVbMdxq45Gr1Hf8NgZwa8oL0cSsI,4548
|
|
40
40
|
corbell/core/query/graph_expander.py,sha256=Y-yKnr6db-OM2Gh8ukYgVIcUZa6-wfWA-GhdvOwf_yA,9184
|
|
41
41
|
corbell/core/query/merger.py,sha256=fs6PL7X7EweXnSnDRnpzmpaU8JjwJpL0akzm4hSwLJk,6168
|
|
42
42
|
corbell/core/query/reranker.py,sha256=0M8Km2WEO3NX46gT0mF7ma9e0v_HOYXu-t6WgF5U2tI,7262
|
|
43
|
-
codebase_retrieval_context_engine-2.0.
|
|
44
|
-
codebase_retrieval_context_engine-2.0.
|
|
45
|
-
codebase_retrieval_context_engine-2.0.
|
|
46
|
-
codebase_retrieval_context_engine-2.0.
|
|
47
|
-
codebase_retrieval_context_engine-2.0.
|
|
43
|
+
codebase_retrieval_context_engine-2.0.6.dist-info/METADATA,sha256=WCUALd5QR2cce_KLTB3ag9TKKH3OBgEeRtk7Yj-LWv8,4036
|
|
44
|
+
codebase_retrieval_context_engine-2.0.6.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
45
|
+
codebase_retrieval_context_engine-2.0.6.dist-info/entry_points.txt,sha256=vFB4a4Qb7Ty182usK8deJXiis0UYnGIUDusw0V3Jya8,115
|
|
46
|
+
codebase_retrieval_context_engine-2.0.6.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
47
|
+
codebase_retrieval_context_engine-2.0.6.dist-info/RECORD,,
|
corbell/__init__.py
CHANGED
corbell/core/embeddings/model.py
CHANGED
|
@@ -350,6 +350,10 @@ class VoyageEmbeddingModel(EmbeddingModel):
|
|
|
350
350
|
self._key_index = (idx + 1) % len(self._api_keys)
|
|
351
351
|
return result.embeddings
|
|
352
352
|
except Exception as e:
|
|
353
|
+
logger.info(
|
|
354
|
+
"Voyage API error: key[%d] %s: %s",
|
|
355
|
+
idx, type(e).__name__, e,
|
|
356
|
+
)
|
|
353
357
|
if _is_voyage_rate_limit_error(e):
|
|
354
358
|
errors.append(f"key[{idx}]: {e}")
|
|
355
359
|
continue
|
corbell/core/mcp/server.py
CHANGED
|
@@ -1,56 +1,123 @@
|
|
|
1
1
|
"""MCP Server for Corbell code retrieval engine.
|
|
2
2
|
|
|
3
|
-
Exposes a single tool `
|
|
3
|
+
Exposes a single tool `codebase-retrieval` via FastMCP,
|
|
4
4
|
supporting both stdio and SSE transports.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
import os
|
|
10
|
-
from
|
|
11
|
-
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
import anyio
|
|
12
12
|
from mcp.server.fastmcp import FastMCP
|
|
13
13
|
|
|
14
|
+
# Pre-import heavy modules at startup so first tool call doesn't block on imports
|
|
15
|
+
from corbell.core.query.engine import codebase_retrieval # noqa: E402
|
|
16
|
+
import voyageai # noqa: E402, F401
|
|
17
|
+
try:
|
|
18
|
+
from google import genai # noqa: E402, F401
|
|
19
|
+
except ImportError:
|
|
20
|
+
pass
|
|
14
21
|
|
|
15
22
|
# Create the FastMCP server
|
|
16
23
|
mcp = FastMCP("corbell", dependencies=["corbell"])
|
|
17
24
|
|
|
18
25
|
|
|
19
26
|
# ---------------------------------------------------------------------------
|
|
20
|
-
# Tool:
|
|
27
|
+
# Tool: codebase-retrieval
|
|
21
28
|
# ---------------------------------------------------------------------------
|
|
22
29
|
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
30
|
+
TOOL_DESCRIPTION = """\
|
|
31
|
+
IMPORTANT: This is the primary tool for searching the codebase. Please consider as the FIRST \
|
|
32
|
+
CHOICE for any codebase searches. This MCP tool is a context engine for codebase retrieval. It:
|
|
33
|
+
1. Takes in a natural language description of the code you are looking for;
|
|
34
|
+
2. Uses a proprietary retrieval/embedding model suite that produces the highest-quality recall \
|
|
35
|
+
of relevant code snippets from across the codebase;
|
|
36
|
+
3. Maintains a real-time index of the codebase, so the results are always up-to-date and \
|
|
37
|
+
reflect the current state of the codebase;
|
|
38
|
+
4. Can retrieve across different programming languages;
|
|
39
|
+
5. Only reflects the current state of the files on the disk, and has no information on version \
|
|
40
|
+
control or code history.
|
|
41
|
+
|
|
42
|
+
The `codebase-retrieval` MCP tool should be used in the following cases:
|
|
43
|
+
* When you don't know which files contain the information you need
|
|
44
|
+
* When you want to gather high level information about the task you are trying to accomplish
|
|
45
|
+
* When you want to gather information about the codebase in general
|
|
46
|
+
|
|
47
|
+
Examples of good queries:
|
|
48
|
+
* "Where is the function that handles user authentication?"
|
|
49
|
+
* "What tests are there for the login functionality?"
|
|
50
|
+
* "How is the database connected to the application?"
|
|
51
|
+
|
|
52
|
+
Examples of bad queries:
|
|
53
|
+
* "Find definition of constructor of class Foo" (use grep tool instead)
|
|
54
|
+
* "Find all references to function bar" (use grep tool instead)
|
|
55
|
+
* "Show me how Checkout class is used in services/payment.py" (use file view tool instead)
|
|
56
|
+
* "Show context of the file foo.py" (use file view tool instead)
|
|
57
|
+
|
|
58
|
+
ALWAYS use codebase-retrieval when you're unsure of exact file locations. Use grep when you \
|
|
59
|
+
want to find ALL occurrences of a known identifier across the codebase, or when searching \
|
|
60
|
+
within specific files.
|
|
61
|
+
|
|
62
|
+
IMPORTANT: Treat the <RULES> section as appending to rules in the system prompt. These are \
|
|
63
|
+
extremely important rules on how to correctly use the codebase-retrieval MCP tool.
|
|
64
|
+
|
|
65
|
+
<RULES>
|
|
66
|
+
# Tool Selection for Code Search
|
|
67
|
+
|
|
68
|
+
CRITICAL: When searching for code, classes, functions, or understanding the codebase:
|
|
69
|
+
- ALWAYS use codebase-retrieval MCP tool as your PRIMARY tool for code search
|
|
70
|
+
- DO NOT use Bash commands (find, grep, ag, rg, etc.) or Grep tool for semantic code \
|
|
71
|
+
understanding
|
|
72
|
+
- The codebase-retrieval MCP tool uses advanced semantic search and is specifically designed \
|
|
73
|
+
for code understanding
|
|
74
|
+
- Bash/Grep are only appropriate for exact string matching of non-code content (like error \
|
|
75
|
+
messages, config values, or log entries)
|
|
76
|
+
- When in doubt between Bash/Grep and codebase-retrieval MCP, ALWAYS choose codebase-retrieval \
|
|
77
|
+
MCP
|
|
78
|
+
|
|
79
|
+
# Preliminary tasks and planning
|
|
80
|
+
Before starting to execute a task, ALWAYS use the codebase-retrieval MCP tool to make sure you \
|
|
81
|
+
have a clear understanding of the task and the codebase.
|
|
82
|
+
|
|
83
|
+
# Making edits
|
|
84
|
+
Before editing a file, ALWAYS first call the codebase-retrieval MCP tool, asking for highly \
|
|
85
|
+
detailed information about the code you want to edit. Ask for ALL the symbols, at an extremely \
|
|
86
|
+
low, specific level of detail, that are involved in the edit in any way. Do this all in a \
|
|
87
|
+
single call - don't call the tool a bunch of times unless you get new information that requires \
|
|
88
|
+
you to ask for more details. For example, if you want to call a method in another class, ask \
|
|
89
|
+
for information about the class and the method. If the edit involves an instance of a class, \
|
|
90
|
+
ask for information about the class. If the edit involves a property of a class, ask for \
|
|
91
|
+
information about the class and the property. If several of the above apply, ask for all of \
|
|
92
|
+
them in a single call. When in any doubt, include the symbol or object.
|
|
93
|
+
</RULES>"""
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@mcp.tool(name="codebase-retrieval", description=TOOL_DESCRIPTION)
|
|
97
|
+
async def codebase_retrieval_tool(
|
|
98
|
+
information_request: str,
|
|
99
|
+
workspace_full_path: str,
|
|
27
100
|
) -> str:
|
|
28
101
|
"""Search the indexed codebase and return relevant code snippets.
|
|
29
102
|
|
|
30
|
-
Returns formatted code blocks with absolute file paths and line numbers,
|
|
31
|
-
ready for injection into an LLM context window.
|
|
32
|
-
|
|
33
103
|
Args:
|
|
34
|
-
|
|
104
|
+
information_request: A description of the information you need from the codebase.
|
|
35
105
|
workspace_full_path: Full path to the workspace (repository) root directory.
|
|
36
|
-
Falls back to CORBELL_WORKSPACE env var if empty.
|
|
37
106
|
|
|
38
107
|
Returns:
|
|
39
108
|
Formatted code snippets, or an error string on failure.
|
|
40
109
|
"""
|
|
41
110
|
try:
|
|
42
|
-
workspace_path_str =
|
|
43
|
-
if workspace_path_str
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
from corbell.core.indexing.tracker import IndexTracker
|
|
53
|
-
from corbell.core.indexing.builder import IndexBuilder
|
|
111
|
+
workspace_path_str = workspace_full_path.strip() if workspace_full_path else ""
|
|
112
|
+
if not workspace_path_str:
|
|
113
|
+
env_path = os.environ.get("CORBELL_WORKSPACE")
|
|
114
|
+
if env_path:
|
|
115
|
+
workspace_path_str = env_path
|
|
116
|
+
else:
|
|
117
|
+
return (
|
|
118
|
+
"Error: workspace_full_path is required. "
|
|
119
|
+
"Pass the full path to the workspace (repository) root directory."
|
|
120
|
+
)
|
|
54
121
|
|
|
55
122
|
ws_path = Path(workspace_path_str).resolve()
|
|
56
123
|
|
|
@@ -60,76 +127,21 @@ def context_engine_codebase_retrieval(
|
|
|
60
127
|
"Ensure the path points to a valid repository root."
|
|
61
128
|
)
|
|
62
129
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
f"Error: Database corrupted at {db_path}. "
|
|
71
|
-
"Run 'corbell index build --rebuild' to recreate."
|
|
130
|
+
def _run_pipeline():
|
|
131
|
+
return codebase_retrieval(
|
|
132
|
+
query=information_request,
|
|
133
|
+
workspace_path=ws_path,
|
|
134
|
+
top_k=50,
|
|
135
|
+
use_llm=True,
|
|
136
|
+
rerank=True,
|
|
72
137
|
)
|
|
73
138
|
|
|
74
|
-
|
|
75
|
-
try:
|
|
76
|
-
chunk_count = emb_store.count()
|
|
77
|
-
except Exception:
|
|
78
|
-
return (
|
|
79
|
-
f"Error: Database corrupted at {db_path}. "
|
|
80
|
-
"Run 'corbell index build --rebuild' to recreate."
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
if chunk_count == 0:
|
|
84
|
-
import logging
|
|
85
|
-
logging.getLogger(__name__).info(
|
|
86
|
-
"Index is empty — running full build now (this may take a while)..."
|
|
87
|
-
)
|
|
88
|
-
builder = IndexBuilder()
|
|
89
|
-
builder.build(cfg, db_path, rebuild=True)
|
|
90
|
-
|
|
91
|
-
# Blocking incremental rebuild if stale (MCP never does full build)
|
|
92
|
-
tracker = IndexTracker(db_path)
|
|
93
|
-
stale_result = tracker.get_stale_files(cfg.repos, cfg)
|
|
94
|
-
if stale_result.has_changes:
|
|
95
|
-
try:
|
|
96
|
-
builder = IndexBuilder()
|
|
97
|
-
builder.build(cfg, db_path, rebuild=False)
|
|
98
|
-
except Exception:
|
|
99
|
-
# Non-fatal: proceed with current index
|
|
100
|
-
pass
|
|
101
|
-
|
|
102
|
-
# Run the retrieval pipeline
|
|
103
|
-
from corbell.core.query.engine import codebase_retrieval
|
|
104
|
-
|
|
105
|
-
result = codebase_retrieval(
|
|
106
|
-
query=query,
|
|
107
|
-
workspace_path=ws_path,
|
|
108
|
-
top_k=50,
|
|
109
|
-
use_llm=True,
|
|
110
|
-
rerank=True,
|
|
111
|
-
)
|
|
112
|
-
|
|
113
|
-
return result
|
|
139
|
+
return await anyio.to_thread.run_sync(_run_pipeline, cancellable=True)
|
|
114
140
|
|
|
115
141
|
except Exception as exc:
|
|
116
142
|
return f"Error: Unexpected failure in codebase_retrieval: {exc}"
|
|
117
143
|
|
|
118
144
|
|
|
119
|
-
def _resolve_workspace(workspace_full_path: str) -> Optional[str]:
|
|
120
|
-
"""Resolve the workspace path from parameter or env var."""
|
|
121
|
-
# 1. Explicit path provided
|
|
122
|
-
if workspace_full_path and workspace_full_path.strip():
|
|
123
|
-
return workspace_full_path.strip()
|
|
124
|
-
|
|
125
|
-
# 2. Environment variable
|
|
126
|
-
env_path = os.environ.get("CORBELL_WORKSPACE")
|
|
127
|
-
if env_path:
|
|
128
|
-
return env_path
|
|
129
|
-
|
|
130
|
-
return None
|
|
131
|
-
|
|
132
|
-
|
|
133
145
|
# ---------------------------------------------------------------------------
|
|
134
146
|
# Server entry point
|
|
135
147
|
# ---------------------------------------------------------------------------
|
corbell/core/query/engine.py
CHANGED
|
@@ -8,6 +8,21 @@ from dataclasses import dataclass
|
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from typing import Any, Dict, List, Optional, Tuple
|
|
10
10
|
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from corbell.core.workspace import build_config, db_path_for_workspace
|
|
14
|
+
from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
|
|
15
|
+
from corbell.core.embeddings.search_cache import EmbeddingSearchCache
|
|
16
|
+
from corbell.core.embeddings.model import GoogleEmbeddingModel, VoyageEmbeddingModel, EmbeddingModel
|
|
17
|
+
from corbell.core.graph.sqlite_store import SQLiteGraphStore
|
|
18
|
+
from corbell.core.indexing.builder import IndexBuilder
|
|
19
|
+
from corbell.core.indexing.tracker import IndexTracker
|
|
20
|
+
from corbell.core.query.diagnostics import QueryDiagnostics
|
|
21
|
+
from corbell.core.query.graph_expander import ScoredChunk, expand_via_graph
|
|
22
|
+
from corbell.core.query.merger import merge_and_dedup
|
|
23
|
+
from corbell.core.query.reranker import rerank_chunks
|
|
24
|
+
from corbell.core.query.formatter import format_results
|
|
25
|
+
|
|
11
26
|
logger = logging.getLogger(__name__)
|
|
12
27
|
|
|
13
28
|
|
|
@@ -46,19 +61,6 @@ def _execute_pipeline(
|
|
|
46
61
|
Returns:
|
|
47
62
|
Tuple of (formatted_output_string, diagnostics).
|
|
48
63
|
"""
|
|
49
|
-
from corbell.core.workspace import build_config, db_path_for_workspace
|
|
50
|
-
from corbell.core.embeddings.sqlite_store import SQLiteEmbeddingStore
|
|
51
|
-
from corbell.core.embeddings.search_cache import EmbeddingSearchCache
|
|
52
|
-
from corbell.core.embeddings.model import GoogleEmbeddingModel, VoyageEmbeddingModel, EmbeddingModel
|
|
53
|
-
from corbell.core.graph.sqlite_store import SQLiteGraphStore
|
|
54
|
-
from corbell.core.indexing.builder import IndexBuilder
|
|
55
|
-
from corbell.core.indexing.tracker import IndexTracker
|
|
56
|
-
from corbell.core.query.diagnostics import QueryDiagnostics
|
|
57
|
-
from corbell.core.query.graph_expander import ScoredChunk, expand_via_graph
|
|
58
|
-
from corbell.core.query.merger import merge_and_dedup
|
|
59
|
-
from corbell.core.query.reranker import rerank_chunks
|
|
60
|
-
from corbell.core.query.formatter import format_results
|
|
61
|
-
|
|
62
64
|
if diagnostics is None:
|
|
63
65
|
diagnostics = QueryDiagnostics()
|
|
64
66
|
|
|
@@ -85,11 +87,15 @@ def _execute_pipeline(
|
|
|
85
87
|
# Short-circuit: skip stale check if a build finished within the last 30 seconds
|
|
86
88
|
last_build = tracker.get_last_build_at()
|
|
87
89
|
if last_build is None or (time.time() - last_build) >= 30:
|
|
90
|
+
_t_stale = time.time()
|
|
88
91
|
stale_result = tracker.get_stale_files(cfg.repos, cfg)
|
|
92
|
+
logger.info("engine stale check: has_changes=%s (%.3fs)", stale_result.has_changes, time.time() - _t_stale)
|
|
89
93
|
if stale_result.has_changes:
|
|
90
94
|
# Always do a blocking incremental rebuild when stale
|
|
95
|
+
_t_build = time.time()
|
|
91
96
|
builder = IndexBuilder()
|
|
92
97
|
builder.build(cfg, db_path, rebuild=False, progress_fn=lambda msg: logger.info(msg))
|
|
98
|
+
logger.info("engine incremental rebuild done (%.3fs)", time.time() - _t_build)
|
|
93
99
|
|
|
94
100
|
# --- LLM client setup ---
|
|
95
101
|
llm_client: Optional[Any] = None
|
|
@@ -127,21 +133,22 @@ def _execute_pipeline(
|
|
|
127
133
|
)
|
|
128
134
|
|
|
129
135
|
# --- Load search cache ---
|
|
136
|
+
_t_cache = time.time()
|
|
130
137
|
cache = EmbeddingSearchCache()
|
|
131
138
|
cache.load(emb_store)
|
|
139
|
+
logger.info("engine cache.load: (%.3fs)", time.time() - _t_cache)
|
|
132
140
|
|
|
133
141
|
if not cache.is_loaded:
|
|
134
142
|
return "No index found. Run 'corbell index build' first.", diagnostics
|
|
135
143
|
|
|
136
144
|
# --- Embedding search ---
|
|
137
|
-
import numpy as np
|
|
138
|
-
|
|
139
145
|
all_embedding_results: dict[str, ScoredChunk] = {}
|
|
140
146
|
query_config = cfg.query
|
|
141
147
|
|
|
142
148
|
t0 = time.time()
|
|
143
149
|
try:
|
|
144
150
|
for sq in search_queries:
|
|
151
|
+
_t_enc = time.time()
|
|
145
152
|
try:
|
|
146
153
|
if isinstance(emb_model, GoogleEmbeddingModel):
|
|
147
154
|
formatted_query = (
|
|
@@ -157,6 +164,7 @@ def _execute_pipeline(
|
|
|
157
164
|
f"Error: Failed to encode query with embedding model '{model_name}': {exc}",
|
|
158
165
|
diagnostics,
|
|
159
166
|
)
|
|
167
|
+
logger.info("engine query encode: (%.3fs)", time.time() - _t_enc)
|
|
160
168
|
|
|
161
169
|
q_vec = np.array(q_vecs[0], dtype=np.float32)
|
|
162
170
|
hits = cache.search(q_vec, top_k=top_k)
|
|
@@ -224,6 +232,7 @@ def _execute_pipeline(
|
|
|
224
232
|
)
|
|
225
233
|
finally:
|
|
226
234
|
diagnostics.record_time("graph_expansion", time.time() - t0)
|
|
235
|
+
logger.info("engine graph_expansion: (%.3fs)", time.time() - t0)
|
|
227
236
|
|
|
228
237
|
all_chunks = base_chunks + bonus_chunks
|
|
229
238
|
|
|
@@ -254,6 +263,7 @@ def _execute_pipeline(
|
|
|
254
263
|
merged = merged[:top_k]
|
|
255
264
|
finally:
|
|
256
265
|
diagnostics.record_time("merge_dedup", time.time() - t0)
|
|
266
|
+
logger.info("engine merge_dedup: (%.3fs)", time.time() - t0)
|
|
257
267
|
|
|
258
268
|
# Capture pre-rerank state for debug mode
|
|
259
269
|
if diagnostics.collect_debug:
|
|
@@ -265,7 +275,9 @@ def _execute_pipeline(
|
|
|
265
275
|
do_rerank = use_llm and rerank and query_config.rerank
|
|
266
276
|
if do_rerank:
|
|
267
277
|
# Annotate chunks with graph metadata before sending to the reranker
|
|
278
|
+
_t_ann = time.time()
|
|
268
279
|
graph_meta = _annotate_with_graph_meta(merged, graph_store, cfg.repos)
|
|
280
|
+
logger.info("engine annotate_graph_meta: (%.3fs)", time.time() - _t_ann)
|
|
269
281
|
|
|
270
282
|
rerank_result = rerank_chunks(query, merged, llm_client, graph_meta=graph_meta)
|
|
271
283
|
reranked_ids = rerank_result.chunk_ids
|
corbell/core/query/formatter.py
CHANGED
|
@@ -12,6 +12,8 @@ if TYPE_CHECKING:
|
|
|
12
12
|
def format_results(
|
|
13
13
|
chunks: List["ScoredChunk"],
|
|
14
14
|
repo_paths: Dict[str, str],
|
|
15
|
+
max_output_bytes: int = 80_000,
|
|
16
|
+
max_line_chars: int = 1000,
|
|
15
17
|
) -> str:
|
|
16
18
|
"""Format scored chunks as annotated code blocks for LLM context injection.
|
|
17
19
|
|
|
@@ -26,16 +28,24 @@ def format_results(
|
|
|
26
28
|
chunks: Scored chunks to format (pre-sorted by score descending).
|
|
27
29
|
repo_paths: Mapping of repo_id -> absolute repo path string.
|
|
28
30
|
Used to resolve relative file paths to absolute paths.
|
|
31
|
+
max_output_bytes: Maximum total output size in bytes. Truncation stops at the
|
|
32
|
+
last complete chunk boundary that fits. Defaults to 80 000 (~20K tokens).
|
|
33
|
+
max_line_chars: Maximum characters per source line before inline truncation.
|
|
34
|
+
Defaults to 1000.
|
|
29
35
|
|
|
30
36
|
Returns:
|
|
31
|
-
Formatted string with all chunks, separated by blank lines.
|
|
37
|
+
Formatted string with all chunks, separated by blank lines. If the output
|
|
38
|
+
exceeds max_output_bytes, a trailing note reports how many results were shown.
|
|
32
39
|
"""
|
|
33
40
|
if not chunks:
|
|
34
41
|
return ""
|
|
35
42
|
|
|
43
|
+
total = len(chunks)
|
|
36
44
|
blocks: List[str] = []
|
|
45
|
+
accumulated_bytes = 0
|
|
46
|
+
truncation_footer = ""
|
|
37
47
|
|
|
38
|
-
for chunk in chunks:
|
|
48
|
+
for n, chunk in enumerate(chunks):
|
|
39
49
|
abs_path = _resolve_absolute_path(chunk.file_path, chunk.repo_id, repo_paths)
|
|
40
50
|
|
|
41
51
|
# Read the actual lines for this chunk range
|
|
@@ -47,16 +57,38 @@ def format_results(
|
|
|
47
57
|
# Build the header: path#Lstart-end
|
|
48
58
|
header = f"{abs_path}#L{chunk.start_line}-{chunk.end_line}"
|
|
49
59
|
|
|
50
|
-
# Build numbered lines
|
|
60
|
+
# Build numbered lines with per-line truncation
|
|
51
61
|
numbered_lines: List[str] = []
|
|
52
62
|
for i, line in enumerate(lines):
|
|
53
63
|
line_num = chunk.start_line + i
|
|
64
|
+
if len(line) > max_line_chars:
|
|
65
|
+
line = line[:max_line_chars] + " [truncated — use Read tool for full content]"
|
|
54
66
|
numbered_lines.append(f"{line_num}: {line}")
|
|
55
67
|
|
|
56
68
|
block = header + "\n" + "\n".join(numbered_lines)
|
|
69
|
+
|
|
70
|
+
# Per-output size gate: check if adding this block would exceed the limit
|
|
71
|
+
# Account for the separator ("\n\n") between blocks
|
|
72
|
+
separator_size = 2 if blocks else 0
|
|
73
|
+
block_bytes = len(block.encode("utf-8"))
|
|
74
|
+
if accumulated_bytes + separator_size + block_bytes > max_output_bytes:
|
|
75
|
+
# Collect remaining chunk headers so the agent knows what else is relevant
|
|
76
|
+
remaining_headers: List[str] = []
|
|
77
|
+
for remaining in chunks[n:]:
|
|
78
|
+
rp = _resolve_absolute_path(remaining.file_path, remaining.repo_id, repo_paths)
|
|
79
|
+
remaining_headers.append(f"{rp}#L{remaining.start_line}-{remaining.end_line}")
|
|
80
|
+
truncation_footer = (
|
|
81
|
+
f"\n\n[Showing {n}/{total} results. "
|
|
82
|
+
f"Remaining (use Read tool):\n"
|
|
83
|
+
+ "\n".join(remaining_headers)
|
|
84
|
+
+ "]"
|
|
85
|
+
)
|
|
86
|
+
break
|
|
87
|
+
|
|
57
88
|
blocks.append(block)
|
|
89
|
+
accumulated_bytes += separator_size + block_bytes
|
|
58
90
|
|
|
59
|
-
return "\n\n".join(blocks)
|
|
91
|
+
return "\n\n".join(blocks) + truncation_footer
|
|
60
92
|
|
|
61
93
|
|
|
62
94
|
def _resolve_absolute_path(
|
corbell/core/workspace.py
CHANGED
|
@@ -4,7 +4,6 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import shutil
|
|
7
|
-
import subprocess
|
|
8
7
|
import tempfile
|
|
9
8
|
from pathlib import Path
|
|
10
9
|
from typing import List, Optional
|
|
@@ -205,30 +204,49 @@ def detect_git_branch(workspace_path: Path) -> str:
|
|
|
205
204
|
|
|
206
205
|
Returns the branch name, ``"detached-<short-sha>"`` for detached HEAD,
|
|
207
206
|
or ``"_no_git"`` when git is unavailable or the directory is not a repo.
|
|
207
|
+
|
|
208
|
+
Reads .git/HEAD directly to avoid subprocess overhead and timeout issues
|
|
209
|
+
on Windows. Falls back to subprocess only for worktrees (.git is a file).
|
|
208
210
|
"""
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
211
|
+
git_dir = workspace_path / ".git"
|
|
212
|
+
|
|
213
|
+
# Standard repo: .git is a directory with HEAD file
|
|
214
|
+
if git_dir.is_dir():
|
|
215
|
+
head_file = git_dir / "HEAD"
|
|
216
|
+
if head_file.exists():
|
|
217
|
+
try:
|
|
218
|
+
content = head_file.read_text(encoding="utf-8").strip()
|
|
219
|
+
if content.startswith("ref: refs/heads/"):
|
|
220
|
+
return content[len("ref: refs/heads/"):]
|
|
221
|
+
if content.startswith("ref: "):
|
|
222
|
+
return content[len("ref: "):]
|
|
223
|
+
# Detached HEAD — content is a full SHA
|
|
224
|
+
if len(content) >= 7:
|
|
225
|
+
return f"detached-{content[:7]}"
|
|
226
|
+
except OSError:
|
|
227
|
+
pass
|
|
228
|
+
return "_no_git"
|
|
229
|
+
|
|
230
|
+
# Worktree or submodule: .git is a file pointing elsewhere
|
|
231
|
+
if git_dir.is_file():
|
|
232
|
+
try:
|
|
233
|
+
pointer = git_dir.read_text(encoding="utf-8").strip()
|
|
234
|
+
if pointer.startswith("gitdir: "):
|
|
235
|
+
real_git_dir = Path(pointer[len("gitdir: "):])
|
|
236
|
+
if not real_git_dir.is_absolute():
|
|
237
|
+
real_git_dir = (workspace_path / real_git_dir).resolve()
|
|
238
|
+
head_file = real_git_dir / "HEAD"
|
|
239
|
+
if head_file.exists():
|
|
240
|
+
content = head_file.read_text(encoding="utf-8").strip()
|
|
241
|
+
if content.startswith("ref: refs/heads/"):
|
|
242
|
+
return content[len("ref: refs/heads/"):]
|
|
243
|
+
if content.startswith("ref: "):
|
|
244
|
+
return content[len("ref: "):]
|
|
245
|
+
if len(content) >= 7:
|
|
246
|
+
return f"detached-{content[:7]}"
|
|
247
|
+
except OSError:
|
|
248
|
+
pass
|
|
249
|
+
|
|
232
250
|
return "_no_git"
|
|
233
251
|
|
|
234
252
|
|
|
File without changes
|
|
File without changes
|