codebase-retrieval-context-engine 2.0.1__py3-none-any.whl → 2.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codebase_retrieval_context_engine-2.0.3.dist-info/METADATA +107 -0
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.3.dist-info}/RECORD +25 -24
- corbell/__init__.py +1 -1
- corbell/cli/commands/debug.py +305 -0
- corbell/cli/commands/index.py +13 -0
- corbell/cli/main.py +2 -0
- corbell/core/constants.py +8 -0
- corbell/core/embeddings/extractor.py +4 -1
- corbell/core/embeddings/model.py +13 -39
- corbell/core/embeddings/sqlite_store.py +71 -26
- corbell/core/gitignore.py +2 -0
- corbell/core/graph/builder.py +57 -15
- corbell/core/graph/method_graph.py +240 -39
- corbell/core/graph/sqlite_store.py +25 -0
- corbell/core/indexing/builder.py +854 -608
- corbell/core/indexing/tracker.py +2 -0
- corbell/core/llm_client.py +1 -1
- corbell/core/mcp/server.py +3 -54
- corbell/core/query/diagnostics.py +18 -1
- corbell/core/query/engine.py +473 -321
- corbell/core/query/reranker.py +98 -22
- corbell/core/workspace.py +7 -11
- codebase_retrieval_context_engine-2.0.1.dist-info/METADATA +0 -506
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.3.dist-info}/WHEEL +0 -0
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.3.dist-info}/entry_points.txt +0 -0
- {codebase_retrieval_context_engine-2.0.1.dist-info → codebase_retrieval_context_engine-2.0.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codebase-retrieval-context-engine
|
|
3
|
+
Version: 2.0.3
|
|
4
|
+
Summary: Code retrieval engine — hybrid embedding + graph search for LLM context injection.
|
|
5
|
+
Project-URL: Homepage, https://github.com/nullmastermind/local-context-engine
|
|
6
|
+
Project-URL: Repository, https://github.com/nullmastermind/local-context-engine
|
|
7
|
+
Project-URL: Issues, https://github.com/nullmastermind/local-context-engine/issues
|
|
8
|
+
Author: nullmastermind
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: LLM,MCP,code-search,codebase-retrieval,context,embeddings,retrieval
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Requires-Dist: mcp>=1.1.2
|
|
19
|
+
Requires-Dist: numpy>=2.0
|
|
20
|
+
Requires-Dist: pathspec>=0.11
|
|
21
|
+
Requires-Dist: pydantic>=2.0
|
|
22
|
+
Requires-Dist: python-dotenv>=1.0
|
|
23
|
+
Requires-Dist: rich>=13.0
|
|
24
|
+
Requires-Dist: typer>=0.12
|
|
25
|
+
Provides-Extra: anthropic
|
|
26
|
+
Requires-Dist: anthropic>=0.25; extra == 'anthropic'
|
|
27
|
+
Provides-Extra: aws
|
|
28
|
+
Requires-Dist: boto3>=1.34; extra == 'aws'
|
|
29
|
+
Provides-Extra: azure
|
|
30
|
+
Requires-Dist: openai>=1.0; extra == 'azure'
|
|
31
|
+
Provides-Extra: debug
|
|
32
|
+
Requires-Dist: gradio>=4.0; extra == 'debug'
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: httpx; extra == 'dev'
|
|
35
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
37
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
38
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
39
|
+
Requires-Dist: respx; extra == 'dev'
|
|
40
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
41
|
+
Provides-Extra: gcp
|
|
42
|
+
Requires-Dist: anthropic[vertex]>=0.25; extra == 'gcp'
|
|
43
|
+
Requires-Dist: google-cloud-aiplatform>=1.38; extra == 'gcp'
|
|
44
|
+
Provides-Extra: google
|
|
45
|
+
Requires-Dist: google-genai>=2.7.0; extra == 'google'
|
|
46
|
+
Provides-Extra: openai
|
|
47
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
48
|
+
Provides-Extra: treesitter
|
|
49
|
+
Requires-Dist: tree-sitter-c-sharp>=0.21; extra == 'treesitter'
|
|
50
|
+
Requires-Dist: tree-sitter-go>=0.21; extra == 'treesitter'
|
|
51
|
+
Requires-Dist: tree-sitter-java>=0.21; extra == 'treesitter'
|
|
52
|
+
Requires-Dist: tree-sitter-javascript>=0.21; extra == 'treesitter'
|
|
53
|
+
Requires-Dist: tree-sitter-php>=0.21; extra == 'treesitter'
|
|
54
|
+
Requires-Dist: tree-sitter-python>=0.21; extra == 'treesitter'
|
|
55
|
+
Requires-Dist: tree-sitter-ruby>=0.21; extra == 'treesitter'
|
|
56
|
+
Requires-Dist: tree-sitter-rust>=0.21; extra == 'treesitter'
|
|
57
|
+
Requires-Dist: tree-sitter-typescript>=0.21; extra == 'treesitter'
|
|
58
|
+
Requires-Dist: tree-sitter>=0.21; extra == 'treesitter'
|
|
59
|
+
Provides-Extra: voyage
|
|
60
|
+
Requires-Dist: voyageai>=0.3; extra == 'voyage'
|
|
61
|
+
Description-Content-Type: text/markdown
|
|
62
|
+
|
|
63
|
+
<div align="center">
|
|
64
|
+
<h1>codebase-retrieval-context-engine</h1>
|
|
65
|
+
<p><strong>Code retrieval engine for LLM context via MCP.</strong></p>
|
|
66
|
+
<p>
|
|
67
|
+
<a href="LICENSE"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License"/></a>
|
|
68
|
+
</p>
|
|
69
|
+
</div>
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Add to Claude Code
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
claude mcp add codebase-retrieval -e CORBELL_LLM_PROVIDER=google -e GOOGLE_API_KEY=your-google-api-key -e GOOGLE_MODEL=gemini-3.1-flash-lite -e CORBELL_EMBEDDING_MODEL=voyage-4-lite -e VOYAGE_API_KEY=your-voyage-api-key -- uvx codebase-retrieval-context-engine
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
That's it. The AI agent passes workspace path and triggers index builds automatically.
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Build index manually (optional)
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
uvx codebase-retrieval-context-engine index build
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Run from your project root. Env vars (`CORBELL_LLM_PROVIDER`, `GOOGLE_API_KEY`, etc.) must be set in your shell.
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Environment variables
|
|
94
|
+
|
|
95
|
+
| Variable | Description |
|
|
96
|
+
|---|---|
|
|
97
|
+
| `CORBELL_LLM_PROVIDER` | LLM provider for reranking (`google`, `anthropic`, `openai`) |
|
|
98
|
+
| `GOOGLE_API_KEY` | Google AI API key (supports multiple: `key1,key2,key3`) |
|
|
99
|
+
| `GOOGLE_MODEL` | e.g. `gemini-3.1-flash-lite` |
|
|
100
|
+
| `CORBELL_EMBEDDING_MODEL` | `voyage-4-lite`, `voyage-code-3`, or `gemini-embedding-001` |
|
|
101
|
+
| `VOYAGE_API_KEY` | Voyage AI API key (supports multiple: `key1,key2,key3`). Add a card to billing to unlock rate limits. |
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## License
|
|
106
|
+
|
|
107
|
+
Apache 2.0
|
|
@@ -1,46 +1,47 @@
|
|
|
1
|
-
corbell/__init__.py,sha256=
|
|
1
|
+
corbell/__init__.py,sha256=BaFS4Y0-zDapO7abzb83G5HBc7cB1xLntwgXWJbDixk,124
|
|
2
2
|
corbell/cli/__init__.py,sha256=5-MP6JIWgp4nDLNIhqP6Gtx97GESaIYg3NGxtRGaMv0,28
|
|
3
|
-
corbell/cli/main.py,sha256=
|
|
3
|
+
corbell/cli/main.py,sha256=CP5EHizFLaBLF1EohgVo_-XFlm4VaO6peQaSnzyfxAI,1954
|
|
4
4
|
corbell/cli/commands/__init__.py,sha256=0mAOs3RWC7XMZnGRN677hjPCHHQKDq9ASjIr_GQM3js,37
|
|
5
|
-
corbell/cli/commands/
|
|
5
|
+
corbell/cli/commands/debug.py,sha256=wdwveCeQSgcQbNg5-R5ekU_smEQKMq8WfH0obBbq3i8,10764
|
|
6
|
+
corbell/cli/commands/index.py,sha256=_nv5TC2O1xusX2gY8s2p00xPLN3wQrHEiFmc0EL6oHY,3432
|
|
6
7
|
corbell/cli/commands/query.py,sha256=Sh-xnVj4n3zAI2hTxVyMTqFEPsq3vkWucfljnCEaGyU,2310
|
|
7
8
|
corbell/core/__init__.py,sha256=VS9PnhHr4NXYlWs1TLCyllnVCNsiwVZ1Xj-AOBhZpAU,29
|
|
8
|
-
corbell/core/constants.py,sha256=
|
|
9
|
-
corbell/core/gitignore.py,sha256=
|
|
10
|
-
corbell/core/llm_client.py,sha256=
|
|
11
|
-
corbell/core/workspace.py,sha256=
|
|
9
|
+
corbell/core/constants.py,sha256=P0fCJ0J5V2Nt348ZAVH1bHd9dFPJRLtpUyQhHPAl0_8,1203
|
|
10
|
+
corbell/core/gitignore.py,sha256=UO588tAxSVv7YEGNDjzdcBys_aqMIAhXrDgToRfcnzc,2347
|
|
11
|
+
corbell/core/llm_client.py,sha256=qGKuptxMAMDwqvhGAKVjppf2p-sX-auaA26WKo6Nlkk,26221
|
|
12
|
+
corbell/core/workspace.py,sha256=p24p_yJss7B3UPbv7Qx7XCUagJ2YKTrsBxDhFLCfqd4,14118
|
|
12
13
|
corbell/core/embeddings/__init__.py,sha256=RCekvfNkFuMGEDLnls78i3znR84cTdnj4KJ_PeQrMNg,213
|
|
13
14
|
corbell/core/embeddings/base.py,sha256=udPW4XmcPhCpNQA6n8KqMcu2JXvVNv1JjdRJmFq5ZRA,2175
|
|
14
|
-
corbell/core/embeddings/extractor.py,sha256=
|
|
15
|
+
corbell/core/embeddings/extractor.py,sha256=2_BxRpsUcz-C-3HXjvlARqM3U5dzHRJcPR_hhPdMxSE,7314
|
|
15
16
|
corbell/core/embeddings/factory.py,sha256=Lonjbk8Lsxykz-2ZEgFCWoH9zZ005Qm4dXVdA6P4qJY,1817
|
|
16
|
-
corbell/core/embeddings/model.py,sha256=
|
|
17
|
+
corbell/core/embeddings/model.py,sha256=hU-SyW7YM9jGv9-_-bfxxOUh1ZZdc-8fpDK7o5j5s88,14289
|
|
17
18
|
corbell/core/embeddings/search_cache.py,sha256=FHzO3mu4m4MJGy2jOFwb9GCEypcT11CcVrLts4Ib0ho,3351
|
|
18
|
-
corbell/core/embeddings/sqlite_store.py,sha256=
|
|
19
|
+
corbell/core/embeddings/sqlite_store.py,sha256=99lHU_gPYwKw9BhUMS-XimQI8vDpBbBrIc_RkrsVdOM,11676
|
|
19
20
|
corbell/core/graph/__init__.py,sha256=VaxDKeXMgMEBBMC0dglwj68A_aNYRI5O8VM6oMC1GIM,29
|
|
20
|
-
corbell/core/graph/builder.py,sha256=
|
|
21
|
-
corbell/core/graph/method_graph.py,sha256=
|
|
21
|
+
corbell/core/graph/builder.py,sha256=dXUdAhuZ4t-wuW4dFZHz6k9-wBXdYkY6dysjQIkvl3Q,32214
|
|
22
|
+
corbell/core/graph/method_graph.py,sha256=fwmkSZXiGGYZIc2iC-6hbTrb26fAwielOrJBlqaz8Oc,57594
|
|
22
23
|
corbell/core/graph/schema.py,sha256=swy1VZZpL88LPEj6zihl5bglQLrGD-ohOYjFeNC31a0,5253
|
|
23
|
-
corbell/core/graph/sqlite_store.py,sha256=
|
|
24
|
+
corbell/core/graph/sqlite_store.py,sha256=B1ObNit7MXbQpst6dpuloTcFAmUim_MoP3PSCATf_4A,21116
|
|
24
25
|
corbell/core/graph/providers/__init__.py,sha256=__ZVe1uwIHSyFh_t-V4MyT5MsM5hooTOrxxkm9Txt7o,268
|
|
25
26
|
corbell/core/graph/providers/aws_patterns.py,sha256=w2iF5qQJcV7S6J64ZYb3IzGPdXjCc37YX5sNnHz8mXY,2818
|
|
26
27
|
corbell/core/graph/providers/azure_patterns.py,sha256=tJ9AQQXW2xYzJ36wNOxTHHhaivaCv3RYEMJUjw8WjeQ,3515
|
|
27
28
|
corbell/core/graph/providers/gcp_patterns.py,sha256=vIofjanvRWGhFftuGdzt9YgTIGZRJz7lLG0abUNjFdA,2789
|
|
28
29
|
corbell/core/indexing/__init__.py,sha256=VczeSHUfKR3YVowGCleFjo2pIpDHfl9kl-OkEl8szow,47
|
|
29
|
-
corbell/core/indexing/builder.py,sha256=
|
|
30
|
+
corbell/core/indexing/builder.py,sha256=apF-FFz_bZ6SeBEVVZzNXMavp9zuLVMVhg4598YJfMs,33333
|
|
30
31
|
corbell/core/indexing/lock.py,sha256=uUMelIrtrp6Ww9rTfbl2OvomByc-IJyiHIMnptfA4xI,4743
|
|
31
|
-
corbell/core/indexing/tracker.py,sha256=
|
|
32
|
+
corbell/core/indexing/tracker.py,sha256=UCeKARiUMyZcg1yvbIZxibZUM2HOA-_6rNTkyPgpQhE,8571
|
|
32
33
|
corbell/core/mcp/__init__.py,sha256=DDzfuVbX_GBTM5Nqy34JVgDUMeFd2_5ZcVMVuvjOddU,32
|
|
33
|
-
corbell/core/mcp/server.py,sha256=
|
|
34
|
+
corbell/core/mcp/server.py,sha256=CmkqS2EYx4eRzquaJNdPPAx_G07_sJUaK1v_u_aXhTc,5380
|
|
34
35
|
corbell/core/query/__init__.py,sha256=OCyVRZOyh_eLGhOxR_JYyH6zp8O7qy_-rC3fqGHm7Bc,56
|
|
35
|
-
corbell/core/query/diagnostics.py,sha256=
|
|
36
|
-
corbell/core/query/engine.py,sha256=
|
|
36
|
+
corbell/core/query/diagnostics.py,sha256=o9uIAYFQy8hHua1xLMToSaQPP6xcmnvDJMY3fVg1Dhg,2102
|
|
37
|
+
corbell/core/query/engine.py,sha256=wqaZy-ACZQhLua9mlgad4boowDsRFW1TQtQpP5dTReU,17374
|
|
37
38
|
corbell/core/query/enhancer.py,sha256=w5mvm1B8qQZpL6RVhMuhq_rls77hakGSNUyanfkyNEU,3934
|
|
38
39
|
corbell/core/query/formatter.py,sha256=xMr8HE-oxBSEKb514aixY7aoUWGeYoK1w5wnaIlCYEc,2813
|
|
39
40
|
corbell/core/query/graph_expander.py,sha256=Y-yKnr6db-OM2Gh8ukYgVIcUZa6-wfWA-GhdvOwf_yA,9184
|
|
40
41
|
corbell/core/query/merger.py,sha256=fs6PL7X7EweXnSnDRnpzmpaU8JjwJpL0akzm4hSwLJk,6168
|
|
41
|
-
corbell/core/query/reranker.py,sha256=
|
|
42
|
-
codebase_retrieval_context_engine-2.0.
|
|
43
|
-
codebase_retrieval_context_engine-2.0.
|
|
44
|
-
codebase_retrieval_context_engine-2.0.
|
|
45
|
-
codebase_retrieval_context_engine-2.0.
|
|
46
|
-
codebase_retrieval_context_engine-2.0.
|
|
42
|
+
corbell/core/query/reranker.py,sha256=0M8Km2WEO3NX46gT0mF7ma9e0v_HOYXu-t6WgF5U2tI,7262
|
|
43
|
+
codebase_retrieval_context_engine-2.0.3.dist-info/METADATA,sha256=yb84Ich965QFp98h1XcO_uk9uhRn-OaYGG8zsZtEWp0,4089
|
|
44
|
+
codebase_retrieval_context_engine-2.0.3.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
45
|
+
codebase_retrieval_context_engine-2.0.3.dist-info/entry_points.txt,sha256=vFB4a4Qb7Ty182usK8deJXiis0UYnGIUDusw0V3Jya8,115
|
|
46
|
+
codebase_retrieval_context_engine-2.0.3.dist-info/licenses/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
|
|
47
|
+
codebase_retrieval_context_engine-2.0.3.dist-info/RECORD,,
|
corbell/__init__.py
CHANGED
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
"""CLI: corbell debug — launch a Gradio UI for inspecting query pipeline internals."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
|
|
11
|
+
app = typer.Typer(no_args_is_help=False, help="Query debug UI commands.")
|
|
12
|
+
console = Console()
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@app.callback(invoke_without_command=True)
|
|
16
|
+
def debug(
|
|
17
|
+
ctx: typer.Context,
|
|
18
|
+
workspace: str = typer.Option(
|
|
19
|
+
"",
|
|
20
|
+
"--workspace",
|
|
21
|
+
"-w",
|
|
22
|
+
help="Path to the workspace root (default: current directory).",
|
|
23
|
+
),
|
|
24
|
+
port: int = typer.Option(7860, "--port", "-p", help="Port for the Gradio server."),
|
|
25
|
+
share: bool = typer.Option(False, "--share", help="Create a public Gradio share link."),
|
|
26
|
+
) -> None:
|
|
27
|
+
"""Launch the Gradio debug UI for inspecting the query pipeline.
|
|
28
|
+
|
|
29
|
+
The UI lets you run a query against a workspace and inspect:
|
|
30
|
+
- Per-phase timing
|
|
31
|
+
- Final formatted results
|
|
32
|
+
- Pre-rerank chunk table (file, lines, score, symbol, type)
|
|
33
|
+
- LLM rerank prompts and raw response
|
|
34
|
+
"""
|
|
35
|
+
if ctx.invoked_subcommand is not None:
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
import gradio as gr # type: ignore[import-untyped]
|
|
40
|
+
except ImportError:
|
|
41
|
+
console.print(
|
|
42
|
+
"[red]Gradio is not installed. Install it with:[/red]\n"
|
|
43
|
+
" pip install 'codebase-retrieval-context-engine[debug]'"
|
|
44
|
+
)
|
|
45
|
+
raise typer.Exit(1)
|
|
46
|
+
|
|
47
|
+
default_workspace = workspace or os.environ.get("CORBELL_WORKSPACE") or str(Path.cwd())
|
|
48
|
+
|
|
49
|
+
def run_mcp_tool(
|
|
50
|
+
env_vars_text: str,
|
|
51
|
+
mcp_workspace: str,
|
|
52
|
+
mcp_query: str,
|
|
53
|
+
): # type: ignore[no-untyped-def]
|
|
54
|
+
"""Invoke context_engine_codebase_retrieval directly and return results."""
|
|
55
|
+
if not mcp_query.strip():
|
|
56
|
+
return "", ""
|
|
57
|
+
|
|
58
|
+
# Apply env var overrides for this invocation
|
|
59
|
+
env_backup: dict[str, str | None] = {}
|
|
60
|
+
if env_vars_text.strip():
|
|
61
|
+
for line in env_vars_text.strip().splitlines():
|
|
62
|
+
line = line.strip()
|
|
63
|
+
if not line or line.startswith("#"):
|
|
64
|
+
continue
|
|
65
|
+
if "=" not in line:
|
|
66
|
+
continue
|
|
67
|
+
key, _, value = line.partition("=")
|
|
68
|
+
key, value = key.strip(), value.strip()
|
|
69
|
+
env_backup[key] = os.environ.get(key)
|
|
70
|
+
os.environ[key] = value
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
from corbell.core.mcp.server import context_engine_codebase_retrieval
|
|
74
|
+
|
|
75
|
+
result = context_engine_codebase_retrieval(
|
|
76
|
+
query=mcp_query.strip(),
|
|
77
|
+
workspace_full_path=mcp_workspace.strip(),
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
if result.startswith("Error:"):
|
|
81
|
+
return result, ""
|
|
82
|
+
return "", result
|
|
83
|
+
except Exception as exc:
|
|
84
|
+
return f"Error: {exc}", ""
|
|
85
|
+
finally:
|
|
86
|
+
for key, original in env_backup.items():
|
|
87
|
+
if original is None:
|
|
88
|
+
os.environ.pop(key, None)
|
|
89
|
+
else:
|
|
90
|
+
os.environ[key] = original
|
|
91
|
+
|
|
92
|
+
def run_query(workspace_path: str, query: str): # type: ignore[no-untyped-def]
|
|
93
|
+
"""Run the debug pipeline and return Gradio component values."""
|
|
94
|
+
from corbell.core.query.engine import codebase_retrieval_debug
|
|
95
|
+
|
|
96
|
+
if not query.strip():
|
|
97
|
+
return (
|
|
98
|
+
"", # error_box
|
|
99
|
+
"", # timing_md
|
|
100
|
+
"", # final_results
|
|
101
|
+
[], # pre_rerank_table
|
|
102
|
+
"", # rerank_system
|
|
103
|
+
"", # rerank_user
|
|
104
|
+
"", # rerank_response
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
ws = workspace_path.strip() or default_workspace
|
|
108
|
+
result = codebase_retrieval_debug(query=query, workspace_path=ws)
|
|
109
|
+
|
|
110
|
+
# --- Error banner ---
|
|
111
|
+
error_text = result.error or ""
|
|
112
|
+
|
|
113
|
+
# --- Timing table ---
|
|
114
|
+
timing = result.diagnostics.timing if result.diagnostics else {}
|
|
115
|
+
if timing:
|
|
116
|
+
rows = "".join(
|
|
117
|
+
f"| {phase} | {elapsed:.3f}s |\n"
|
|
118
|
+
for phase, elapsed in timing.items()
|
|
119
|
+
)
|
|
120
|
+
timing_md = (
|
|
121
|
+
"| Phase | Elapsed |\n"
|
|
122
|
+
"|---|---|\n"
|
|
123
|
+
+ rows
|
|
124
|
+
)
|
|
125
|
+
else:
|
|
126
|
+
timing_md = "_No timing data available._"
|
|
127
|
+
|
|
128
|
+
# --- Final results ---
|
|
129
|
+
final_results = result.final_output or ""
|
|
130
|
+
|
|
131
|
+
# --- Pre-rerank table ---
|
|
132
|
+
pre_rerank_rows = []
|
|
133
|
+
graph_ids = set()
|
|
134
|
+
if result.diagnostics and result.diagnostics.graph_chunk_ids:
|
|
135
|
+
graph_ids = result.diagnostics.graph_chunk_ids
|
|
136
|
+
for chunk in result.pre_rerank_chunks:
|
|
137
|
+
chunk_id = getattr(chunk, "chunk_id", "")
|
|
138
|
+
parts = chunk_id.split("+") if chunk_id else []
|
|
139
|
+
has_graph = any(p in graph_ids for p in parts) if graph_ids else False
|
|
140
|
+
has_embedding = any(p not in graph_ids for p in parts) if graph_ids else True
|
|
141
|
+
if has_graph and has_embedding and len(parts) > 1:
|
|
142
|
+
source = "embedding+graph"
|
|
143
|
+
elif has_graph:
|
|
144
|
+
source = "graph"
|
|
145
|
+
else:
|
|
146
|
+
source = "embedding"
|
|
147
|
+
pre_rerank_rows.append([
|
|
148
|
+
getattr(chunk, "file_path", ""),
|
|
149
|
+
f"{getattr(chunk, 'start_line', '')}-{getattr(chunk, 'end_line', '')}",
|
|
150
|
+
f"{getattr(chunk, 'score', 0.0):.4f}",
|
|
151
|
+
getattr(chunk, "symbol", "") or "",
|
|
152
|
+
getattr(chunk, "chunk_type", "") or "",
|
|
153
|
+
source,
|
|
154
|
+
getattr(chunk, "content", "") or "",
|
|
155
|
+
])
|
|
156
|
+
|
|
157
|
+
# --- Rerank prompts ---
|
|
158
|
+
detail = result.rerank_detail
|
|
159
|
+
if detail is None or not detail.system_prompt:
|
|
160
|
+
rerank_system = "_LLM not configured — reranking skipped_"
|
|
161
|
+
rerank_user = ""
|
|
162
|
+
rerank_response = ""
|
|
163
|
+
else:
|
|
164
|
+
rerank_system = detail.system_prompt
|
|
165
|
+
rerank_user = detail.user_prompt
|
|
166
|
+
rerank_response = detail.raw_response or "_No response (LLM call failed)_"
|
|
167
|
+
|
|
168
|
+
return (
|
|
169
|
+
error_text,
|
|
170
|
+
timing_md,
|
|
171
|
+
final_results,
|
|
172
|
+
pre_rerank_rows,
|
|
173
|
+
rerank_system,
|
|
174
|
+
rerank_user,
|
|
175
|
+
rerank_response,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
with gr.Blocks(title="Corbell Query Debugger") as demo:
|
|
179
|
+
gr.Markdown("# Corbell Query Debugger")
|
|
180
|
+
gr.Markdown("Inspect query pipeline internals: timing, pre-rerank chunks, and LLM rerank prompts.")
|
|
181
|
+
|
|
182
|
+
with gr.Row():
|
|
183
|
+
workspace_input = gr.Textbox(
|
|
184
|
+
label="Workspace Path",
|
|
185
|
+
value=default_workspace,
|
|
186
|
+
placeholder="Path to repository root",
|
|
187
|
+
scale=2,
|
|
188
|
+
)
|
|
189
|
+
query_input = gr.Textbox(
|
|
190
|
+
label="Query",
|
|
191
|
+
placeholder="e.g. authentication middleware",
|
|
192
|
+
scale=3,
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
run_btn = gr.Button("Run Query", variant="primary")
|
|
196
|
+
|
|
197
|
+
error_box = gr.Textbox(
|
|
198
|
+
label="Error",
|
|
199
|
+
visible=True,
|
|
200
|
+
interactive=False,
|
|
201
|
+
lines=2,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
timing_md = gr.Markdown(label="Timing")
|
|
205
|
+
|
|
206
|
+
with gr.Tabs():
|
|
207
|
+
with gr.Tab("Final Results"):
|
|
208
|
+
final_output = gr.Code(label="Formatted Output", language=None)
|
|
209
|
+
|
|
210
|
+
with gr.Tab("Pre-Rerank Chunks"):
|
|
211
|
+
pre_rerank_table = gr.Dataframe(
|
|
212
|
+
headers=["File", "Lines", "Score", "Symbol", "Type", "Source", "Content"],
|
|
213
|
+
datatype=["str", "str", "str", "str", "str", "str", "str"],
|
|
214
|
+
label="Chunks before reranking",
|
|
215
|
+
wrap=False,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
with gr.Tab("LLM Rerank"):
|
|
219
|
+
rerank_system_box = gr.Textbox(
|
|
220
|
+
label="System Prompt",
|
|
221
|
+
lines=6,
|
|
222
|
+
interactive=False,
|
|
223
|
+
)
|
|
224
|
+
rerank_user_box = gr.Textbox(
|
|
225
|
+
label="User Prompt",
|
|
226
|
+
lines=12,
|
|
227
|
+
interactive=False,
|
|
228
|
+
)
|
|
229
|
+
rerank_response_box = gr.Textbox(
|
|
230
|
+
label="Raw LLM Response",
|
|
231
|
+
lines=4,
|
|
232
|
+
interactive=False,
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
with gr.Tab("MCP Debug"):
|
|
236
|
+
gr.Markdown(
|
|
237
|
+
"### MCP Tool Tester\n"
|
|
238
|
+
"Configure environment and invoke "
|
|
239
|
+
"`context_engine_codebase_retrieval` directly."
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
with gr.Accordion("Environment Configuration", open=False):
|
|
243
|
+
mcp_env_vars = gr.Textbox(
|
|
244
|
+
label="Environment Variables (one per line, KEY=VALUE)",
|
|
245
|
+
placeholder=(
|
|
246
|
+
"# Example:\n"
|
|
247
|
+
"CORBELL_LLM_PROVIDER=anthropic\n"
|
|
248
|
+
"CORBELL_RERANK=true\n"
|
|
249
|
+
"ANTHROPIC_API_KEY=sk-..."
|
|
250
|
+
),
|
|
251
|
+
lines=6,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
gr.Markdown("#### Tool Parameters")
|
|
255
|
+
with gr.Row():
|
|
256
|
+
mcp_workspace_input = gr.Textbox(
|
|
257
|
+
label="workspace_full_path",
|
|
258
|
+
value=default_workspace,
|
|
259
|
+
placeholder="Path to repository root",
|
|
260
|
+
scale=3,
|
|
261
|
+
)
|
|
262
|
+
mcp_query_input = gr.Textbox(
|
|
263
|
+
label="query",
|
|
264
|
+
placeholder="e.g. authentication middleware",
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
mcp_run_btn = gr.Button("Invoke MCP Tool", variant="primary")
|
|
268
|
+
|
|
269
|
+
mcp_error_box = gr.Textbox(
|
|
270
|
+
label="Error",
|
|
271
|
+
visible=True,
|
|
272
|
+
interactive=False,
|
|
273
|
+
lines=2,
|
|
274
|
+
)
|
|
275
|
+
mcp_result_box = gr.Code(
|
|
276
|
+
label="Tool Response",
|
|
277
|
+
language=None,
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
run_btn.click(
|
|
281
|
+
fn=run_query,
|
|
282
|
+
inputs=[workspace_input, query_input],
|
|
283
|
+
outputs=[
|
|
284
|
+
error_box,
|
|
285
|
+
timing_md,
|
|
286
|
+
final_output,
|
|
287
|
+
pre_rerank_table,
|
|
288
|
+
rerank_system_box,
|
|
289
|
+
rerank_user_box,
|
|
290
|
+
rerank_response_box,
|
|
291
|
+
],
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
mcp_run_btn.click(
|
|
295
|
+
fn=run_mcp_tool,
|
|
296
|
+
inputs=[
|
|
297
|
+
mcp_env_vars,
|
|
298
|
+
mcp_workspace_input,
|
|
299
|
+
mcp_query_input,
|
|
300
|
+
],
|
|
301
|
+
outputs=[mcp_error_box, mcp_result_box],
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
console.print(f"[green]Starting Corbell debug UI on port {port}...[/green]")
|
|
305
|
+
demo.launch(server_port=port, share=share)
|
corbell/cli/commands/index.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
import os
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
from typing import Optional
|
|
@@ -26,6 +27,9 @@ def build(
|
|
|
26
27
|
repo: Optional[str] = typer.Option(
|
|
27
28
|
None, "--repo", help="Only index a specific repo by ID."
|
|
28
29
|
),
|
|
30
|
+
verbose: bool = typer.Option(
|
|
31
|
+
False, "--verbose", "-v", help="Enable detailed performance logging."
|
|
32
|
+
),
|
|
29
33
|
) -> None:
|
|
30
34
|
"""Build (or incrementally update) the code search index.
|
|
31
35
|
|
|
@@ -39,6 +43,15 @@ def build(
|
|
|
39
43
|
2. CORBELL_WORKSPACE environment variable
|
|
40
44
|
3. Current working directory
|
|
41
45
|
"""
|
|
46
|
+
if verbose or os.environ.get("CORBELL_VERBOSE", ""):
|
|
47
|
+
logging.basicConfig(
|
|
48
|
+
level=logging.INFO,
|
|
49
|
+
format="%(asctime)s %(name)s %(message)s",
|
|
50
|
+
datefmt="%H:%M:%S",
|
|
51
|
+
)
|
|
52
|
+
else:
|
|
53
|
+
logging.basicConfig(level=logging.WARNING)
|
|
54
|
+
|
|
42
55
|
from corbell.core.workspace import build_config, db_path_for_workspace
|
|
43
56
|
|
|
44
57
|
# Resolve workspace path: flag → env var → cwd
|
corbell/cli/main.py
CHANGED
|
@@ -8,6 +8,7 @@ import typer
|
|
|
8
8
|
from dotenv import load_dotenv
|
|
9
9
|
from rich.console import Console
|
|
10
10
|
|
|
11
|
+
from corbell.cli.commands.debug import app as debug_app
|
|
11
12
|
from corbell.cli.commands.index import app as index_app
|
|
12
13
|
from corbell.cli.commands.query import app as query_app
|
|
13
14
|
|
|
@@ -32,6 +33,7 @@ console = Console()
|
|
|
32
33
|
|
|
33
34
|
app.add_typer(index_app, name="index", help="Code index commands.")
|
|
34
35
|
app.add_typer(query_app, name="query", help="Code search commands.")
|
|
36
|
+
app.add_typer(debug_app, name="debug", help="Query debug UI.")
|
|
35
37
|
|
|
36
38
|
|
|
37
39
|
# ---------------------------------------------------------------------------
|
corbell/core/constants.py
CHANGED
|
@@ -45,6 +45,14 @@ EXTENSION_LANG: dict[str, str] = {
|
|
|
45
45
|
".php": "php",
|
|
46
46
|
".cs": "csharp",
|
|
47
47
|
".rs": "rust",
|
|
48
|
+
".c": "c",
|
|
49
|
+
".cc": "cpp",
|
|
50
|
+
".cpp": "cpp",
|
|
51
|
+
".cxx": "cpp",
|
|
52
|
+
".h": "c",
|
|
53
|
+
".hh": "cpp",
|
|
54
|
+
".hpp": "cpp",
|
|
55
|
+
".hxx": "cpp",
|
|
48
56
|
".md": "markdown",
|
|
49
57
|
".yml": "yaml",
|
|
50
58
|
".yaml": "yaml",
|
|
@@ -83,7 +83,10 @@ class CodeChunkExtractor:
|
|
|
83
83
|
lang = _SUPPORTED.get(fp.suffix)
|
|
84
84
|
if not lang:
|
|
85
85
|
continue
|
|
86
|
-
|
|
86
|
+
rel_path = fp.relative_to(repo_path)
|
|
87
|
+
if any(part.startswith(".") for part in rel_path.parts[:-1]):
|
|
88
|
+
continue
|
|
89
|
+
rel = str(rel_path)
|
|
87
90
|
if gitignore_spec.match_file(rel.replace("\\", "/")):
|
|
88
91
|
continue
|
|
89
92
|
chunks = self._extract_file(fp, rel, lang, service_id, str(repo_path))
|
corbell/core/embeddings/model.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Embedding model interface +
|
|
1
|
+
"""Embedding model interface + cloud provider implementations (Google, Voyage)."""
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
@@ -33,32 +33,6 @@ class EmbeddingModel(ABC):
|
|
|
33
33
|
...
|
|
34
34
|
|
|
35
35
|
|
|
36
|
-
class SentenceTransformerModel(EmbeddingModel):
|
|
37
|
-
"""Wraps ``sentence-transformers`` with lazy loading.
|
|
38
|
-
|
|
39
|
-
Uses ``all-MiniLM-L6-v2`` by default (384-dim, fast, no API key).
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
|
43
|
-
self.model_name = model_name
|
|
44
|
-
self._model = None # lazy-loaded
|
|
45
|
-
|
|
46
|
-
def _get_model(self):
|
|
47
|
-
if self._model is None:
|
|
48
|
-
from sentence_transformers import SentenceTransformer
|
|
49
|
-
self._model = SentenceTransformer(f"sentence-transformers/{self.model_name}")
|
|
50
|
-
return self._model
|
|
51
|
-
|
|
52
|
-
def encode(self, texts: List[str]) -> List[List[float]]:
|
|
53
|
-
model = self._get_model()
|
|
54
|
-
vecs = model.encode(texts, show_progress_bar=False)
|
|
55
|
-
return [v.tolist() for v in vecs]
|
|
56
|
-
|
|
57
|
-
@property
|
|
58
|
-
def dimension(self) -> int:
|
|
59
|
-
return self._get_model().get_sentence_embedding_dimension()
|
|
60
|
-
|
|
61
|
-
|
|
62
36
|
def _is_voyage_rate_limit_error(e: Exception) -> bool:
|
|
63
37
|
"""Return True when a Voyage API error is a 429 rate limit."""
|
|
64
38
|
status = getattr(e, "status_code", None)
|
|
@@ -130,9 +104,8 @@ class GoogleEmbeddingModel(EmbeddingModel):
|
|
|
130
104
|
if not self._api_keys:
|
|
131
105
|
raise ValueError(
|
|
132
106
|
"GOOGLE_API_KEY is not set. "
|
|
133
|
-
"Set it in your environment or
|
|
134
|
-
" export GOOGLE_API_KEY=AIza
|
|
135
|
-
"Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
|
|
107
|
+
"Set it in your environment or .env file:\n"
|
|
108
|
+
" export GOOGLE_API_KEY=AIza..."
|
|
136
109
|
)
|
|
137
110
|
self._key_index: int = random.randrange(len(self._api_keys))
|
|
138
111
|
# kept for backwards-compat with tests that read _api_key directly
|
|
@@ -302,9 +275,8 @@ class VoyageEmbeddingModel(EmbeddingModel):
|
|
|
302
275
|
if not self._api_keys:
|
|
303
276
|
raise ValueError(
|
|
304
277
|
"VOYAGE_API_KEY is not set. "
|
|
305
|
-
"Set it in your environment or
|
|
306
|
-
" export VOYAGE_API_KEY=pa
|
|
307
|
-
"Or use a local embedding model (e.g. all-MiniLM-L6-v2) in storage.model."
|
|
278
|
+
"Set it in your environment or .env file:\n"
|
|
279
|
+
" export VOYAGE_API_KEY=pa-..."
|
|
308
280
|
)
|
|
309
281
|
self._key_index: int = random.randrange(len(self._api_keys))
|
|
310
282
|
# kept for backwards-compat with tests that read _api_key directly
|
|
@@ -367,12 +339,14 @@ class VoyageEmbeddingModel(EmbeddingModel):
|
|
|
367
339
|
key = self._api_keys[idx]
|
|
368
340
|
try:
|
|
369
341
|
vo = voyageai.Client(api_key=key)
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
)
|
|
342
|
+
kwargs: dict = {
|
|
343
|
+
"model": self.model_name,
|
|
344
|
+
"input_type": input_type,
|
|
345
|
+
}
|
|
346
|
+
import inspect
|
|
347
|
+
if "output_dimension" in inspect.signature(vo.embed).parameters:
|
|
348
|
+
kwargs["output_dimension"] = self.dimension
|
|
349
|
+
result = vo.embed(batch, **kwargs)
|
|
376
350
|
self._key_index = (idx + 1) % len(self._api_keys)
|
|
377
351
|
return result.embeddings
|
|
378
352
|
except Exception as e:
|