java-codebase-rag 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- java_codebase_rag-0.2.0/PKG-INFO +228 -0
- java_codebase_rag-0.2.0/README.md +195 -0
- java_codebase_rag-0.2.0/java_codebase_rag.egg-info/PKG-INFO +228 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag.egg-info/SOURCES.txt +1 -0
- java_codebase_rag-0.2.0/mcp_hints.py +932 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/mcp_v2.py +57 -28
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/pyproject.toml +1 -1
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/server.py +5 -6
- java_codebase_rag-0.2.0/tests/test_agent_skills_static.py +318 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_bank_chat_brownfield_integration.py +0 -1
- java_codebase_rag-0.2.0/tests/test_mcp_hints.py +1299 -0
- java_codebase_rag-0.1.0/PKG-INFO +0 -818
- java_codebase_rag-0.1.0/README.md +0 -785
- java_codebase_rag-0.1.0/java_codebase_rag.egg-info/PKG-INFO +0 -818
- java_codebase_rag-0.1.0/mcp_hints.py +0 -748
- java_codebase_rag-0.1.0/tests/test_mcp_hints.py +0 -1764
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/LICENSE +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/ast_java.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/brownfield_events.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/build_ast_graph.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/chunk_heuristics.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/graph_enrich.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/index_common.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag/__init__.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag/cli.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag/cli_progress.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag/config.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag/pipeline.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag.egg-info/dependency_links.txt +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag.egg-info/entry_points.txt +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag.egg-info/requires.txt +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_codebase_rag.egg-info/top_level.txt +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_index_flow_lancedb.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_index_v1_common.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/java_ontology.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/kuzu_queries.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/path_filtering.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/pr_analysis.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/search_lancedb.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/setup.cfg +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_assign_endpoint_client_extraction.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_ast_graph_build.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_ast_java_calls.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_ast_java_capabilities.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_brownfield_clients.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_brownfield_events.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_brownfield_overrides.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_brownfield_routes.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_call_edge_matching.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_call_edges_e2e.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_call_graph_receiver_resolution.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_call_graph_smoke_roundtrip.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_call_invariant.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_cli_progress_stdout_invariant.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_cli_quiet_parity.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_client_hint_recovery.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_client_node_extraction.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_client_role_rename.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_cross_service_resolution_flag.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_edge_navigation_doc.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_feign_not_exposer.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_graph_enrich.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_java_codebase_rag_cli.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_kuzu_queries.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_lancedb_e2e.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_mcp_tools.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_mcp_v2.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_mcp_v2_compose.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_meta_chain_core.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_outgoing_call_extraction.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_path_filtering.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_pr_analysis.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_resolve_routes_messaging_layer_c.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_route_extraction.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_schema_consistency.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_search_lancedb.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_search_lancedb_capability.py +0 -0
- {java_codebase_rag-0.1.0 → java_codebase_rag-0.2.0}/tests/test_string_value_atoms.py +0 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: java-codebase-rag
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: MCP server for semantic + structural search over Java codebases
|
|
5
|
+
Author: HumanBean17
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/HumanBean17/java-codebase-rag
|
|
8
|
+
Project-URL: Repository, https://github.com/HumanBean17/java-codebase-rag
|
|
9
|
+
Project-URL: Issues, https://github.com/HumanBean17/java-codebase-rag/issues
|
|
10
|
+
Keywords: mcp,java,rag,code-search,graph,lancedb,kuzu
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: kuzu<0.12,>=0.11.3
|
|
22
|
+
Requires-Dist: lancedb<0.31,>=0.25.3
|
|
23
|
+
Requires-Dist: mcp<2,>=1.27.0
|
|
24
|
+
Requires-Dist: numpy<2.5,>=1.26.4
|
|
25
|
+
Requires-Dist: pathspec<2,>=1.0.4
|
|
26
|
+
Requires-Dist: pyarrow<24,>=23.0.1
|
|
27
|
+
Requires-Dist: PyYAML<7,>=6.0.3
|
|
28
|
+
Requires-Dist: sentence-transformers<6,>=5.4.0
|
|
29
|
+
Requires-Dist: tree-sitter<0.26,>=0.25.2
|
|
30
|
+
Requires-Dist: tree-sitter-java<0.24,>=0.23.5
|
|
31
|
+
Requires-Dist: unidiff<1,>=0.7.3
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# java-codebase-rag
|
|
35
|
+
|
|
36
|
+
A graph-native code intelligence layer for Java microservice estates, exposed to LLM agents via the **Model Context Protocol (MCP)**.
|
|
37
|
+
|
|
38
|
+
The system extracts a deterministic property graph from Java source (tree-sitter), stores it in **Kuzu** (graph) alongside a **LanceDB** vector index (chunks), and exposes a deliberately small MCP surface — **five tools**: `search`, `find`, `describe`, `neighbors`, `resolve` — that collapse onto three primitive agent operations: **locate**, **inspect**, **walk**.
|
|
39
|
+
|
|
40
|
+
> **What this MCP is:** a **GPS for code navigation**, not a reasoning engine.
|
|
41
|
+
> Agents use a simple loop:
|
|
42
|
+
>
|
|
43
|
+
> 1. **Locate** entry nodes (`search` / `find`, or identifier-shaped **`resolve`**)
|
|
44
|
+
> 2. **Inspect** what a node is (`describe`)
|
|
45
|
+
> 3. **Walk** one hop at a time (`neighbors`) until enough evidence is gathered
|
|
46
|
+
>
|
|
47
|
+
> The MCP exposes structure and adjacency; the agent owns multi-hop reasoning and stop conditions.
|
|
48
|
+
|
|
49
|
+
For the design rationale, the GPS metaphor, and the full ontology, see [`docs/paper/paper.pdf`](./docs/paper/paper.pdf) (architecture report).
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install java-codebase-rag
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Python **3.11+** required. After install, `java-codebase-rag --help` should print the CLI groups.
|
|
60
|
+
|
|
61
|
+
> **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/Kuzu schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## 5-minute walkthrough — index this repo's bank-chat fixture
|
|
66
|
+
|
|
67
|
+
This repo ships a small multi-module Spring fixture under [`tests/bank-chat-system/`](./tests/bank-chat-system/) (`chat-core` + `chat-assign`) that the test suite uses for calibration. You can index it and confirm the install works end-to-end in under five minutes — no agent host required.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# 1. Clone the repo to get the fixture (the published package doesn't include tests/)
|
|
71
|
+
git clone https://github.com/HumanBean17/java-codebase-rag
|
|
72
|
+
cd java-codebase-rag
|
|
73
|
+
|
|
74
|
+
# 2. Build the index (Lance vectors + Kuzu graph). First run downloads the
|
|
75
|
+
# embedding model (~90 MB) and takes ~30-60s on the fixture.
|
|
76
|
+
java-codebase-rag init --source-root tests/bank-chat-system --index-dir /tmp/bank-chat-index
|
|
77
|
+
|
|
78
|
+
# 3. Inspect what landed (resolved config, edge counts, ontology version)
|
|
79
|
+
java-codebase-rag meta --source-root tests/bank-chat-system --index-dir /tmp/bank-chat-index
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Smoke-test the index with two checks (`search_lancedb` ships with the package):
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Vector search — proves the LanceDB side works
|
|
86
|
+
JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \
|
|
87
|
+
python -m search_lancedb "chat ingress controller" --table java --limit 3
|
|
88
|
+
|
|
89
|
+
# Vector + graph expansion — proves Kuzu is wired in
|
|
90
|
+
JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \
|
|
91
|
+
python -m search_lancedb "chat ingress controller" --table java --limit 3 \
|
|
92
|
+
--graph-expand --expand-depth 2
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
If vector hits come back and graph expansion adds neighbor symbols, the install works end-to-end. Wire it into your agent next — the five MCP tools (`search`, `find`, `describe`, `neighbors`, `resolve`) are reachable over stdio.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Wire into an MCP host
|
|
100
|
+
|
|
101
|
+
### Claude Code
|
|
102
|
+
|
|
103
|
+
With the package installed, the console script `java-codebase-rag-mcp` is on your `PATH`. Register it project-scoped:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
claude mcp add --transport stdio java-codebase-rag -- java-codebase-rag-mcp
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Then set env vars (`JAVA_CODEBASE_RAG_INDEX_DIR`, `JAVA_CODEBASE_RAG_SOURCE_ROOT`, `SBERT_MODEL`, …) in `.mcp.json` or your shell profile. For a project-scoped `.mcp.json` template, see [`mcp.json.example`](./mcp.json.example). Official docs: [Claude Code settings](https://docs.anthropic.com/en/docs/claude-code/settings).
|
|
110
|
+
|
|
111
|
+
### Claude Desktop
|
|
112
|
+
|
|
113
|
+
Edit `claude_desktop_config.json` (macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`) and add under `mcpServers`:
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"mcpServers": {
|
|
118
|
+
"java-codebase-rag": {
|
|
119
|
+
"command": "java-codebase-rag-mcp",
|
|
120
|
+
"env": {
|
|
121
|
+
"JAVA_CODEBASE_RAG_INDEX_DIR": "/ABSOLUTE/PATH/TO/.java-codebase-rag",
|
|
122
|
+
"JAVA_CODEBASE_RAG_SOURCE_ROOT": "/ABSOLUTE/PATH/TO/your-java-project"
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
See [`mcp.json.example`](./mcp.json.example) for the same shape in `.mcp.json` (Claude Code project-scoped) form.
|
|
130
|
+
|
|
131
|
+
### Driving the MCP from an agent
|
|
132
|
+
|
|
133
|
+
Pick **one** of two options (not both — they cover the same navigation intents):
|
|
134
|
+
|
|
135
|
+
1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and inline slash-style aliases (`/callers`, `/callees`, `/routes`, etc.) as prompt templates. Self-contained — no external file dependencies.
|
|
136
|
+
|
|
137
|
+
2. **[`skills/`](./skills/)** (for hosts with skill discovery) — 15 shipped `SKILL.md` files. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), the same navigation intents are available as discoverable `/` commands. Tier 1 = deterministic MCP chains (`/callers`, `/callees`, `/routes`, `/controllers`, `/clients`, `/producers`, `/handlers`, `/who-hits-route`, `/implements`, `/injects`, `/nl`). Tier 2 = bounded workflows (`/explain-feature`, `/impact-of`, `/trace-request-flow`, `/mini-map`). See [`skills/README.md`](./skills/README.md) for the full index.
|
|
138
|
+
|
|
139
|
+
Also: **[`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md)** — 7-phase agent-driven verification you run after indexing your real project.
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## The five tools, at a glance
|
|
144
|
+
|
|
145
|
+
| Tool | Purpose | Required args |
|
|
146
|
+
|---|---|---|
|
|
147
|
+
| `search` | Locate nodes by NL / code text. | `query` |
|
|
148
|
+
| `find` | Locate nodes by structured filter. | `kind`, `filter` |
|
|
149
|
+
| `describe` | Full record + edge counts for one node. | `id` |
|
|
150
|
+
| `resolve` | Identifier-shaped lookup (FQN-collision-safe). Returns `one` / `many` / `none`. | `identifier` |
|
|
151
|
+
| `neighbors` | Graph walk, one hop. | `ids`, `direction`, `edge_types` |
|
|
152
|
+
|
|
153
|
+
Full schemas, `NodeFilter` / `EdgeFilter` semantics, and the hints contract live in [`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md). Edge types and traversal directions are listed in [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md).
|
|
154
|
+
|
|
155
|
+
### Three-layer architecture
|
|
156
|
+
|
|
157
|
+
Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skills). Navigation skills in [`skills/`](./skills/) wrap the MCP tools into deterministic chains (Tier 1) and bounded workflows (Tier 2). See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Configuration
|
|
162
|
+
|
|
163
|
+
The operator-facing surface is small: pick an index dir, pick an embedding model, optionally drop a `.java-codebase-rag.yml` at your project root for microservice layout and brownfield overrides.
|
|
164
|
+
|
|
165
|
+
| If you want to… | See |
|
|
166
|
+
|---|---|
|
|
167
|
+
| Set env vars and override precedence | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §1 |
|
|
168
|
+
| Configure microservice roots and embeddings via YAML | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §2 |
|
|
169
|
+
| Understand the graph (nodes, edges, capabilities, ranking) | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §3 |
|
|
170
|
+
| Steer a brownfield Java tree (custom stereotypes, non-Spring stacks) | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §4 |
|
|
171
|
+
| Control which files the indexer walks | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §5 |
|
|
172
|
+
| Check whether your repo fits this tool's assumptions | [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) |
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## CLI cheat sheet
|
|
177
|
+
|
|
178
|
+
Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook with workflows, exit codes, and env alignment lives in [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md).
|
|
179
|
+
|
|
180
|
+
| Group | Subcommand | What it does |
|
|
181
|
+
|---|---|---|
|
|
182
|
+
| Lifecycle | `init` | First-time index. Refuses if artifacts already exist. |
|
|
183
|
+
| Lifecycle | `increment` | CocoIndex catch-up (Lance only); Kuzu stays stale until `reprocess`. |
|
|
184
|
+
| Lifecycle | `reprocess` | Full Lance + Kuzu rebuild. `--vectors-only` / `--graph-only` for a single phase. |
|
|
185
|
+
| Lifecycle | `erase` | Delete index artifacts. Requires `--yes` or TTY confirm. |
|
|
186
|
+
| Introspection | `meta`, `tables`, `diagnose-ignore`, `unresolved-calls` | Health, table listing, ignore-layer diagnostics, receiver-failure call sites. |
|
|
187
|
+
| Analysis | `analyze-pr` | Blast-radius / risk from a unified diff. |
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Further reading
|
|
192
|
+
|
|
193
|
+
| Document | What's in it |
|
|
194
|
+
|---|---|
|
|
195
|
+
| [`docs/paper/paper.pdf`](./docs/paper/paper.pdf) | Architecture report — design rationale, GPS metaphor, three-layer architecture, design principles, future work. |
|
|
196
|
+
| [`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md) | Agent-facing guide. Copy-paste into `QWEN.md` / `CLAUDE.md` / `AGENTS.md`. |
|
|
197
|
+
| [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) | Environment variables, project YAML, graph ontology, brownfield overrides, ignore patterns. |
|
|
198
|
+
| [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) | CLI operator playbook: workflows, exit codes, env alignment. |
|
|
199
|
+
| [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) | MCP-traversable edges, directions, dot-key composition. |
|
|
200
|
+
| [`skills/`](./skills/) | 15 navigation and workflow skills for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
|
|
201
|
+
| [`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md) | 7-phase agent-driven verification after indexing your project. |
|
|
202
|
+
| [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) | Assumptions about your Java repo + per-file edit map for non-conforming codebases. |
|
|
203
|
+
| [`automation/cursor_propose_only/README.md`](./automation/cursor_propose_only/README.md) | Optional proposal orchestration workflow (single-command autopilot, planning bundles, automated execution/review loops). |
|
|
204
|
+
| [`docs/PRODUCT-VISION.md`](./docs/PRODUCT-VISION.md) | Long-term product direction. |
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Install from source (contributors)
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
git clone https://github.com/HumanBean17/java-codebase-rag
|
|
212
|
+
cd java-codebase-rag
|
|
213
|
+
python3 -m venv .venv
|
|
214
|
+
.venv/bin/pip install -r requirements.txt
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
The `cocoindex` package is **only** needed for lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation work without it.
|
|
218
|
+
|
|
219
|
+
The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `EMBEDDING_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables).
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## Roadmap (graph layer)
|
|
224
|
+
|
|
225
|
+
- `get_service_topology` — microservice-level summary aggregating `HTTP_CALLS` / `ASYNC_CALLS`.
|
|
226
|
+
- Agentic routing layer (query classifier → vector / graph / both).
|
|
227
|
+
- Incremental Kuzu updates (per-changed-file) — see [`propose/TIER2-INCREMENTAL-REBUILD-PROPOSE.md`](./propose/TIER2-INCREMENTAL-REBUILD-PROPOSE.md) and [`propose/INDEX-AUTO-MODE-PROPOSE.md`](./propose/INDEX-AUTO-MODE-PROPOSE.md).
|
|
228
|
+
- Optional `codegraph_nodes` LanceDB table embedding symbol summaries so the graph itself is vector-searchable.
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# java-codebase-rag
|
|
2
|
+
|
|
3
|
+
A graph-native code intelligence layer for Java microservice estates, exposed to LLM agents via the **Model Context Protocol (MCP)**.
|
|
4
|
+
|
|
5
|
+
The system extracts a deterministic property graph from Java source (tree-sitter), stores it in **Kuzu** (graph) alongside a **LanceDB** vector index (chunks), and exposes a deliberately small MCP surface — **five tools**: `search`, `find`, `describe`, `neighbors`, `resolve` — that collapse onto three primitive agent operations: **locate**, **inspect**, **walk**.
|
|
6
|
+
|
|
7
|
+
> **What this MCP is:** a **GPS for code navigation**, not a reasoning engine.
|
|
8
|
+
> Agents use a simple loop:
|
|
9
|
+
>
|
|
10
|
+
> 1. **Locate** entry nodes (`search` / `find`, or identifier-shaped **`resolve`**)
|
|
11
|
+
> 2. **Inspect** what a node is (`describe`)
|
|
12
|
+
> 3. **Walk** one hop at a time (`neighbors`) until enough evidence is gathered
|
|
13
|
+
>
|
|
14
|
+
> The MCP exposes structure and adjacency; the agent owns multi-hop reasoning and stop conditions.
|
|
15
|
+
|
|
16
|
+
For the design rationale, the GPS metaphor, and the full ontology, see [`docs/paper/paper.pdf`](./docs/paper/paper.pdf) (architecture report).
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## Install
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
pip install java-codebase-rag
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Python **3.11+** required. After install, `java-codebase-rag --help` should print the CLI groups.
|
|
27
|
+
|
|
28
|
+
> **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/Kuzu schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change.
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 5-minute walkthrough — index this repo's bank-chat fixture
|
|
33
|
+
|
|
34
|
+
This repo ships a small multi-module Spring fixture under [`tests/bank-chat-system/`](./tests/bank-chat-system/) (`chat-core` + `chat-assign`) that the test suite uses for calibration. You can index it and confirm the install works end-to-end in under five minutes — no agent host required.
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
# 1. Clone the repo to get the fixture (the published package doesn't include tests/)
|
|
38
|
+
git clone https://github.com/HumanBean17/java-codebase-rag
|
|
39
|
+
cd java-codebase-rag
|
|
40
|
+
|
|
41
|
+
# 2. Build the index (Lance vectors + Kuzu graph). First run downloads the
|
|
42
|
+
# embedding model (~90 MB) and takes ~30-60s on the fixture.
|
|
43
|
+
java-codebase-rag init --source-root tests/bank-chat-system --index-dir /tmp/bank-chat-index
|
|
44
|
+
|
|
45
|
+
# 3. Inspect what landed (resolved config, edge counts, ontology version)
|
|
46
|
+
java-codebase-rag meta --source-root tests/bank-chat-system --index-dir /tmp/bank-chat-index
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Smoke-test the index with two checks (`search_lancedb` ships with the package):
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# Vector search — proves the LanceDB side works
|
|
53
|
+
JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \
|
|
54
|
+
python -m search_lancedb "chat ingress controller" --table java --limit 3
|
|
55
|
+
|
|
56
|
+
# Vector + graph expansion — proves Kuzu is wired in
|
|
57
|
+
JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \
|
|
58
|
+
python -m search_lancedb "chat ingress controller" --table java --limit 3 \
|
|
59
|
+
--graph-expand --expand-depth 2
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
If vector hits come back and graph expansion adds neighbor symbols, the install works end-to-end. Wire it into your agent next — the five MCP tools (`search`, `find`, `describe`, `neighbors`, `resolve`) are reachable over stdio.
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Wire into an MCP host
|
|
67
|
+
|
|
68
|
+
### Claude Code
|
|
69
|
+
|
|
70
|
+
With the package installed, the console script `java-codebase-rag-mcp` is on your `PATH`. Register it project-scoped:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
claude mcp add --transport stdio java-codebase-rag -- java-codebase-rag-mcp
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Then set env vars (`JAVA_CODEBASE_RAG_INDEX_DIR`, `JAVA_CODEBASE_RAG_SOURCE_ROOT`, `SBERT_MODEL`, …) in `.mcp.json` or your shell profile. For a project-scoped `.mcp.json` template, see [`mcp.json.example`](./mcp.json.example). Official docs: [Claude Code settings](https://docs.anthropic.com/en/docs/claude-code/settings).
|
|
77
|
+
|
|
78
|
+
### Claude Desktop
|
|
79
|
+
|
|
80
|
+
Edit `claude_desktop_config.json` (macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`) and add under `mcpServers`:
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"mcpServers": {
|
|
85
|
+
"java-codebase-rag": {
|
|
86
|
+
"command": "java-codebase-rag-mcp",
|
|
87
|
+
"env": {
|
|
88
|
+
"JAVA_CODEBASE_RAG_INDEX_DIR": "/ABSOLUTE/PATH/TO/.java-codebase-rag",
|
|
89
|
+
"JAVA_CODEBASE_RAG_SOURCE_ROOT": "/ABSOLUTE/PATH/TO/your-java-project"
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
See [`mcp.json.example`](./mcp.json.example) for the same shape in `.mcp.json` (Claude Code project-scoped) form.
|
|
97
|
+
|
|
98
|
+
### Driving the MCP from an agent
|
|
99
|
+
|
|
100
|
+
Pick **one** of two options (not both — they cover the same navigation intents):
|
|
101
|
+
|
|
102
|
+
1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and inline slash-style aliases (`/callers`, `/callees`, `/routes`, etc.) as prompt templates. Self-contained — no external file dependencies.
|
|
103
|
+
|
|
104
|
+
2. **[`skills/`](./skills/)** (for hosts with skill discovery) — 15 shipped `SKILL.md` files. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), the same navigation intents are available as discoverable `/` commands. Tier 1 = deterministic MCP chains (`/callers`, `/callees`, `/routes`, `/controllers`, `/clients`, `/producers`, `/handlers`, `/who-hits-route`, `/implements`, `/injects`, `/nl`). Tier 2 = bounded workflows (`/explain-feature`, `/impact-of`, `/trace-request-flow`, `/mini-map`). See [`skills/README.md`](./skills/README.md) for the full index.
|
|
105
|
+
|
|
106
|
+
Also: **[`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md)** — 7-phase agent-driven verification you run after indexing your real project.
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## The five tools, at a glance
|
|
111
|
+
|
|
112
|
+
| Tool | Purpose | Required args |
|
|
113
|
+
|---|---|---|
|
|
114
|
+
| `search` | Locate nodes by NL / code text. | `query` |
|
|
115
|
+
| `find` | Locate nodes by structured filter. | `kind`, `filter` |
|
|
116
|
+
| `describe` | Full record + edge counts for one node. | `id` |
|
|
117
|
+
| `resolve` | Identifier-shaped lookup (FQN-collision-safe). Returns `one` / `many` / `none`. | `identifier` |
|
|
118
|
+
| `neighbors` | Graph walk, one hop. | `ids`, `direction`, `edge_types` |
|
|
119
|
+
|
|
120
|
+
Full schemas, `NodeFilter` / `EdgeFilter` semantics, and the hints contract live in [`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md). Edge types and traversal directions are listed in [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md).
|
|
121
|
+
|
|
122
|
+
### Three-layer architecture
|
|
123
|
+
|
|
124
|
+
Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skills). Navigation skills in [`skills/`](./skills/) wrap the MCP tools into deterministic chains (Tier 1) and bounded workflows (Tier 2). See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
## Configuration
|
|
129
|
+
|
|
130
|
+
The operator-facing surface is small: pick an index dir, pick an embedding model, optionally drop a `.java-codebase-rag.yml` at your project root for microservice layout and brownfield overrides.
|
|
131
|
+
|
|
132
|
+
| If you want to… | See |
|
|
133
|
+
|---|---|
|
|
134
|
+
| Set env vars and override precedence | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §1 |
|
|
135
|
+
| Configure microservice roots and embeddings via YAML | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §2 |
|
|
136
|
+
| Understand the graph (nodes, edges, capabilities, ranking) | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §3 |
|
|
137
|
+
| Steer a brownfield Java tree (custom stereotypes, non-Spring stacks) | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §4 |
|
|
138
|
+
| Control which files the indexer walks | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §5 |
|
|
139
|
+
| Check whether your repo fits this tool's assumptions | [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) |
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## CLI cheat sheet
|
|
144
|
+
|
|
145
|
+
Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook with workflows, exit codes, and env alignment lives in [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md).
|
|
146
|
+
|
|
147
|
+
| Group | Subcommand | What it does |
|
|
148
|
+
|---|---|---|
|
|
149
|
+
| Lifecycle | `init` | First-time index. Refuses if artifacts already exist. |
|
|
150
|
+
| Lifecycle | `increment` | CocoIndex catch-up (Lance only); Kuzu stays stale until `reprocess`. |
|
|
151
|
+
| Lifecycle | `reprocess` | Full Lance + Kuzu rebuild. `--vectors-only` / `--graph-only` for a single phase. |
|
|
152
|
+
| Lifecycle | `erase` | Delete index artifacts. Requires `--yes` or TTY confirm. |
|
|
153
|
+
| Introspection | `meta`, `tables`, `diagnose-ignore`, `unresolved-calls` | Health, table listing, ignore-layer diagnostics, receiver-failure call sites. |
|
|
154
|
+
| Analysis | `analyze-pr` | Blast-radius / risk from a unified diff. |
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
## Further reading
|
|
159
|
+
|
|
160
|
+
| Document | What's in it |
|
|
161
|
+
|---|---|
|
|
162
|
+
| [`docs/paper/paper.pdf`](./docs/paper/paper.pdf) | Architecture report — design rationale, GPS metaphor, three-layer architecture, design principles, future work. |
|
|
163
|
+
| [`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md) | Agent-facing guide. Copy-paste into `QWEN.md` / `CLAUDE.md` / `AGENTS.md`. |
|
|
164
|
+
| [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) | Environment variables, project YAML, graph ontology, brownfield overrides, ignore patterns. |
|
|
165
|
+
| [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) | CLI operator playbook: workflows, exit codes, env alignment. |
|
|
166
|
+
| [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) | MCP-traversable edges, directions, dot-key composition. |
|
|
167
|
+
| [`skills/`](./skills/) | 15 navigation and workflow skills for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
|
|
168
|
+
| [`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md) | 7-phase agent-driven verification after indexing your project. |
|
|
169
|
+
| [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) | Assumptions about your Java repo + per-file edit map for non-conforming codebases. |
|
|
170
|
+
| [`automation/cursor_propose_only/README.md`](./automation/cursor_propose_only/README.md) | Optional proposal orchestration workflow (single-command autopilot, planning bundles, automated execution/review loops). |
|
|
171
|
+
| [`docs/PRODUCT-VISION.md`](./docs/PRODUCT-VISION.md) | Long-term product direction. |
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Install from source (contributors)
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
git clone https://github.com/HumanBean17/java-codebase-rag
|
|
179
|
+
cd java-codebase-rag
|
|
180
|
+
python3 -m venv .venv
|
|
181
|
+
.venv/bin/pip install -r requirements.txt
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
The `cocoindex` package is **only** needed for lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation work without it.
|
|
185
|
+
|
|
186
|
+
The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `EMBEDDING_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables).
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## Roadmap (graph layer)
|
|
191
|
+
|
|
192
|
+
- `get_service_topology` — microservice-level summary aggregating `HTTP_CALLS` / `ASYNC_CALLS`.
|
|
193
|
+
- Agentic routing layer (query classifier → vector / graph / both).
|
|
194
|
+
- Incremental Kuzu updates (per-changed-file) — see [`propose/TIER2-INCREMENTAL-REBUILD-PROPOSE.md`](./propose/TIER2-INCREMENTAL-REBUILD-PROPOSE.md) and [`propose/INDEX-AUTO-MODE-PROPOSE.md`](./propose/INDEX-AUTO-MODE-PROPOSE.md).
|
|
195
|
+
- Optional `codegraph_nodes` LanceDB table embedding symbol summaries so the graph itself is vector-searchable.
|
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: java-codebase-rag
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: MCP server for semantic + structural search over Java codebases
|
|
5
|
+
Author: HumanBean17
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/HumanBean17/java-codebase-rag
|
|
8
|
+
Project-URL: Repository, https://github.com/HumanBean17/java-codebase-rag
|
|
9
|
+
Project-URL: Issues, https://github.com/HumanBean17/java-codebase-rag/issues
|
|
10
|
+
Keywords: mcp,java,rag,code-search,graph,lancedb,kuzu
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Requires-Python: >=3.11
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: kuzu<0.12,>=0.11.3
|
|
22
|
+
Requires-Dist: lancedb<0.31,>=0.25.3
|
|
23
|
+
Requires-Dist: mcp<2,>=1.27.0
|
|
24
|
+
Requires-Dist: numpy<2.5,>=1.26.4
|
|
25
|
+
Requires-Dist: pathspec<2,>=1.0.4
|
|
26
|
+
Requires-Dist: pyarrow<24,>=23.0.1
|
|
27
|
+
Requires-Dist: PyYAML<7,>=6.0.3
|
|
28
|
+
Requires-Dist: sentence-transformers<6,>=5.4.0
|
|
29
|
+
Requires-Dist: tree-sitter<0.26,>=0.25.2
|
|
30
|
+
Requires-Dist: tree-sitter-java<0.24,>=0.23.5
|
|
31
|
+
Requires-Dist: unidiff<1,>=0.7.3
|
|
32
|
+
Dynamic: license-file
|
|
33
|
+
|
|
34
|
+
# java-codebase-rag
|
|
35
|
+
|
|
36
|
+
A graph-native code intelligence layer for Java microservice estates, exposed to LLM agents via the **Model Context Protocol (MCP)**.
|
|
37
|
+
|
|
38
|
+
The system extracts a deterministic property graph from Java source (tree-sitter), stores it in **Kuzu** (graph) alongside a **LanceDB** vector index (chunks), and exposes a deliberately small MCP surface — **five tools**: `search`, `find`, `describe`, `neighbors`, `resolve` — that collapse onto three primitive agent operations: **locate**, **inspect**, **walk**.
|
|
39
|
+
|
|
40
|
+
> **What this MCP is:** a **GPS for code navigation**, not a reasoning engine.
|
|
41
|
+
> Agents use a simple loop:
|
|
42
|
+
>
|
|
43
|
+
> 1. **Locate** entry nodes (`search` / `find`, or identifier-shaped **`resolve`**)
|
|
44
|
+
> 2. **Inspect** what a node is (`describe`)
|
|
45
|
+
> 3. **Walk** one hop at a time (`neighbors`) until enough evidence is gathered
|
|
46
|
+
>
|
|
47
|
+
> The MCP exposes structure and adjacency; the agent owns multi-hop reasoning and stop conditions.
|
|
48
|
+
|
|
49
|
+
For the design rationale, the GPS metaphor, and the full ontology, see [`docs/paper/paper.pdf`](./docs/paper/paper.pdf) (architecture report).
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Install
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install java-codebase-rag
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Python **3.11+** required. After install, `java-codebase-rag --help` should print the CLI groups.
|
|
60
|
+
|
|
61
|
+
> **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/Kuzu schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change.
|
|
62
|
+
|
|
63
|
+
---
|
|
64
|
+
|
|
65
|
+
## 5-minute walkthrough — index this repo's bank-chat fixture
|
|
66
|
+
|
|
67
|
+
This repo ships a small multi-module Spring fixture under [`tests/bank-chat-system/`](./tests/bank-chat-system/) (`chat-core` + `chat-assign`) that the test suite uses for calibration. You can index it and confirm the install works end-to-end in under five minutes — no agent host required.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# 1. Clone the repo to get the fixture (the published package doesn't include tests/)
|
|
71
|
+
git clone https://github.com/HumanBean17/java-codebase-rag
|
|
72
|
+
cd java-codebase-rag
|
|
73
|
+
|
|
74
|
+
# 2. Build the index (Lance vectors + Kuzu graph). First run downloads the
|
|
75
|
+
# embedding model (~90 MB) and takes ~30-60s on the fixture.
|
|
76
|
+
java-codebase-rag init --source-root tests/bank-chat-system --index-dir /tmp/bank-chat-index
|
|
77
|
+
|
|
78
|
+
# 3. Inspect what landed (resolved config, edge counts, ontology version)
|
|
79
|
+
java-codebase-rag meta --source-root tests/bank-chat-system --index-dir /tmp/bank-chat-index
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
Smoke-test the index with two checks (`search_lancedb` ships with the package):
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Vector search — proves the LanceDB side works
|
|
86
|
+
JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \
|
|
87
|
+
python -m search_lancedb "chat ingress controller" --table java --limit 3
|
|
88
|
+
|
|
89
|
+
# Vector + graph expansion — proves Kuzu is wired in
|
|
90
|
+
JAVA_CODEBASE_RAG_INDEX_DIR=/tmp/bank-chat-index \
|
|
91
|
+
python -m search_lancedb "chat ingress controller" --table java --limit 3 \
|
|
92
|
+
--graph-expand --expand-depth 2
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
If vector hits come back and graph expansion adds neighbor symbols, the install works end-to-end. Wire it into your agent next — the five MCP tools (`search`, `find`, `describe`, `neighbors`, `resolve`) are reachable over stdio.
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Wire into an MCP host
|
|
100
|
+
|
|
101
|
+
### Claude Code
|
|
102
|
+
|
|
103
|
+
With the package installed, the console script `java-codebase-rag-mcp` is on your `PATH`. Register it project-scoped:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
claude mcp add --transport stdio java-codebase-rag -- java-codebase-rag-mcp
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Then set env vars (`JAVA_CODEBASE_RAG_INDEX_DIR`, `JAVA_CODEBASE_RAG_SOURCE_ROOT`, `SBERT_MODEL`, …) in `.mcp.json` or your shell profile. For a project-scoped `.mcp.json` template, see [`mcp.json.example`](./mcp.json.example). Official docs: [Claude Code settings](https://docs.anthropic.com/en/docs/claude-code/settings).
|
|
110
|
+
|
|
111
|
+
### Claude Desktop
|
|
112
|
+
|
|
113
|
+
Edit `claude_desktop_config.json` (macOS: `~/Library/Application Support/Claude/claude_desktop_config.json`) and add under `mcpServers`:
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"mcpServers": {
|
|
118
|
+
"java-codebase-rag": {
|
|
119
|
+
"command": "java-codebase-rag-mcp",
|
|
120
|
+
"env": {
|
|
121
|
+
"JAVA_CODEBASE_RAG_INDEX_DIR": "/ABSOLUTE/PATH/TO/.java-codebase-rag",
|
|
122
|
+
"JAVA_CODEBASE_RAG_SOURCE_ROOT": "/ABSOLUTE/PATH/TO/your-java-project"
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
See [`mcp.json.example`](./mcp.json.example) for the same shape in `.mcp.json` (Claude Code project-scoped) form.
|
|
130
|
+
|
|
131
|
+
### Driving the MCP from an agent
|
|
132
|
+
|
|
133
|
+
Pick **one** of two options (not both — they cover the same navigation intents):
|
|
134
|
+
|
|
135
|
+
1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and inline slash-style aliases (`/callers`, `/callees`, `/routes`, etc.) as prompt templates. Self-contained — no external file dependencies.
|
|
136
|
+
|
|
137
|
+
2. **[`skills/`](./skills/)** (for hosts with skill discovery) — 15 shipped `SKILL.md` files. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), the same navigation intents are available as discoverable `/` commands. Tier 1 = deterministic MCP chains (`/callers`, `/callees`, `/routes`, `/controllers`, `/clients`, `/producers`, `/handlers`, `/who-hits-route`, `/implements`, `/injects`, `/nl`). Tier 2 = bounded workflows (`/explain-feature`, `/impact-of`, `/trace-request-flow`, `/mini-map`). See [`skills/README.md`](./skills/README.md) for the full index.
|
|
138
|
+
|
|
139
|
+
Also: **[`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md)** — 7-phase agent-driven verification you run after indexing your real project.
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## The five tools, at a glance
|
|
144
|
+
|
|
145
|
+
| Tool | Purpose | Required args |
|
|
146
|
+
|---|---|---|
|
|
147
|
+
| `search` | Locate nodes by NL / code text. | `query` |
|
|
148
|
+
| `find` | Locate nodes by structured filter. | `kind`, `filter` |
|
|
149
|
+
| `describe` | Full record + edge counts for one node. | `id` |
|
|
150
|
+
| `resolve` | Identifier-shaped lookup (FQN-collision-safe). Returns `one` / `many` / `none`. | `identifier` |
|
|
151
|
+
| `neighbors` | Graph walk, one hop. | `ids`, `direction`, `edge_types` |
|
|
152
|
+
|
|
153
|
+
Full schemas, `NodeFilter` / `EdgeFilter` semantics, and the hints contract live in [`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md). Edge types and traversal directions are listed in [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md).
|
|
154
|
+
|
|
155
|
+
### Three-layer architecture
|
|
156
|
+
|
|
157
|
+
Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skills). Navigation skills in [`skills/`](./skills/) wrap the MCP tools into deterministic chains (Tier 1) and bounded workflows (Tier 2). See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
|
|
158
|
+
|
|
159
|
+
---
|
|
160
|
+
|
|
161
|
+
## Configuration
|
|
162
|
+
|
|
163
|
+
The operator-facing surface is small: pick an index dir, pick an embedding model, optionally drop a `.java-codebase-rag.yml` at your project root for microservice layout and brownfield overrides.
|
|
164
|
+
|
|
165
|
+
| If you want to… | See |
|
|
166
|
+
|---|---|
|
|
167
|
+
| Set env vars and override precedence | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §1 |
|
|
168
|
+
| Configure microservice roots and embeddings via YAML | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §2 |
|
|
169
|
+
| Understand the graph (nodes, edges, capabilities, ranking) | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §3 |
|
|
170
|
+
| Steer a brownfield Java tree (custom stereotypes, non-Spring stacks) | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §4 |
|
|
171
|
+
| Control which files the indexer walks | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) §5 |
|
|
172
|
+
| Check whether your repo fits this tool's assumptions | [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) |
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## CLI cheat sheet
|
|
177
|
+
|
|
178
|
+
Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook with workflows, exit codes, and env alignment lives in [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md).
|
|
179
|
+
|
|
180
|
+
| Group | Subcommand | What it does |
|
|
181
|
+
|---|---|---|
|
|
182
|
+
| Lifecycle | `init` | First-time index. Refuses if artifacts already exist. |
|
|
183
|
+
| Lifecycle | `increment` | CocoIndex catch-up (Lance only); Kuzu stays stale until `reprocess`. |
|
|
184
|
+
| Lifecycle | `reprocess` | Full Lance + Kuzu rebuild. `--vectors-only` / `--graph-only` for a single phase. |
|
|
185
|
+
| Lifecycle | `erase` | Delete index artifacts. Requires `--yes` or TTY confirm. |
|
|
186
|
+
| Introspection | `meta`, `tables`, `diagnose-ignore`, `unresolved-calls` | Health, table listing, ignore-layer diagnostics, receiver-failure call sites. |
|
|
187
|
+
| Analysis | `analyze-pr` | Blast-radius / risk from a unified diff. |
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Further reading
|
|
192
|
+
|
|
193
|
+
| Document | What's in it |
|
|
194
|
+
|---|---|
|
|
195
|
+
| [`docs/paper/paper.pdf`](./docs/paper/paper.pdf) | Architecture report — design rationale, GPS metaphor, three-layer architecture, design principles, future work. |
|
|
196
|
+
| [`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md) | Agent-facing guide. Copy-paste into `QWEN.md` / `CLAUDE.md` / `AGENTS.md`. |
|
|
197
|
+
| [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) | Environment variables, project YAML, graph ontology, brownfield overrides, ignore patterns. |
|
|
198
|
+
| [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) | CLI operator playbook: workflows, exit codes, env alignment. |
|
|
199
|
+
| [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) | MCP-traversable edges, directions, dot-key composition. |
|
|
200
|
+
| [`skills/`](./skills/) | 15 navigation and workflow skills for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
|
|
201
|
+
| [`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md) | 7-phase agent-driven verification after indexing your project. |
|
|
202
|
+
| [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) | Assumptions about your Java repo + per-file edit map for non-conforming codebases. |
|
|
203
|
+
| [`automation/cursor_propose_only/README.md`](./automation/cursor_propose_only/README.md) | Optional proposal orchestration workflow (single-command autopilot, planning bundles, automated execution/review loops). |
|
|
204
|
+
| [`docs/PRODUCT-VISION.md`](./docs/PRODUCT-VISION.md) | Long-term product direction. |
|
|
205
|
+
|
|
206
|
+
---
|
|
207
|
+
|
|
208
|
+
## Install from source (contributors)
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
git clone https://github.com/HumanBean17/java-codebase-rag
|
|
212
|
+
cd java-codebase-rag
|
|
213
|
+
python3 -m venv .venv
|
|
214
|
+
.venv/bin/pip install -r requirements.txt
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
The `cocoindex` package is **only** needed for lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation work without it.
|
|
218
|
+
|
|
219
|
+
The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `EMBEDDING_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables).
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## Roadmap (graph layer)
|
|
224
|
+
|
|
225
|
+
- `get_service_topology` — microservice-level summary aggregating `HTTP_CALLS` / `ASYNC_CALLS`.
|
|
226
|
+
- Agentic routing layer (query classifier → vector / graph / both).
|
|
227
|
+
- Incremental Kuzu updates (per-changed-file) — see [`propose/TIER2-INCREMENTAL-REBUILD-PROPOSE.md`](./propose/TIER2-INCREMENTAL-REBUILD-PROPOSE.md) and [`propose/INDEX-AUTO-MODE-PROPOSE.md`](./propose/INDEX-AUTO-MODE-PROPOSE.md).
|
|
228
|
+
- Optional `codegraph_nodes` LanceDB table embedding symbol summaries so the graph itself is vector-searchable.
|
|
@@ -28,6 +28,7 @@ java_codebase_rag.egg-info/dependency_links.txt
|
|
|
28
28
|
java_codebase_rag.egg-info/entry_points.txt
|
|
29
29
|
java_codebase_rag.egg-info/requires.txt
|
|
30
30
|
java_codebase_rag.egg-info/top_level.txt
|
|
31
|
+
tests/test_agent_skills_static.py
|
|
31
32
|
tests/test_assign_endpoint_client_extraction.py
|
|
32
33
|
tests/test_ast_graph_build.py
|
|
33
34
|
tests/test_ast_java_calls.py
|