java-codebase-rag 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. {java_codebase_rag-0.2.0/java_codebase_rag.egg-info → java_codebase_rag-0.2.1}/PKG-INFO +26 -6
  2. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/README.md +24 -5
  3. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1/java_codebase_rag.egg-info}/PKG-INFO +26 -6
  4. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag.egg-info/SOURCES.txt +1 -0
  5. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag.egg-info/requires.txt +1 -0
  6. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/pyproject.toml +2 -1
  7. java_codebase_rag-0.2.1/tests/test_agent_skills_static.py +251 -0
  8. java_codebase_rag-0.2.1/tests/test_packaging_metadata.py +14 -0
  9. java_codebase_rag-0.2.0/tests/test_agent_skills_static.py +0 -318
  10. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/LICENSE +0 -0
  11. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/ast_java.py +0 -0
  12. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/brownfield_events.py +0 -0
  13. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/build_ast_graph.py +0 -0
  14. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/chunk_heuristics.py +0 -0
  15. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/graph_enrich.py +0 -0
  16. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/index_common.py +0 -0
  17. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag/__init__.py +0 -0
  18. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag/cli.py +0 -0
  19. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag/cli_progress.py +0 -0
  20. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag/config.py +0 -0
  21. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag/pipeline.py +0 -0
  22. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag.egg-info/dependency_links.txt +0 -0
  23. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag.egg-info/entry_points.txt +0 -0
  24. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_codebase_rag.egg-info/top_level.txt +0 -0
  25. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_index_flow_lancedb.py +0 -0
  26. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_index_v1_common.py +0 -0
  27. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/java_ontology.py +0 -0
  28. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/kuzu_queries.py +0 -0
  29. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/mcp_hints.py +0 -0
  30. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/mcp_v2.py +0 -0
  31. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/path_filtering.py +0 -0
  32. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/pr_analysis.py +0 -0
  33. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/search_lancedb.py +0 -0
  34. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/server.py +0 -0
  35. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/setup.cfg +0 -0
  36. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_assign_endpoint_client_extraction.py +0 -0
  37. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_ast_graph_build.py +0 -0
  38. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_ast_java_calls.py +0 -0
  39. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_ast_java_capabilities.py +0 -0
  40. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_bank_chat_brownfield_integration.py +0 -0
  41. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_brownfield_clients.py +0 -0
  42. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_brownfield_events.py +0 -0
  43. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_brownfield_overrides.py +0 -0
  44. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_brownfield_routes.py +0 -0
  45. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_call_edge_matching.py +0 -0
  46. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_call_edges_e2e.py +0 -0
  47. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_call_graph_receiver_resolution.py +0 -0
  48. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_call_graph_smoke_roundtrip.py +0 -0
  49. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_call_invariant.py +0 -0
  50. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_cli_progress_stdout_invariant.py +0 -0
  51. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_cli_quiet_parity.py +0 -0
  52. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_client_hint_recovery.py +0 -0
  53. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_client_node_extraction.py +0 -0
  54. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_client_role_rename.py +0 -0
  55. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_cross_service_resolution_flag.py +0 -0
  56. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_edge_navigation_doc.py +0 -0
  57. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_feign_not_exposer.py +0 -0
  58. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_graph_enrich.py +0 -0
  59. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_java_codebase_rag_cli.py +0 -0
  60. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_kuzu_queries.py +0 -0
  61. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_lancedb_e2e.py +0 -0
  62. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_mcp_hints.py +0 -0
  63. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_mcp_tools.py +0 -0
  64. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_mcp_v2.py +0 -0
  65. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_mcp_v2_compose.py +0 -0
  66. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_meta_chain_core.py +0 -0
  67. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_outgoing_call_extraction.py +0 -0
  68. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_path_filtering.py +0 -0
  69. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_pr_analysis.py +0 -0
  70. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_resolve_routes_messaging_layer_c.py +0 -0
  71. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_route_extraction.py +0 -0
  72. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_schema_consistency.py +0 -0
  73. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_search_lancedb.py +0 -0
  74. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_search_lancedb_capability.py +0 -0
  75. {java_codebase_rag-0.2.0 → java_codebase_rag-0.2.1}/tests/test_string_value_atoms.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: java-codebase-rag
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: MCP server for semantic + structural search over Java codebases
5
5
  Author: HumanBean17
6
6
  License-Expression: MIT
@@ -18,6 +18,7 @@ Classifier: Topic :: Software Development :: Libraries
18
18
  Requires-Python: >=3.11
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
+ Requires-Dist: cocoindex[lancedb]<2,>=1.0.0a43
21
22
  Requires-Dist: kuzu<0.12,>=0.11.3
22
23
  Requires-Dist: lancedb<0.31,>=0.25.3
23
24
  Requires-Dist: mcp<2,>=1.27.0
@@ -50,6 +51,24 @@ For the design rationale, the GPS metaphor, and the full ontology, see [`docs/pa
50
51
 
51
52
  ---
52
53
 
54
+ ## Why this exists
55
+
56
+ Generic code-search tools (grep, ctags, vector-only RAG) hit a ceiling on real Java microservice estates: they find files but lose the structure that makes a Spring/JAX-RS system navigable. This project is built around five choices that target that gap.
57
+
58
+ - **Hybrid RAG + GraphRAG, not either-or.** Semantic recall (LanceDB chunk vectors) and structural navigation (Kuzu property graph) are composed in one surface. `search` finds candidate nodes by meaning; `neighbors` walks the exact edge you care about (`CALLS`, `IMPLEMENTS`, `INJECTS`, `DECLARES_ROUTE`, …). The agent picks the right primitive per step instead of being forced into pure-vector or pure-symbol search.
59
+
60
+ - **A Java-tuned role model.** Symbols are labelled with stereotypes inferred from Spring and JAX-RS conventions — `CONTROLLER`, `SERVICE`, `REPOSITORY`, `CLIENT`, `PRODUCER`, `MAPPER`, `DTO`. Agents can ask "list controllers" or "who injects this repository" directly, instead of grep-ing for `@RestController` and hoping for the best. Roles drive both filtering (`find` with a `NodeFilter`) and ranking.
61
+
62
+ - **Ranking specialized for Java codebases.** The composite ranker is aware of role, microservice, and FQN structure — not a generic BM25. A search for `"chat ingress"` surfaces controllers before utility classes; a search scoped to one microservice doesn't drown in matches from the other 19. Defaults are tuned on the bank-chat fixture and exposed in `docs/CONFIGURATION.md` for per-repo overrides.
63
+
64
+ - **Cross-service resolution + system-level navigation.** `HTTP_CALLS` and `ASYNC_CALLS` edges connect Clients and Producers in one microservice to Routes and Handlers in another, resolved at index time from URL/topic strings + Spring `@FeignClient` / `RestTemplate` conventions. `/who-hits-route`, `/trace-request-flow`, and `/impact-of` use these to answer questions a single-service tool fundamentally can't — "who calls this REST endpoint from outside this service", "trace this Kafka message end-to-end", "if I change this DTO, which services break".
65
+
66
+ - **Brownfield annotations as a first-class override.** Real Java estates have hand-rolled HTTP clients, dynamic topic names, reflection-heavy routing. `@CodebaseHttpRoute`, `@CodebaseAsyncRoute`, `@CodebaseHttpClient`, and `@CodebaseProducer` let you pin the truth in source. They have **exclusive priority** — when a symbol is annotated, framework-convention inference is skipped entirely. You get a correct graph on legacy code without rewriting it.
67
+
68
+ The rest of this README is the install, walkthrough, and tool cheat sheet for putting that to work.
69
+
70
+ ---
71
+
53
72
  ## Install
54
73
 
55
74
  ```bash
@@ -57,6 +76,7 @@ pip install java-codebase-rag
57
76
  ```
58
77
 
59
78
  Python **3.11+** required. After install, `java-codebase-rag --help` should print the CLI groups.
79
+ The package includes the CocoIndex lifecycle dependency used by `init`, `increment`, `reprocess`, and `erase`.
60
80
 
61
81
  > **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/Kuzu schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change.
62
82
 
@@ -132,9 +152,9 @@ See [`mcp.json.example`](./mcp.json.example) for the same shape in `.mcp.json` (
132
152
 
133
153
  Pick **one** of two options (not both — they cover the same navigation intents):
134
154
 
135
- 1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and inline slash-style aliases (`/callers`, `/callees`, `/routes`, etc.) as prompt templates. Self-contained — no external file dependencies.
155
+ 1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and navigation patterns. Self-contained — no external file dependencies.
136
156
 
137
- 2. **[`skills/`](./skills/)** (for hosts with skill discovery) — 15 shipped `SKILL.md` files. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), the same navigation intents are available as discoverable `/` commands. Tier 1 = deterministic MCP chains (`/callers`, `/callees`, `/routes`, `/controllers`, `/clients`, `/producers`, `/handlers`, `/who-hits-route`, `/implements`, `/injects`, `/nl`). Tier 2 = bounded workflows (`/explain-feature`, `/impact-of`, `/trace-request-flow`, `/mini-map`). See [`skills/README.md`](./skills/README.md) for the full index.
157
+ 2. **[`/explore-codebase`](./skills/explore-codebase/SKILL.md)** (for hosts with skill discovery) — single self-contained skill with the complete operating manual. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), load `/explore-codebase` to get the full tool reference, edge taxonomy, decision tree, and recovery playbook in one shot.
138
158
 
139
159
  Also: **[`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md)** — 7-phase agent-driven verification you run after indexing your real project.
140
160
 
@@ -154,7 +174,7 @@ Full schemas, `NodeFilter` / `EdgeFilter` semantics, and the hints contract live
154
174
 
155
175
  ### Three-layer architecture
156
176
 
157
- Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skills). Navigation skills in [`skills/`](./skills/) wrap the MCP tools into deterministic chains (Tier 1) and bounded workflows (Tier 2). See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
177
+ Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skill). The [`/explore-codebase`](./skills/explore-codebase/SKILL.md) skill provides the full operating manual for Layer 2. See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
158
178
 
159
179
  ---
160
180
 
@@ -197,7 +217,7 @@ Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook wi
197
217
  | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) | Environment variables, project YAML, graph ontology, brownfield overrides, ignore patterns. |
198
218
  | [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) | CLI operator playbook: workflows, exit codes, env alignment. |
199
219
  | [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) | MCP-traversable edges, directions, dot-key composition. |
200
- | [`skills/`](./skills/) | 15 navigation and workflow skills for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
220
+ | [`skills/`](./skills/) | Single `/explore-codebase` skill complete MCP operating manual for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
201
221
  | [`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md) | 7-phase agent-driven verification after indexing your project. |
202
222
  | [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) | Assumptions about your Java repo + per-file edit map for non-conforming codebases. |
203
223
  | [`automation/cursor_propose_only/README.md`](./automation/cursor_propose_only/README.md) | Optional proposal orchestration workflow (single-command autopilot, planning bundles, automated execution/review loops). |
@@ -214,7 +234,7 @@ python3 -m venv .venv
214
234
  .venv/bin/pip install -r requirements.txt
215
235
  ```
216
236
 
217
- The `cocoindex` package is **only** needed for lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation work without it.
237
+ The `cocoindex` package powers lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation do not invoke it directly.
218
238
 
219
239
  The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `EMBEDDING_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables).
220
240
 
@@ -17,6 +17,24 @@ For the design rationale, the GPS metaphor, and the full ontology, see [`docs/pa
17
17
 
18
18
  ---
19
19
 
20
+ ## Why this exists
21
+
22
+ Generic code-search tools (grep, ctags, vector-only RAG) hit a ceiling on real Java microservice estates: they find files but lose the structure that makes a Spring/JAX-RS system navigable. This project is built around five choices that target that gap.
23
+
24
+ - **Hybrid RAG + GraphRAG, not either-or.** Semantic recall (LanceDB chunk vectors) and structural navigation (Kuzu property graph) are composed in one surface. `search` finds candidate nodes by meaning; `neighbors` walks the exact edge you care about (`CALLS`, `IMPLEMENTS`, `INJECTS`, `DECLARES_ROUTE`, …). The agent picks the right primitive per step instead of being forced into pure-vector or pure-symbol search.
25
+
26
+ - **A Java-tuned role model.** Symbols are labelled with stereotypes inferred from Spring and JAX-RS conventions — `CONTROLLER`, `SERVICE`, `REPOSITORY`, `CLIENT`, `PRODUCER`, `MAPPER`, `DTO`. Agents can ask "list controllers" or "who injects this repository" directly, instead of grep-ing for `@RestController` and hoping for the best. Roles drive both filtering (`find` with a `NodeFilter`) and ranking.
27
+
28
+ - **Ranking specialized for Java codebases.** The composite ranker is aware of role, microservice, and FQN structure — not a generic BM25. A search for `"chat ingress"` surfaces controllers before utility classes; a search scoped to one microservice doesn't drown in matches from the other 19. Defaults are tuned on the bank-chat fixture and exposed in `docs/CONFIGURATION.md` for per-repo overrides.
29
+
30
+ - **Cross-service resolution + system-level navigation.** `HTTP_CALLS` and `ASYNC_CALLS` edges connect Clients and Producers in one microservice to Routes and Handlers in another, resolved at index time from URL/topic strings + Spring `@FeignClient` / `RestTemplate` conventions. `/who-hits-route`, `/trace-request-flow`, and `/impact-of` use these to answer questions a single-service tool fundamentally can't — "who calls this REST endpoint from outside this service", "trace this Kafka message end-to-end", "if I change this DTO, which services break".
31
+
32
+ - **Brownfield annotations as a first-class override.** Real Java estates have hand-rolled HTTP clients, dynamic topic names, reflection-heavy routing. `@CodebaseHttpRoute`, `@CodebaseAsyncRoute`, `@CodebaseHttpClient`, and `@CodebaseProducer` let you pin the truth in source. They have **exclusive priority** — when a symbol is annotated, framework-convention inference is skipped entirely. You get a correct graph on legacy code without rewriting it.
33
+
34
+ The rest of this README is the install, walkthrough, and tool cheat sheet for putting that to work.
35
+
36
+ ---
37
+
20
38
  ## Install
21
39
 
22
40
  ```bash
@@ -24,6 +42,7 @@ pip install java-codebase-rag
24
42
  ```
25
43
 
26
44
  Python **3.11+** required. After install, `java-codebase-rag --help` should print the CLI groups.
45
+ The package includes the CocoIndex lifecycle dependency used by `init`, `increment`, `reprocess`, and `erase`.
27
46
 
28
47
  > **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/Kuzu schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change.
29
48
 
@@ -99,9 +118,9 @@ See [`mcp.json.example`](./mcp.json.example) for the same shape in `.mcp.json` (
99
118
 
100
119
  Pick **one** of two options (not both — they cover the same navigation intents):
101
120
 
102
- 1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and inline slash-style aliases (`/callers`, `/callees`, `/routes`, etc.) as prompt templates. Self-contained — no external file dependencies.
121
+ 1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and navigation patterns. Self-contained — no external file dependencies.
103
122
 
104
- 2. **[`skills/`](./skills/)** (for hosts with skill discovery) — 15 shipped `SKILL.md` files. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), the same navigation intents are available as discoverable `/` commands. Tier 1 = deterministic MCP chains (`/callers`, `/callees`, `/routes`, `/controllers`, `/clients`, `/producers`, `/handlers`, `/who-hits-route`, `/implements`, `/injects`, `/nl`). Tier 2 = bounded workflows (`/explain-feature`, `/impact-of`, `/trace-request-flow`, `/mini-map`). See [`skills/README.md`](./skills/README.md) for the full index.
123
+ 2. **[`/explore-codebase`](./skills/explore-codebase/SKILL.md)** (for hosts with skill discovery) — single self-contained skill with the complete operating manual. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), load `/explore-codebase` to get the full tool reference, edge taxonomy, decision tree, and recovery playbook in one shot.
105
124
 
106
125
  Also: **[`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md)** — 7-phase agent-driven verification you run after indexing your real project.
107
126
 
@@ -121,7 +140,7 @@ Full schemas, `NodeFilter` / `EdgeFilter` semantics, and the hints contract live
121
140
 
122
141
  ### Three-layer architecture
123
142
 
124
- Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skills). Navigation skills in [`skills/`](./skills/) wrap the MCP tools into deterministic chains (Tier 1) and bounded workflows (Tier 2). See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
143
+ Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skill). The [`/explore-codebase`](./skills/explore-codebase/SKILL.md) skill provides the full operating manual for Layer 2. See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
125
144
 
126
145
  ---
127
146
 
@@ -164,7 +183,7 @@ Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook wi
164
183
  | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) | Environment variables, project YAML, graph ontology, brownfield overrides, ignore patterns. |
165
184
  | [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) | CLI operator playbook: workflows, exit codes, env alignment. |
166
185
  | [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) | MCP-traversable edges, directions, dot-key composition. |
167
- | [`skills/`](./skills/) | 15 navigation and workflow skills for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
186
+ | [`skills/`](./skills/) | Single `/explore-codebase` skill complete MCP operating manual for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
168
187
  | [`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md) | 7-phase agent-driven verification after indexing your project. |
169
188
  | [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) | Assumptions about your Java repo + per-file edit map for non-conforming codebases. |
170
189
  | [`automation/cursor_propose_only/README.md`](./automation/cursor_propose_only/README.md) | Optional proposal orchestration workflow (single-command autopilot, planning bundles, automated execution/review loops). |
@@ -181,7 +200,7 @@ python3 -m venv .venv
181
200
  .venv/bin/pip install -r requirements.txt
182
201
  ```
183
202
 
184
- The `cocoindex` package is **only** needed for lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation work without it.
203
+ The `cocoindex` package powers lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation do not invoke it directly.
185
204
 
186
205
  The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `EMBEDDING_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables).
187
206
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: java-codebase-rag
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: MCP server for semantic + structural search over Java codebases
5
5
  Author: HumanBean17
6
6
  License-Expression: MIT
@@ -18,6 +18,7 @@ Classifier: Topic :: Software Development :: Libraries
18
18
  Requires-Python: >=3.11
19
19
  Description-Content-Type: text/markdown
20
20
  License-File: LICENSE
21
+ Requires-Dist: cocoindex[lancedb]<2,>=1.0.0a43
21
22
  Requires-Dist: kuzu<0.12,>=0.11.3
22
23
  Requires-Dist: lancedb<0.31,>=0.25.3
23
24
  Requires-Dist: mcp<2,>=1.27.0
@@ -50,6 +51,24 @@ For the design rationale, the GPS metaphor, and the full ontology, see [`docs/pa
50
51
 
51
52
  ---
52
53
 
54
+ ## Why this exists
55
+
56
+ Generic code-search tools (grep, ctags, vector-only RAG) hit a ceiling on real Java microservice estates: they find files but lose the structure that makes a Spring/JAX-RS system navigable. This project is built around five choices that target that gap.
57
+
58
+ - **Hybrid RAG + GraphRAG, not either-or.** Semantic recall (LanceDB chunk vectors) and structural navigation (Kuzu property graph) are composed in one surface. `search` finds candidate nodes by meaning; `neighbors` walks the exact edge you care about (`CALLS`, `IMPLEMENTS`, `INJECTS`, `DECLARES_ROUTE`, …). The agent picks the right primitive per step instead of being forced into pure-vector or pure-symbol search.
59
+
60
+ - **A Java-tuned role model.** Symbols are labelled with stereotypes inferred from Spring and JAX-RS conventions — `CONTROLLER`, `SERVICE`, `REPOSITORY`, `CLIENT`, `PRODUCER`, `MAPPER`, `DTO`. Agents can ask "list controllers" or "who injects this repository" directly, instead of grep-ing for `@RestController` and hoping for the best. Roles drive both filtering (`find` with a `NodeFilter`) and ranking.
61
+
62
+ - **Ranking specialized for Java codebases.** The composite ranker is aware of role, microservice, and FQN structure — not a generic BM25. A search for `"chat ingress"` surfaces controllers before utility classes; a search scoped to one microservice doesn't drown in matches from the other 19. Defaults are tuned on the bank-chat fixture and exposed in `docs/CONFIGURATION.md` for per-repo overrides.
63
+
64
+ - **Cross-service resolution + system-level navigation.** `HTTP_CALLS` and `ASYNC_CALLS` edges connect Clients and Producers in one microservice to Routes and Handlers in another, resolved at index time from URL/topic strings + Spring `@FeignClient` / `RestTemplate` conventions. `/who-hits-route`, `/trace-request-flow`, and `/impact-of` use these to answer questions a single-service tool fundamentally can't — "who calls this REST endpoint from outside this service", "trace this Kafka message end-to-end", "if I change this DTO, which services break".
65
+
66
+ - **Brownfield annotations as a first-class override.** Real Java estates have hand-rolled HTTP clients, dynamic topic names, reflection-heavy routing. `@CodebaseHttpRoute`, `@CodebaseAsyncRoute`, `@CodebaseHttpClient`, and `@CodebaseProducer` let you pin the truth in source. They have **exclusive priority** — when a symbol is annotated, framework-convention inference is skipped entirely. You get a correct graph on legacy code without rewriting it.
67
+
68
+ The rest of this README is the install, walkthrough, and tool cheat sheet for putting that to work.
69
+
70
+ ---
71
+
53
72
  ## Install
54
73
 
55
74
  ```bash
@@ -57,6 +76,7 @@ pip install java-codebase-rag
57
76
  ```
58
77
 
59
78
  Python **3.11+** required. After install, `java-codebase-rag --help` should print the CLI groups.
79
+ The package includes the CocoIndex lifecycle dependency used by `init`, `increment`, `reprocess`, and `erase`.
60
80
 
61
81
  > **Stability disclaimer.** This package does **not** promise backward compatibility. MCP tool contracts, env vars, Lance/Kuzu schemas, config files, and Python APIs may change without a deprecation period. Track `main` and rebuild indexes when ontology or embedding settings change.
62
82
 
@@ -132,9 +152,9 @@ See [`mcp.json.example`](./mcp.json.example) for the same shape in `.mcp.json` (
132
152
 
133
153
  Pick **one** of two options (not both — they cover the same navigation intents):
134
154
 
135
- 1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and inline slash-style aliases (`/callers`, `/callees`, `/routes`, etc.) as prompt templates. Self-contained — no external file dependencies.
155
+ 1. **[`docs/AGENT-GUIDE.md`](./docs/AGENT-GUIDE.md)** (recommended for most) — standalone MCP operating manual. Copy-paste the `BEGIN`/`END` block into your project's `QWEN.md`, `CLAUDE.md`, or `AGENTS.md`. Contains: five-tool reference, `NodeFilter` / edge taxonomy, ontology glossary, recovery playbook, and navigation patterns. Self-contained — no external file dependencies.
136
156
 
137
- 2. **[`skills/`](./skills/)** (for hosts with skill discovery) — 15 shipped `SKILL.md` files. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), the same navigation intents are available as discoverable `/` commands. Tier 1 = deterministic MCP chains (`/callers`, `/callees`, `/routes`, `/controllers`, `/clients`, `/producers`, `/handlers`, `/who-hits-route`, `/implements`, `/injects`, `/nl`). Tier 2 = bounded workflows (`/explain-feature`, `/impact-of`, `/trace-request-flow`, `/mini-map`). See [`skills/README.md`](./skills/README.md) for the full index.
157
+ 2. **[`/explore-codebase`](./skills/explore-codebase/SKILL.md)** (for hosts with skill discovery) — single self-contained skill with the complete operating manual. If your MCP host supports skill discovery (Claude Code, Qwen Code, Cursor), load `/explore-codebase` to get the full tool reference, edge taxonomy, decision tree, and recovery playbook in one shot.
138
158
 
139
159
  Also: **[`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md)** — 7-phase agent-driven verification you run after indexing your real project.
140
160
 
@@ -154,7 +174,7 @@ Full schemas, `NodeFilter` / `EdgeFilter` semantics, and the hints contract live
154
174
 
155
175
  ### Three-layer architecture
156
176
 
157
- Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skills). Navigation skills in [`skills/`](./skills/) wrap the MCP tools into deterministic chains (Tier 1) and bounded workflows (Tier 2). See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
177
+ Layer 1 (storage) → Layer 2 (5 MCP tools) → Layer 3 (skill). The [`/explore-codebase`](./skills/explore-codebase/SKILL.md) skill provides the full operating manual for Layer 2. See the [architecture diagram in `skills/README.md`](./skills/README.md#three-layer-architecture).
158
178
 
159
179
  ---
160
180
 
@@ -197,7 +217,7 @@ Run `java-codebase-rag --help` to list grouped subcommands. Operator playbook wi
197
217
  | [`docs/CONFIGURATION.md`](./docs/CONFIGURATION.md) | Environment variables, project YAML, graph ontology, brownfield overrides, ignore patterns. |
198
218
  | [`docs/JAVA-CODEBASE-RAG-CLI.md`](./docs/JAVA-CODEBASE-RAG-CLI.md) | CLI operator playbook: workflows, exit codes, env alignment. |
199
219
  | [`docs/EDGE-NAVIGATION.md`](./docs/EDGE-NAVIGATION.md) | MCP-traversable edges, directions, dot-key composition. |
200
- | [`skills/`](./skills/) | 15 navigation and workflow skills for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
220
+ | [`skills/`](./skills/) | Single `/explore-codebase` skill complete MCP operating manual for hosts with skill discovery (alternative to copy-pasting AGENT-GUIDE). See [`skills/README.md`](./skills/README.md). |
201
221
  | [`docs/MANUAL-VERIFICATION-CHECKLIST.md`](./docs/MANUAL-VERIFICATION-CHECKLIST.md) | 7-phase agent-driven verification after indexing your project. |
202
222
  | [`docs/CODEBASE_REQUIREMENTS.md`](./docs/CODEBASE_REQUIREMENTS.md) | Assumptions about your Java repo + per-file edit map for non-conforming codebases. |
203
223
  | [`automation/cursor_propose_only/README.md`](./automation/cursor_propose_only/README.md) | Optional proposal orchestration workflow (single-command autopilot, planning bundles, automated execution/review loops). |
@@ -214,7 +234,7 @@ python3 -m venv .venv
214
234
  .venv/bin/pip install -r requirements.txt
215
235
  ```
216
236
 
217
- The `cocoindex` package is **only** needed for lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation work without it.
237
+ The `cocoindex` package powers lifecycle commands that run the indexer (`init`, `increment`, `reprocess`, `erase`). Search and MCP navigation do not invoke it directly.
218
238
 
219
239
  The default embedding model is `sentence-transformers/all-MiniLM-L6-v2` (downloaded on first `init`). Override via the `EMBEDDING_MODEL` env var — see [`docs/CONFIGURATION.md` §1](./docs/CONFIGURATION.md#1-environment-variables).
220
240
 
@@ -61,6 +61,7 @@ tests/test_mcp_v2.py
61
61
  tests/test_mcp_v2_compose.py
62
62
  tests/test_meta_chain_core.py
63
63
  tests/test_outgoing_call_extraction.py
64
+ tests/test_packaging_metadata.py
64
65
  tests/test_path_filtering.py
65
66
  tests/test_pr_analysis.py
66
67
  tests/test_resolve_routes_messaging_layer_c.py
@@ -1,3 +1,4 @@
1
+ cocoindex[lancedb]<2,>=1.0.0a43
1
2
  kuzu<0.12,>=0.11.3
2
3
  lancedb<0.31,>=0.25.3
3
4
  mcp<2,>=1.27.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "java-codebase-rag"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  description = "MCP server for semantic + structural search over Java codebases"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.11"
@@ -23,6 +23,7 @@ classifiers = [
23
23
  "Topic :: Software Development :: Libraries",
24
24
  ]
25
25
  dependencies = [
26
+ "cocoindex[lancedb]>=1.0.0a43,<2",
26
27
  "kuzu>=0.11.3,<0.12",
27
28
  "lancedb>=0.25.3,<0.31",
28
29
  "mcp>=1.27.0,<2",
@@ -0,0 +1,251 @@
1
+ """Static validation for skills/ directory SKILL.md files.
2
+
3
+ Imports allowlists from production code (mcp_v2, java_ontology) — not
4
+ hand-maintained lists. Validates:
5
+ - frontmatter (name + description present)
6
+ - MCP tool names referenced in skill body
7
+ - find kind values
8
+ - direction values
9
+ - edge_types values
10
+ - worked example section present
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import re
16
+ from pathlib import Path
17
+ from typing import get_args
18
+
19
+ import pytest
20
+
21
+ from java_ontology import NodeKind
22
+ from mcp_v2 import ComposedEdgeType, EdgeType
23
+
24
+ # ---------------------------------------------------------------------------
25
+ # Allowlists sourced from production code
26
+ # ---------------------------------------------------------------------------
27
+
28
+ _VALID_TOOLS: frozenset[str] = frozenset(["search", "find", "describe", "neighbors", "resolve"])
29
+
30
+ _VALID_KINDS: frozenset[str] = frozenset(k.lower() for k in get_args(NodeKind))
31
+
32
+ _VALID_DIRECTIONS: frozenset[str] = frozenset(["in", "out"])
33
+
34
+ _ALL_EDGE_TYPES: frozenset[str] = frozenset(get_args(EdgeType)) | frozenset(get_args(ComposedEdgeType))
35
+
36
+ # ---------------------------------------------------------------------------
37
+ # Helpers
38
+ # ---------------------------------------------------------------------------
39
+
40
+ SKILLS_DIR = Path(__file__).resolve().parent.parent / "skills"
41
+ SKILL_NAME = "explore-codebase"
42
+ SKILL_PATH = SKILLS_DIR / SKILL_NAME / "SKILL.md"
43
+
44
+
45
+ def _parse_frontmatter(text: str) -> dict[str, str]:
46
+ """Parse simple YAML frontmatter (key: value pairs only)."""
47
+ m = re.match(r"^---\n(.*?)\n---", text, re.DOTALL)
48
+ if not m:
49
+ return {}
50
+ result: dict[str, str] = {}
51
+ for line in m.group(1).splitlines():
52
+ if ":" in line:
53
+ key, _, value = line.partition(":")
54
+ result[key.strip()] = value.strip()
55
+ return result
56
+
57
+
58
+ def _read_skill() -> tuple[dict[str, str], str]:
59
+ """Read the explore-codebase SKILL.md and return (frontmatter, body)."""
60
+ text = SKILL_PATH.read_text(encoding="utf-8")
61
+ fm = _parse_frontmatter(text)
62
+ body = re.sub(r"^---\n.*?\n---\n*", "", text, count=1, flags=re.DOTALL)
63
+ return fm, body
64
+
65
+
66
+ def _extract_tool_refs(body: str) -> set[str]:
67
+ """Extract tool names referenced in MCP call patterns."""
68
+ refs: set[str] = set()
69
+ for m in re.finditer(r"`(search|find|describe|neighbors|resolve)\b", body):
70
+ refs.add(m.group(1))
71
+ for m in re.finditer(r"\b(search|find|describe|neighbors|resolve)\s*[\(\{]", body):
72
+ refs.add(m.group(1))
73
+ return refs
74
+
75
+
76
+ def _extract_kind_refs(body: str) -> set[str]:
77
+ """Extract find kind values from skill body."""
78
+ refs: set[str] = set()
79
+ for m in re.finditer(r'kind\s*=\s*["\']?(\w+)["\']?', body):
80
+ val = m.group(1).lower()
81
+ if val in _VALID_KINDS:
82
+ refs.add(val)
83
+ return refs
84
+
85
+
86
+ def _extract_direction_refs(body: str) -> set[str]:
87
+ """Extract direction values from skill body."""
88
+ refs: set[str] = set()
89
+ for m in re.finditer(r'direction\s*:\s*["\']?(in|out)["\']?', body):
90
+ refs.add(m.group(1))
91
+ return refs
92
+
93
+
94
+ def _extract_edge_type_refs(body: str) -> set[str]:
95
+ """Extract edge_types values referenced in skill body."""
96
+ refs: set[str] = set()
97
+ for m in re.finditer(r'edge_types\s*:\s*\[([^\]]+)\]', body):
98
+ inner = m.group(1)
99
+ for val in re.findall(r'"(\w[\w.]*)"', inner):
100
+ if val in _ALL_EDGE_TYPES:
101
+ refs.add(val)
102
+ for m in re.finditer(r'\["(\w[\w.]*)"', body):
103
+ val = m.group(1)
104
+ if val in _ALL_EDGE_TYPES:
105
+ refs.add(val)
106
+ return refs
107
+
108
+
109
+ # ---------------------------------------------------------------------------
110
+ # Tests
111
+ # ---------------------------------------------------------------------------
112
+
113
+
114
+ class TestSkillFrontmatter:
115
+ """SKILL.md must have valid frontmatter."""
116
+
117
+ def test_skill_file_exists(self):
118
+ assert SKILL_PATH.is_file(), f"Missing {SKILL_PATH}"
119
+
120
+ def test_frontmatter_has_name_and_description(self):
121
+ fm, _ = _read_skill()
122
+ assert "name" in fm, "SKILL.md missing frontmatter 'name'"
123
+ assert fm["name"] == SKILL_NAME, f"name={fm['name']!r}, expected {SKILL_NAME!r}"
124
+ assert "description" in fm, "SKILL.md missing frontmatter 'description'"
125
+ assert len(fm["description"]) >= 20, (
126
+ f"description too short ({len(fm['description'])} chars)"
127
+ )
128
+
129
+
130
+ class TestMCPToolReferences:
131
+ """Tool names in skill body must be valid MCP navigation tools."""
132
+
133
+ def test_tool_refs_are_valid(self):
134
+ _, body = _read_skill()
135
+ refs = _extract_tool_refs(body)
136
+ invalid = refs - _VALID_TOOLS
137
+ assert not invalid, f"SKILL.md references invalid tools: {invalid}"
138
+
139
+ def test_skill_references_all_five_tools(self):
140
+ _, body = _read_skill()
141
+ refs = _extract_tool_refs(body)
142
+ missing = _VALID_TOOLS - refs
143
+ assert not missing, f"SKILL.md does not reference all 5 tools, missing: {missing}"
144
+
145
+
146
+ class TestKindAndEdgeReferences:
147
+ """Kind, direction, and edge_type values must match production allowlists."""
148
+
149
+ def test_kind_refs_are_valid(self):
150
+ _, body = _read_skill()
151
+ refs = _extract_kind_refs(body)
152
+ invalid = refs - _VALID_KINDS
153
+ assert not invalid, f"SKILL.md references invalid find kinds: {invalid}"
154
+
155
+ def test_direction_refs_are_valid(self):
156
+ _, body = _read_skill()
157
+ refs = _extract_direction_refs(body)
158
+ invalid = refs - _VALID_DIRECTIONS
159
+ assert not invalid, f"SKILL.md references invalid directions: {invalid}"
160
+
161
+ def test_edge_type_refs_are_valid(self):
162
+ _, body = _read_skill()
163
+ refs = _extract_edge_type_refs(body)
164
+ invalid = refs - _ALL_EDGE_TYPES
165
+ assert not invalid, f"SKILL.md references invalid edge_types: {invalid}"
166
+
167
+
168
+ class TestBodyStructure:
169
+ """Skill body must contain key sections."""
170
+
171
+ def test_has_worked_example(self):
172
+ _, body = _read_skill()
173
+ assert "## Worked example" in body, "SKILL.md missing '## Worked example'"
174
+
175
+ def test_has_decision_tree(self):
176
+ _, body = _read_skill()
177
+ assert "## Decision tree" in body, "SKILL.md missing '## Decision tree'"
178
+
179
+ def test_has_recovery_playbook(self):
180
+ _, body = _read_skill()
181
+ assert "## Recovery playbook" in body, "SKILL.md missing '## Recovery playbook'"
182
+
183
+ def test_has_edge_taxonomy(self):
184
+ _, body = _read_skill()
185
+ assert "## Edge taxonomy" in body, "SKILL.md missing '## Edge taxonomy'"
186
+
187
+ def test_has_navigation_patterns(self):
188
+ _, body = _read_skill()
189
+ assert "## Common navigation patterns" in body, "SKILL.md missing '## Common navigation patterns'"
190
+
191
+ def test_has_reasoning_preamble(self):
192
+ _, body = _read_skill()
193
+ assert "## Forced reasoning preamble" in body, "SKILL.md missing '## Forced reasoning preamble'"
194
+
195
+
196
+ class TestDirectoryIntegrity:
197
+ """skills/ must have expected structure."""
198
+
199
+ def test_skill_dir_exists(self):
200
+ assert (SKILLS_DIR / SKILL_NAME).is_dir(), f"skills/{SKILL_NAME}/ missing"
201
+
202
+ def test_no_tier_dirs(self):
203
+ """Old tier-1/ and tier-2/ directories must not exist."""
204
+ for tier in ("tier-1", "tier-2"):
205
+ assert not (SKILLS_DIR / tier).is_dir(), f"Old skills/{tier}/ still exists — remove it"
206
+
207
+ def test_readme_exists(self):
208
+ assert (SKILLS_DIR / "README.md").is_file(), "skills/README.md missing"
209
+
210
+ def test_no_other_skill_dirs(self):
211
+ """Only explore-codebase/ should exist as a skill directory."""
212
+ skill_dirs = {
213
+ p.name for p in SKILLS_DIR.iterdir()
214
+ if p.is_dir() and (p / "SKILL.md").exists()
215
+ }
216
+ assert skill_dirs == {SKILL_NAME}, (
217
+ f"Expected only skills/{SKILL_NAME}/, found: {skill_dirs}"
218
+ )
219
+
220
+
221
+ class TestAgentGuideConsistency:
222
+ """AGENT-GUIDE.md copy-paste block must be self-contained."""
223
+
224
+ def test_guide_has_navigation_patterns_table(self):
225
+ """The copy-paste block must include a navigation patterns section."""
226
+ guide = Path(__file__).resolve().parent.parent / "docs" / "AGENT-GUIDE.md"
227
+ text = guide.read_text(encoding="utf-8")
228
+ begin = text.find("<!-- BEGIN java-codebase-rag MCP guide -->")
229
+ end = text.find("<!-- END java-codebase-rag MCP guide -->")
230
+ assert begin != -1 and end != -1, "AGENT-GUIDE.md missing BEGIN/END markers"
231
+ block = text[begin:end]
232
+ assert "### Common navigation patterns" in block, (
233
+ "AGENT-GUIDE.md copy-paste block missing '### Common navigation patterns'"
234
+ )
235
+ for pattern in ["CALLS", "EXPOSES", "IMPLEMENTS", "INJECTS"]:
236
+ assert pattern in block, f"AGENT-GUIDE.md copy-paste block missing {pattern} pattern"
237
+
238
+ def test_guide_copy_block_does_not_reference_skills_dir(self):
239
+ """The copy-paste block must not reference skills/ — it won't exist
240
+ in the consumer's project."""
241
+ guide = Path(__file__).resolve().parent.parent / "docs" / "AGENT-GUIDE.md"
242
+ text = guide.read_text(encoding="utf-8")
243
+ begin = text.find("<!-- BEGIN java-codebase-rag MCP guide -->")
244
+ end = text.find("<!-- END java-codebase-rag MCP guide -->")
245
+ assert begin != -1 and end != -1, "AGENT-GUIDE.md missing BEGIN/END markers"
246
+ block = text[begin:end]
247
+ assert "skills/" not in block, (
248
+ "AGENT-GUIDE.md copy-paste block references skills/ — "
249
+ "this path won't resolve in a consumer project. "
250
+ "Keep skills/ references outside the copy-paste block."
251
+ )
@@ -0,0 +1,14 @@
1
+ from __future__ import annotations
2
+
3
+ import tomllib
4
+ from pathlib import Path
5
+
6
+
7
+ def test_published_package_installs_cocoindex_for_lifecycle_commands() -> None:
8
+ data = tomllib.loads((Path(__file__).resolve().parents[1] / "pyproject.toml").read_text())
9
+ deps = data["project"]["dependencies"]
10
+
11
+ cocoindex_deps = [dep for dep in deps if dep.startswith("cocoindex")]
12
+
13
+ assert cocoindex_deps
14
+ assert any("[lancedb]" in dep for dep in cocoindex_deps)
@@ -1,318 +0,0 @@
1
- """Static validation for skills/ directory SKILL.md files.
2
-
3
- Imports allowlists from production code (mcp_v2, java_ontology) — not
4
- hand-maintained lists. Validates:
5
- - frontmatter (name + description present)
6
- - MCP tool names referenced in skill bodies
7
- - find kind values
8
- - direction values
9
- - edge_types values
10
- - Tier 2 body structure (stop conditions, recursion limit)
11
-
12
- Known gap (intentional — see AGENT-SKILLS-AND-COMMANDS-PROPOSE §11):
13
- - edge_filter parameters (callee_declaring_role, min_confidence,
14
- exclude_callee_declaring_roles, dedup_calls, include_unresolved)
15
- referenced in /mini-map are NOT validated against mcp_v2 parameter
16
- definitions. The static validator does not parse edge_filter dicts.
17
- On re-index, manually verify /mini-map against the MCP surface.
18
- """
19
-
20
- from __future__ import annotations
21
-
22
- import re
23
- from pathlib import Path
24
- from typing import get_args
25
-
26
- import pytest
27
-
28
- from java_ontology import NodeKind
29
- from mcp_v2 import ComposedEdgeType, EdgeType
30
-
31
- # ---------------------------------------------------------------------------
32
- # Allowlists sourced from production code
33
- # ---------------------------------------------------------------------------
34
-
35
- _VALID_TOOLS: frozenset[str] = frozenset(["search", "find", "describe", "neighbors", "resolve"])
36
-
37
- _VALID_KINDS: frozenset[str] = frozenset(k.lower() for k in get_args(NodeKind))
38
-
39
- _VALID_DIRECTIONS: frozenset[str] = frozenset(["in", "out"])
40
-
41
- _ALL_EDGE_TYPES: frozenset[str] = frozenset(get_args(EdgeType)) | frozenset(get_args(ComposedEdgeType))
42
-
43
- # ---------------------------------------------------------------------------
44
- # Helpers
45
- # ---------------------------------------------------------------------------
46
-
47
- SKILLS_DIR = Path(__file__).resolve().parent.parent / "skills"
48
-
49
- TIER1_NAMES = [
50
- "nl", "controllers", "routes", "clients", "producers",
51
- "callers", "callees", "handlers", "who-hits-route",
52
- "implements", "injects",
53
- ]
54
-
55
- TIER2_NAMES = [
56
- "explain-feature", "impact-of", "trace-request-flow", "mini-map",
57
- ]
58
-
59
- ALL_SKILL_NAMES = TIER1_NAMES + TIER2_NAMES
60
-
61
-
62
- def _parse_frontmatter(text: str) -> dict[str, str]:
63
- """Parse simple YAML frontmatter (key: value pairs only)."""
64
- m = re.match(r"^---\n(.*?)\n---", text, re.DOTALL)
65
- if not m:
66
- return {}
67
- result: dict[str, str] = {}
68
- for line in m.group(1).splitlines():
69
- if ":" in line:
70
- key, _, value = line.partition(":")
71
- result[key.strip()] = value.strip()
72
- return result
73
-
74
-
75
- def _extract_tool_refs(body: str) -> set[str]:
76
- """Extract tool names referenced in MCP call patterns."""
77
- # Match patterns like `search(...)`, `find(kind=...)`, `describe(id=...)`,
78
- # `neighbors({ids:`, `resolve(identifier=`, also backtick-wrapped names.
79
- refs: set[str] = set()
80
- for m in re.finditer(r"`(search|find|describe|neighbors|resolve)\b", body):
81
- refs.add(m.group(1))
82
- # Also catch patterns like search(query=...) find(kind=...) without backticks
83
- for m in re.finditer(r"\b(search|find|describe|neighbors|resolve)\s*[\(\{]", body):
84
- refs.add(m.group(1))
85
- return refs
86
-
87
-
88
- def _extract_kind_refs(body: str) -> set[str]:
89
- """Extract find kind values from skill body."""
90
- refs: set[str] = set()
91
- for m in re.finditer(r'kind\s*=\s*["\']?(\w+)["\']?', body):
92
- val = m.group(1).lower()
93
- if val in _VALID_KINDS:
94
- refs.add(val)
95
- return refs
96
-
97
-
98
- def _extract_direction_refs(body: str) -> set[str]:
99
- """Extract direction values from skill body."""
100
- refs: set[str] = set()
101
- for m in re.finditer(r'direction\s*:\s*["\']?(in|out)["\']?', body):
102
- refs.add(m.group(1))
103
- return refs
104
-
105
-
106
- def _extract_edge_type_refs(body: str) -> set[str]:
107
- """Extract edge_types values referenced in skill body."""
108
- refs: set[str] = set()
109
- # Match edge_types lists: ["CALLS"] or ["HTTP_CALLS","ASYNC_CALLS","EXPOSES"]
110
- for m in re.finditer(r'edge_types\s*:\s*\[([^\]]+)\]', body):
111
- inner = m.group(1)
112
- for val in re.findall(r'"(\w[\w.]*)"', inner):
113
- if val in _ALL_EDGE_TYPES:
114
- refs.add(val)
115
- # Also match quoted edge names in backticked patterns
116
- for m in re.finditer(r'\["(\w[\w.]*)"', body):
117
- val = m.group(1)
118
- if val in _ALL_EDGE_TYPES:
119
- refs.add(val)
120
- return refs
121
-
122
-
123
- def _read_skill(name: str) -> tuple[dict[str, str], str]:
124
- """Read a skill's SKILL.md and return (frontmatter, body)."""
125
- path = SKILLS_DIR / name / "SKILL.md"
126
- text = path.read_text(encoding="utf-8")
127
- fm = _parse_frontmatter(text)
128
- # Body is everything after the closing ---
129
- body = re.sub(r"^---\n.*?\n---\n*", "", text, count=1, flags=re.DOTALL)
130
- return fm, body
131
-
132
-
133
- # ---------------------------------------------------------------------------
134
- # Parametrized test ids
135
- # ---------------------------------------------------------------------------
136
-
137
- @pytest.fixture(params=ALL_SKILL_NAMES, ids=lambda n: f"skill:{n}")
138
- def skill_name(request):
139
- return request.param
140
-
141
-
142
- # ---------------------------------------------------------------------------
143
- # Tests
144
- # ---------------------------------------------------------------------------
145
-
146
-
147
- class TestSkillFrontmatter:
148
- """Every SKILL.md must have valid frontmatter."""
149
-
150
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
151
- def test_frontmatter_has_name_and_description(self, name: str):
152
- fm, _ = _read_skill(name)
153
- assert "name" in fm, f"skills/{name}/SKILL.md missing frontmatter 'name'"
154
- assert fm["name"] == name, f"skills/{name}/SKILL.md: name={fm['name']!r}, expected {name!r}"
155
- assert "description" in fm, f"skills/{name}/SKILL.md missing frontmatter 'description'"
156
- assert len(fm["description"]) >= 20, (
157
- f"skills/{name}/SKILL.md description too short ({len(fm['description'])} chars)"
158
- )
159
-
160
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
161
- def test_skill_file_exists(self, name: str):
162
- path = SKILLS_DIR / name / "SKILL.md"
163
- assert path.is_file(), f"Missing skills/{name}/SKILL.md"
164
-
165
-
166
- class TestMCPToolReferences:
167
- """Tool names in skill bodies must be valid MCP navigation tools."""
168
-
169
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
170
- def test_tool_refs_are_valid(self, name: str):
171
- _, body = _read_skill(name)
172
- refs = _extract_tool_refs(body)
173
- invalid = refs - _VALID_TOOLS
174
- assert not invalid, f"skills/{name}/SKILL.md references invalid tools: {invalid}"
175
-
176
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
177
- def test_skill_references_at_least_one_tool(self, name: str):
178
- _, body = _read_skill(name)
179
- refs = _extract_tool_refs(body)
180
- assert refs, f"skills/{name}/SKILL.md references no MCP tools"
181
-
182
-
183
- class TestKindAndEdgeReferences:
184
- """Kind, direction, and edge_type values must match production allowlists."""
185
-
186
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
187
- def test_kind_refs_are_valid(self, name: str):
188
- _, body = _read_skill(name)
189
- refs = _extract_kind_refs(body)
190
- invalid = refs - _VALID_KINDS
191
- assert not invalid, f"skills/{name}/SKILL.md references invalid find kinds: {invalid}"
192
-
193
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
194
- def test_direction_refs_are_valid(self, name: str):
195
- _, body = _read_skill(name)
196
- refs = _extract_direction_refs(body)
197
- invalid = refs - _VALID_DIRECTIONS
198
- assert not invalid, f"skills/{name}/SKILL.md references invalid directions: {invalid}"
199
-
200
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
201
- def test_edge_type_refs_are_valid(self, name: str):
202
- _, body = _read_skill(name)
203
- refs = _extract_edge_type_refs(body)
204
- invalid = refs - _ALL_EDGE_TYPES
205
- assert not invalid, f"skills/{name}/SKILL.md references invalid edge_types: {invalid}"
206
-
207
-
208
- class TestTier2BodyStructure:
209
- """Tier 2 skills must have stop conditions and recursion limits."""
210
-
211
- @pytest.mark.parametrize("name", TIER2_NAMES)
212
- def test_has_stop_conditions(self, name: str):
213
- _, body = _read_skill(name)
214
- assert "## Stop conditions" in body, f"skills/{name}/SKILL.md missing '## Stop conditions'"
215
-
216
- @pytest.mark.parametrize("name", TIER2_NAMES)
217
- def test_has_recursion_limit(self, name: str):
218
- _, body = _read_skill(name)
219
- assert "## Recursion limit" in body, f"skills/{name}/SKILL.md missing '## Recursion limit'"
220
-
221
- def test_mini_map_has_classification_rules(self):
222
- _, body = _read_skill("mini-map")
223
- assert "### Step 4 — Skill heuristics" in body or "Classification" in body, (
224
- "skills/mini-map/SKILL.md missing classification rules"
225
- )
226
-
227
- def test_mini_map_has_output_shape(self):
228
- _, body = _read_skill("mini-map")
229
- assert "PERSISTS" in body and "DELEGATES" in body, (
230
- "skills/mini-map/SKILL.md missing output shape (PERSISTS/DELEGATES labels)"
231
- )
232
-
233
-
234
- class TestWorkedExamples:
235
- """Every skill must have a worked example section."""
236
-
237
- @pytest.mark.parametrize("name", ALL_SKILL_NAMES)
238
- def test_has_worked_example(self, name: str):
239
- _, body = _read_skill(name)
240
- assert "## Worked example" in body, f"skills/{name}/SKILL.md missing '## Worked example'"
241
-
242
-
243
- class TestDirectoryIntegrity:
244
- """skills/ directory must contain exactly the expected skills."""
245
-
246
- def test_no_extra_skill_dirs(self):
247
- actual = {p.name for p in SKILLS_DIR.iterdir() if p.is_dir() and (p / "SKILL.md").exists()}
248
- expected = set(ALL_SKILL_NAMES)
249
- extra = actual - expected
250
- assert not extra, f"Unexpected skill directories: {extra}"
251
-
252
- def test_no_missing_skill_dirs(self):
253
- actual = {p.name for p in SKILLS_DIR.iterdir() if p.is_dir() and (p / "SKILL.md").exists()}
254
- expected = set(ALL_SKILL_NAMES)
255
- missing = expected - actual
256
- assert not missing, f"Missing skill directories: {missing}"
257
-
258
- def test_readme_exists(self):
259
- assert (SKILLS_DIR / "README.md").is_file(), "skills/README.md missing"
260
-
261
-
262
- class TestAgentGuideConsistency:
263
- """AGENT-GUIDE.md copy-paste block must be self-contained."""
264
-
265
- def test_guide_has_navigation_patterns_table(self):
266
- """The copy-paste block must include a navigation patterns section
267
- (it's standalone — no external file references work in a consumer project)."""
268
- guide = Path(__file__).resolve().parent.parent / "docs" / "AGENT-GUIDE.md"
269
- text = guide.read_text(encoding="utf-8")
270
- # Extract the copy-paste block (marker on its own line)
271
- begin = text.find("<!-- BEGIN java-codebase-rag MCP guide -->")
272
- end = text.find("<!-- END java-codebase-rag MCP guide -->")
273
- assert begin != -1 and end != -1, "AGENT-GUIDE.md missing BEGIN/END markers"
274
- block = text[begin:end]
275
- assert "### Common navigation patterns" in block, (
276
- "AGENT-GUIDE.md copy-paste block missing '### Common navigation patterns'"
277
- )
278
- # Verify key patterns are present
279
- for pattern in ["CALLS", "EXPOSES", "IMPLEMENTS", "INJECTS"]:
280
- assert pattern in block, f"AGENT-GUIDE.md copy-paste block missing {pattern} pattern"
281
-
282
- def test_guide_copy_block_does_not_reference_skills_dir(self):
283
- """The copy-paste block must not reference skills/ — it won't exist
284
- in the consumer's project."""
285
- guide = Path(__file__).resolve().parent.parent / "docs" / "AGENT-GUIDE.md"
286
- text = guide.read_text(encoding="utf-8")
287
- begin = text.find("<!-- BEGIN java-codebase-rag MCP guide -->")
288
- end = text.find("<!-- END java-codebase-rag MCP guide -->")
289
- assert begin != -1 and end != -1, "AGENT-GUIDE.md missing BEGIN/END markers"
290
- block = text[begin:end]
291
- assert "skills/" not in block, (
292
- "AGENT-GUIDE.md copy-paste block references skills/ — "
293
- "this path won't resolve in a consumer project. "
294
- "Keep skills/ references outside the copy-paste block."
295
- )
296
-
297
- def test_guide_copy_block_has_no_slash_command_aliases(self):
298
- """The copy-paste block must not contain slash-command alias bullets
299
- like `/nl <text>` → ... — these imply commands that don't exist
300
- and will mislead the agent. Incidental mentions (e.g. cross-references
301
- in prose) are fine."""
302
- guide = Path(__file__).resolve().parent.parent / "docs" / "AGENT-GUIDE.md"
303
- text = guide.read_text(encoding="utf-8")
304
- begin = text.find("<!-- BEGIN java-codebase-rag MCP guide -->")
305
- end = text.find("<!-- END java-codebase-rag MCP guide -->")
306
- block = text[begin:end]
307
- # Match alias definition lines: - `/skillname ...` → tool(...)
308
- skill_names_pattern = "|".join(re.escape(n) for n in ALL_SKILL_NAMES)
309
- alias_pattern = re.compile(
310
- rf"^- `/(?:{skill_names_pattern})\s",
311
- re.MULTILINE,
312
- )
313
- matches = alias_pattern.findall(block)
314
- assert not matches, (
315
- f"AGENT-GUIDE.md copy-paste block contains slash-command alias bullets: "
316
- f"{alias_pattern.findall(block)}. "
317
- "These are not real commands and will mislead the agent."
318
- )