codespine 1.0.13__tar.gz → 1.0.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {codespine-1.0.13 → codespine-1.0.14}/PKG-INFO +277 -8
  2. {codespine-1.0.13 → codespine-1.0.14}/README.md +276 -7
  3. {codespine-1.0.13 → codespine-1.0.14}/codespine/__init__.py +1 -1
  4. {codespine-1.0.13 → codespine-1.0.14}/codespine/analysis/community.py +22 -10
  5. codespine-1.0.14/codespine/analysis/context.py +72 -0
  6. {codespine-1.0.13 → codespine-1.0.14}/codespine/analysis/coupling.py +1 -1
  7. {codespine-1.0.13 → codespine-1.0.14}/codespine/analysis/flow.py +65 -28
  8. {codespine-1.0.13 → codespine-1.0.14}/codespine/analysis/impact.py +71 -30
  9. {codespine-1.0.13 → codespine-1.0.14}/codespine/cli.py +86 -2
  10. codespine-1.0.14/codespine/graphrag.py +1336 -0
  11. {codespine-1.0.13 → codespine-1.0.14}/codespine/guide.py +13 -3
  12. {codespine-1.0.13 → codespine-1.0.14}/codespine/health.py +68 -6
  13. {codespine-1.0.13 → codespine-1.0.14}/codespine/mcp/server.py +121 -24
  14. {codespine-1.0.13 → codespine-1.0.14}/codespine/overlay/merge.py +56 -9
  15. {codespine-1.0.13 → codespine-1.0.14}/codespine/search/hybrid.py +180 -25
  16. {codespine-1.0.13 → codespine-1.0.14}/codespine.egg-info/PKG-INFO +277 -8
  17. {codespine-1.0.13 → codespine-1.0.14}/codespine.egg-info/SOURCES.txt +6 -1
  18. {codespine-1.0.13 → codespine-1.0.14}/pyproject.toml +1 -1
  19. codespine-1.0.14/tests/test_coupling.py +25 -0
  20. codespine-1.0.14/tests/test_graphrag.py +778 -0
  21. codespine-1.0.14/tests/test_health.py +193 -0
  22. codespine-1.0.14/tests/test_hybrid_search.py +441 -0
  23. codespine-1.0.14/tests/test_mcp_server_cache.py +77 -0
  24. codespine-1.0.14/tests/test_overlay.py +548 -0
  25. codespine-1.0.14/tests/test_workflow_golden.py +277 -0
  26. codespine-1.0.13/codespine/analysis/context.py +0 -24
  27. codespine-1.0.13/tests/test_health.py +0 -81
  28. codespine-1.0.13/tests/test_hybrid_search.py +0 -89
  29. codespine-1.0.13/tests/test_overlay.py +0 -232
  30. {codespine-1.0.13 → codespine-1.0.14}/LICENSE +0 -0
  31. {codespine-1.0.13 → codespine-1.0.14}/codespine/analysis/__init__.py +0 -0
  32. {codespine-1.0.13 → codespine-1.0.14}/codespine/analysis/crossmodule.py +0 -0
  33. {codespine-1.0.13 → codespine-1.0.14}/codespine/analysis/deadcode.py +0 -0
  34. {codespine-1.0.13 → codespine-1.0.14}/codespine/cache/__init__.py +0 -0
  35. {codespine-1.0.13 → codespine-1.0.14}/codespine/cache/result_cache.py +0 -0
  36. {codespine-1.0.13 → codespine-1.0.14}/codespine/config.py +0 -0
  37. {codespine-1.0.13 → codespine-1.0.14}/codespine/db/__init__.py +0 -0
  38. {codespine-1.0.13 → codespine-1.0.14}/codespine/db/_cypher_compat.py +0 -0
  39. {codespine-1.0.13 → codespine-1.0.14}/codespine/db/duckdb_store.py +0 -0
  40. {codespine-1.0.13 → codespine-1.0.14}/codespine/db/schema.py +0 -0
  41. {codespine-1.0.13 → codespine-1.0.14}/codespine/db/store.py +0 -0
  42. {codespine-1.0.13 → codespine-1.0.14}/codespine/diff/__init__.py +0 -0
  43. {codespine-1.0.13 → codespine-1.0.14}/codespine/diff/branch_diff.py +0 -0
  44. {codespine-1.0.13 → codespine-1.0.14}/codespine/indexer/__init__.py +0 -0
  45. {codespine-1.0.13 → codespine-1.0.14}/codespine/indexer/call_resolver.py +0 -0
  46. {codespine-1.0.13 → codespine-1.0.14}/codespine/indexer/di_resolver.py +0 -0
  47. {codespine-1.0.13 → codespine-1.0.14}/codespine/indexer/engine.py +0 -0
  48. {codespine-1.0.13 → codespine-1.0.14}/codespine/indexer/java_parser.py +0 -0
  49. {codespine-1.0.13 → codespine-1.0.14}/codespine/indexer/symbol_builder.py +0 -0
  50. {codespine-1.0.13 → codespine-1.0.14}/codespine/mcp/__init__.py +0 -0
  51. {codespine-1.0.13 → codespine-1.0.14}/codespine/noise/__init__.py +0 -0
  52. {codespine-1.0.13 → codespine-1.0.14}/codespine/noise/blocklist.py +0 -0
  53. {codespine-1.0.13 → codespine-1.0.14}/codespine/overlay/__init__.py +0 -0
  54. {codespine-1.0.13 → codespine-1.0.14}/codespine/overlay/git_state.py +0 -0
  55. {codespine-1.0.13 → codespine-1.0.14}/codespine/overlay/store.py +0 -0
  56. {codespine-1.0.13 → codespine-1.0.14}/codespine/project_state.py +0 -0
  57. {codespine-1.0.13 → codespine-1.0.14}/codespine/search/__init__.py +0 -0
  58. {codespine-1.0.13 → codespine-1.0.14}/codespine/search/bm25.py +0 -0
  59. {codespine-1.0.13 → codespine-1.0.14}/codespine/search/fuzzy.py +0 -0
  60. {codespine-1.0.13 → codespine-1.0.14}/codespine/search/rrf.py +0 -0
  61. {codespine-1.0.13 → codespine-1.0.14}/codespine/search/vector.py +0 -0
  62. {codespine-1.0.13 → codespine-1.0.14}/codespine/sharding/__init__.py +0 -0
  63. {codespine-1.0.13 → codespine-1.0.14}/codespine/sharding/router.py +0 -0
  64. {codespine-1.0.13 → codespine-1.0.14}/codespine/sharding/store.py +0 -0
  65. {codespine-1.0.13 → codespine-1.0.14}/codespine/tasks.py +0 -0
  66. {codespine-1.0.13 → codespine-1.0.14}/codespine/watch/__init__.py +0 -0
  67. {codespine-1.0.13 → codespine-1.0.14}/codespine/watch/git_hook.py +0 -0
  68. {codespine-1.0.13 → codespine-1.0.14}/codespine/watch/watcher.py +0 -0
  69. {codespine-1.0.13 → codespine-1.0.14}/codespine.egg-info/dependency_links.txt +0 -0
  70. {codespine-1.0.13 → codespine-1.0.14}/codespine.egg-info/entry_points.txt +0 -0
  71. {codespine-1.0.13 → codespine-1.0.14}/codespine.egg-info/requires.txt +0 -0
  72. {codespine-1.0.13 → codespine-1.0.14}/codespine.egg-info/top_level.txt +0 -0
  73. {codespine-1.0.13 → codespine-1.0.14}/gindex.py +0 -0
  74. {codespine-1.0.13 → codespine-1.0.14}/setup.cfg +0 -0
  75. {codespine-1.0.13 → codespine-1.0.14}/tests/test_branch_diff_normalize.py +0 -0
  76. {codespine-1.0.13 → codespine-1.0.14}/tests/test_call_resolver.py +0 -0
  77. {codespine-1.0.13 → codespine-1.0.14}/tests/test_community_detection.py +0 -0
  78. {codespine-1.0.13 → codespine-1.0.14}/tests/test_cypher_compat.py +0 -0
  79. {codespine-1.0.13 → codespine-1.0.14}/tests/test_deadcode.py +0 -0
  80. {codespine-1.0.13 → codespine-1.0.14}/tests/test_duckdb_store.py +0 -0
  81. {codespine-1.0.13 → codespine-1.0.14}/tests/test_index_and_hybrid.py +0 -0
  82. {codespine-1.0.13 → codespine-1.0.14}/tests/test_java_parser.py +0 -0
  83. {codespine-1.0.13 → codespine-1.0.14}/tests/test_multimodule_index.py +0 -0
  84. {codespine-1.0.13 → codespine-1.0.14}/tests/test_parse_resilience.py +0 -0
  85. {codespine-1.0.13 → codespine-1.0.14}/tests/test_result_cache.py +0 -0
  86. {codespine-1.0.13 → codespine-1.0.14}/tests/test_search_ranking.py +0 -0
  87. {codespine-1.0.13 → codespine-1.0.14}/tests/test_sharding.py +0 -0
  88. {codespine-1.0.13 → codespine-1.0.14}/tests/test_store_recovery.py +0 -0
  89. {codespine-1.0.13 → codespine-1.0.14}/tests/test_tasks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codespine
3
- Version: 1.0.13
3
+ Version: 1.0.14
4
4
  Summary: Local Java code intelligence indexer backed by a graph database
5
5
  Author: CodeSpine contributors
6
6
  License: MIT License
@@ -66,7 +66,7 @@ Dynamic: license-file
66
66
 
67
67
  # CodeSpine
68
68
 
69
- **v1.0.13** — Local Java code intelligence for coding agents, backed by a graph database.
69
+ **v1.0.14** — Local Java code intelligence for coding agents, backed by a graph database.
70
70
 
71
71
  CodeSpine cuts token burn for coding agents working on Java codebases.
72
72
 
@@ -172,6 +172,70 @@ Each analysis phase streams live progress. The final step publishes a read repli
172
172
 
173
173
  ---
174
174
 
175
+ ## GraphRAG Answer Surface
176
+
177
+ CodeSpine includes a full graph-augmented generation (`answer`) tool that synthesizes indexed graph data into grounded, citable answers — no external LLM required for the retrieval/reranking pipeline.
178
+
179
+ When an agent calls `answer(question, project=...)`, the system:
180
+
181
+ 1. **Resolves the focus symbol** via hybrid search (BM25 + vector + fuzzy) scoped to the project.
182
+ 2. **Builds deep context** — impact caller tree, community cluster, and execution flows.
183
+ 3. **Assembles evidence candidates** from search results, impact callers, community matches, and flow entries — each with a quality-adjusted `rerank_score`.
184
+ 4. **Applies graph-aware diversity reranking** — a greedy MMR-style selection that prefers underrepresented evidence kinds, producing a broader evidence mix than pure utility ranking.
185
+ 5. **Generates an evidence subgraph** linking the focus symbol to each selected evidence item with typed edges.
186
+ 6. **Computes confidence** from evidence count, kind diversity, and impact depth.
187
+ 7. **Returns citations** mapping every evidence item back to its source (hybrid_search, analyze_impact, symbol_community, or trace_execution_flows).
188
+
189
+ ### Safe Abstention
190
+
191
+ The system **refuses to guess** when it cannot ground an answer:
192
+
193
+ | Abstention trigger | Behaviour |
194
+ |-------------------|-----------|
195
+ | **Ambiguous focus** — multiple symbols match the query closely (lexical similarity + score proximity) | Returns `abstained=True` with the ambiguity details and recommended disambiguation tools (`find_symbol`, `search_hybrid`) |
196
+ | **No focus found** — hybrid search returned zero matches | Returns `abstained=True` with `note` explaining no symbol was matched |
197
+ | **Insufficient evidence** — focus resolved but no evidence candidates passed the quality threshold | Returns `abstained=True` with `note` explaining the grounding failure |
198
+
199
+ In all three cases the response includes:
200
+ - `answer_contract.status = "abstained"`
201
+ - `fallback.recommended_tools` — actionable suggestions for the agent
202
+ - Full `provenance` and `observability` metadata (same shape as a supported answer)
203
+
204
+ ### Evidence Reranking
205
+
206
+ Each evidence candidate receives a `rerank_score` based on:
207
+
208
+ | Evidence kind | Base weight | Additional signals |
209
+ |--------------|-------------|-------------------|
210
+ | `search_result` | 2.7 | BM25/vector/fuzzy score, confidence label, exact focus-anchor match (+1.55) |
211
+ | `impact` | 3.0 | Call depth (direct=+0.35, indirect=+0.2, transitive=+0.1), edge confidence |
212
+ | `community` | 2.2 | Community cohesion (+0.25 per point), **high-cohesion bonus** (+0.15 if cohesion > 0.7) |
213
+ | `flow` | 2.0 | Flow depth (entry level=+0.25, deeper=+0.05 to +0.2) |
214
+
215
+ The **graph-aware diversity reranker** (Tranche 9) then applies a greedy diverse-selection pass: each evidence kind already represented in the selected set receives a diversity penalty (`1.0 - 0.3 × kind_ratio`), encouraging the system to pick from under-represented kinds. This produces answers with broader architectural coverage.
216
+
217
+ ### Answer Caching
218
+
219
+ GraphRAG answers are cached in-memory for 5 minutes (TTL, max 128 entries). Cache keys include:
220
+
221
+ - **Provenance version** — cache invalidates automatically when the response contract changes
222
+ - **Question text** and **project scope**
223
+ - **Snapshot mtime** — base index timestamp
224
+ - **Overlay mtime** — active overlay/dirty-file timestamp
225
+
226
+ A change to either the base snapshot or the overlay immediately produces a cache miss and a fresh answer. Each cached response includes `observability.cache.hit` and the mtime values that produced it.
227
+
228
+ ### Latency Observability
229
+
230
+ Every answer includes `observability.latency_ms` with per-stage breakdown:
231
+ - `context` — `build_symbol_context()` timings (search + impact + community + flows)
232
+ - `evidence_build` — evidence candidate assembly and reranking
233
+ - `cache_lookup` — cache read time
234
+ - `serialization` — JSON serialization
235
+ - `total` — wall-clock elapsed time
236
+
237
+ ---
238
+
175
239
  ## MCP Configuration
176
240
 
177
241
  Foreground server:
@@ -222,7 +286,7 @@ codespine guide --json # structured JSON for tooling
222
286
 
223
287
  ---
224
288
 
225
- ## MCP Tools (45 total)
289
+ ## MCP Tools (46 total)
226
290
 
227
291
  ### Discovery & Status
228
292
 
@@ -239,7 +303,7 @@ codespine guide --json # structured JSON for tooling
239
303
 
240
304
  | Tool | Description |
241
305
  |------|-------------|
242
- | `search_hybrid(query, k, project)` | Ranked symbol search (BM25 + vector + fuzzy via RRF) with `high/medium/low` confidence scores. |
306
+ | `search_hybrid(query, k, project, explain)` | Ranked symbol search (BM25 + vector + fuzzy via RRF) with `high/medium/low` confidence scores; `explain=True` adds versioned provenance, index fingerprint, per-ranker traces, match reasons, confidence explanations, and a retrieval contract (`candidate_pool_size`, `returned`, `supports_rerank`). |
243
307
  | `find_symbol(name, kind, project, limit)` | Exact/prefix name lookup; returns `primary_match` flag and disambiguated overloads. |
244
308
  | `get_symbol_context(query, max_depth, project)` | One-shot deep context: search + impact + community + flows. |
245
309
  | `get_neighborhood(symbol, project)` | Callers (same project), `cross_project_callers` (other projects), callees, siblings, and override/implements links. |
@@ -261,6 +325,8 @@ Higher-level tools designed to answer full agent questions in a single call, wit
261
325
 
262
326
  | Tool | Description |
263
327
  |------|-------------|
328
+ | `answer(question, project)` | **GraphRAG answer surface.** Resolves the focus symbol, builds deep context (impact + community + flows), applies graph-aware diverse evidence reranking, returns evidence subgraph with typed edges, per-item citations, confidence score, per-stage latency, provenance/envelope, index fingerprint, and safe abstention on ambiguity or weak grounding. Answers are cached with overlay-aware invalidation. |
329
+ | `answer-eval(suite, project)` | Run a GraphRAG regression suite: score each answer against expected contracts (availability, abstention, focus, evidence kinds, citations, confidence, term inclusion/exclusion), enforce quality gates (`min_average_score`, `min_case_score`, `min_pass_rate`), and produce a structured JSON report for CI. |
264
330
  | `ask(question, project)` | Keyword-based natural language dispatcher: routes "who calls X", "what breaks if Y", "explain Z", "find methods named …" to the right tool automatically. |
265
331
  | `what_breaks(symbol, project)` | Plain-English blast-radius summary with `risk_level` (low / medium / high). |
266
332
  | `explain(symbol, project)` | What a class or method does and how it fits in the architecture. |
@@ -345,7 +411,15 @@ codespine watch --path . --install-hook # also install post-commit git hook
345
411
  codespine watch --path . --uninstall-hook # remove git hook
346
412
 
347
413
  # Search & Analysis (CLI)
348
- codespine search "query" # hybrid search
414
+ codespine answer "question" --project app # GraphRAG answer (evidence subgraph, citations, confidence, provenance)
415
+ codespine answer-eval --suite suite.json --project app # GraphRAG regression scoring + quality gates
416
+ codespine answer-eval --suite suite.json --project app --json # structured JSON output for CI
417
+ codespine answer-eval --suite suite.json --project app --min-average-score 90 # override suite gates
418
+ codespine answer-eval --suite suite.json --project app --min-case-score 70 # per-case minimum
419
+ codespine answer-eval --suite suite.json --project app --min-pass-rate 1.0 # strict pass rate
420
+ codespine answer-eval --suite suite.json --project app --max-depth 4 --k 3 # override retrieval params
421
+ codespine search "query" --project app # scoped hybrid search
422
+ codespine search "query" --explain # provenance-aware hybrid search with index fingerprint
349
423
  codespine context "symbol" # one-shot deep context
350
424
  codespine impact "symbol" # caller-tree impact (includes DI consumers)
351
425
  codespine deadcode # dead code candidates
@@ -386,6 +460,8 @@ codespine force-reset # emergency: delete all data files
386
460
 
387
461
  `analyse` is trust-first by default: it completes the core graph in the foreground, validates and publishes the read replica, then keeps deep enrichment moving in the background. Use `--fast` only when you want a budgeted partial core index. Use `codespine background`, `codespine ui`, or `codespine repair` to inspect and recover incomplete or degraded work.
388
462
 
463
+ GraphRAG regression suites are JSON objects with `cases` and optional `gates` thresholds. Each case can assert availability, abstention, focus, evidence kinds, citations, confidence, term inclusion/exclusion, and minimum scores. The scorer runs 10+ weighted checks and produces a structured report with per-check pass/fail, deltas, and observed values. Quality gates enforce `min_average_score`, `min_case_score`, and `min_pass_rate` — suite-defined gates take precedence over CLI defaults.
464
+
389
465
  ---
390
466
 
391
467
  ## Workspace and Module Detection
@@ -547,15 +623,178 @@ sg = ShardedGraphStore(backend="duckdb", num_shards=4) # DuckDB (default)
547
623
 
548
624
  ---
549
625
 
626
+ ## Provenance & Traceability
627
+
628
+ Every GraphRAG answer and hybrid search explain response includes a **versioned provenance envelope** with full trace metadata for audit, debugging, and regression analysis.
629
+
630
+ ### Response Metadata
631
+
632
+ ```json
633
+ {
634
+ "provenance": {
635
+ "version": 10,
636
+ "package_version": "1.0.14",
637
+ "retrieval_mode": "graph_rag",
638
+ "question": "what breaks if I change PaymentService?",
639
+ "project": "app",
640
+ "focus_id": "com.example.PaymentService#processPayment",
641
+ "candidate_counts": {
642
+ "search_result": 3,
643
+ "impact": 5,
644
+ "community": 2,
645
+ "flow": 1
646
+ },
647
+ "search_candidate_count": 3,
648
+ "evidence_sources": ["hybrid_search", "analyze_impact", "symbol_community"],
649
+ "context_timings_ms": {
650
+ "search": 12,
651
+ "impact": 45,
652
+ "community": 8,
653
+ "flows": 3,
654
+ "total": 68
655
+ },
656
+ "index_fingerprint": {
657
+ "snapshot_mtime": 1712345678.123,
658
+ "overlay_mtime": 0.0
659
+ }
660
+ }
661
+ }
662
+ ```
663
+
664
+ ### What each field tells you
665
+
666
+ | Field | Purpose |
667
+ |-------|---------|
668
+ | `version` | Provenance schema version; incremented on breaking changes. Cache keys include this field for automatic invalidation. |
669
+ | `package_version` | The CodeSpine version that generated this response. Ties answers to a specific release. |
670
+ | `index_fingerprint` | Snapshot and overlay mtimes at the time the answer was generated. Enables deterministic replay: same fingerprint + same query → same answer (modulo caching). |
671
+ | `candidate_counts` | How many candidates of each evidence kind were available before reranking. |
672
+ | `context_timings_ms` | Per-stage latency for context assembly (search, impact, community, flows). |
673
+ | `evidence_sources` | Unique set of retrieval backends that supplied selected evidence. |
674
+
675
+ The same provenance envelope is exposed in two places:
676
+ - **Top-level `provenance`** — for direct consumer access
677
+ - **`observability.provenance`** — for monitoring and observability pipelines
678
+
679
+ ### Hybrid Search Provenance
680
+
681
+ When `search_hybrid(..., explain=True)` is used, the response includes a parallel provenance structure with the same `version`, `package_version`, `index_fingerprint`, and per-ranker traces:
682
+
683
+ ```json
684
+ {
685
+ "retrieval_contract": {
686
+ "version": 10,
687
+ "fusion": "rrf",
688
+ "rankers": ["bm25", "semantic", "fuzzy"],
689
+ "candidate_pool_size": 142
690
+ },
691
+ "provenance": {
692
+ "version": 10,
693
+ "package_version": "1.0.14",
694
+ "candidate_pool_size": 142,
695
+ "index_fingerprint": {
696
+ "snapshot_mtime": 1712345678.123,
697
+ "overlay_mtime": 0.0
698
+ },
699
+ "rankers": {
700
+ "bm25": {"traces": [...]},
701
+ "semantic": {"traces": [...]},
702
+ "fuzzy": {"traces": [...]}
703
+ }
704
+ }
705
+ }
706
+ ```
707
+
708
+ ---
709
+
710
+ ## Quality Gates & Regression Testing
711
+
712
+ CodeSpine includes a **continuous evaluation framework** (Tranche 7) that scores GraphRAG answers against expected contracts and enforces quality thresholds in CI.
713
+
714
+ ### Scoring
715
+
716
+ Each answer is scored against an expectation object:
717
+
718
+ ```json
719
+ {
720
+ "available": true,
721
+ "abstained": false,
722
+ "focus_id": "com.example.PaymentService#processPayment",
723
+ "min_evidence_count": 2,
724
+ "min_citation_count": 2,
725
+ "requires_evidence_kinds": ["search_result", "impact"],
726
+ "must_include_terms": ["Best match", "Impact"],
727
+ "min_confidence": "medium"
728
+ }
729
+ ```
730
+
731
+ The scorer runs 10+ checks covering availability, abstention, focus match, evidence count, citation count, evidence kind coverage, term inclusion/exclusion, and confidence thresholds. Each check contributes a weighted delta to the final score (0–100).
732
+
733
+ ### Quality Gates
734
+
735
+ Results are validated against configurable gates:
736
+
737
+ | Gate | Default | Description |
738
+ |------|---------|-------------|
739
+ | `min_average_score` | 80.0 | Minimum average score across all cases |
740
+ | `min_case_score` | 70.0 | Minimum score for any single case |
741
+ | `min_pass_rate` | 1.0 | Fraction of cases that must pass |
742
+
743
+ Suites are JSON files with `cases` and optional `gates`:
744
+
745
+ ```json
746
+ {
747
+ "name": "payment-service-regression",
748
+ "gates": { "min_average_score": 85.0, "min_pass_rate": 0.9 },
749
+ "cases": [
750
+ {"name": "processPayment-impact", "question": "what breaks if I change processPayment?", "expect": {"available": true, "focus_id": "...processPayment", "min_evidence_count": 2, "requires_evidence_kinds": ["search_result", "impact"]}},
751
+ {"name": "unknown-symbol", "question": "what breaks if I change NonExistent?", "expect": {"available": false, "abstained": true}}
752
+ ]
753
+ }
754
+ ```
755
+
756
+ Suite-defined gates take precedence over CLI defaults unless explicitly overridden:
757
+
758
+ ```bash
759
+ codespine answer-eval --suite suite.json --project app --min-average-score 90 --json
760
+ ```
761
+
762
+ ### Programmatic API
763
+
764
+ ```python
765
+ from codespine.graphrag import evaluate_graph_rag_suite, score_graph_rag_answer
766
+
767
+ report = evaluate_graph_rag_suite(store, suite_payload, project="app")
768
+ report["quality_gates"]["passed"] # → True/False
769
+ report["summary"]["average_score"] # → float
770
+ report["summary"]["pass_rate"] # → float
771
+ ```
772
+
773
+ ---
774
+
550
775
  ## Result Caching
551
776
 
552
777
  Expensive analysis tools cache their results for 5 minutes. The cache is keyed by `(tool_name, arguments, snapshot_mtime)` so a new index snapshot automatically invalidates stale entries.
553
778
 
554
- **Cached tools:** `get_impact`, `detect_dead_code`.
779
+ **Cached tools:** `get_impact`, `detect_dead_code`, `answer`.
555
780
 
556
781
  The cache is per MCP server instance (in-memory, not persisted across restarts). It is invalidated automatically when `reindex_file` or `analyse_project` completes.
557
782
 
558
- **Cache stats** are visible via `get_capabilities()`.
783
+ **Cache stats** are visible via `get_capabilities()`. `answer` also exposes per-stage latency timing in its observability payload and cache hit/miss state with snapshot/overlay mtimes for forensic debugging.
784
+
785
+ ### GraphRAG Cache Details
786
+
787
+ The `answer` tool uses a purpose-built cache (`ResultCache` with 128-entry LRU and 300 s TTL) keyed on:
788
+
789
+ - Provenance schema version (bumped on contract changes)
790
+ - Question text and project scope
791
+ - Snapshot mtime (base index timestamp)
792
+ - Overlay mtime (active overlay/dirty-file timestamp)
793
+
794
+ This design ensures:
795
+ - An answer is never served from cache after re-indexing
796
+ - Overlay edits immediately invalidate the corresponding cached answer
797
+ - A provenance version bump invalidates all cached answers in one shot
559
798
 
560
799
  ---
561
800
 
@@ -574,7 +813,7 @@ The deep analysis phase covers four passes that are expensive but optional:
574
813
 
575
814
  **Fast mode:** `codespine analyse --fast` keeps the old budgeted behavior for large repos. If the budget expires before the core graph is complete, CodeSpine publishes a partial snapshot, marks the project as partial, and tracks the background continuation until the core graph is repaired.
576
815
 
577
- **Health checks:** every analyse run now performs a small self-test query suite and reports index anomalies such as large projects with zero call edges. Use `codespine health` for the terminal dashboard or `codespine self-test --json` in CI.
816
+ **Health checks:** every analyse run now performs a small self-test query suite and reports index anomalies such as large projects with zero call edges, plus graph integrity checks for dangling files/classes/methods/symbols. Use `codespine health` for the terminal dashboard or `codespine self-test --json` in CI.
578
817
 
579
818
  **Background visibility:** `codespine background` shows status, result, last phase, progress, and repair hints in the terminal, and `codespine tasks` remains available as the shorter registry view. `codespine ui` serves a local explorer with project state (`ready`, `enriching`, `partial`, `degraded`, `repair_required`), background tasks, and one-click Repair/Reindex actions at `http://127.0.0.1:8765`.
580
819
 
@@ -636,6 +875,7 @@ from codespine.sharding.store import ShardedGraphStore
636
875
  from codespine.indexer.engine import JavaIndexer
637
876
  from codespine.analysis.impact import analyze_impact
638
877
  from codespine.search.hybrid import hybrid_search
878
+ from codespine.graphrag import graph_rag_answer, evaluate_graph_rag_suite, score_graph_rag_answer
639
879
 
640
880
  # Open (or create) the store
641
881
  sg = ShardedGraphStore()
@@ -653,6 +893,31 @@ hits = hybrid_search(store, "payment processor", project="my-project")
653
893
 
654
894
  # Impact analysis
655
895
  impact = analyze_impact(store, "PaymentService", max_depth=4, project="my-project")
896
+
897
+ # GraphRAG answer with full provenance envelope
898
+ answer = graph_rag_answer(store, "what breaks if I change PaymentService?", project="my-project")
899
+ print(answer["answer"]) # → "Best match: … Impact: … Evidence: …"
900
+ print(answer["provenance"]["version"]) # → 10
901
+ print(answer["provenance"]["index_fingerprint"]) # → { snapshot_mtime: …, overlay_mtime: … }
902
+ print(answer["observability"]["latency_ms"]) # → per-stage timings
903
+ print(answer["observability"]["cache"]["hit"]) # → True/False
904
+
905
+ # Score an answer against an expected contract
906
+ report = score_graph_rag_answer(answer, {"available": True, "min_evidence_count": 2})
907
+ print(report["passed"], report["score"]) # → True, 92.5
908
+
909
+ # Run a full regression suite with quality gates
910
+ suite = {
911
+ "name": "payment-regression",
912
+ "gates": {"min_average_score": 85.0},
913
+ "cases": [
914
+ {"question": "what breaks if I change PaymentService?", "expect": {"available": True, "min_evidence_count": 2}},
915
+ {"question": "unknown symbol", "expect": {"available": False, "abstained": True}},
916
+ ],
917
+ }
918
+ eval_report = evaluate_graph_rag_suite(store, suite, project="my-project")
919
+ print(eval_report["quality_gates"]["passed"]) # → True/False
920
+ print(eval_report["summary"]["average_score"]) # → float
656
921
  ```
657
922
 
658
923
  ---
@@ -666,6 +931,10 @@ impact = analyze_impact(store, "PaymentService", max_depth=4, project="my-projec
666
931
  - `codespine force-reset` is the nuclear option — it deletes all data files without going through the DB engine. Use it when `clear-index` fails due to DB corruption (e.g. after an abrupt Ctrl+C mid-write with KùzuDB).
667
932
  - For large Spring or JPA-heavy repos, dead-code results should be reviewed before deletion. The tool is conservative by default; use `strict=True` for a more aggressive audit.
668
933
  - The `CODESPINE_BACKEND` env var must be set consistently across the indexer and the MCP server — mixing backends on the same shard path will produce errors.
934
+ - **GraphRAG answers** are never served from cache after re-indexing or overlay edits. Cache keys include both snapshot and overlay mtimes plus the provenance schema version.
935
+ - **Quality gate failures** in CI produce a non-zero exit code and a detailed JSON report listing every failed check per case. Use `--json` for structured CI integration.
936
+ - **Abstained answers** still return full provenance and observability metadata, so CI pipelines can distinguish "system refused to guess" from "system failed".
937
+ - **Evidence diversity** may reorder results compared to earlier versions: the graph-aware diverse reranker prefers underrepresented evidence kinds over pure utility ranking. This produces broader architectural coverage at the cost of different top-k ordering.
669
938
 
670
939
  ---
671
940