codespine 1.0.12__tar.gz → 1.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codespine-1.0.12 → codespine-1.0.14}/PKG-INFO +277 -8
- {codespine-1.0.12 → codespine-1.0.14}/README.md +276 -7
- {codespine-1.0.12 → codespine-1.0.14}/codespine/__init__.py +1 -1
- {codespine-1.0.12 → codespine-1.0.14}/codespine/analysis/community.py +22 -10
- codespine-1.0.14/codespine/analysis/context.py +72 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/analysis/coupling.py +1 -1
- {codespine-1.0.12 → codespine-1.0.14}/codespine/analysis/flow.py +65 -28
- {codespine-1.0.12 → codespine-1.0.14}/codespine/analysis/impact.py +71 -30
- {codespine-1.0.12 → codespine-1.0.14}/codespine/cli.py +86 -2
- codespine-1.0.14/codespine/graphrag.py +1336 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/guide.py +13 -3
- {codespine-1.0.12 → codespine-1.0.14}/codespine/health.py +68 -6
- {codespine-1.0.12 → codespine-1.0.14}/codespine/mcp/server.py +121 -24
- {codespine-1.0.12 → codespine-1.0.14}/codespine/overlay/merge.py +56 -9
- codespine-1.0.14/codespine/search/hybrid.py +364 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine.egg-info/PKG-INFO +277 -8
- {codespine-1.0.12 → codespine-1.0.14}/codespine.egg-info/SOURCES.txt +7 -1
- {codespine-1.0.12 → codespine-1.0.14}/pyproject.toml +1 -1
- codespine-1.0.14/tests/test_coupling.py +25 -0
- codespine-1.0.14/tests/test_graphrag.py +778 -0
- codespine-1.0.14/tests/test_health.py +193 -0
- codespine-1.0.14/tests/test_hybrid_search.py +441 -0
- codespine-1.0.14/tests/test_mcp_server_cache.py +77 -0
- codespine-1.0.14/tests/test_overlay.py +548 -0
- codespine-1.0.14/tests/test_workflow_golden.py +277 -0
- codespine-1.0.12/codespine/analysis/context.py +0 -24
- codespine-1.0.12/codespine/search/hybrid.py +0 -169
- codespine-1.0.12/tests/test_health.py +0 -81
- codespine-1.0.12/tests/test_overlay.py +0 -232
- {codespine-1.0.12 → codespine-1.0.14}/LICENSE +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/analysis/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/analysis/crossmodule.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/analysis/deadcode.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/cache/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/cache/result_cache.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/config.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/db/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/db/_cypher_compat.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/db/duckdb_store.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/db/schema.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/db/store.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/diff/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/diff/branch_diff.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/indexer/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/indexer/call_resolver.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/indexer/di_resolver.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/indexer/engine.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/indexer/java_parser.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/indexer/symbol_builder.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/mcp/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/noise/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/noise/blocklist.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/overlay/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/overlay/git_state.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/overlay/store.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/project_state.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/search/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/search/bm25.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/search/fuzzy.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/search/rrf.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/search/vector.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/sharding/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/sharding/router.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/sharding/store.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/tasks.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/watch/__init__.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/watch/git_hook.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine/watch/watcher.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine.egg-info/dependency_links.txt +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine.egg-info/entry_points.txt +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine.egg-info/requires.txt +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/codespine.egg-info/top_level.txt +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/gindex.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/setup.cfg +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_branch_diff_normalize.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_call_resolver.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_community_detection.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_cypher_compat.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_deadcode.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_duckdb_store.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_index_and_hybrid.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_java_parser.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_multimodule_index.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_parse_resilience.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_result_cache.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_search_ranking.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_sharding.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_store_recovery.py +0 -0
- {codespine-1.0.12 → codespine-1.0.14}/tests/test_tasks.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codespine
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.14
|
|
4
4
|
Summary: Local Java code intelligence indexer backed by a graph database
|
|
5
5
|
Author: CodeSpine contributors
|
|
6
6
|
License: MIT License
|
|
@@ -66,7 +66,7 @@ Dynamic: license-file
|
|
|
66
66
|
|
|
67
67
|
# CodeSpine
|
|
68
68
|
|
|
69
|
-
**v1.0.
|
|
69
|
+
**v1.0.14** — Local Java code intelligence for coding agents, backed by a graph database.
|
|
70
70
|
|
|
71
71
|
CodeSpine cuts token burn for coding agents working on Java codebases.
|
|
72
72
|
|
|
@@ -172,6 +172,70 @@ Each analysis phase streams live progress. The final step publishes a read repli
|
|
|
172
172
|
|
|
173
173
|
---
|
|
174
174
|
|
|
175
|
+
## GraphRAG Answer Surface
|
|
176
|
+
|
|
177
|
+
CodeSpine includes a full graph-augmented generation (`answer`) tool that synthesizes indexed graph data into grounded, citable answers — no external LLM required for the retrieval/reranking pipeline.
|
|
178
|
+
|
|
179
|
+
When an agent calls `answer(question, project=...)`, the system:
|
|
180
|
+
|
|
181
|
+
1. **Resolves the focus symbol** via hybrid search (BM25 + vector + fuzzy) scoped to the project.
|
|
182
|
+
2. **Builds deep context** — impact caller tree, community cluster, and execution flows.
|
|
183
|
+
3. **Assembles evidence candidates** from search results, impact callers, community matches, and flow entries — each with a quality-adjusted `rerank_score`.
|
|
184
|
+
4. **Applies graph-aware diversity reranking** — a greedy MMR-style selection that prefers underrepresented evidence kinds, producing a broader evidence mix than pure utility ranking.
|
|
185
|
+
5. **Generates an evidence subgraph** linking the focus symbol to each selected evidence item with typed edges.
|
|
186
|
+
6. **Computes confidence** from evidence count, kind diversity, and impact depth.
|
|
187
|
+
7. **Returns citations** mapping every evidence item back to its source (hybrid_search, analyze_impact, symbol_community, or trace_execution_flows).
|
|
188
|
+
|
|
189
|
+
### Safe Abstention
|
|
190
|
+
|
|
191
|
+
The system **refuses to guess** when it cannot ground an answer:
|
|
192
|
+
|
|
193
|
+
| Abstention trigger | Behaviour |
|
|
194
|
+
|-------------------|-----------|
|
|
195
|
+
| **Ambiguous focus** — multiple symbols match the query closely (lexical similarity + score proximity) | Returns `abstained=True` with the ambiguity details and recommended disambiguation tools (`find_symbol`, `search_hybrid`) |
|
|
196
|
+
| **No focus found** — hybrid search returned zero matches | Returns `abstained=True` with `note` explaining no symbol was matched |
|
|
197
|
+
| **Insufficient evidence** — focus resolved but no evidence candidates passed the quality threshold | Returns `abstained=True` with `note` explaining the grounding failure |
|
|
198
|
+
|
|
199
|
+
In all three cases the response includes:
|
|
200
|
+
- `answer_contract.status = "abstained"`
|
|
201
|
+
- `fallback.recommended_tools` — actionable suggestions for the agent
|
|
202
|
+
- Full `provenance` and `observability` metadata (same shape as a supported answer)
|
|
203
|
+
|
|
204
|
+
### Evidence Reranking
|
|
205
|
+
|
|
206
|
+
Each evidence candidate receives a `rerank_score` based on:
|
|
207
|
+
|
|
208
|
+
| Evidence kind | Base weight | Additional signals |
|
|
209
|
+
|--------------|-------------|-------------------|
|
|
210
|
+
| `search_result` | 2.7 | BM25/vector/fuzzy score, confidence label, exact focus-anchor match (+1.55) |
|
|
211
|
+
| `impact` | 3.0 | Call depth (direct=+0.35, indirect=+0.2, transitive=+0.1), edge confidence |
|
|
212
|
+
| `community` | 2.2 | Community cohesion (+0.25 per point), **high-cohesion bonus** (+0.15 if cohesion > 0.7) |
|
|
213
|
+
| `flow` | 2.0 | Flow depth (entry level=+0.25, deeper=+0.05 to +0.2) |
|
|
214
|
+
|
|
215
|
+
The **graph-aware diversity reranker** (Tranche 9) then applies a greedy diverse-selection pass: each evidence kind already represented in the selected set receives a diversity penalty (`1.0 - 0.3 × kind_ratio`), encouraging the system to pick from under-represented kinds. This produces answers with broader architectural coverage.
|
|
216
|
+
|
|
217
|
+
### Answer Caching
|
|
218
|
+
|
|
219
|
+
GraphRAG answers are cached in-memory for 5 minutes (TTL, max 128 entries). Cache keys include:
|
|
220
|
+
|
|
221
|
+
- **Provenance version** — cache invalidates automatically when the response contract changes
|
|
222
|
+
- **Question text** and **project scope**
|
|
223
|
+
- **Snapshot mtime** — base index timestamp
|
|
224
|
+
- **Overlay mtime** — active overlay/dirty-file timestamp
|
|
225
|
+
|
|
226
|
+
A change to either the base snapshot or the overlay immediately produces a cache miss and a fresh answer. Each cached response includes `observability.cache.hit` and the mtime values that produced it.
|
|
227
|
+
|
|
228
|
+
### Latency Observability
|
|
229
|
+
|
|
230
|
+
Every answer includes `observability.latency_ms` with per-stage breakdown:
|
|
231
|
+
- `context` — `build_symbol_context()` timings (search + impact + community + flows)
|
|
232
|
+
- `evidence_build` — evidence candidate assembly and reranking
|
|
233
|
+
- `cache_lookup` — cache read time
|
|
234
|
+
- `serialization` — JSON serialization
|
|
235
|
+
- `total` — wall-clock elapsed time
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
175
239
|
## MCP Configuration
|
|
176
240
|
|
|
177
241
|
Foreground server:
|
|
@@ -222,7 +286,7 @@ codespine guide --json # structured JSON for tooling
|
|
|
222
286
|
|
|
223
287
|
---
|
|
224
288
|
|
|
225
|
-
## MCP Tools (
|
|
289
|
+
## MCP Tools (46 total)
|
|
226
290
|
|
|
227
291
|
### Discovery & Status
|
|
228
292
|
|
|
@@ -239,7 +303,7 @@ codespine guide --json # structured JSON for tooling
|
|
|
239
303
|
|
|
240
304
|
| Tool | Description |
|
|
241
305
|
|------|-------------|
|
|
242
|
-
| `search_hybrid(query, k, project)` | Ranked symbol search (BM25 + vector + fuzzy via RRF) with `high/medium/low` confidence scores. |
|
|
306
|
+
| `search_hybrid(query, k, project, explain)` | Ranked symbol search (BM25 + vector + fuzzy via RRF) with `high/medium/low` confidence scores; `explain=True` adds versioned provenance, index fingerprint, per-ranker traces, match reasons, confidence explanations, and a retrieval contract (`candidate_pool_size`, `returned`, `supports_rerank`). |
|
|
243
307
|
| `find_symbol(name, kind, project, limit)` | Exact/prefix name lookup; returns `primary_match` flag and disambiguated overloads. |
|
|
244
308
|
| `get_symbol_context(query, max_depth, project)` | One-shot deep context: search + impact + community + flows. |
|
|
245
309
|
| `get_neighborhood(symbol, project)` | Callers (same project), `cross_project_callers` (other projects), callees, siblings, and override/implements links. |
|
|
@@ -261,6 +325,8 @@ Higher-level tools designed to answer full agent questions in a single call, wit
|
|
|
261
325
|
|
|
262
326
|
| Tool | Description |
|
|
263
327
|
|------|-------------|
|
|
328
|
+
| `answer(question, project)` | **GraphRAG answer surface.** Resolves the focus symbol, builds deep context (impact + community + flows), applies graph-aware diverse evidence reranking, returns evidence subgraph with typed edges, per-item citations, confidence score, per-stage latency, provenance/envelope, index fingerprint, and safe abstention on ambiguity or weak grounding. Answers are cached with overlay-aware invalidation. |
|
|
329
|
+
| `answer-eval(suite, project)` | Run a GraphRAG regression suite: score each answer against expected contracts (availability, abstention, focus, evidence kinds, citations, confidence, term inclusion/exclusion), enforce quality gates (`min_average_score`, `min_case_score`, `min_pass_rate`), and produce a structured JSON report for CI. |
|
|
264
330
|
| `ask(question, project)` | Keyword-based natural language dispatcher: routes "who calls X", "what breaks if Y", "explain Z", "find methods named …" to the right tool automatically. |
|
|
265
331
|
| `what_breaks(symbol, project)` | Plain-English blast-radius summary with `risk_level` (low / medium / high). |
|
|
266
332
|
| `explain(symbol, project)` | What a class or method does and how it fits in the architecture. |
|
|
@@ -345,7 +411,15 @@ codespine watch --path . --install-hook # also install post-commit git hook
|
|
|
345
411
|
codespine watch --path . --uninstall-hook # remove git hook
|
|
346
412
|
|
|
347
413
|
# Search & Analysis (CLI)
|
|
348
|
-
codespine
|
|
414
|
+
codespine answer "question" --project app # GraphRAG answer (evidence subgraph, citations, confidence, provenance)
|
|
415
|
+
codespine answer-eval --suite suite.json --project app # GraphRAG regression scoring + quality gates
|
|
416
|
+
codespine answer-eval --suite suite.json --project app --json # structured JSON output for CI
|
|
417
|
+
codespine answer-eval --suite suite.json --project app --min-average-score 90 # override suite gates
|
|
418
|
+
codespine answer-eval --suite suite.json --project app --min-case-score 70 # per-case minimum
|
|
419
|
+
codespine answer-eval --suite suite.json --project app --min-pass-rate 1.0 # strict pass rate
|
|
420
|
+
codespine answer-eval --suite suite.json --project app --max-depth 4 --k 3 # override retrieval params
|
|
421
|
+
codespine search "query" --project app # scoped hybrid search
|
|
422
|
+
codespine search "query" --explain # provenance-aware hybrid search with index fingerprint
|
|
349
423
|
codespine context "symbol" # one-shot deep context
|
|
350
424
|
codespine impact "symbol" # caller-tree impact (includes DI consumers)
|
|
351
425
|
codespine deadcode # dead code candidates
|
|
@@ -386,6 +460,8 @@ codespine force-reset # emergency: delete all data files
|
|
|
386
460
|
|
|
387
461
|
`analyse` is trust-first by default: it completes the core graph in the foreground, validates and publishes the read replica, then keeps deep enrichment moving in the background. Use `--fast` only when you want a budgeted partial core index. Use `codespine background`, `codespine ui`, or `codespine repair` to inspect and recover incomplete or degraded work.
|
|
388
462
|
|
|
463
|
+
GraphRAG regression suites are JSON objects with `cases` and optional `gates` thresholds. Each case can assert availability, abstention, focus, evidence kinds, citations, confidence, term inclusion/exclusion, and minimum scores. The scorer runs 10+ weighted checks and produces a structured report with per-check pass/fail, deltas, and observed values. Quality gates enforce `min_average_score`, `min_case_score`, and `min_pass_rate` — suite-defined gates take precedence over CLI defaults.
|
|
464
|
+
|
|
389
465
|
---
|
|
390
466
|
|
|
391
467
|
## Workspace and Module Detection
|
|
@@ -547,15 +623,178 @@ sg = ShardedGraphStore(backend="duckdb", num_shards=4) # DuckDB (default)
|
|
|
547
623
|
|
|
548
624
|
---
|
|
549
625
|
|
|
626
|
+
## Provenance & Traceability
|
|
627
|
+
|
|
628
|
+
Every GraphRAG answer and hybrid search explain response includes a **versioned provenance envelope** with full trace metadata for audit, debugging, and regression analysis.
|
|
629
|
+
|
|
630
|
+
### Response Metadata
|
|
631
|
+
|
|
632
|
+
```json
|
|
633
|
+
{
|
|
634
|
+
"provenance": {
|
|
635
|
+
"version": 10,
|
|
636
|
+
"package_version": "1.0.14",
|
|
637
|
+
"retrieval_mode": "graph_rag",
|
|
638
|
+
"question": "what breaks if I change PaymentService?",
|
|
639
|
+
"project": "app",
|
|
640
|
+
"focus_id": "com.example.PaymentService#processPayment",
|
|
641
|
+
"candidate_counts": {
|
|
642
|
+
"search_result": 3,
|
|
643
|
+
"impact": 5,
|
|
644
|
+
"community": 2,
|
|
645
|
+
"flow": 1
|
|
646
|
+
},
|
|
647
|
+
"search_candidate_count": 3,
|
|
648
|
+
"evidence_sources": ["hybrid_search", "analyze_impact", "symbol_community"],
|
|
649
|
+
"context_timings_ms": {
|
|
650
|
+
"search": 12,
|
|
651
|
+
"impact": 45,
|
|
652
|
+
"community": 8,
|
|
653
|
+
"flows": 3,
|
|
654
|
+
"total": 68
|
|
655
|
+
},
|
|
656
|
+
"index_fingerprint": {
|
|
657
|
+
"snapshot_mtime": 1712345678.123,
|
|
658
|
+
"overlay_mtime": 0.0
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
```
|
|
663
|
+
|
|
664
|
+
### What each field tells you
|
|
665
|
+
|
|
666
|
+
| Field | Purpose |
|
|
667
|
+
|-------|---------|
|
|
668
|
+
| `version` | Provenance schema version; incremented on breaking changes. Cache keys include this field for automatic invalidation. |
|
|
669
|
+
| `package_version` | The CodeSpine version that generated this response. Ties answers to a specific release. |
|
|
670
|
+
| `index_fingerprint` | Snapshot and overlay mtimes at the time the answer was generated. Enables deterministic replay: same fingerprint + same query → same answer (modulo caching). |
|
|
671
|
+
| `candidate_counts` | How many candidates of each evidence kind were available before reranking. |
|
|
672
|
+
| `context_timings_ms` | Per-stage latency for context assembly (search, impact, community, flows). |
|
|
673
|
+
| `evidence_sources` | Unique set of retrieval backends that supplied selected evidence. |
|
|
674
|
+
|
|
675
|
+
The same provenance envelope is exposed in two places:
|
|
676
|
+
- **Top-level `provenance`** — for direct consumer access
|
|
677
|
+
- **`observability.provenance`** — for monitoring and observability pipelines
|
|
678
|
+
|
|
679
|
+
### Hybrid Search Provenance
|
|
680
|
+
|
|
681
|
+
When `search_hybrid(..., explain=True)` is used, the response includes a parallel provenance structure with the same `version`, `package_version`, `index_fingerprint`, and per-ranker traces:
|
|
682
|
+
|
|
683
|
+
```json
|
|
684
|
+
{
|
|
685
|
+
"retrieval_contract": {
|
|
686
|
+
"version": 10,
|
|
687
|
+
"fusion": "rrf",
|
|
688
|
+
"rankers": ["bm25", "semantic", "fuzzy"],
|
|
689
|
+
"candidate_pool_size": 142
|
|
690
|
+
},
|
|
691
|
+
"provenance": {
|
|
692
|
+
"version": 10,
|
|
693
|
+
"package_version": "1.0.14",
|
|
694
|
+
"candidate_pool_size": 142,
|
|
695
|
+
"index_fingerprint": {
|
|
696
|
+
"snapshot_mtime": 1712345678.123,
|
|
697
|
+
"overlay_mtime": 0.0
|
|
698
|
+
},
|
|
699
|
+
"rankers": {
|
|
700
|
+
"bm25": {"traces": [...]},
|
|
701
|
+
"semantic": {"traces": [...]},
|
|
702
|
+
"fuzzy": {"traces": [...]}
|
|
703
|
+
}
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
```
|
|
707
|
+
|
|
708
|
+
---
|
|
709
|
+
|
|
710
|
+
## Quality Gates & Regression Testing
|
|
711
|
+
|
|
712
|
+
CodeSpine includes a **continuous evaluation framework** (Tranche 7) that scores GraphRAG answers against expected contracts and enforces quality thresholds in CI.
|
|
713
|
+
|
|
714
|
+
### Scoring
|
|
715
|
+
|
|
716
|
+
Each answer is scored against an expectation object:
|
|
717
|
+
|
|
718
|
+
```json
|
|
719
|
+
{
|
|
720
|
+
"available": true,
|
|
721
|
+
"abstained": false,
|
|
722
|
+
"focus_id": "com.example.PaymentService#processPayment",
|
|
723
|
+
"min_evidence_count": 2,
|
|
724
|
+
"min_citation_count": 2,
|
|
725
|
+
"requires_evidence_kinds": ["search_result", "impact"],
|
|
726
|
+
"must_include_terms": ["Best match", "Impact"],
|
|
727
|
+
"min_confidence": "medium"
|
|
728
|
+
}
|
|
729
|
+
```
|
|
730
|
+
|
|
731
|
+
The scorer runs 10+ checks covering availability, abstention, focus match, evidence count, citation count, evidence kind coverage, term inclusion/exclusion, and confidence thresholds. Each check contributes a weighted delta to the final score (0–100).
|
|
732
|
+
|
|
733
|
+
### Quality Gates
|
|
734
|
+
|
|
735
|
+
Results are validated against configurable gates:
|
|
736
|
+
|
|
737
|
+
| Gate | Default | Description |
|
|
738
|
+
|------|---------|-------------|
|
|
739
|
+
| `min_average_score` | 80.0 | Minimum average score across all cases |
|
|
740
|
+
| `min_case_score` | 70.0 | Minimum score for any single case |
|
|
741
|
+
| `min_pass_rate` | 1.0 | Fraction of cases that must pass |
|
|
742
|
+
|
|
743
|
+
Suites are JSON files with `cases` and optional `gates`:
|
|
744
|
+
|
|
745
|
+
```json
|
|
746
|
+
{
|
|
747
|
+
"name": "payment-service-regression",
|
|
748
|
+
"gates": { "min_average_score": 85.0, "min_pass_rate": 0.9 },
|
|
749
|
+
"cases": [
|
|
750
|
+
{"name": "processPayment-impact", "question": "what breaks if I change processPayment?", "expect": {"available": true, "focus_id": "...processPayment", "min_evidence_count": 2, "requires_evidence_kinds": ["search_result", "impact"]}},
|
|
751
|
+
{"name": "unknown-symbol", "question": "what breaks if I change NonExistent?", "expect": {"available": false, "abstained": true}}
|
|
752
|
+
]
|
|
753
|
+
}
|
|
754
|
+
```
|
|
755
|
+
|
|
756
|
+
Suite-defined gates take precedence over CLI defaults unless explicitly overridden:
|
|
757
|
+
|
|
758
|
+
```bash
|
|
759
|
+
codespine answer-eval --suite suite.json --project app --min-average-score 90 --json
|
|
760
|
+
```
|
|
761
|
+
|
|
762
|
+
### Programmatic API
|
|
763
|
+
|
|
764
|
+
```python
|
|
765
|
+
from codespine.graphrag import evaluate_graph_rag_suite, score_graph_rag_answer
|
|
766
|
+
|
|
767
|
+
report = evaluate_graph_rag_suite(store, suite_payload, project="app")
|
|
768
|
+
report["quality_gates"]["passed"] # → True/False
|
|
769
|
+
report["summary"]["average_score"] # → float
|
|
770
|
+
report["summary"]["pass_rate"] # → float
|
|
771
|
+
```
|
|
772
|
+
|
|
773
|
+
---
|
|
774
|
+
|
|
550
775
|
## Result Caching
|
|
551
776
|
|
|
552
777
|
Expensive analysis tools cache their results for 5 minutes. The cache is keyed by `(tool_name, arguments, snapshot_mtime)` so a new index snapshot automatically invalidates stale entries.
|
|
553
778
|
|
|
554
|
-
**Cached tools:** `get_impact`, `detect_dead_code`.
|
|
779
|
+
**Cached tools:** `get_impact`, `detect_dead_code`, `answer`.
|
|
555
780
|
|
|
556
781
|
The cache is per MCP server instance (in-memory, not persisted across restarts). It is invalidated automatically when `reindex_file` or `analyse_project` completes.
|
|
557
782
|
|
|
558
|
-
**Cache stats** are visible via `get_capabilities()`.
|
|
783
|
+
**Cache stats** are visible via `get_capabilities()`. `answer` also exposes per-stage latency timing in its observability payload and cache hit/miss state with snapshot/overlay mtimes for forensic debugging.
|
|
784
|
+
|
|
785
|
+
### GraphRAG Cache Details
|
|
786
|
+
|
|
787
|
+
The `answer` tool uses a purpose-built cache (`ResultCache` with 128-entry LRU and 300 s TTL) keyed on:
|
|
788
|
+
|
|
789
|
+
- Provenance schema version (bumped on contract changes)
|
|
790
|
+
- Question text and project scope
|
|
791
|
+
- Snapshot mtime (base index timestamp)
|
|
792
|
+
- Overlay mtime (active overlay/dirty-file timestamp)
|
|
793
|
+
|
|
794
|
+
This design ensures:
|
|
795
|
+
- An answer is never served from cache after re-indexing
|
|
796
|
+
- Overlay edits immediately invalidate the corresponding cached answer
|
|
797
|
+
- A provenance version bump invalidates all cached answers in one shot
|
|
559
798
|
|
|
560
799
|
---
|
|
561
800
|
|
|
@@ -574,7 +813,7 @@ The deep analysis phase covers four passes that are expensive but optional:
|
|
|
574
813
|
|
|
575
814
|
**Fast mode:** `codespine analyse --fast` keeps the old budgeted behavior for large repos. If the budget expires before the core graph is complete, CodeSpine publishes a partial snapshot, marks the project as partial, and tracks the background continuation until the core graph is repaired.
|
|
576
815
|
|
|
577
|
-
**Health checks:** every analyse run now performs a small self-test query suite and reports index anomalies such as large projects with zero call edges. Use `codespine health` for the terminal dashboard or `codespine self-test --json` in CI.
|
|
816
|
+
**Health checks:** every analyse run now performs a small self-test query suite and reports index anomalies such as large projects with zero call edges, plus graph integrity checks for dangling files/classes/methods/symbols. Use `codespine health` for the terminal dashboard or `codespine self-test --json` in CI.
|
|
578
817
|
|
|
579
818
|
**Background visibility:** `codespine background` shows status, result, last phase, progress, and repair hints in the terminal, and `codespine tasks` remains available as the shorter registry view. `codespine ui` serves a local explorer with project state (`ready`, `enriching`, `partial`, `degraded`, `repair_required`), background tasks, and one-click Repair/Reindex actions at `http://127.0.0.1:8765`.
|
|
580
819
|
|
|
@@ -636,6 +875,7 @@ from codespine.sharding.store import ShardedGraphStore
|
|
|
636
875
|
from codespine.indexer.engine import JavaIndexer
|
|
637
876
|
from codespine.analysis.impact import analyze_impact
|
|
638
877
|
from codespine.search.hybrid import hybrid_search
|
|
878
|
+
from codespine.graphrag import graph_rag_answer, evaluate_graph_rag_suite, score_graph_rag_answer
|
|
639
879
|
|
|
640
880
|
# Open (or create) the store
|
|
641
881
|
sg = ShardedGraphStore()
|
|
@@ -653,6 +893,31 @@ hits = hybrid_search(store, "payment processor", project="my-project")
|
|
|
653
893
|
|
|
654
894
|
# Impact analysis
|
|
655
895
|
impact = analyze_impact(store, "PaymentService", max_depth=4, project="my-project")
|
|
896
|
+
|
|
897
|
+
# GraphRAG answer with full provenance envelope
|
|
898
|
+
answer = graph_rag_answer(store, "what breaks if I change PaymentService?", project="my-project")
|
|
899
|
+
print(answer["answer"]) # → "Best match: … Impact: … Evidence: …"
|
|
900
|
+
print(answer["provenance"]["version"]) # → 10
|
|
901
|
+
print(answer["provenance"]["index_fingerprint"]) # → { snapshot_mtime: …, overlay_mtime: … }
|
|
902
|
+
print(answer["observability"]["latency_ms"]) # → per-stage timings
|
|
903
|
+
print(answer["observability"]["cache"]["hit"]) # → True/False
|
|
904
|
+
|
|
905
|
+
# Score an answer against an expected contract
|
|
906
|
+
report = score_graph_rag_answer(answer, {"available": True, "min_evidence_count": 2})
|
|
907
|
+
print(report["passed"], report["score"]) # → True, 92.5
|
|
908
|
+
|
|
909
|
+
# Run a full regression suite with quality gates
|
|
910
|
+
suite = {
|
|
911
|
+
"name": "payment-regression",
|
|
912
|
+
"gates": {"min_average_score": 85.0},
|
|
913
|
+
"cases": [
|
|
914
|
+
{"question": "what breaks if I change PaymentService?", "expect": {"available": True, "min_evidence_count": 2}},
|
|
915
|
+
{"question": "unknown symbol", "expect": {"available": False, "abstained": True}},
|
|
916
|
+
],
|
|
917
|
+
}
|
|
918
|
+
eval_report = evaluate_graph_rag_suite(store, suite, project="my-project")
|
|
919
|
+
print(eval_report["quality_gates"]["passed"]) # → True/False
|
|
920
|
+
print(eval_report["summary"]["average_score"]) # → float
|
|
656
921
|
```
|
|
657
922
|
|
|
658
923
|
---
|
|
@@ -666,6 +931,10 @@ impact = analyze_impact(store, "PaymentService", max_depth=4, project="my-projec
|
|
|
666
931
|
- `codespine force-reset` is the nuclear option — it deletes all data files without going through the DB engine. Use it when `clear-index` fails due to DB corruption (e.g. after an abrupt Ctrl+C mid-write with KùzuDB).
|
|
667
932
|
- For large Spring or JPA-heavy repos, dead-code results should be reviewed before deletion. The tool is conservative by default; use `strict=True` for a more aggressive audit.
|
|
668
933
|
- The `CODESPINE_BACKEND` env var must be set consistently across the indexer and the MCP server — mixing backends on the same shard path will produce errors.
|
|
934
|
+
- **GraphRAG answers** are never served from cache after re-indexing or overlay edits. Cache keys include both snapshot and overlay mtimes plus the provenance schema version.
|
|
935
|
+
- **Quality gate failures** in CI produce a non-zero exit code and a detailed JSON report listing every failed check per case. Use `--json` for structured CI integration.
|
|
936
|
+
- **Abstained answers** still return full provenance and observability metadata, so CI pipelines can distinguish "system refused to guess" from "system failed".
|
|
937
|
+
- **Evidence diversity** may reorder results compared to earlier versions: the graph-aware diverse reranker prefers underrepresented evidence kinds over pure utility ranking. This produces broader architectural coverage at the cost of different top-k ordering.
|
|
669
938
|
|
|
670
939
|
---
|
|
671
940
|
|