code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_review_graph/__init__.py +20 -0
- code_review_graph/__main__.py +4 -0
- code_review_graph/analysis.py +410 -0
- code_review_graph/changes.py +409 -0
- code_review_graph/cli.py +1255 -0
- code_review_graph/communities.py +874 -0
- code_review_graph/constants.py +23 -0
- code_review_graph/context_savings.py +317 -0
- code_review_graph/custom_languages.py +322 -0
- code_review_graph/daemon.py +1009 -0
- code_review_graph/daemon_cli.py +320 -0
- code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
- code_review_graph/embeddings.py +1006 -0
- code_review_graph/enrich.py +303 -0
- code_review_graph/eval/__init__.py +33 -0
- code_review_graph/eval/benchmarks/__init__.py +1 -0
- code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
- code_review_graph/eval/benchmarks/build_performance.py +60 -0
- code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
- code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
- code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
- code_review_graph/eval/benchmarks/search_quality.py +59 -0
- code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
- code_review_graph/eval/configs/code-review-graph.yaml +50 -0
- code_review_graph/eval/configs/express.yaml +45 -0
- code_review_graph/eval/configs/fastapi.yaml +48 -0
- code_review_graph/eval/configs/flask.yaml +50 -0
- code_review_graph/eval/configs/gin.yaml +51 -0
- code_review_graph/eval/configs/httpx.yaml +48 -0
- code_review_graph/eval/reporter.py +301 -0
- code_review_graph/eval/runner.py +211 -0
- code_review_graph/eval/scorer.py +85 -0
- code_review_graph/eval/token_benchmark.py +182 -0
- code_review_graph/exports.py +409 -0
- code_review_graph/flows.py +698 -0
- code_review_graph/graph.py +1427 -0
- code_review_graph/graph_diff.py +122 -0
- code_review_graph/hints.py +384 -0
- code_review_graph/incremental.py +1245 -0
- code_review_graph/jedi_resolver.py +303 -0
- code_review_graph/main.py +1079 -0
- code_review_graph/memory.py +142 -0
- code_review_graph/migrations.py +284 -0
- code_review_graph/parser.py +6957 -0
- code_review_graph/postprocessing.py +134 -0
- code_review_graph/prompts.py +159 -0
- code_review_graph/refactor.py +852 -0
- code_review_graph/registry.py +319 -0
- code_review_graph/rescript_resolver.py +206 -0
- code_review_graph/search.py +447 -0
- code_review_graph/skills.py +1481 -0
- code_review_graph/spring_resolver.py +200 -0
- code_review_graph/temporal_resolver.py +199 -0
- code_review_graph/token_benchmark.py +125 -0
- code_review_graph/tools/__init__.py +156 -0
- code_review_graph/tools/_common.py +176 -0
- code_review_graph/tools/analysis_tools.py +184 -0
- code_review_graph/tools/build.py +541 -0
- code_review_graph/tools/community_tools.py +246 -0
- code_review_graph/tools/context.py +152 -0
- code_review_graph/tools/docs.py +274 -0
- code_review_graph/tools/flows_tools.py +176 -0
- code_review_graph/tools/query.py +692 -0
- code_review_graph/tools/refactor_tools.py +168 -0
- code_review_graph/tools/registry_tools.py +125 -0
- code_review_graph/tools/review.py +477 -0
- code_review_graph/tsconfig_resolver.py +257 -0
- code_review_graph/visualization.py +2184 -0
- code_review_graph/wiki.py +305 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: flask
|
|
2
|
+
url: https://github.com/pallets/flask
|
|
3
|
+
# Pinned to the latest test_commit SHA so the snapshot is deterministic and
|
|
4
|
+
# every test_commit below is reachable as an ancestor.
|
|
5
|
+
commit: a29f88ce6f2f9843bd6fcbbfce1390a2071965d6
|
|
6
|
+
language: python
|
|
7
|
+
size_category: small
|
|
8
|
+
|
|
9
|
+
test_commits:
|
|
10
|
+
- sha: fbb6f0bc4c60a0bada0e03c3480d0ccf30a3c1df
|
|
11
|
+
description: "all teardown callbacks are called despite errors"
|
|
12
|
+
changed_files: 10
|
|
13
|
+
- sha: a29f88ce6f2f9843bd6fcbbfce1390a2071965d6
|
|
14
|
+
description: "document that headers must be set before streaming"
|
|
15
|
+
changed_files: 4
|
|
16
|
+
|
|
17
|
+
entry_points:
|
|
18
|
+
- "src/flask/app.py::Flask.wsgi_app"
|
|
19
|
+
- "src/flask/sansio/app.py::App.add_url_rule"
|
|
20
|
+
|
|
21
|
+
search_queries:
|
|
22
|
+
- query: "Flask wsgi"
|
|
23
|
+
expected: "src/flask/app.py::Flask"
|
|
24
|
+
- query: "AppContext globals"
|
|
25
|
+
expected: "src/flask/ctx.py::AppContext"
|
|
26
|
+
- query: "create logger"
|
|
27
|
+
expected: "src/flask/logging.py::create_logger"
|
|
28
|
+
|
|
29
|
+
# Multi-hop retrieval tasks (semantic_search → query_graph one-hop)
|
|
30
|
+
# See docs/REPRODUCING.md for the schema.
|
|
31
|
+
multi_hop_tasks:
|
|
32
|
+
- id: flask-dispatch-callers
|
|
33
|
+
nl_query: "Where Flask dispatches HTTP requests"
|
|
34
|
+
anchor_qualified_suffix: "src/flask/app.py::flask.dispatch_request"
|
|
35
|
+
traversal_pattern: callers_of
|
|
36
|
+
expected_neighbor_names: ["full_dispatch_request"]
|
|
37
|
+
k: 10
|
|
38
|
+
- id: flask-exception-callers
|
|
39
|
+
nl_query: "Where Flask handles uncaught exceptions"
|
|
40
|
+
anchor_qualified_suffix: "src/flask/app.py::flask.handle_exception"
|
|
41
|
+
traversal_pattern: callers_of
|
|
42
|
+
expected_neighbor_names: ["wsgi_app"]
|
|
43
|
+
k: 10
|
|
44
|
+
|
|
45
|
+
# Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
|
|
46
|
+
# query). See docs/REPRODUCING.md for the methodology.
|
|
47
|
+
agent_questions:
|
|
48
|
+
- "How does dispatch_request route an incoming HTTP request"
|
|
49
|
+
- "Where is the AppContext pushed and popped"
|
|
50
|
+
- "How does create_logger configure application logging"
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
name: gin
|
|
2
|
+
url: https://github.com/gin-gonic/gin
|
|
3
|
+
# Pinned to the latest test_commit SHA so the snapshot is deterministic and
|
|
4
|
+
# every test_commit below is reachable as an ancestor.
|
|
5
|
+
commit: 5c00df8afadd06cc5be530dde00fe6d9fa4a2e4a
|
|
6
|
+
language: go
|
|
7
|
+
size_category: small
|
|
8
|
+
|
|
9
|
+
test_commits:
|
|
10
|
+
- sha: 052d1a79aafe3f04078a2716f8e77d4340308383
|
|
11
|
+
description: "feat(render): add PDF renderer and tests"
|
|
12
|
+
changed_files: 5
|
|
13
|
+
- sha: 472d086af2acd924cb4b9d7be0525f7d790f69bc
|
|
14
|
+
description: "fix(tree): panic in findCaseInsensitivePathRec with RedirectFixedPath"
|
|
15
|
+
changed_files: 2
|
|
16
|
+
- sha: 5c00df8afadd06cc5be530dde00fe6d9fa4a2e4a
|
|
17
|
+
description: "fix(render): write content length in Data.Render"
|
|
18
|
+
changed_files: 2
|
|
19
|
+
|
|
20
|
+
entry_points:
|
|
21
|
+
- "gin.go::Engine"
|
|
22
|
+
- "routergroup.go::RouterGroup"
|
|
23
|
+
|
|
24
|
+
search_queries:
|
|
25
|
+
- query: "Engine ServeHTTP"
|
|
26
|
+
expected: "gin.go::Engine"
|
|
27
|
+
- query: "Context request"
|
|
28
|
+
expected: "context.go::Context"
|
|
29
|
+
- query: "node tree"
|
|
30
|
+
expected: "tree.go::node"
|
|
31
|
+
|
|
32
|
+
multi_hop_tasks:
|
|
33
|
+
- id: gin-serve-http-callees
|
|
34
|
+
nl_query: "What does the gin engine do when serving an HTTP request"
|
|
35
|
+
anchor_qualified_suffix: "gin.go::engine.servehttp"
|
|
36
|
+
traversal_pattern: callees_of
|
|
37
|
+
expected_neighbor_names: ["reset"]
|
|
38
|
+
k: 10
|
|
39
|
+
- id: gin-context-next-callers
|
|
40
|
+
nl_query: "Who advances the gin middleware chain via Context.Next"
|
|
41
|
+
anchor_qualified_suffix: "context.go::context.next"
|
|
42
|
+
traversal_pattern: callers_of
|
|
43
|
+
expected_neighbor_names: ["handleHTTPRequest", "serveError"]
|
|
44
|
+
k: 10
|
|
45
|
+
|
|
46
|
+
# Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
|
|
47
|
+
# query). See docs/REPRODUCING.md for the methodology.
|
|
48
|
+
agent_questions:
|
|
49
|
+
- "How does Engine.ServeHTTP route an incoming request"
|
|
50
|
+
- "Where does Context.Next advance the middleware chain"
|
|
51
|
+
- "How does the node tree match wildcard routes"
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: httpx
|
|
2
|
+
url: https://github.com/encode/httpx
|
|
3
|
+
# Pinned to the latest test_commit SHA so the snapshot is deterministic and
|
|
4
|
+
# every test_commit below is reachable as an ancestor.
|
|
5
|
+
commit: b55d4635701d9dc22928ee647880c76b078ba3f2
|
|
6
|
+
language: python
|
|
7
|
+
size_category: small
|
|
8
|
+
|
|
9
|
+
test_commits:
|
|
10
|
+
- sha: ae1b9f66238f75ced3ced5e4485408435de10768
|
|
11
|
+
description: "Expose FunctionAuth in __all__"
|
|
12
|
+
changed_files: 3
|
|
13
|
+
- sha: b55d4635701d9dc22928ee647880c76b078ba3f2
|
|
14
|
+
description: "Upgrade Python type checker mypy"
|
|
15
|
+
changed_files: 4
|
|
16
|
+
|
|
17
|
+
entry_points:
|
|
18
|
+
- "httpx/_client.py::Client"
|
|
19
|
+
- "httpx/_client.py::AsyncClient"
|
|
20
|
+
|
|
21
|
+
search_queries:
|
|
22
|
+
- query: "Client request"
|
|
23
|
+
expected: "httpx/_client.py::Client"
|
|
24
|
+
- query: "Response headers"
|
|
25
|
+
expected: "httpx/_models.py::Response"
|
|
26
|
+
- query: "BaseClient"
|
|
27
|
+
expected: "httpx/_client.py::BaseClient"
|
|
28
|
+
|
|
29
|
+
multi_hop_tasks:
|
|
30
|
+
- id: httpx-client-request-callers
|
|
31
|
+
nl_query: "Which HTTP verbs route through the httpx Client.request"
|
|
32
|
+
anchor_qualified_suffix: "httpx/_client.py::client.request"
|
|
33
|
+
traversal_pattern: callers_of
|
|
34
|
+
expected_neighbor_names: ["get", "options", "head", "post", "put", "patch"]
|
|
35
|
+
k: 10
|
|
36
|
+
- id: httpx-async-request-tests
|
|
37
|
+
nl_query: "Tests covering the httpx async client request method"
|
|
38
|
+
anchor_qualified_suffix: "httpx/_client.py::asyncclient.request"
|
|
39
|
+
traversal_pattern: callers_of
|
|
40
|
+
expected_neighbor_names: ["test_raise_for_status"]
|
|
41
|
+
k: 10
|
|
42
|
+
|
|
43
|
+
# Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
|
|
44
|
+
# query). See docs/REPRODUCING.md for the methodology.
|
|
45
|
+
agent_questions:
|
|
46
|
+
- "How does Client.request send an HTTP request"
|
|
47
|
+
- "Where are Response headers parsed and decoded"
|
|
48
|
+
- "How does BaseClient build request URLs"
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Markdown report generator for evaluation benchmark results.
|
|
2
|
+
|
|
3
|
+
Takes a list of benchmark result dicts and produces a formatted markdown table
|
|
4
|
+
suitable for inclusion in documentation or CI output.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import csv
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def generate_markdown_report(results: list[dict[str, Any]]) -> str:
|
|
15
|
+
"""Generate a markdown report from benchmark results.
|
|
16
|
+
|
|
17
|
+
Each result dict should contain at minimum a ``benchmark`` key identifying
|
|
18
|
+
the benchmark name, plus any metric keys (e.g. ``ratio``,
|
|
19
|
+
``reduction_percent``, ``mrr``, ``precision``, ``recall``, ``f1``).
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
results: List of result dicts from benchmark runs.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
A markdown string containing a summary table and per-benchmark details.
|
|
26
|
+
"""
|
|
27
|
+
if not results:
|
|
28
|
+
return "# Evaluation Report\n\nNo benchmark results to report.\n"
|
|
29
|
+
|
|
30
|
+
lines: list[str] = []
|
|
31
|
+
lines.append("# Evaluation Report")
|
|
32
|
+
lines.append("")
|
|
33
|
+
|
|
34
|
+
# Collect all metric keys across results (excluding 'benchmark')
|
|
35
|
+
all_keys: list[str] = []
|
|
36
|
+
seen: set[str] = set()
|
|
37
|
+
for r in results:
|
|
38
|
+
for k in r:
|
|
39
|
+
if k != "benchmark" and k not in seen:
|
|
40
|
+
all_keys.append(k)
|
|
41
|
+
seen.add(k)
|
|
42
|
+
|
|
43
|
+
# Summary table
|
|
44
|
+
lines.append("## Summary")
|
|
45
|
+
lines.append("")
|
|
46
|
+
|
|
47
|
+
header = "| Benchmark | " + " | ".join(all_keys) + " |"
|
|
48
|
+
separator = "| --- | " + " | ".join("---" for _ in all_keys) + " |"
|
|
49
|
+
lines.append(header)
|
|
50
|
+
lines.append(separator)
|
|
51
|
+
|
|
52
|
+
for r in results:
|
|
53
|
+
name = r.get("benchmark", "unknown")
|
|
54
|
+
values = [str(r.get(k, "-")) for k in all_keys]
|
|
55
|
+
lines.append(f"| {name} | " + " | ".join(values) + " |")
|
|
56
|
+
|
|
57
|
+
lines.append("")
|
|
58
|
+
|
|
59
|
+
# Per-benchmark detail sections
|
|
60
|
+
lines.append("## Details")
|
|
61
|
+
lines.append("")
|
|
62
|
+
for r in results:
|
|
63
|
+
name = r.get("benchmark", "unknown")
|
|
64
|
+
lines.append(f"### {name}")
|
|
65
|
+
lines.append("")
|
|
66
|
+
for k in all_keys:
|
|
67
|
+
v = r.get(k, "-")
|
|
68
|
+
lines.append(f"- **{k}**: {v}")
|
|
69
|
+
lines.append("")
|
|
70
|
+
|
|
71
|
+
return "\n".join(lines)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _read_csvs(results_dir: Path, prefix: str) -> list[dict[str, str]]:
|
|
75
|
+
"""Read all CSV files matching a prefix from the results directory."""
|
|
76
|
+
rows: list[dict[str, str]] = []
|
|
77
|
+
for p in sorted(results_dir.glob(f"*_{prefix}_*.csv")):
|
|
78
|
+
with open(p, newline="") as f:
|
|
79
|
+
reader = csv.DictReader(f)
|
|
80
|
+
rows.extend(reader)
|
|
81
|
+
return rows
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _md_table(headers: list[str], rows: list[list[str]]) -> str:
|
|
85
|
+
"""Build a markdown table from headers and rows."""
|
|
86
|
+
lines = []
|
|
87
|
+
lines.append("| " + " | ".join(headers) + " |")
|
|
88
|
+
lines.append("| " + " | ".join("---" for _ in headers) + " |")
|
|
89
|
+
for row in rows:
|
|
90
|
+
lines.append("| " + " | ".join(row) + " |")
|
|
91
|
+
return "\n".join(lines)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def generate_full_report(results_dir: str | Path) -> str:
|
|
95
|
+
"""Generate a full markdown evaluation report from CSV result files.
|
|
96
|
+
|
|
97
|
+
Reads all CSV files in *results_dir*, groups them by benchmark type,
|
|
98
|
+
and produces a markdown report with methodology notes and per-benchmark
|
|
99
|
+
result tables.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
results_dir: Directory containing CSV result files.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Markdown string with the full report.
|
|
106
|
+
"""
|
|
107
|
+
results_dir = Path(results_dir)
|
|
108
|
+
lines: list[str] = []
|
|
109
|
+
lines.append("# Evaluation Report")
|
|
110
|
+
lines.append("")
|
|
111
|
+
lines.append("## Methodology")
|
|
112
|
+
lines.append("")
|
|
113
|
+
lines.append("Benchmarks are run against real open-source repositories.")
|
|
114
|
+
lines.append("Token counts use a consistent `len(text) // 4` approximation.")
|
|
115
|
+
lines.append(
|
|
116
|
+
"Impact accuracy reports two ground-truth modes: "
|
|
117
|
+
"graph-derived (circular — upper bound) and co-change "
|
|
118
|
+
"(files co-changed in the same commit, seed excluded)."
|
|
119
|
+
)
|
|
120
|
+
lines.append(
|
|
121
|
+
"Rows with `status=error` are kept for forensics but excluded "
|
|
122
|
+
"from all aggregates."
|
|
123
|
+
)
|
|
124
|
+
lines.append("")
|
|
125
|
+
|
|
126
|
+
benchmark_types = [
|
|
127
|
+
"token_efficiency",
|
|
128
|
+
"impact_accuracy",
|
|
129
|
+
"agent_baseline",
|
|
130
|
+
"flow_completeness",
|
|
131
|
+
"search_quality",
|
|
132
|
+
"build_performance",
|
|
133
|
+
"multi_hop_retrieval",
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
for btype in benchmark_types:
|
|
137
|
+
rows = _read_csvs(results_dir, btype)
|
|
138
|
+
if not rows:
|
|
139
|
+
continue
|
|
140
|
+
|
|
141
|
+
title = btype.replace("_", " ").title()
|
|
142
|
+
lines.append(f"## {title}")
|
|
143
|
+
lines.append("")
|
|
144
|
+
|
|
145
|
+
headers = list(rows[0].keys())
|
|
146
|
+
table_rows = [[r.get(h, "-") for h in headers] for r in rows]
|
|
147
|
+
lines.append(_md_table(headers, table_rows))
|
|
148
|
+
lines.append("")
|
|
149
|
+
|
|
150
|
+
if len(lines) <= 6:
|
|
151
|
+
lines.append("No benchmark results found.")
|
|
152
|
+
lines.append("")
|
|
153
|
+
|
|
154
|
+
return "\n".join(lines)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def generate_readme_tables(results_dir: str | Path) -> str:
|
|
158
|
+
"""Generate concise README-ready tables from CSV result files.
|
|
159
|
+
|
|
160
|
+
Produces three tables:
|
|
161
|
+
- Table A: Token Efficiency
|
|
162
|
+
- Table B: Accuracy & Quality
|
|
163
|
+
- Table C: Performance
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
results_dir: Directory containing CSV result files.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Markdown string with the three tables.
|
|
170
|
+
"""
|
|
171
|
+
results_dir = Path(results_dir)
|
|
172
|
+
lines: list[str] = []
|
|
173
|
+
|
|
174
|
+
# Table A: Token Efficiency
|
|
175
|
+
te_rows = _read_csvs(results_dir, "token_efficiency")
|
|
176
|
+
if te_rows:
|
|
177
|
+
lines.append("### Token Efficiency")
|
|
178
|
+
lines.append("")
|
|
179
|
+
headers = [
|
|
180
|
+
"Repo", "Files", "Naive Tokens", "Standard Tokens",
|
|
181
|
+
"Graph Tokens", "Naive/Graph", "Std/Graph",
|
|
182
|
+
]
|
|
183
|
+
table_rows = []
|
|
184
|
+
for r in te_rows:
|
|
185
|
+
table_rows.append([
|
|
186
|
+
r.get("repo", "-"),
|
|
187
|
+
r.get("changed_files", "-"),
|
|
188
|
+
r.get("naive_tokens", "-"),
|
|
189
|
+
r.get("standard_tokens", "-"),
|
|
190
|
+
r.get("graph_tokens", "-"),
|
|
191
|
+
r.get("naive_to_graph_ratio", "-"),
|
|
192
|
+
r.get("standard_to_graph_ratio", "-"),
|
|
193
|
+
])
|
|
194
|
+
lines.append(_md_table(headers, table_rows))
|
|
195
|
+
lines.append("")
|
|
196
|
+
|
|
197
|
+
# Table B: Accuracy & Quality
|
|
198
|
+
ia_rows = _read_csvs(results_dir, "impact_accuracy")
|
|
199
|
+
fc_rows = _read_csvs(results_dir, "flow_completeness")
|
|
200
|
+
sq_rows = _read_csvs(results_dir, "search_quality")
|
|
201
|
+
|
|
202
|
+
if ia_rows or fc_rows or sq_rows:
|
|
203
|
+
lines.append("### Accuracy & Quality")
|
|
204
|
+
lines.append("")
|
|
205
|
+
headers = ["Repo", "Impact F1 (graph-derived)", "Flow Recall", "Search MRR"]
|
|
206
|
+
# Build a per-repo summary
|
|
207
|
+
repo_data: dict[str, dict[str, object]] = {}
|
|
208
|
+
mrr_accum: dict[str, list[float]] = {}
|
|
209
|
+
f1_accum: dict[str, list[float]] = {}
|
|
210
|
+
for r in ia_rows:
|
|
211
|
+
# Failed rows are kept in the CSV for forensics but must never
|
|
212
|
+
# contribute to a headline number; co-change rows are a
|
|
213
|
+
# different metric and get their own reporting.
|
|
214
|
+
if r.get("status", "ok") not in ("", "ok"):
|
|
215
|
+
continue
|
|
216
|
+
mode = r.get("ground_truth_mode", "")
|
|
217
|
+
if mode and not mode.startswith("graph-derived"):
|
|
218
|
+
continue
|
|
219
|
+
repo = r.get("repo", "?")
|
|
220
|
+
repo_data.setdefault(repo, {})
|
|
221
|
+
try:
|
|
222
|
+
f1_accum.setdefault(repo, []).append(float(r.get("f1", "")))
|
|
223
|
+
except (ValueError, TypeError):
|
|
224
|
+
pass
|
|
225
|
+
for r in fc_rows:
|
|
226
|
+
repo_data.setdefault(r.get("repo", "?"), {})["recall"] = r.get("recall", "-")
|
|
227
|
+
for r in sq_rows:
|
|
228
|
+
repo = r.get("repo", "?")
|
|
229
|
+
repo_data.setdefault(repo, {})
|
|
230
|
+
try:
|
|
231
|
+
mrr_accum.setdefault(repo, []).append(float(r.get("reciprocal_rank", 0)))
|
|
232
|
+
except (ValueError, TypeError):
|
|
233
|
+
pass
|
|
234
|
+
|
|
235
|
+
table_rows = []
|
|
236
|
+
for repo, d in sorted(repo_data.items()):
|
|
237
|
+
mrr_vals = mrr_accum.get(repo, [])
|
|
238
|
+
mrr = (
|
|
239
|
+
str(round(sum(mrr_vals) / len(mrr_vals), 3))
|
|
240
|
+
if mrr_vals
|
|
241
|
+
else "-"
|
|
242
|
+
)
|
|
243
|
+
f1_vals = f1_accum.get(repo, [])
|
|
244
|
+
f1 = (
|
|
245
|
+
str(round(sum(f1_vals) / len(f1_vals), 3))
|
|
246
|
+
if f1_vals
|
|
247
|
+
else "-"
|
|
248
|
+
)
|
|
249
|
+
table_rows.append([
|
|
250
|
+
repo,
|
|
251
|
+
f1,
|
|
252
|
+
str(d.get("recall", "-")),
|
|
253
|
+
mrr,
|
|
254
|
+
])
|
|
255
|
+
lines.append(_md_table(headers, table_rows))
|
|
256
|
+
lines.append("")
|
|
257
|
+
|
|
258
|
+
# Table B2: Agent Baseline (grep top-k vs graph query)
|
|
259
|
+
ab_rows = _read_csvs(results_dir, "agent_baseline")
|
|
260
|
+
if ab_rows:
|
|
261
|
+
lines.append("### Agent Baseline (grep top-k vs graph query)")
|
|
262
|
+
lines.append("")
|
|
263
|
+
headers = [
|
|
264
|
+
"Repo", "Question", "Baseline Tokens", "Graph Tokens",
|
|
265
|
+
"Baseline/Graph", "Status",
|
|
266
|
+
]
|
|
267
|
+
table_rows = []
|
|
268
|
+
for r in ab_rows:
|
|
269
|
+
table_rows.append([
|
|
270
|
+
r.get("repo", "-"),
|
|
271
|
+
r.get("question", "-"),
|
|
272
|
+
r.get("baseline_tokens", "-"),
|
|
273
|
+
r.get("graph_tokens", "-"),
|
|
274
|
+
r.get("baseline_to_graph_ratio", "-"),
|
|
275
|
+
r.get("status", "ok") or "ok",
|
|
276
|
+
])
|
|
277
|
+
lines.append(_md_table(headers, table_rows))
|
|
278
|
+
lines.append("")
|
|
279
|
+
|
|
280
|
+
# Table C: Performance
|
|
281
|
+
bp_rows = _read_csvs(results_dir, "build_performance")
|
|
282
|
+
if bp_rows:
|
|
283
|
+
lines.append("### Performance")
|
|
284
|
+
lines.append("")
|
|
285
|
+
headers = ["Repo", "Files", "Nodes", "Flow Det. (s)", "Search (ms)"]
|
|
286
|
+
table_rows = []
|
|
287
|
+
for r in bp_rows:
|
|
288
|
+
table_rows.append([
|
|
289
|
+
r.get("repo", "-"),
|
|
290
|
+
r.get("file_count", "-"),
|
|
291
|
+
r.get("node_count", "-"),
|
|
292
|
+
r.get("flow_detection_seconds", "-"),
|
|
293
|
+
r.get("search_avg_ms", "-"),
|
|
294
|
+
])
|
|
295
|
+
lines.append(_md_table(headers, table_rows))
|
|
296
|
+
lines.append("")
|
|
297
|
+
|
|
298
|
+
if not lines:
|
|
299
|
+
return "No benchmark results found.\n"
|
|
300
|
+
|
|
301
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""Evaluation runner: orchestrates benchmark execution across repositories."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import csv
|
|
6
|
+
import logging
|
|
7
|
+
import subprocess
|
|
8
|
+
from datetime import date
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import yaml # type: ignore[import-untyped]
|
|
13
|
+
except ImportError:
|
|
14
|
+
yaml = None # type: ignore[assignment]
|
|
15
|
+
|
|
16
|
+
from code_review_graph.eval.benchmarks import (
|
|
17
|
+
agent_baseline,
|
|
18
|
+
build_performance,
|
|
19
|
+
flow_completeness,
|
|
20
|
+
impact_accuracy,
|
|
21
|
+
multi_hop_retrieval,
|
|
22
|
+
search_quality,
|
|
23
|
+
token_efficiency,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
logger = logging.getLogger(__name__)
|
|
27
|
+
|
|
28
|
+
BENCHMARK_REGISTRY = {
|
|
29
|
+
"token_efficiency": token_efficiency.run,
|
|
30
|
+
"impact_accuracy": impact_accuracy.run,
|
|
31
|
+
"flow_completeness": flow_completeness.run,
|
|
32
|
+
"search_quality": search_quality.run,
|
|
33
|
+
"build_performance": build_performance.run,
|
|
34
|
+
"multi_hop_retrieval": multi_hop_retrieval.run,
|
|
35
|
+
"agent_baseline": agent_baseline.run,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
CONFIGS_DIR = Path(__file__).parent / "configs"
|
|
39
|
+
DEFAULT_OUTPUT = Path("evaluate/results")
|
|
40
|
+
DEFAULT_REPOS = Path("evaluate/test_repos")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _require_yaml():
|
|
44
|
+
if yaml is None:
|
|
45
|
+
raise ImportError("pyyaml is required: pip install code-review-graph[eval]")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def load_config(name: str) -> dict:
|
|
49
|
+
"""Load a single benchmark config by name."""
|
|
50
|
+
_require_yaml()
|
|
51
|
+
path = CONFIGS_DIR / f"{name}.yaml"
|
|
52
|
+
with open(path) as f:
|
|
53
|
+
return yaml.safe_load(f)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def load_all_configs() -> list[dict]:
|
|
57
|
+
"""Load all benchmark configs from the configs directory."""
|
|
58
|
+
_require_yaml()
|
|
59
|
+
configs = []
|
|
60
|
+
for p in sorted(CONFIGS_DIR.glob("*.yaml")):
|
|
61
|
+
with open(p) as f:
|
|
62
|
+
configs.append(yaml.safe_load(f))
|
|
63
|
+
return configs
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def clone_or_update(config: dict, repos_dir: Path | None = None) -> Path:
|
|
67
|
+
"""Clone or update a repository at the config's pinned ``commit`` SHA.
|
|
68
|
+
|
|
69
|
+
Full clones (no ``--depth``) are required: the pinned ``test_commits`` are
|
|
70
|
+
often older than any reasonable shallow-clone window, and a missed SHA
|
|
71
|
+
used to silently fall back to ``git diff HEAD~1 HEAD`` — producing
|
|
72
|
+
benchmark numbers tied to whatever upstream HEAD looked like that day.
|
|
73
|
+
|
|
74
|
+
Every subprocess call's exit status is checked; failures raise
|
|
75
|
+
``RuntimeError`` so reproducibility issues surface immediately instead of
|
|
76
|
+
yielding garbage results.
|
|
77
|
+
"""
|
|
78
|
+
repos_dir = repos_dir or DEFAULT_REPOS
|
|
79
|
+
repos_dir.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
repo_path = repos_dir / config["name"]
|
|
81
|
+
|
|
82
|
+
if repo_path.exists():
|
|
83
|
+
proc = subprocess.run(
|
|
84
|
+
["git", "fetch", "--all", "--tags"],
|
|
85
|
+
cwd=str(repo_path),
|
|
86
|
+
capture_output=True,
|
|
87
|
+
text=True,
|
|
88
|
+
)
|
|
89
|
+
if proc.returncode != 0:
|
|
90
|
+
raise RuntimeError(
|
|
91
|
+
f"git fetch failed in {repo_path}: {proc.stderr.strip()}"
|
|
92
|
+
)
|
|
93
|
+
else:
|
|
94
|
+
proc = subprocess.run(
|
|
95
|
+
["git", "clone", config["url"], str(repo_path)],
|
|
96
|
+
capture_output=True,
|
|
97
|
+
text=True,
|
|
98
|
+
)
|
|
99
|
+
if proc.returncode != 0:
|
|
100
|
+
raise RuntimeError(
|
|
101
|
+
f"git clone failed for {config['url']}: {proc.stderr.strip()}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
commit = config.get("commit", "HEAD")
|
|
105
|
+
if commit != "HEAD":
|
|
106
|
+
proc = subprocess.run(
|
|
107
|
+
["git", "checkout", commit],
|
|
108
|
+
cwd=str(repo_path),
|
|
109
|
+
capture_output=True,
|
|
110
|
+
text=True,
|
|
111
|
+
)
|
|
112
|
+
if proc.returncode != 0:
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
f"git checkout {commit} failed in {repo_path}: "
|
|
115
|
+
f"{proc.stderr.strip()}"
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
return repo_path
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def write_csv(results: list[dict], path: Path) -> None:
|
|
122
|
+
"""Write benchmark results to a CSV file."""
|
|
123
|
+
if not results:
|
|
124
|
+
return
|
|
125
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
126
|
+
fieldnames = list(results[0].keys())
|
|
127
|
+
with open(path, "w", newline="") as f:
|
|
128
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
129
|
+
writer.writeheader()
|
|
130
|
+
writer.writerows(results)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def run_eval(
|
|
134
|
+
repos: list[str] | None = None,
|
|
135
|
+
benchmarks: list[str] | None = None,
|
|
136
|
+
output_dir: str | Path | None = None,
|
|
137
|
+
) -> dict[str, list[dict]]:
|
|
138
|
+
"""Run evaluation benchmarks across repositories.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
repos: List of repo config names to evaluate (None = all).
|
|
142
|
+
benchmarks: List of benchmark names to run (None = all).
|
|
143
|
+
output_dir: Directory for CSV output files.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Dict mapping ``{repo}_{benchmark}`` to list of result dicts.
|
|
147
|
+
"""
|
|
148
|
+
output_dir = Path(output_dir) if output_dir else DEFAULT_OUTPUT
|
|
149
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
|
|
151
|
+
if repos:
|
|
152
|
+
configs = [load_config(r) for r in repos]
|
|
153
|
+
else:
|
|
154
|
+
configs = load_all_configs()
|
|
155
|
+
|
|
156
|
+
benchmark_names = benchmarks or list(BENCHMARK_REGISTRY.keys())
|
|
157
|
+
all_results: dict[str, list[dict]] = {}
|
|
158
|
+
today = date.today().isoformat()
|
|
159
|
+
|
|
160
|
+
for config in configs:
|
|
161
|
+
name = config["name"]
|
|
162
|
+
logger.info("Evaluating %s...", name)
|
|
163
|
+
|
|
164
|
+
# Resolve the repo path to an absolute Path before handing it to
|
|
165
|
+
# full_build / get_db_path so the stored qualified_names match what
|
|
166
|
+
# the CLI/MCP layer produces (those paths go through _get_store ->
|
|
167
|
+
# _validate_repo_root which .resolve()s). Without this, a later
|
|
168
|
+
# ``code-review-graph update --repo <relative>`` writes the same
|
|
169
|
+
# function under a new absolute-prefixed qualified_name, leaving the
|
|
170
|
+
# graph with duplicate nodes for the same source location.
|
|
171
|
+
repo_path = clone_or_update(config).resolve()
|
|
172
|
+
|
|
173
|
+
# Build graph
|
|
174
|
+
from code_review_graph.graph import GraphStore
|
|
175
|
+
from code_review_graph.incremental import full_build, get_db_path
|
|
176
|
+
from code_review_graph.postprocessing import run_post_processing
|
|
177
|
+
|
|
178
|
+
db_path = get_db_path(repo_path)
|
|
179
|
+
store = GraphStore(db_path)
|
|
180
|
+
|
|
181
|
+
full_build(repo_path, store)
|
|
182
|
+
# full_build is the parsing-only primitive; the higher-level CLI/MCP
|
|
183
|
+
# wrappers run postprocessing on top. The eval framework bypasses
|
|
184
|
+
# those, so call it directly here. Without this, FTS5 stays empty
|
|
185
|
+
# and downstream benchmarks (token_efficiency, search_quality)
|
|
186
|
+
# silently produce useless results. See: search.rebuild_fts_index.
|
|
187
|
+
pp_result = run_post_processing(store)
|
|
188
|
+
for warning in pp_result.get("warnings", []):
|
|
189
|
+
logger.warning(" postprocessing: %s", warning)
|
|
190
|
+
|
|
191
|
+
for bench_name in benchmark_names:
|
|
192
|
+
if bench_name not in BENCHMARK_REGISTRY:
|
|
193
|
+
logger.warning("Unknown benchmark: %s", bench_name)
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
logger.info(" Running %s...", bench_name)
|
|
197
|
+
try:
|
|
198
|
+
bench_fn = BENCHMARK_REGISTRY[bench_name]
|
|
199
|
+
results = bench_fn(repo_path, store, config)
|
|
200
|
+
|
|
201
|
+
key = f"{name}_{bench_name}"
|
|
202
|
+
all_results[key] = results
|
|
203
|
+
write_csv(results, output_dir / f"{key}_{today}.csv")
|
|
204
|
+
logger.info(" %s: %d result(s)", bench_name, len(results))
|
|
205
|
+
except Exception as e:
|
|
206
|
+
logger.error(" %s failed: %s", bench_name, e)
|
|
207
|
+
all_results[f"{name}_{bench_name}"] = []
|
|
208
|
+
|
|
209
|
+
store.close()
|
|
210
|
+
|
|
211
|
+
return all_results
|