code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- code_review_graph/__init__.py +20 -0
- code_review_graph/__main__.py +4 -0
- code_review_graph/analysis.py +410 -0
- code_review_graph/changes.py +409 -0
- code_review_graph/cli.py +1255 -0
- code_review_graph/communities.py +874 -0
- code_review_graph/constants.py +23 -0
- code_review_graph/context_savings.py +317 -0
- code_review_graph/custom_languages.py +322 -0
- code_review_graph/daemon.py +1009 -0
- code_review_graph/daemon_cli.py +320 -0
- code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
- code_review_graph/embeddings.py +1006 -0
- code_review_graph/enrich.py +303 -0
- code_review_graph/eval/__init__.py +33 -0
- code_review_graph/eval/benchmarks/__init__.py +1 -0
- code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
- code_review_graph/eval/benchmarks/build_performance.py +60 -0
- code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
- code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
- code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
- code_review_graph/eval/benchmarks/search_quality.py +59 -0
- code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
- code_review_graph/eval/configs/code-review-graph.yaml +50 -0
- code_review_graph/eval/configs/express.yaml +45 -0
- code_review_graph/eval/configs/fastapi.yaml +48 -0
- code_review_graph/eval/configs/flask.yaml +50 -0
- code_review_graph/eval/configs/gin.yaml +51 -0
- code_review_graph/eval/configs/httpx.yaml +48 -0
- code_review_graph/eval/reporter.py +301 -0
- code_review_graph/eval/runner.py +211 -0
- code_review_graph/eval/scorer.py +85 -0
- code_review_graph/eval/token_benchmark.py +182 -0
- code_review_graph/exports.py +409 -0
- code_review_graph/flows.py +698 -0
- code_review_graph/graph.py +1427 -0
- code_review_graph/graph_diff.py +122 -0
- code_review_graph/hints.py +384 -0
- code_review_graph/incremental.py +1245 -0
- code_review_graph/jedi_resolver.py +303 -0
- code_review_graph/main.py +1079 -0
- code_review_graph/memory.py +142 -0
- code_review_graph/migrations.py +284 -0
- code_review_graph/parser.py +6957 -0
- code_review_graph/postprocessing.py +134 -0
- code_review_graph/prompts.py +159 -0
- code_review_graph/refactor.py +852 -0
- code_review_graph/registry.py +319 -0
- code_review_graph/rescript_resolver.py +206 -0
- code_review_graph/search.py +447 -0
- code_review_graph/skills.py +1481 -0
- code_review_graph/spring_resolver.py +200 -0
- code_review_graph/temporal_resolver.py +199 -0
- code_review_graph/token_benchmark.py +125 -0
- code_review_graph/tools/__init__.py +156 -0
- code_review_graph/tools/_common.py +176 -0
- code_review_graph/tools/analysis_tools.py +184 -0
- code_review_graph/tools/build.py +541 -0
- code_review_graph/tools/community_tools.py +246 -0
- code_review_graph/tools/context.py +152 -0
- code_review_graph/tools/docs.py +274 -0
- code_review_graph/tools/flows_tools.py +176 -0
- code_review_graph/tools/query.py +692 -0
- code_review_graph/tools/refactor_tools.py +168 -0
- code_review_graph/tools/registry_tools.py +125 -0
- code_review_graph/tools/review.py +477 -0
- code_review_graph/tsconfig_resolver.py +257 -0
- code_review_graph/visualization.py +2184 -0
- code_review_graph/wiki.py +305 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
- code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""Impact accuracy benchmark: measures precision/recall of change impact analysis.
|
|
2
|
+
|
|
3
|
+
Two ground-truth modes are emitted side by side (``ground_truth_mode`` column):
|
|
4
|
+
|
|
5
|
+
- **graph-derived (circular — upper bound)** — the historical mode. Ground
|
|
6
|
+
truth is the changed files plus files with CALLS/IMPORTS_FROM edges into
|
|
7
|
+
them, i.e. derived from the same graph the predictor traverses. Recall in
|
|
8
|
+
this mode is an upper bound by construction, not independent evidence.
|
|
9
|
+
- **co-change (same commit, seed excluded)** — the honest mode. The predictor
|
|
10
|
+
is seeded with a single changed file and graded against the *other* files
|
|
11
|
+
the author actually touched in the same commit. The ground truth comes from
|
|
12
|
+
git history, not from the graph.
|
|
13
|
+
|
|
14
|
+
Failure semantics: if ``analyze_changes`` throws, the row is recorded with
|
|
15
|
+
``status="error"`` and empty metric fields — it stays in the CSV but is
|
|
16
|
+
excluded from aggregates. (Previously a failure silently set
|
|
17
|
+
``predicted = set(changed)``, guaranteeing a fake recall of 1.0.)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import logging
|
|
23
|
+
import statistics
|
|
24
|
+
import subprocess
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
MODE_GRAPH_DERIVED = "graph-derived (circular — upper bound)"
|
|
30
|
+
MODE_CO_CHANGE = "co-change (same commit, seed excluded)"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _get_changed_files(repo_path: Path, sha: str) -> list[str]:
|
|
34
|
+
"""Get list of changed files for a commit."""
|
|
35
|
+
result = subprocess.run(
|
|
36
|
+
["git", "diff", "--name-only", f"{sha}~1", sha],
|
|
37
|
+
cwd=str(repo_path),
|
|
38
|
+
capture_output=True,
|
|
39
|
+
text=True,
|
|
40
|
+
)
|
|
41
|
+
if result.returncode != 0:
|
|
42
|
+
result = subprocess.run(
|
|
43
|
+
["git", "diff", "--name-only", "HEAD~1", "HEAD"],
|
|
44
|
+
cwd=str(repo_path),
|
|
45
|
+
capture_output=True,
|
|
46
|
+
text=True,
|
|
47
|
+
)
|
|
48
|
+
return [f.strip() for f in result.stdout.strip().splitlines() if f.strip()]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _files_from_analysis(analysis: dict) -> set[str]:
|
|
52
|
+
"""Extract predicted file paths from an ``analyze_changes`` result."""
|
|
53
|
+
predicted: set[str] = set()
|
|
54
|
+
for f in analysis.get("changed_functions", []):
|
|
55
|
+
if isinstance(f, dict) and "file_path" in f:
|
|
56
|
+
predicted.add(f["file_path"])
|
|
57
|
+
elif isinstance(f, dict) and "file" in f:
|
|
58
|
+
predicted.add(f["file"])
|
|
59
|
+
for flow in analysis.get("affected_flows", []):
|
|
60
|
+
if isinstance(flow, dict):
|
|
61
|
+
for node in flow.get("nodes", []):
|
|
62
|
+
if isinstance(node, dict) and "file_path" in node:
|
|
63
|
+
predicted.add(node["file_path"])
|
|
64
|
+
return predicted
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _graph_neighbor_files(store, files: list[str]) -> set[str]:
|
|
68
|
+
"""Files with CALLS/IMPORTS_FROM edges into any node of *files* (one hop)."""
|
|
69
|
+
out: set[str] = set()
|
|
70
|
+
for f in files:
|
|
71
|
+
for node in store.get_nodes_by_file(f):
|
|
72
|
+
for edge in store.get_edges_by_target(node.qualified_name):
|
|
73
|
+
if edge.kind in ("CALLS", "IMPORTS_FROM"):
|
|
74
|
+
src_qual = edge.source_qualified
|
|
75
|
+
src_file = src_qual.split("::")[0] if "::" in src_qual else ""
|
|
76
|
+
if src_file:
|
|
77
|
+
out.add(src_file)
|
|
78
|
+
return out
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _base_row(repo: str, sha: str, mode: str, seed: str) -> dict:
|
|
82
|
+
return {
|
|
83
|
+
"repo": repo,
|
|
84
|
+
"commit": sha,
|
|
85
|
+
"ground_truth_mode": mode,
|
|
86
|
+
"seed_file": seed,
|
|
87
|
+
"predicted_files": "",
|
|
88
|
+
"actual_files": "",
|
|
89
|
+
"true_positives": "",
|
|
90
|
+
"precision": "",
|
|
91
|
+
"recall": "",
|
|
92
|
+
"f1": "",
|
|
93
|
+
"status": "ok",
|
|
94
|
+
"error": "",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _scored_row(
|
|
99
|
+
repo: str, sha: str, mode: str, seed: str,
|
|
100
|
+
predicted: set[str], actual: set[str],
|
|
101
|
+
) -> dict:
|
|
102
|
+
tp = len(predicted & actual)
|
|
103
|
+
precision = tp / max(len(predicted), 1)
|
|
104
|
+
recall = tp / max(len(actual), 1)
|
|
105
|
+
f1 = 2 * precision * recall / max(precision + recall, 0.001)
|
|
106
|
+
row = _base_row(repo, sha, mode, seed)
|
|
107
|
+
row.update({
|
|
108
|
+
"predicted_files": len(predicted),
|
|
109
|
+
"actual_files": len(actual),
|
|
110
|
+
"true_positives": tp,
|
|
111
|
+
"precision": round(precision, 3),
|
|
112
|
+
"recall": round(recall, 3),
|
|
113
|
+
"f1": round(f1, 3),
|
|
114
|
+
})
|
|
115
|
+
return row
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _error_row(repo: str, sha: str, mode: str, seed: str, exc: Exception) -> dict:
|
|
119
|
+
row = _base_row(repo, sha, mode, seed)
|
|
120
|
+
row["status"] = "error"
|
|
121
|
+
row["error"] = str(exc)[:200]
|
|
122
|
+
return row
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def run(repo_path: Path, store, config: dict) -> list[dict]:
|
|
126
|
+
"""Run impact accuracy benchmark (both ground-truth modes)."""
|
|
127
|
+
from code_review_graph.changes import analyze_changes
|
|
128
|
+
|
|
129
|
+
results = []
|
|
130
|
+
repo = config["name"]
|
|
131
|
+
for tc in config.get("test_commits", []):
|
|
132
|
+
sha = tc["sha"]
|
|
133
|
+
changed = _get_changed_files(repo_path, sha)
|
|
134
|
+
if not changed:
|
|
135
|
+
continue
|
|
136
|
+
|
|
137
|
+
# --- Mode 1: graph-derived ground truth (circular — upper bound) ---
|
|
138
|
+
try:
|
|
139
|
+
analysis = analyze_changes(
|
|
140
|
+
store, changed, repo_root=str(repo_path), base=sha + "~1",
|
|
141
|
+
)
|
|
142
|
+
except Exception as exc:
|
|
143
|
+
# Old behaviour set predicted = set(changed) here, which
|
|
144
|
+
# guarantees recall 1.0 on a *failed* run. Mark failed instead.
|
|
145
|
+
logger.warning("analyze_changes failed on %s: %s", sha, exc)
|
|
146
|
+
results.append(_error_row(repo, sha, MODE_GRAPH_DERIVED, "", exc))
|
|
147
|
+
analysis = None
|
|
148
|
+
|
|
149
|
+
if analysis is not None:
|
|
150
|
+
predicted = set(changed) | _files_from_analysis(analysis)
|
|
151
|
+
actual = set(changed) | _graph_neighbor_files(store, changed)
|
|
152
|
+
results.append(
|
|
153
|
+
_scored_row(repo, sha, MODE_GRAPH_DERIVED, "", predicted, actual)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# --- Mode 2: co-change ground truth (honest) ---
|
|
157
|
+
# Seed the predictor with a single changed file and grade against
|
|
158
|
+
# the other files the author touched in the same commit. Note the
|
|
159
|
+
# seed analysis deliberately gets no repo_root/diff: it must only
|
|
160
|
+
# see the seed file, never the full commit diff.
|
|
161
|
+
seed = sorted(changed)[0]
|
|
162
|
+
co_actual = set(changed) - {seed}
|
|
163
|
+
if not co_actual:
|
|
164
|
+
row = _base_row(repo, sha, MODE_CO_CHANGE, seed)
|
|
165
|
+
row["status"] = "skipped"
|
|
166
|
+
row["error"] = "single-file commit: no co-changed files to grade against"
|
|
167
|
+
results.append(row)
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
seed_analysis = analyze_changes(store, [seed])
|
|
172
|
+
except Exception as exc:
|
|
173
|
+
logger.warning("analyze_changes (seed=%s) failed on %s: %s", seed, sha, exc)
|
|
174
|
+
results.append(_error_row(repo, sha, MODE_CO_CHANGE, seed, exc))
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
co_predicted = _files_from_analysis(seed_analysis)
|
|
178
|
+
co_predicted |= _graph_neighbor_files(store, [seed])
|
|
179
|
+
co_predicted.discard(seed)
|
|
180
|
+
results.append(
|
|
181
|
+
_scored_row(repo, sha, MODE_CO_CHANGE, seed, co_predicted, co_actual)
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
return results
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def aggregate(results: list[dict]) -> dict:
|
|
188
|
+
"""Per-mode means over successful rows only.
|
|
189
|
+
|
|
190
|
+
Error/skipped rows stay in the CSV but never contribute to a number.
|
|
191
|
+
"""
|
|
192
|
+
out: dict = {
|
|
193
|
+
"total_rows": len(results),
|
|
194
|
+
"error_rows": sum(1 for r in results if r.get("status") == "error"),
|
|
195
|
+
"skipped_rows": sum(1 for r in results if r.get("status") == "skipped"),
|
|
196
|
+
}
|
|
197
|
+
for key, mode in (
|
|
198
|
+
("graph_derived", MODE_GRAPH_DERIVED),
|
|
199
|
+
("co_change", MODE_CO_CHANGE),
|
|
200
|
+
):
|
|
201
|
+
rows = [
|
|
202
|
+
r for r in results
|
|
203
|
+
if r.get("ground_truth_mode") == mode and r.get("status") == "ok"
|
|
204
|
+
]
|
|
205
|
+
out[key] = {
|
|
206
|
+
"ok_rows": len(rows),
|
|
207
|
+
"mean_precision": (
|
|
208
|
+
round(statistics.mean(float(r["precision"]) for r in rows), 3)
|
|
209
|
+
if rows else None
|
|
210
|
+
),
|
|
211
|
+
"mean_recall": (
|
|
212
|
+
round(statistics.mean(float(r["recall"]) for r in rows), 3)
|
|
213
|
+
if rows else None
|
|
214
|
+
),
|
|
215
|
+
"mean_f1": (
|
|
216
|
+
round(statistics.mean(float(r["f1"]) for r in rows), 3)
|
|
217
|
+
if rows else None
|
|
218
|
+
),
|
|
219
|
+
}
|
|
220
|
+
return out
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Multi-hop retrieval benchmark.
|
|
2
|
+
|
|
3
|
+
Tests a two-step tool chain that mimics how an LLM agent actually uses the
|
|
4
|
+
graph for complex tasks:
|
|
5
|
+
|
|
6
|
+
1. ``hybrid_search(nl_query)`` to find a starting anchor from a natural-
|
|
7
|
+
language question.
|
|
8
|
+
2. ``query_graph(pattern, target=anchor)`` to traverse one hop along the
|
|
9
|
+
requested edge kind (callers_of / callees_of / tests_for / ...).
|
|
10
|
+
|
|
11
|
+
For each task the benchmark records:
|
|
12
|
+
|
|
13
|
+
- ``anchor_found`` — did semantic search return a node whose qualified_name
|
|
14
|
+
ends with the expected suffix in the top-K?
|
|
15
|
+
- ``anchor_rank`` — index in the search result list (lower is better).
|
|
16
|
+
- ``neighbor_count`` — number of neighbors returned by the traversal.
|
|
17
|
+
- ``neighbor_recall`` — fraction of ``expected_neighbor_names`` that appear
|
|
18
|
+
among the neighbor names.
|
|
19
|
+
- ``score`` — ``int(anchor_found) * neighbor_recall``. Range 0–1.
|
|
20
|
+
|
|
21
|
+
Tasks are defined per-config under ``multi_hop_tasks:`` in
|
|
22
|
+
``code_review_graph/eval/configs/*.yaml``. See
|
|
23
|
+
``docs/REPRODUCING.md`` for the schema and the curated canonical task set.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import logging
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Any
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _name_set(rows: list[dict[str, Any]]) -> set[str]:
|
|
36
|
+
out: set[str] = set()
|
|
37
|
+
for r in rows:
|
|
38
|
+
name = (r.get("name") or "").lower()
|
|
39
|
+
if name:
|
|
40
|
+
out.add(name)
|
|
41
|
+
return out
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def run(repo_path: Path, store, config: dict) -> list[dict]:
|
|
45
|
+
"""Run the multi-hop retrieval benchmark for one repo."""
|
|
46
|
+
# Imports are local so an import-time failure in one optional benchmark
|
|
47
|
+
# does not poison the whole runner.
|
|
48
|
+
from code_review_graph.search import hybrid_search
|
|
49
|
+
from code_review_graph.tools.query import query_graph
|
|
50
|
+
|
|
51
|
+
repo_root = str(repo_path)
|
|
52
|
+
results: list[dict] = []
|
|
53
|
+
|
|
54
|
+
for task in config.get("multi_hop_tasks", []):
|
|
55
|
+
task_id = task["id"]
|
|
56
|
+
nl_query = task["nl_query"]
|
|
57
|
+
suffix = task["anchor_qualified_suffix"].lower()
|
|
58
|
+
traversal = task.get("traversal_pattern", "callers_of")
|
|
59
|
+
expected = [e.lower() for e in task.get("expected_neighbor_names", [])]
|
|
60
|
+
k = int(task.get("k", 10))
|
|
61
|
+
|
|
62
|
+
# Step 1 — semantic search
|
|
63
|
+
try:
|
|
64
|
+
hits = hybrid_search(store, nl_query, limit=k)
|
|
65
|
+
except Exception as exc: # noqa: BLE001 — benchmark must not abort the runner
|
|
66
|
+
logger.warning("hybrid_search failed on %s: %s", task_id, exc)
|
|
67
|
+
hits = []
|
|
68
|
+
|
|
69
|
+
anchor = None
|
|
70
|
+
anchor_rank = -1
|
|
71
|
+
for i, h in enumerate(hits):
|
|
72
|
+
qn = (h.get("qualified_name") or "").lower()
|
|
73
|
+
if qn.endswith(suffix):
|
|
74
|
+
anchor = h
|
|
75
|
+
anchor_rank = i
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
if anchor is None:
|
|
79
|
+
results.append({
|
|
80
|
+
"repo": config["name"],
|
|
81
|
+
"task_id": task_id,
|
|
82
|
+
"nl_query": nl_query,
|
|
83
|
+
"anchor_found": False,
|
|
84
|
+
"anchor_rank": -1,
|
|
85
|
+
"neighbor_count": 0,
|
|
86
|
+
"expected_count": len(expected),
|
|
87
|
+
"matched_count": 0,
|
|
88
|
+
"neighbor_recall": 0.0,
|
|
89
|
+
"score": 0.0,
|
|
90
|
+
})
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
# Step 2 — single-hop graph traversal from the anchor
|
|
94
|
+
try:
|
|
95
|
+
trav = query_graph(
|
|
96
|
+
pattern=traversal,
|
|
97
|
+
target=anchor["qualified_name"],
|
|
98
|
+
repo_root=repo_root,
|
|
99
|
+
detail_level="standard",
|
|
100
|
+
)
|
|
101
|
+
except Exception as exc: # noqa: BLE001
|
|
102
|
+
logger.warning(
|
|
103
|
+
"query_graph(%s) failed on %s: %s", traversal, task_id, exc,
|
|
104
|
+
)
|
|
105
|
+
trav = {}
|
|
106
|
+
|
|
107
|
+
rows = trav.get("data") or trav.get("results") or []
|
|
108
|
+
names = _name_set(rows)
|
|
109
|
+
matched = sum(1 for e in expected if e in names)
|
|
110
|
+
recall = matched / len(expected) if expected else 0.0
|
|
111
|
+
|
|
112
|
+
results.append({
|
|
113
|
+
"repo": config["name"],
|
|
114
|
+
"task_id": task_id,
|
|
115
|
+
"nl_query": nl_query,
|
|
116
|
+
"anchor_found": True,
|
|
117
|
+
"anchor_rank": anchor_rank,
|
|
118
|
+
"neighbor_count": len(rows),
|
|
119
|
+
"expected_count": len(expected),
|
|
120
|
+
"matched_count": matched,
|
|
121
|
+
"neighbor_recall": round(recall, 3),
|
|
122
|
+
"score": round(recall, 3),
|
|
123
|
+
})
|
|
124
|
+
|
|
125
|
+
return results
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Search quality benchmark: measures search result ranking via MRR."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
import sqlite3
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def run(repo_path: Path, store, config: dict) -> list[dict]:
|
|
13
|
+
"""Run search quality benchmark."""
|
|
14
|
+
results = []
|
|
15
|
+
for sq in config.get("search_queries", []):
|
|
16
|
+
query = sq["query"]
|
|
17
|
+
expected = sq["expected"]
|
|
18
|
+
|
|
19
|
+
try:
|
|
20
|
+
from code_review_graph.search import hybrid_search
|
|
21
|
+
search_results = hybrid_search(store, query, limit=20)
|
|
22
|
+
except (ImportError, sqlite3.OperationalError) as exc:
|
|
23
|
+
logger.debug("hybrid_search unavailable, using fallback: %s", exc)
|
|
24
|
+
# Fallback to basic search
|
|
25
|
+
search_results = [
|
|
26
|
+
{"qualified_name": n.qualified_name}
|
|
27
|
+
for n in store.search_nodes(query, limit=20)
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
rank = 0
|
|
31
|
+
for i, r in enumerate(search_results):
|
|
32
|
+
if isinstance(r, dict):
|
|
33
|
+
qn = r.get("qualified_name", "")
|
|
34
|
+
elif hasattr(r, "qualified_name"):
|
|
35
|
+
qn = r.qualified_name
|
|
36
|
+
else:
|
|
37
|
+
qn = ""
|
|
38
|
+
qn_lower = qn.lower()
|
|
39
|
+
exp_lower = expected.lower()
|
|
40
|
+
# Match if expected is substring of qn, qn is substring of expected,
|
|
41
|
+
# or the name part after :: matches
|
|
42
|
+
exp_name = expected.rsplit("::", 1)[-1] if "::" in expected else expected
|
|
43
|
+
qn_name = qn.rsplit("::", 1)[-1] if "::" in qn else qn
|
|
44
|
+
if (
|
|
45
|
+
exp_lower in qn_lower
|
|
46
|
+
or qn_lower in exp_lower
|
|
47
|
+
or exp_name.lower() == qn_name.lower()
|
|
48
|
+
):
|
|
49
|
+
rank = i + 1
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
results.append({
|
|
53
|
+
"repo": config["name"],
|
|
54
|
+
"query": query,
|
|
55
|
+
"expected": expected,
|
|
56
|
+
"rank": rank,
|
|
57
|
+
"reciprocal_rank": round(1.0 / rank if rank > 0 else 0.0, 3),
|
|
58
|
+
})
|
|
59
|
+
return results
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Token efficiency benchmark: compares naive, standard, and graph-based token counts.
|
|
2
|
+
|
|
3
|
+
Failure semantics: if ``get_review_context`` throws, the row is recorded with
|
|
4
|
+
``status="error"`` and empty metric fields. It stays in the CSV for forensics
|
|
5
|
+
but is excluded from every aggregate — a failed tool call is not a
|
|
6
|
+
measurement. (Previously a failure silently produced ``graph_tokens=0`` and
|
|
7
|
+
``ratio = naive / 1``, inflating the results.)
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import logging
|
|
14
|
+
import statistics
|
|
15
|
+
import subprocess
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _count_tokens(text: str) -> int:
|
|
22
|
+
"""Approximate token count (1 token ~ 4 chars)."""
|
|
23
|
+
return len(text) // 4
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_changed_files(repo_path: Path, sha: str) -> list[str]:
|
|
27
|
+
"""Get list of changed files for a commit."""
|
|
28
|
+
result = subprocess.run(
|
|
29
|
+
["git", "diff", "--name-only", f"{sha}~1", sha],
|
|
30
|
+
cwd=str(repo_path),
|
|
31
|
+
capture_output=True,
|
|
32
|
+
text=True,
|
|
33
|
+
)
|
|
34
|
+
if result.returncode != 0:
|
|
35
|
+
# Fallback: diff against parent
|
|
36
|
+
result = subprocess.run(
|
|
37
|
+
["git", "diff", "--name-only", "HEAD~1", "HEAD"],
|
|
38
|
+
cwd=str(repo_path),
|
|
39
|
+
capture_output=True,
|
|
40
|
+
text=True,
|
|
41
|
+
)
|
|
42
|
+
return [f.strip() for f in result.stdout.strip().splitlines() if f.strip()]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _count_file_tokens(repo_path: Path, files: list[str]) -> int:
|
|
46
|
+
"""Count tokens from full file contents (naive approach)."""
|
|
47
|
+
total = 0
|
|
48
|
+
for f in files:
|
|
49
|
+
fp = repo_path / f
|
|
50
|
+
if fp.is_file():
|
|
51
|
+
try:
|
|
52
|
+
total += _count_tokens(fp.read_text(encoding="utf-8", errors="replace"))
|
|
53
|
+
except OSError:
|
|
54
|
+
pass
|
|
55
|
+
return total
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _count_diff_tokens(repo_path: Path, sha: str) -> int:
|
|
59
|
+
"""Count tokens from git diff output (standard approach)."""
|
|
60
|
+
result = subprocess.run(
|
|
61
|
+
["git", "diff", f"{sha}~1", sha],
|
|
62
|
+
cwd=str(repo_path),
|
|
63
|
+
capture_output=True,
|
|
64
|
+
text=True,
|
|
65
|
+
)
|
|
66
|
+
if result.returncode != 0:
|
|
67
|
+
result = subprocess.run(
|
|
68
|
+
["git", "diff", "HEAD~1", "HEAD"],
|
|
69
|
+
cwd=str(repo_path),
|
|
70
|
+
capture_output=True,
|
|
71
|
+
text=True,
|
|
72
|
+
)
|
|
73
|
+
return _count_tokens(result.stdout)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def run(repo_path: Path, store, config: dict) -> list[dict]:
|
|
77
|
+
"""Run token efficiency benchmark."""
|
|
78
|
+
results = []
|
|
79
|
+
for tc in config.get("test_commits", []):
|
|
80
|
+
changed = _get_changed_files(repo_path, tc["sha"])
|
|
81
|
+
if not changed:
|
|
82
|
+
continue
|
|
83
|
+
|
|
84
|
+
naive_tokens = _count_file_tokens(repo_path, changed)
|
|
85
|
+
standard_tokens = _count_diff_tokens(repo_path, tc["sha"])
|
|
86
|
+
|
|
87
|
+
row: dict = {
|
|
88
|
+
"repo": config["name"],
|
|
89
|
+
"commit": tc["sha"],
|
|
90
|
+
"description": tc.get("description", ""),
|
|
91
|
+
"changed_files": len(changed),
|
|
92
|
+
"naive_tokens": naive_tokens,
|
|
93
|
+
"standard_tokens": standard_tokens,
|
|
94
|
+
"graph_tokens": "",
|
|
95
|
+
"naive_to_graph_ratio": "",
|
|
96
|
+
"standard_to_graph_ratio": "",
|
|
97
|
+
"status": "ok",
|
|
98
|
+
"error": "",
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
# Graph-based: use get_review_context
|
|
102
|
+
try:
|
|
103
|
+
from code_review_graph.tools import get_review_context
|
|
104
|
+
ctx = get_review_context(
|
|
105
|
+
changed_files=changed, repo_root=str(repo_path)
|
|
106
|
+
)
|
|
107
|
+
graph_tokens = _count_tokens(json.dumps(ctx))
|
|
108
|
+
except Exception as exc:
|
|
109
|
+
# A failed tool call is not a measurement. Recording
|
|
110
|
+
# graph_tokens=0 used to turn this into ratio = naive/1 — a
|
|
111
|
+
# huge fake win. Mark the row failed; aggregate() excludes it.
|
|
112
|
+
logger.warning("get_review_context failed on %s: %s", tc["sha"], exc)
|
|
113
|
+
row["status"] = "error"
|
|
114
|
+
row["error"] = str(exc)[:200]
|
|
115
|
+
results.append(row)
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
row["graph_tokens"] = graph_tokens
|
|
119
|
+
row["naive_to_graph_ratio"] = round(naive_tokens / max(graph_tokens, 1), 1)
|
|
120
|
+
row["standard_to_graph_ratio"] = round(standard_tokens / max(graph_tokens, 1), 1)
|
|
121
|
+
results.append(row)
|
|
122
|
+
return results
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def aggregate(results: list[dict]) -> dict:
|
|
126
|
+
"""Aggregate token-efficiency rows, excluding failed measurements.
|
|
127
|
+
|
|
128
|
+
Rows with ``status != "ok"`` stay in the CSV for forensics but must not
|
|
129
|
+
contribute to any headline number.
|
|
130
|
+
"""
|
|
131
|
+
ok = [r for r in results if r.get("status") == "ok"]
|
|
132
|
+
ratios = [float(r["naive_to_graph_ratio"]) for r in ok]
|
|
133
|
+
return {
|
|
134
|
+
"total_rows": len(results),
|
|
135
|
+
"ok_rows": len(ok),
|
|
136
|
+
"error_rows": sum(1 for r in results if r.get("status") == "error"),
|
|
137
|
+
"median_naive_to_graph_ratio": (
|
|
138
|
+
round(statistics.median(ratios), 1) if ratios else None
|
|
139
|
+
),
|
|
140
|
+
"mean_naive_to_graph_ratio": (
|
|
141
|
+
round(statistics.mean(ratios), 1) if ratios else None
|
|
142
|
+
),
|
|
143
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
name: code-review-graph
|
|
2
|
+
url: https://github.com/tirth8205/code-review-graph
|
|
3
|
+
# Pinned to the latest test_commit SHA so the snapshot is deterministic and
|
|
4
|
+
# every test_commit below is reachable as an ancestor. (This config replaces
|
|
5
|
+
# the historical "nextjs" entry, which used the same URL but mis-labelled the
|
|
6
|
+
# target as a Next.js monorepo.)
|
|
7
|
+
commit: 84bde35459c52e1e0c4b25c6c4799743021e0fc7
|
|
8
|
+
language: python
|
|
9
|
+
size_category: medium
|
|
10
|
+
|
|
11
|
+
test_commits:
|
|
12
|
+
- sha: 528801f841e519567ef54d6e52e9b9831d162e1b
|
|
13
|
+
description: "feat: add multi-platform MCP server installation support"
|
|
14
|
+
changed_files: 3
|
|
15
|
+
- sha: 84bde35459c52e1e0c4b25c6c4799743021e0fc7
|
|
16
|
+
description: "feat: add Google Antigravity platform support for MCP install"
|
|
17
|
+
changed_files: 2
|
|
18
|
+
|
|
19
|
+
entry_points:
|
|
20
|
+
- "code_review_graph/cli.py::cli"
|
|
21
|
+
- "code_review_graph/main.py::main"
|
|
22
|
+
|
|
23
|
+
search_queries:
|
|
24
|
+
- query: "GraphStore nodes"
|
|
25
|
+
expected: "code_review_graph/graph.py::GraphStore"
|
|
26
|
+
- query: "parse AST"
|
|
27
|
+
expected: "code_review_graph/parser.py::CodeParser"
|
|
28
|
+
- query: "full build"
|
|
29
|
+
expected: "code_review_graph/incremental.py::full_build"
|
|
30
|
+
|
|
31
|
+
multi_hop_tasks:
|
|
32
|
+
- id: crg-parse-file-callers
|
|
33
|
+
nl_query: "Who invokes the parser entry point on a single source file"
|
|
34
|
+
anchor_qualified_suffix: "code_review_graph/parser.py::codeparser.parse_file"
|
|
35
|
+
traversal_pattern: callers_of
|
|
36
|
+
expected_neighbor_names: ["setup_method"]
|
|
37
|
+
k: 10
|
|
38
|
+
- id: crg-upsert-node-callers
|
|
39
|
+
nl_query: "Where the graph store inserts or updates a node"
|
|
40
|
+
anchor_qualified_suffix: "code_review_graph/graph.py::graphstore.upsert_node"
|
|
41
|
+
traversal_pattern: callers_of
|
|
42
|
+
expected_neighbor_names: ["store_file_nodes_edges"]
|
|
43
|
+
k: 10
|
|
44
|
+
|
|
45
|
+
# Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
|
|
46
|
+
# query). See docs/REPRODUCING.md for the methodology.
|
|
47
|
+
agent_questions:
|
|
48
|
+
- "How does GraphStore upsert_node store a node"
|
|
49
|
+
- "Where does full_build parse the repository"
|
|
50
|
+
- "How does hybrid_search rank search results"
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
name: express
|
|
2
|
+
url: https://github.com/expressjs/express
|
|
3
|
+
# Pinned to the latest test_commit SHA so the snapshot is deterministic and
|
|
4
|
+
# every test_commit below is reachable as an ancestor.
|
|
5
|
+
commit: b4ab7d65d7724d9309b6faaaf82ad492da2a6d35
|
|
6
|
+
language: javascript
|
|
7
|
+
size_category: small
|
|
8
|
+
|
|
9
|
+
test_commits:
|
|
10
|
+
- sha: 925a1dff1e42f1b393c977b8b77757fcf633e09f
|
|
11
|
+
description: "fix: bump qs minimum to ^6.14.2 for CVE-2026-2391"
|
|
12
|
+
changed_files: 1
|
|
13
|
+
- sha: b4ab7d65d7724d9309b6faaaf82ad492da2a6d35
|
|
14
|
+
description: "test: include edge case tests for res.type()"
|
|
15
|
+
changed_files: 1
|
|
16
|
+
|
|
17
|
+
entry_points:
|
|
18
|
+
- "lib/application.js::app.handle"
|
|
19
|
+
- "lib/express.js::createApplication"
|
|
20
|
+
|
|
21
|
+
search_queries:
|
|
22
|
+
- query: "app handle"
|
|
23
|
+
expected: "lib/application.js::app"
|
|
24
|
+
- query: "response send"
|
|
25
|
+
expected: "lib/response.js::res"
|
|
26
|
+
- query: "request"
|
|
27
|
+
expected: "lib/request.js::req"
|
|
28
|
+
|
|
29
|
+
# Express has only one task — JS modules use prototypes + module.exports
|
|
30
|
+
# heavily, so most "method" callers are not represented as proper Function
|
|
31
|
+
# edges in the graph. createApplication is the cleanest anchor.
|
|
32
|
+
multi_hop_tasks:
|
|
33
|
+
- id: express-create-application-callees
|
|
34
|
+
nl_query: "What express does when constructing an application"
|
|
35
|
+
anchor_qualified_suffix: "lib/express.js::createapplication"
|
|
36
|
+
traversal_pattern: callees_of
|
|
37
|
+
expected_neighbor_names: ["mixin", "create", "init"]
|
|
38
|
+
k: 10
|
|
39
|
+
|
|
40
|
+
# Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
|
|
41
|
+
# query). See docs/REPRODUCING.md for the methodology.
|
|
42
|
+
agent_questions:
|
|
43
|
+
- "How does app.handle process the middleware stack"
|
|
44
|
+
- "Where does res.send write the response body"
|
|
45
|
+
- "How does createApplication initialize an app"
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
name: fastapi
|
|
2
|
+
url: https://github.com/tiangolo/fastapi
|
|
3
|
+
# Pinned to the latest test_commit SHA so the snapshot is deterministic and
|
|
4
|
+
# every test_commit below is reachable as an ancestor.
|
|
5
|
+
commit: 0227991a01e61bf5cdd93cc00e9e243f52b47a4a
|
|
6
|
+
language: python
|
|
7
|
+
size_category: medium
|
|
8
|
+
|
|
9
|
+
test_commits:
|
|
10
|
+
- sha: fa3588c38c7473aca7536b12d686102de4b0f407
|
|
11
|
+
description: "Fix typo for client_secret in OAuth2 form docstrings"
|
|
12
|
+
changed_files: 1
|
|
13
|
+
- sha: 0227991a01e61bf5cdd93cc00e9e243f52b47a4a
|
|
14
|
+
description: "Exclude spam comments from statistics in scripts/people.py"
|
|
15
|
+
changed_files: 1
|
|
16
|
+
|
|
17
|
+
entry_points:
|
|
18
|
+
- "fastapi/applications.py::FastAPI"
|
|
19
|
+
- "fastapi/routing.py::APIRouter"
|
|
20
|
+
|
|
21
|
+
search_queries:
|
|
22
|
+
- query: "FastAPI application"
|
|
23
|
+
expected: "fastapi/applications.py::FastAPI"
|
|
24
|
+
- query: "APIRoute routing"
|
|
25
|
+
expected: "fastapi/routing.py::APIRoute"
|
|
26
|
+
- query: "Depends injection"
|
|
27
|
+
expected: "fastapi/params.py::Depends"
|
|
28
|
+
|
|
29
|
+
multi_hop_tasks:
|
|
30
|
+
- id: fastapi-route-handler-callers
|
|
31
|
+
nl_query: "How fastapi binds a route handler to an APIRoute"
|
|
32
|
+
anchor_qualified_suffix: "fastapi/routing.py::apiroute.get_route_handler"
|
|
33
|
+
traversal_pattern: callers_of
|
|
34
|
+
expected_neighbor_names: ["__init__"]
|
|
35
|
+
k: 10
|
|
36
|
+
- id: fastapi-get-dependant-callers
|
|
37
|
+
nl_query: "Where fastapi resolves dependency declarations into a tree"
|
|
38
|
+
anchor_qualified_suffix: "fastapi/dependencies/utils.py::get_dependant"
|
|
39
|
+
traversal_pattern: callers_of
|
|
40
|
+
expected_neighbor_names: ["get_parameterless_sub_dependant", "solve_dependencies"]
|
|
41
|
+
k: 10
|
|
42
|
+
|
|
43
|
+
# Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
|
|
44
|
+
# query). See docs/REPRODUCING.md for the methodology.
|
|
45
|
+
agent_questions:
|
|
46
|
+
- "How does include_router register routes on the application"
|
|
47
|
+
- "Where does APIRoute build its route handler"
|
|
48
|
+
- "How does solve_dependencies resolve Depends parameters"
|