code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. code_review_graph/__init__.py +20 -0
  2. code_review_graph/__main__.py +4 -0
  3. code_review_graph/analysis.py +410 -0
  4. code_review_graph/changes.py +409 -0
  5. code_review_graph/cli.py +1255 -0
  6. code_review_graph/communities.py +874 -0
  7. code_review_graph/constants.py +23 -0
  8. code_review_graph/context_savings.py +317 -0
  9. code_review_graph/custom_languages.py +322 -0
  10. code_review_graph/daemon.py +1009 -0
  11. code_review_graph/daemon_cli.py +320 -0
  12. code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
  13. code_review_graph/embeddings.py +1006 -0
  14. code_review_graph/enrich.py +303 -0
  15. code_review_graph/eval/__init__.py +33 -0
  16. code_review_graph/eval/benchmarks/__init__.py +1 -0
  17. code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
  18. code_review_graph/eval/benchmarks/build_performance.py +60 -0
  19. code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
  20. code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
  21. code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
  22. code_review_graph/eval/benchmarks/search_quality.py +59 -0
  23. code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
  24. code_review_graph/eval/configs/code-review-graph.yaml +50 -0
  25. code_review_graph/eval/configs/express.yaml +45 -0
  26. code_review_graph/eval/configs/fastapi.yaml +48 -0
  27. code_review_graph/eval/configs/flask.yaml +50 -0
  28. code_review_graph/eval/configs/gin.yaml +51 -0
  29. code_review_graph/eval/configs/httpx.yaml +48 -0
  30. code_review_graph/eval/reporter.py +301 -0
  31. code_review_graph/eval/runner.py +211 -0
  32. code_review_graph/eval/scorer.py +85 -0
  33. code_review_graph/eval/token_benchmark.py +182 -0
  34. code_review_graph/exports.py +409 -0
  35. code_review_graph/flows.py +698 -0
  36. code_review_graph/graph.py +1427 -0
  37. code_review_graph/graph_diff.py +122 -0
  38. code_review_graph/hints.py +384 -0
  39. code_review_graph/incremental.py +1245 -0
  40. code_review_graph/jedi_resolver.py +303 -0
  41. code_review_graph/main.py +1079 -0
  42. code_review_graph/memory.py +142 -0
  43. code_review_graph/migrations.py +284 -0
  44. code_review_graph/parser.py +6957 -0
  45. code_review_graph/postprocessing.py +134 -0
  46. code_review_graph/prompts.py +159 -0
  47. code_review_graph/refactor.py +852 -0
  48. code_review_graph/registry.py +319 -0
  49. code_review_graph/rescript_resolver.py +206 -0
  50. code_review_graph/search.py +447 -0
  51. code_review_graph/skills.py +1481 -0
  52. code_review_graph/spring_resolver.py +200 -0
  53. code_review_graph/temporal_resolver.py +199 -0
  54. code_review_graph/token_benchmark.py +125 -0
  55. code_review_graph/tools/__init__.py +156 -0
  56. code_review_graph/tools/_common.py +176 -0
  57. code_review_graph/tools/analysis_tools.py +184 -0
  58. code_review_graph/tools/build.py +541 -0
  59. code_review_graph/tools/community_tools.py +246 -0
  60. code_review_graph/tools/context.py +152 -0
  61. code_review_graph/tools/docs.py +274 -0
  62. code_review_graph/tools/flows_tools.py +176 -0
  63. code_review_graph/tools/query.py +692 -0
  64. code_review_graph/tools/refactor_tools.py +168 -0
  65. code_review_graph/tools/registry_tools.py +125 -0
  66. code_review_graph/tools/review.py +477 -0
  67. code_review_graph/tsconfig_resolver.py +257 -0
  68. code_review_graph/visualization.py +2184 -0
  69. code_review_graph/wiki.py +305 -0
  70. code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
  71. code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
  72. code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
  73. code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
  74. code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,220 @@
1
+ """Impact accuracy benchmark: measures precision/recall of change impact analysis.
2
+
3
+ Two ground-truth modes are emitted side by side (``ground_truth_mode`` column):
4
+
5
+ - **graph-derived (circular — upper bound)** — the historical mode. Ground
6
+ truth is the changed files plus files with CALLS/IMPORTS_FROM edges into
7
+ them, i.e. derived from the same graph the predictor traverses. Recall in
8
+ this mode is an upper bound by construction, not independent evidence.
9
+ - **co-change (same commit, seed excluded)** — the honest mode. The predictor
10
+ is seeded with a single changed file and graded against the *other* files
11
+ the author actually touched in the same commit. The ground truth comes from
12
+ git history, not from the graph.
13
+
14
+ Failure semantics: if ``analyze_changes`` throws, the row is recorded with
15
+ ``status="error"`` and empty metric fields — it stays in the CSV but is
16
+ excluded from aggregates. (Previously a failure silently set
17
+ ``predicted = set(changed)``, guaranteeing a fake recall of 1.0.)
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import logging
23
+ import statistics
24
+ import subprocess
25
+ from pathlib import Path
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+ MODE_GRAPH_DERIVED = "graph-derived (circular — upper bound)"
30
+ MODE_CO_CHANGE = "co-change (same commit, seed excluded)"
31
+
32
+
33
+ def _get_changed_files(repo_path: Path, sha: str) -> list[str]:
34
+ """Get list of changed files for a commit."""
35
+ result = subprocess.run(
36
+ ["git", "diff", "--name-only", f"{sha}~1", sha],
37
+ cwd=str(repo_path),
38
+ capture_output=True,
39
+ text=True,
40
+ )
41
+ if result.returncode != 0:
42
+ result = subprocess.run(
43
+ ["git", "diff", "--name-only", "HEAD~1", "HEAD"],
44
+ cwd=str(repo_path),
45
+ capture_output=True,
46
+ text=True,
47
+ )
48
+ return [f.strip() for f in result.stdout.strip().splitlines() if f.strip()]
49
+
50
+
51
+ def _files_from_analysis(analysis: dict) -> set[str]:
52
+ """Extract predicted file paths from an ``analyze_changes`` result."""
53
+ predicted: set[str] = set()
54
+ for f in analysis.get("changed_functions", []):
55
+ if isinstance(f, dict) and "file_path" in f:
56
+ predicted.add(f["file_path"])
57
+ elif isinstance(f, dict) and "file" in f:
58
+ predicted.add(f["file"])
59
+ for flow in analysis.get("affected_flows", []):
60
+ if isinstance(flow, dict):
61
+ for node in flow.get("nodes", []):
62
+ if isinstance(node, dict) and "file_path" in node:
63
+ predicted.add(node["file_path"])
64
+ return predicted
65
+
66
+
67
+ def _graph_neighbor_files(store, files: list[str]) -> set[str]:
68
+ """Files with CALLS/IMPORTS_FROM edges into any node of *files* (one hop)."""
69
+ out: set[str] = set()
70
+ for f in files:
71
+ for node in store.get_nodes_by_file(f):
72
+ for edge in store.get_edges_by_target(node.qualified_name):
73
+ if edge.kind in ("CALLS", "IMPORTS_FROM"):
74
+ src_qual = edge.source_qualified
75
+ src_file = src_qual.split("::")[0] if "::" in src_qual else ""
76
+ if src_file:
77
+ out.add(src_file)
78
+ return out
79
+
80
+
81
+ def _base_row(repo: str, sha: str, mode: str, seed: str) -> dict:
82
+ return {
83
+ "repo": repo,
84
+ "commit": sha,
85
+ "ground_truth_mode": mode,
86
+ "seed_file": seed,
87
+ "predicted_files": "",
88
+ "actual_files": "",
89
+ "true_positives": "",
90
+ "precision": "",
91
+ "recall": "",
92
+ "f1": "",
93
+ "status": "ok",
94
+ "error": "",
95
+ }
96
+
97
+
98
+ def _scored_row(
99
+ repo: str, sha: str, mode: str, seed: str,
100
+ predicted: set[str], actual: set[str],
101
+ ) -> dict:
102
+ tp = len(predicted & actual)
103
+ precision = tp / max(len(predicted), 1)
104
+ recall = tp / max(len(actual), 1)
105
+ f1 = 2 * precision * recall / max(precision + recall, 0.001)
106
+ row = _base_row(repo, sha, mode, seed)
107
+ row.update({
108
+ "predicted_files": len(predicted),
109
+ "actual_files": len(actual),
110
+ "true_positives": tp,
111
+ "precision": round(precision, 3),
112
+ "recall": round(recall, 3),
113
+ "f1": round(f1, 3),
114
+ })
115
+ return row
116
+
117
+
118
+ def _error_row(repo: str, sha: str, mode: str, seed: str, exc: Exception) -> dict:
119
+ row = _base_row(repo, sha, mode, seed)
120
+ row["status"] = "error"
121
+ row["error"] = str(exc)[:200]
122
+ return row
123
+
124
+
125
+ def run(repo_path: Path, store, config: dict) -> list[dict]:
126
+ """Run impact accuracy benchmark (both ground-truth modes)."""
127
+ from code_review_graph.changes import analyze_changes
128
+
129
+ results = []
130
+ repo = config["name"]
131
+ for tc in config.get("test_commits", []):
132
+ sha = tc["sha"]
133
+ changed = _get_changed_files(repo_path, sha)
134
+ if not changed:
135
+ continue
136
+
137
+ # --- Mode 1: graph-derived ground truth (circular — upper bound) ---
138
+ try:
139
+ analysis = analyze_changes(
140
+ store, changed, repo_root=str(repo_path), base=sha + "~1",
141
+ )
142
+ except Exception as exc:
143
+ # Old behaviour set predicted = set(changed) here, which
144
+ # guarantees recall 1.0 on a *failed* run. Mark failed instead.
145
+ logger.warning("analyze_changes failed on %s: %s", sha, exc)
146
+ results.append(_error_row(repo, sha, MODE_GRAPH_DERIVED, "", exc))
147
+ analysis = None
148
+
149
+ if analysis is not None:
150
+ predicted = set(changed) | _files_from_analysis(analysis)
151
+ actual = set(changed) | _graph_neighbor_files(store, changed)
152
+ results.append(
153
+ _scored_row(repo, sha, MODE_GRAPH_DERIVED, "", predicted, actual)
154
+ )
155
+
156
+ # --- Mode 2: co-change ground truth (honest) ---
157
+ # Seed the predictor with a single changed file and grade against
158
+ # the other files the author touched in the same commit. Note the
159
+ # seed analysis deliberately gets no repo_root/diff: it must only
160
+ # see the seed file, never the full commit diff.
161
+ seed = sorted(changed)[0]
162
+ co_actual = set(changed) - {seed}
163
+ if not co_actual:
164
+ row = _base_row(repo, sha, MODE_CO_CHANGE, seed)
165
+ row["status"] = "skipped"
166
+ row["error"] = "single-file commit: no co-changed files to grade against"
167
+ results.append(row)
168
+ continue
169
+
170
+ try:
171
+ seed_analysis = analyze_changes(store, [seed])
172
+ except Exception as exc:
173
+ logger.warning("analyze_changes (seed=%s) failed on %s: %s", seed, sha, exc)
174
+ results.append(_error_row(repo, sha, MODE_CO_CHANGE, seed, exc))
175
+ continue
176
+
177
+ co_predicted = _files_from_analysis(seed_analysis)
178
+ co_predicted |= _graph_neighbor_files(store, [seed])
179
+ co_predicted.discard(seed)
180
+ results.append(
181
+ _scored_row(repo, sha, MODE_CO_CHANGE, seed, co_predicted, co_actual)
182
+ )
183
+
184
+ return results
185
+
186
+
187
+ def aggregate(results: list[dict]) -> dict:
188
+ """Per-mode means over successful rows only.
189
+
190
+ Error/skipped rows stay in the CSV but never contribute to a number.
191
+ """
192
+ out: dict = {
193
+ "total_rows": len(results),
194
+ "error_rows": sum(1 for r in results if r.get("status") == "error"),
195
+ "skipped_rows": sum(1 for r in results if r.get("status") == "skipped"),
196
+ }
197
+ for key, mode in (
198
+ ("graph_derived", MODE_GRAPH_DERIVED),
199
+ ("co_change", MODE_CO_CHANGE),
200
+ ):
201
+ rows = [
202
+ r for r in results
203
+ if r.get("ground_truth_mode") == mode and r.get("status") == "ok"
204
+ ]
205
+ out[key] = {
206
+ "ok_rows": len(rows),
207
+ "mean_precision": (
208
+ round(statistics.mean(float(r["precision"]) for r in rows), 3)
209
+ if rows else None
210
+ ),
211
+ "mean_recall": (
212
+ round(statistics.mean(float(r["recall"]) for r in rows), 3)
213
+ if rows else None
214
+ ),
215
+ "mean_f1": (
216
+ round(statistics.mean(float(r["f1"]) for r in rows), 3)
217
+ if rows else None
218
+ ),
219
+ }
220
+ return out
@@ -0,0 +1,125 @@
1
+ """Multi-hop retrieval benchmark.
2
+
3
+ Tests a two-step tool chain that mimics how an LLM agent actually uses the
4
+ graph for complex tasks:
5
+
6
+ 1. ``hybrid_search(nl_query)`` to find a starting anchor from a natural-
7
+ language question.
8
+ 2. ``query_graph(pattern, target=anchor)`` to traverse one hop along the
9
+ requested edge kind (callers_of / callees_of / tests_for / ...).
10
+
11
+ For each task the benchmark records:
12
+
13
+ - ``anchor_found`` — did semantic search return a node whose qualified_name
14
+ ends with the expected suffix in the top-K?
15
+ - ``anchor_rank`` — index in the search result list (lower is better).
16
+ - ``neighbor_count`` — number of neighbors returned by the traversal.
17
+ - ``neighbor_recall`` — fraction of ``expected_neighbor_names`` that appear
18
+ among the neighbor names.
19
+ - ``score`` — ``int(anchor_found) * neighbor_recall``. Range 0–1.
20
+
21
+ Tasks are defined per-config under ``multi_hop_tasks:`` in
22
+ ``code_review_graph/eval/configs/*.yaml``. See
23
+ ``docs/REPRODUCING.md`` for the schema and the curated canonical task set.
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import logging
29
+ from pathlib import Path
30
+ from typing import Any
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def _name_set(rows: list[dict[str, Any]]) -> set[str]:
36
+ out: set[str] = set()
37
+ for r in rows:
38
+ name = (r.get("name") or "").lower()
39
+ if name:
40
+ out.add(name)
41
+ return out
42
+
43
+
44
+ def run(repo_path: Path, store, config: dict) -> list[dict]:
45
+ """Run the multi-hop retrieval benchmark for one repo."""
46
+ # Imports are local so an import-time failure in one optional benchmark
47
+ # does not poison the whole runner.
48
+ from code_review_graph.search import hybrid_search
49
+ from code_review_graph.tools.query import query_graph
50
+
51
+ repo_root = str(repo_path)
52
+ results: list[dict] = []
53
+
54
+ for task in config.get("multi_hop_tasks", []):
55
+ task_id = task["id"]
56
+ nl_query = task["nl_query"]
57
+ suffix = task["anchor_qualified_suffix"].lower()
58
+ traversal = task.get("traversal_pattern", "callers_of")
59
+ expected = [e.lower() for e in task.get("expected_neighbor_names", [])]
60
+ k = int(task.get("k", 10))
61
+
62
+ # Step 1 — semantic search
63
+ try:
64
+ hits = hybrid_search(store, nl_query, limit=k)
65
+ except Exception as exc: # noqa: BLE001 — benchmark must not abort the runner
66
+ logger.warning("hybrid_search failed on %s: %s", task_id, exc)
67
+ hits = []
68
+
69
+ anchor = None
70
+ anchor_rank = -1
71
+ for i, h in enumerate(hits):
72
+ qn = (h.get("qualified_name") or "").lower()
73
+ if qn.endswith(suffix):
74
+ anchor = h
75
+ anchor_rank = i
76
+ break
77
+
78
+ if anchor is None:
79
+ results.append({
80
+ "repo": config["name"],
81
+ "task_id": task_id,
82
+ "nl_query": nl_query,
83
+ "anchor_found": False,
84
+ "anchor_rank": -1,
85
+ "neighbor_count": 0,
86
+ "expected_count": len(expected),
87
+ "matched_count": 0,
88
+ "neighbor_recall": 0.0,
89
+ "score": 0.0,
90
+ })
91
+ continue
92
+
93
+ # Step 2 — single-hop graph traversal from the anchor
94
+ try:
95
+ trav = query_graph(
96
+ pattern=traversal,
97
+ target=anchor["qualified_name"],
98
+ repo_root=repo_root,
99
+ detail_level="standard",
100
+ )
101
+ except Exception as exc: # noqa: BLE001
102
+ logger.warning(
103
+ "query_graph(%s) failed on %s: %s", traversal, task_id, exc,
104
+ )
105
+ trav = {}
106
+
107
+ rows = trav.get("data") or trav.get("results") or []
108
+ names = _name_set(rows)
109
+ matched = sum(1 for e in expected if e in names)
110
+ recall = matched / len(expected) if expected else 0.0
111
+
112
+ results.append({
113
+ "repo": config["name"],
114
+ "task_id": task_id,
115
+ "nl_query": nl_query,
116
+ "anchor_found": True,
117
+ "anchor_rank": anchor_rank,
118
+ "neighbor_count": len(rows),
119
+ "expected_count": len(expected),
120
+ "matched_count": matched,
121
+ "neighbor_recall": round(recall, 3),
122
+ "score": round(recall, 3),
123
+ })
124
+
125
+ return results
@@ -0,0 +1,59 @@
1
+ """Search quality benchmark: measures search result ranking via MRR."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import sqlite3
7
+ from pathlib import Path
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ def run(repo_path: Path, store, config: dict) -> list[dict]:
13
+ """Run search quality benchmark."""
14
+ results = []
15
+ for sq in config.get("search_queries", []):
16
+ query = sq["query"]
17
+ expected = sq["expected"]
18
+
19
+ try:
20
+ from code_review_graph.search import hybrid_search
21
+ search_results = hybrid_search(store, query, limit=20)
22
+ except (ImportError, sqlite3.OperationalError) as exc:
23
+ logger.debug("hybrid_search unavailable, using fallback: %s", exc)
24
+ # Fallback to basic search
25
+ search_results = [
26
+ {"qualified_name": n.qualified_name}
27
+ for n in store.search_nodes(query, limit=20)
28
+ ]
29
+
30
+ rank = 0
31
+ for i, r in enumerate(search_results):
32
+ if isinstance(r, dict):
33
+ qn = r.get("qualified_name", "")
34
+ elif hasattr(r, "qualified_name"):
35
+ qn = r.qualified_name
36
+ else:
37
+ qn = ""
38
+ qn_lower = qn.lower()
39
+ exp_lower = expected.lower()
40
+ # Match if expected is substring of qn, qn is substring of expected,
41
+ # or the name part after :: matches
42
+ exp_name = expected.rsplit("::", 1)[-1] if "::" in expected else expected
43
+ qn_name = qn.rsplit("::", 1)[-1] if "::" in qn else qn
44
+ if (
45
+ exp_lower in qn_lower
46
+ or qn_lower in exp_lower
47
+ or exp_name.lower() == qn_name.lower()
48
+ ):
49
+ rank = i + 1
50
+ break
51
+
52
+ results.append({
53
+ "repo": config["name"],
54
+ "query": query,
55
+ "expected": expected,
56
+ "rank": rank,
57
+ "reciprocal_rank": round(1.0 / rank if rank > 0 else 0.0, 3),
58
+ })
59
+ return results
@@ -0,0 +1,143 @@
1
+ """Token efficiency benchmark: compares naive, standard, and graph-based token counts.
2
+
3
+ Failure semantics: if ``get_review_context`` throws, the row is recorded with
4
+ ``status="error"`` and empty metric fields. It stays in the CSV for forensics
5
+ but is excluded from every aggregate — a failed tool call is not a
6
+ measurement. (Previously a failure silently produced ``graph_tokens=0`` and
7
+ ``ratio = naive / 1``, inflating the results.)
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import logging
14
+ import statistics
15
+ import subprocess
16
+ from pathlib import Path
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _count_tokens(text: str) -> int:
22
+ """Approximate token count (1 token ~ 4 chars)."""
23
+ return len(text) // 4
24
+
25
+
26
+ def _get_changed_files(repo_path: Path, sha: str) -> list[str]:
27
+ """Get list of changed files for a commit."""
28
+ result = subprocess.run(
29
+ ["git", "diff", "--name-only", f"{sha}~1", sha],
30
+ cwd=str(repo_path),
31
+ capture_output=True,
32
+ text=True,
33
+ )
34
+ if result.returncode != 0:
35
+ # Fallback: diff against parent
36
+ result = subprocess.run(
37
+ ["git", "diff", "--name-only", "HEAD~1", "HEAD"],
38
+ cwd=str(repo_path),
39
+ capture_output=True,
40
+ text=True,
41
+ )
42
+ return [f.strip() for f in result.stdout.strip().splitlines() if f.strip()]
43
+
44
+
45
+ def _count_file_tokens(repo_path: Path, files: list[str]) -> int:
46
+ """Count tokens from full file contents (naive approach)."""
47
+ total = 0
48
+ for f in files:
49
+ fp = repo_path / f
50
+ if fp.is_file():
51
+ try:
52
+ total += _count_tokens(fp.read_text(encoding="utf-8", errors="replace"))
53
+ except OSError:
54
+ pass
55
+ return total
56
+
57
+
58
+ def _count_diff_tokens(repo_path: Path, sha: str) -> int:
59
+ """Count tokens from git diff output (standard approach)."""
60
+ result = subprocess.run(
61
+ ["git", "diff", f"{sha}~1", sha],
62
+ cwd=str(repo_path),
63
+ capture_output=True,
64
+ text=True,
65
+ )
66
+ if result.returncode != 0:
67
+ result = subprocess.run(
68
+ ["git", "diff", "HEAD~1", "HEAD"],
69
+ cwd=str(repo_path),
70
+ capture_output=True,
71
+ text=True,
72
+ )
73
+ return _count_tokens(result.stdout)
74
+
75
+
76
+ def run(repo_path: Path, store, config: dict) -> list[dict]:
77
+ """Run token efficiency benchmark."""
78
+ results = []
79
+ for tc in config.get("test_commits", []):
80
+ changed = _get_changed_files(repo_path, tc["sha"])
81
+ if not changed:
82
+ continue
83
+
84
+ naive_tokens = _count_file_tokens(repo_path, changed)
85
+ standard_tokens = _count_diff_tokens(repo_path, tc["sha"])
86
+
87
+ row: dict = {
88
+ "repo": config["name"],
89
+ "commit": tc["sha"],
90
+ "description": tc.get("description", ""),
91
+ "changed_files": len(changed),
92
+ "naive_tokens": naive_tokens,
93
+ "standard_tokens": standard_tokens,
94
+ "graph_tokens": "",
95
+ "naive_to_graph_ratio": "",
96
+ "standard_to_graph_ratio": "",
97
+ "status": "ok",
98
+ "error": "",
99
+ }
100
+
101
+ # Graph-based: use get_review_context
102
+ try:
103
+ from code_review_graph.tools import get_review_context
104
+ ctx = get_review_context(
105
+ changed_files=changed, repo_root=str(repo_path)
106
+ )
107
+ graph_tokens = _count_tokens(json.dumps(ctx))
108
+ except Exception as exc:
109
+ # A failed tool call is not a measurement. Recording
110
+ # graph_tokens=0 used to turn this into ratio = naive/1 — a
111
+ # huge fake win. Mark the row failed; aggregate() excludes it.
112
+ logger.warning("get_review_context failed on %s: %s", tc["sha"], exc)
113
+ row["status"] = "error"
114
+ row["error"] = str(exc)[:200]
115
+ results.append(row)
116
+ continue
117
+
118
+ row["graph_tokens"] = graph_tokens
119
+ row["naive_to_graph_ratio"] = round(naive_tokens / max(graph_tokens, 1), 1)
120
+ row["standard_to_graph_ratio"] = round(standard_tokens / max(graph_tokens, 1), 1)
121
+ results.append(row)
122
+ return results
123
+
124
+
125
+ def aggregate(results: list[dict]) -> dict:
126
+ """Aggregate token-efficiency rows, excluding failed measurements.
127
+
128
+ Rows with ``status != "ok"`` stay in the CSV for forensics but must not
129
+ contribute to any headline number.
130
+ """
131
+ ok = [r for r in results if r.get("status") == "ok"]
132
+ ratios = [float(r["naive_to_graph_ratio"]) for r in ok]
133
+ return {
134
+ "total_rows": len(results),
135
+ "ok_rows": len(ok),
136
+ "error_rows": sum(1 for r in results if r.get("status") == "error"),
137
+ "median_naive_to_graph_ratio": (
138
+ round(statistics.median(ratios), 1) if ratios else None
139
+ ),
140
+ "mean_naive_to_graph_ratio": (
141
+ round(statistics.mean(ratios), 1) if ratios else None
142
+ ),
143
+ }
@@ -0,0 +1,50 @@
1
+ name: code-review-graph
2
+ url: https://github.com/tirth8205/code-review-graph
3
+ # Pinned to the latest test_commit SHA so the snapshot is deterministic and
4
+ # every test_commit below is reachable as an ancestor. (This config replaces
5
+ # the historical "nextjs" entry, which used the same URL but mis-labelled the
6
+ # target as a Next.js monorepo.)
7
+ commit: 84bde35459c52e1e0c4b25c6c4799743021e0fc7
8
+ language: python
9
+ size_category: medium
10
+
11
+ test_commits:
12
+ - sha: 528801f841e519567ef54d6e52e9b9831d162e1b
13
+ description: "feat: add multi-platform MCP server installation support"
14
+ changed_files: 3
15
+ - sha: 84bde35459c52e1e0c4b25c6c4799743021e0fc7
16
+ description: "feat: add Google Antigravity platform support for MCP install"
17
+ changed_files: 2
18
+
19
+ entry_points:
20
+ - "code_review_graph/cli.py::cli"
21
+ - "code_review_graph/main.py::main"
22
+
23
+ search_queries:
24
+ - query: "GraphStore nodes"
25
+ expected: "code_review_graph/graph.py::GraphStore"
26
+ - query: "parse AST"
27
+ expected: "code_review_graph/parser.py::CodeParser"
28
+ - query: "full build"
29
+ expected: "code_review_graph/incremental.py::full_build"
30
+
31
+ multi_hop_tasks:
32
+ - id: crg-parse-file-callers
33
+ nl_query: "Who invokes the parser entry point on a single source file"
34
+ anchor_qualified_suffix: "code_review_graph/parser.py::codeparser.parse_file"
35
+ traversal_pattern: callers_of
36
+ expected_neighbor_names: ["setup_method"]
37
+ k: 10
38
+ - id: crg-upsert-node-callers
39
+ nl_query: "Where the graph store inserts or updates a node"
40
+ anchor_qualified_suffix: "code_review_graph/graph.py::graphstore.upsert_node"
41
+ traversal_pattern: callers_of
42
+ expected_neighbor_names: ["store_file_nodes_edges"]
43
+ k: 10
44
+
45
+ # Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
46
+ # query). See docs/REPRODUCING.md for the methodology.
47
+ agent_questions:
48
+ - "How does GraphStore upsert_node store a node"
49
+ - "Where does full_build parse the repository"
50
+ - "How does hybrid_search rank search results"
@@ -0,0 +1,45 @@
1
+ name: express
2
+ url: https://github.com/expressjs/express
3
+ # Pinned to the latest test_commit SHA so the snapshot is deterministic and
4
+ # every test_commit below is reachable as an ancestor.
5
+ commit: b4ab7d65d7724d9309b6faaaf82ad492da2a6d35
6
+ language: javascript
7
+ size_category: small
8
+
9
+ test_commits:
10
+ - sha: 925a1dff1e42f1b393c977b8b77757fcf633e09f
11
+ description: "fix: bump qs minimum to ^6.14.2 for CVE-2026-2391"
12
+ changed_files: 1
13
+ - sha: b4ab7d65d7724d9309b6faaaf82ad492da2a6d35
14
+ description: "test: include edge case tests for res.type()"
15
+ changed_files: 1
16
+
17
+ entry_points:
18
+ - "lib/application.js::app.handle"
19
+ - "lib/express.js::createApplication"
20
+
21
+ search_queries:
22
+ - query: "app handle"
23
+ expected: "lib/application.js::app"
24
+ - query: "response send"
25
+ expected: "lib/response.js::res"
26
+ - query: "request"
27
+ expected: "lib/request.js::req"
28
+
29
+ # Express has only one task — JS modules use prototypes + module.exports
30
+ # heavily, so most "method" callers are not represented as proper Function
31
+ # edges in the graph. createApplication is the cleanest anchor.
32
+ multi_hop_tasks:
33
+ - id: express-create-application-callees
34
+ nl_query: "What express does when constructing an application"
35
+ anchor_qualified_suffix: "lib/express.js::createapplication"
36
+ traversal_pattern: callees_of
37
+ expected_neighbor_names: ["mixin", "create", "init"]
38
+ k: 10
39
+
40
+ # Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
41
+ # query). See docs/REPRODUCING.md for the methodology.
42
+ agent_questions:
43
+ - "How does app.handle process the middleware stack"
44
+ - "Where does res.send write the response body"
45
+ - "How does createApplication initialize an app"
@@ -0,0 +1,48 @@
1
+ name: fastapi
2
+ url: https://github.com/tiangolo/fastapi
3
+ # Pinned to the latest test_commit SHA so the snapshot is deterministic and
4
+ # every test_commit below is reachable as an ancestor.
5
+ commit: 0227991a01e61bf5cdd93cc00e9e243f52b47a4a
6
+ language: python
7
+ size_category: medium
8
+
9
+ test_commits:
10
+ - sha: fa3588c38c7473aca7536b12d686102de4b0f407
11
+ description: "Fix typo for client_secret in OAuth2 form docstrings"
12
+ changed_files: 1
13
+ - sha: 0227991a01e61bf5cdd93cc00e9e243f52b47a4a
14
+ description: "Exclude spam comments from statistics in scripts/people.py"
15
+ changed_files: 1
16
+
17
+ entry_points:
18
+ - "fastapi/applications.py::FastAPI"
19
+ - "fastapi/routing.py::APIRouter"
20
+
21
+ search_queries:
22
+ - query: "FastAPI application"
23
+ expected: "fastapi/applications.py::FastAPI"
24
+ - query: "APIRoute routing"
25
+ expected: "fastapi/routing.py::APIRoute"
26
+ - query: "Depends injection"
27
+ expected: "fastapi/params.py::Depends"
28
+
29
+ multi_hop_tasks:
30
+ - id: fastapi-route-handler-callers
31
+ nl_query: "How fastapi binds a route handler to an APIRoute"
32
+ anchor_qualified_suffix: "fastapi/routing.py::apiroute.get_route_handler"
33
+ traversal_pattern: callers_of
34
+ expected_neighbor_names: ["__init__"]
35
+ k: 10
36
+ - id: fastapi-get-dependant-callers
37
+ nl_query: "Where fastapi resolves dependency declarations into a tree"
38
+ anchor_qualified_suffix: "fastapi/dependencies/utils.py::get_dependant"
39
+ traversal_pattern: callers_of
40
+ expected_neighbor_names: ["get_parameterless_sub_dependant", "solve_dependencies"]
41
+ k: 10
42
+
43
+ # Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
44
+ # query). See docs/REPRODUCING.md for the methodology.
45
+ agent_questions:
46
+ - "How does include_router register routes on the application"
47
+ - "Where does APIRoute build its route handler"
48
+ - "How does solve_dependencies resolve Depends parameters"