code-review-graph-codeblackwell 2.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. code_review_graph/__init__.py +20 -0
  2. code_review_graph/__main__.py +4 -0
  3. code_review_graph/analysis.py +410 -0
  4. code_review_graph/changes.py +409 -0
  5. code_review_graph/cli.py +1255 -0
  6. code_review_graph/communities.py +874 -0
  7. code_review_graph/constants.py +23 -0
  8. code_review_graph/context_savings.py +317 -0
  9. code_review_graph/custom_languages.py +322 -0
  10. code_review_graph/daemon.py +1009 -0
  11. code_review_graph/daemon_cli.py +320 -0
  12. code_review_graph/docs/LLM-OPTIMIZED-REFERENCE.md +71 -0
  13. code_review_graph/embeddings.py +1006 -0
  14. code_review_graph/enrich.py +303 -0
  15. code_review_graph/eval/__init__.py +33 -0
  16. code_review_graph/eval/benchmarks/__init__.py +1 -0
  17. code_review_graph/eval/benchmarks/agent_baseline.py +193 -0
  18. code_review_graph/eval/benchmarks/build_performance.py +60 -0
  19. code_review_graph/eval/benchmarks/flow_completeness.py +36 -0
  20. code_review_graph/eval/benchmarks/impact_accuracy.py +220 -0
  21. code_review_graph/eval/benchmarks/multi_hop_retrieval.py +125 -0
  22. code_review_graph/eval/benchmarks/search_quality.py +59 -0
  23. code_review_graph/eval/benchmarks/token_efficiency.py +143 -0
  24. code_review_graph/eval/configs/code-review-graph.yaml +50 -0
  25. code_review_graph/eval/configs/express.yaml +45 -0
  26. code_review_graph/eval/configs/fastapi.yaml +48 -0
  27. code_review_graph/eval/configs/flask.yaml +50 -0
  28. code_review_graph/eval/configs/gin.yaml +51 -0
  29. code_review_graph/eval/configs/httpx.yaml +48 -0
  30. code_review_graph/eval/reporter.py +301 -0
  31. code_review_graph/eval/runner.py +211 -0
  32. code_review_graph/eval/scorer.py +85 -0
  33. code_review_graph/eval/token_benchmark.py +182 -0
  34. code_review_graph/exports.py +409 -0
  35. code_review_graph/flows.py +698 -0
  36. code_review_graph/graph.py +1427 -0
  37. code_review_graph/graph_diff.py +122 -0
  38. code_review_graph/hints.py +384 -0
  39. code_review_graph/incremental.py +1245 -0
  40. code_review_graph/jedi_resolver.py +303 -0
  41. code_review_graph/main.py +1079 -0
  42. code_review_graph/memory.py +142 -0
  43. code_review_graph/migrations.py +284 -0
  44. code_review_graph/parser.py +6957 -0
  45. code_review_graph/postprocessing.py +134 -0
  46. code_review_graph/prompts.py +159 -0
  47. code_review_graph/refactor.py +852 -0
  48. code_review_graph/registry.py +319 -0
  49. code_review_graph/rescript_resolver.py +206 -0
  50. code_review_graph/search.py +447 -0
  51. code_review_graph/skills.py +1481 -0
  52. code_review_graph/spring_resolver.py +200 -0
  53. code_review_graph/temporal_resolver.py +199 -0
  54. code_review_graph/token_benchmark.py +125 -0
  55. code_review_graph/tools/__init__.py +156 -0
  56. code_review_graph/tools/_common.py +176 -0
  57. code_review_graph/tools/analysis_tools.py +184 -0
  58. code_review_graph/tools/build.py +541 -0
  59. code_review_graph/tools/community_tools.py +246 -0
  60. code_review_graph/tools/context.py +152 -0
  61. code_review_graph/tools/docs.py +274 -0
  62. code_review_graph/tools/flows_tools.py +176 -0
  63. code_review_graph/tools/query.py +692 -0
  64. code_review_graph/tools/refactor_tools.py +168 -0
  65. code_review_graph/tools/registry_tools.py +125 -0
  66. code_review_graph/tools/review.py +477 -0
  67. code_review_graph/tsconfig_resolver.py +257 -0
  68. code_review_graph/visualization.py +2184 -0
  69. code_review_graph/wiki.py +305 -0
  70. code_review_graph_codeblackwell-2.3.6.post1.dist-info/METADATA +718 -0
  71. code_review_graph_codeblackwell-2.3.6.post1.dist-info/RECORD +74 -0
  72. code_review_graph_codeblackwell-2.3.6.post1.dist-info/WHEEL +4 -0
  73. code_review_graph_codeblackwell-2.3.6.post1.dist-info/entry_points.txt +3 -0
  74. code_review_graph_codeblackwell-2.3.6.post1.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,50 @@
1
+ name: flask
2
+ url: https://github.com/pallets/flask
3
+ # Pinned to the latest test_commit SHA so the snapshot is deterministic and
4
+ # every test_commit below is reachable as an ancestor.
5
+ commit: a29f88ce6f2f9843bd6fcbbfce1390a2071965d6
6
+ language: python
7
+ size_category: small
8
+
9
+ test_commits:
10
+ - sha: fbb6f0bc4c60a0bada0e03c3480d0ccf30a3c1df
11
+ description: "all teardown callbacks are called despite errors"
12
+ changed_files: 10
13
+ - sha: a29f88ce6f2f9843bd6fcbbfce1390a2071965d6
14
+ description: "document that headers must be set before streaming"
15
+ changed_files: 4
16
+
17
+ entry_points:
18
+ - "src/flask/app.py::Flask.wsgi_app"
19
+ - "src/flask/sansio/app.py::App.add_url_rule"
20
+
21
+ search_queries:
22
+ - query: "Flask wsgi"
23
+ expected: "src/flask/app.py::Flask"
24
+ - query: "AppContext globals"
25
+ expected: "src/flask/ctx.py::AppContext"
26
+ - query: "create logger"
27
+ expected: "src/flask/logging.py::create_logger"
28
+
29
+ # Multi-hop retrieval tasks (semantic_search → query_graph one-hop)
30
+ # See docs/REPRODUCING.md for the schema.
31
+ multi_hop_tasks:
32
+ - id: flask-dispatch-callers
33
+ nl_query: "Where Flask dispatches HTTP requests"
34
+ anchor_qualified_suffix: "src/flask/app.py::flask.dispatch_request"
35
+ traversal_pattern: callers_of
36
+ expected_neighbor_names: ["full_dispatch_request"]
37
+ k: 10
38
+ - id: flask-exception-callers
39
+ nl_query: "Where Flask handles uncaught exceptions"
40
+ anchor_qualified_suffix: "src/flask/app.py::flask.handle_exception"
41
+ traversal_pattern: callers_of
42
+ expected_neighbor_names: ["wsgi_app"]
43
+ k: 10
44
+
45
+ # Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
46
+ # query). See docs/REPRODUCING.md for the methodology.
47
+ agent_questions:
48
+ - "How does dispatch_request route an incoming HTTP request"
49
+ - "Where is the AppContext pushed and popped"
50
+ - "How does create_logger configure application logging"
@@ -0,0 +1,51 @@
1
+ name: gin
2
+ url: https://github.com/gin-gonic/gin
3
+ # Pinned to the latest test_commit SHA so the snapshot is deterministic and
4
+ # every test_commit below is reachable as an ancestor.
5
+ commit: 5c00df8afadd06cc5be530dde00fe6d9fa4a2e4a
6
+ language: go
7
+ size_category: small
8
+
9
+ test_commits:
10
+ - sha: 052d1a79aafe3f04078a2716f8e77d4340308383
11
+ description: "feat(render): add PDF renderer and tests"
12
+ changed_files: 5
13
+ - sha: 472d086af2acd924cb4b9d7be0525f7d790f69bc
14
+ description: "fix(tree): panic in findCaseInsensitivePathRec with RedirectFixedPath"
15
+ changed_files: 2
16
+ - sha: 5c00df8afadd06cc5be530dde00fe6d9fa4a2e4a
17
+ description: "fix(render): write content length in Data.Render"
18
+ changed_files: 2
19
+
20
+ entry_points:
21
+ - "gin.go::Engine"
22
+ - "routergroup.go::RouterGroup"
23
+
24
+ search_queries:
25
+ - query: "Engine ServeHTTP"
26
+ expected: "gin.go::Engine"
27
+ - query: "Context request"
28
+ expected: "context.go::Context"
29
+ - query: "node tree"
30
+ expected: "tree.go::node"
31
+
32
+ multi_hop_tasks:
33
+ - id: gin-serve-http-callees
34
+ nl_query: "What does the gin engine do when serving an HTTP request"
35
+ anchor_qualified_suffix: "gin.go::engine.servehttp"
36
+ traversal_pattern: callees_of
37
+ expected_neighbor_names: ["reset"]
38
+ k: 10
39
+ - id: gin-context-next-callers
40
+ nl_query: "Who advances the gin middleware chain via Context.Next"
41
+ anchor_qualified_suffix: "context.go::context.next"
42
+ traversal_pattern: callers_of
43
+ expected_neighbor_names: ["handleHTTPRequest", "serveError"]
44
+ k: 10
45
+
46
+ # Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
47
+ # query). See docs/REPRODUCING.md for the methodology.
48
+ agent_questions:
49
+ - "How does Engine.ServeHTTP route an incoming request"
50
+ - "Where does Context.Next advance the middleware chain"
51
+ - "How does the node tree match wildcard routes"
@@ -0,0 +1,48 @@
1
+ name: httpx
2
+ url: https://github.com/encode/httpx
3
+ # Pinned to the latest test_commit SHA so the snapshot is deterministic and
4
+ # every test_commit below is reachable as an ancestor.
5
+ commit: b55d4635701d9dc22928ee647880c76b078ba3f2
6
+ language: python
7
+ size_category: small
8
+
9
+ test_commits:
10
+ - sha: ae1b9f66238f75ced3ced5e4485408435de10768
11
+ description: "Expose FunctionAuth in __all__"
12
+ changed_files: 3
13
+ - sha: b55d4635701d9dc22928ee647880c76b078ba3f2
14
+ description: "Upgrade Python type checker mypy"
15
+ changed_files: 4
16
+
17
+ entry_points:
18
+ - "httpx/_client.py::Client"
19
+ - "httpx/_client.py::AsyncClient"
20
+
21
+ search_queries:
22
+ - query: "Client request"
23
+ expected: "httpx/_client.py::Client"
24
+ - query: "Response headers"
25
+ expected: "httpx/_models.py::Response"
26
+ - query: "BaseClient"
27
+ expected: "httpx/_client.py::BaseClient"
28
+
29
+ multi_hop_tasks:
30
+ - id: httpx-client-request-callers
31
+ nl_query: "Which HTTP verbs route through the httpx Client.request"
32
+ anchor_qualified_suffix: "httpx/_client.py::client.request"
33
+ traversal_pattern: callers_of
34
+ expected_neighbor_names: ["get", "options", "head", "post", "put", "patch"]
35
+ k: 10
36
+ - id: httpx-async-request-tests
37
+ nl_query: "Tests covering the httpx async client request method"
38
+ anchor_qualified_suffix: "httpx/_client.py::asyncclient.request"
39
+ traversal_pattern: callers_of
40
+ expected_neighbor_names: ["test_raise_for_status"]
41
+ k: 10
42
+
43
+ # Questions for the agent_baseline benchmark (pure-python grep top-k vs graph
44
+ # query). See docs/REPRODUCING.md for the methodology.
45
+ agent_questions:
46
+ - "How does Client.request send an HTTP request"
47
+ - "Where are Response headers parsed and decoded"
48
+ - "How does BaseClient build request URLs"
@@ -0,0 +1,301 @@
1
+ """Markdown report generator for evaluation benchmark results.
2
+
3
+ Takes a list of benchmark result dicts and produces a formatted markdown table
4
+ suitable for inclusion in documentation or CI output.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import csv
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+
14
+ def generate_markdown_report(results: list[dict[str, Any]]) -> str:
15
+ """Generate a markdown report from benchmark results.
16
+
17
+ Each result dict should contain at minimum a ``benchmark`` key identifying
18
+ the benchmark name, plus any metric keys (e.g. ``ratio``,
19
+ ``reduction_percent``, ``mrr``, ``precision``, ``recall``, ``f1``).
20
+
21
+ Args:
22
+ results: List of result dicts from benchmark runs.
23
+
24
+ Returns:
25
+ A markdown string containing a summary table and per-benchmark details.
26
+ """
27
+ if not results:
28
+ return "# Evaluation Report\n\nNo benchmark results to report.\n"
29
+
30
+ lines: list[str] = []
31
+ lines.append("# Evaluation Report")
32
+ lines.append("")
33
+
34
+ # Collect all metric keys across results (excluding 'benchmark')
35
+ all_keys: list[str] = []
36
+ seen: set[str] = set()
37
+ for r in results:
38
+ for k in r:
39
+ if k != "benchmark" and k not in seen:
40
+ all_keys.append(k)
41
+ seen.add(k)
42
+
43
+ # Summary table
44
+ lines.append("## Summary")
45
+ lines.append("")
46
+
47
+ header = "| Benchmark | " + " | ".join(all_keys) + " |"
48
+ separator = "| --- | " + " | ".join("---" for _ in all_keys) + " |"
49
+ lines.append(header)
50
+ lines.append(separator)
51
+
52
+ for r in results:
53
+ name = r.get("benchmark", "unknown")
54
+ values = [str(r.get(k, "-")) for k in all_keys]
55
+ lines.append(f"| {name} | " + " | ".join(values) + " |")
56
+
57
+ lines.append("")
58
+
59
+ # Per-benchmark detail sections
60
+ lines.append("## Details")
61
+ lines.append("")
62
+ for r in results:
63
+ name = r.get("benchmark", "unknown")
64
+ lines.append(f"### {name}")
65
+ lines.append("")
66
+ for k in all_keys:
67
+ v = r.get(k, "-")
68
+ lines.append(f"- **{k}**: {v}")
69
+ lines.append("")
70
+
71
+ return "\n".join(lines)
72
+
73
+
74
+ def _read_csvs(results_dir: Path, prefix: str) -> list[dict[str, str]]:
75
+ """Read all CSV files matching a prefix from the results directory."""
76
+ rows: list[dict[str, str]] = []
77
+ for p in sorted(results_dir.glob(f"*_{prefix}_*.csv")):
78
+ with open(p, newline="") as f:
79
+ reader = csv.DictReader(f)
80
+ rows.extend(reader)
81
+ return rows
82
+
83
+
84
+ def _md_table(headers: list[str], rows: list[list[str]]) -> str:
85
+ """Build a markdown table from headers and rows."""
86
+ lines = []
87
+ lines.append("| " + " | ".join(headers) + " |")
88
+ lines.append("| " + " | ".join("---" for _ in headers) + " |")
89
+ for row in rows:
90
+ lines.append("| " + " | ".join(row) + " |")
91
+ return "\n".join(lines)
92
+
93
+
94
+ def generate_full_report(results_dir: str | Path) -> str:
95
+ """Generate a full markdown evaluation report from CSV result files.
96
+
97
+ Reads all CSV files in *results_dir*, groups them by benchmark type,
98
+ and produces a markdown report with methodology notes and per-benchmark
99
+ result tables.
100
+
101
+ Args:
102
+ results_dir: Directory containing CSV result files.
103
+
104
+ Returns:
105
+ Markdown string with the full report.
106
+ """
107
+ results_dir = Path(results_dir)
108
+ lines: list[str] = []
109
+ lines.append("# Evaluation Report")
110
+ lines.append("")
111
+ lines.append("## Methodology")
112
+ lines.append("")
113
+ lines.append("Benchmarks are run against real open-source repositories.")
114
+ lines.append("Token counts use a consistent `len(text) // 4` approximation.")
115
+ lines.append(
116
+ "Impact accuracy reports two ground-truth modes: "
117
+ "graph-derived (circular — upper bound) and co-change "
118
+ "(files co-changed in the same commit, seed excluded)."
119
+ )
120
+ lines.append(
121
+ "Rows with `status=error` are kept for forensics but excluded "
122
+ "from all aggregates."
123
+ )
124
+ lines.append("")
125
+
126
+ benchmark_types = [
127
+ "token_efficiency",
128
+ "impact_accuracy",
129
+ "agent_baseline",
130
+ "flow_completeness",
131
+ "search_quality",
132
+ "build_performance",
133
+ "multi_hop_retrieval",
134
+ ]
135
+
136
+ for btype in benchmark_types:
137
+ rows = _read_csvs(results_dir, btype)
138
+ if not rows:
139
+ continue
140
+
141
+ title = btype.replace("_", " ").title()
142
+ lines.append(f"## {title}")
143
+ lines.append("")
144
+
145
+ headers = list(rows[0].keys())
146
+ table_rows = [[r.get(h, "-") for h in headers] for r in rows]
147
+ lines.append(_md_table(headers, table_rows))
148
+ lines.append("")
149
+
150
+ if len(lines) <= 6:
151
+ lines.append("No benchmark results found.")
152
+ lines.append("")
153
+
154
+ return "\n".join(lines)
155
+
156
+
157
+ def generate_readme_tables(results_dir: str | Path) -> str:
158
+ """Generate concise README-ready tables from CSV result files.
159
+
160
+ Produces three tables:
161
+ - Table A: Token Efficiency
162
+ - Table B: Accuracy & Quality
163
+ - Table C: Performance
164
+
165
+ Args:
166
+ results_dir: Directory containing CSV result files.
167
+
168
+ Returns:
169
+ Markdown string with the three tables.
170
+ """
171
+ results_dir = Path(results_dir)
172
+ lines: list[str] = []
173
+
174
+ # Table A: Token Efficiency
175
+ te_rows = _read_csvs(results_dir, "token_efficiency")
176
+ if te_rows:
177
+ lines.append("### Token Efficiency")
178
+ lines.append("")
179
+ headers = [
180
+ "Repo", "Files", "Naive Tokens", "Standard Tokens",
181
+ "Graph Tokens", "Naive/Graph", "Std/Graph",
182
+ ]
183
+ table_rows = []
184
+ for r in te_rows:
185
+ table_rows.append([
186
+ r.get("repo", "-"),
187
+ r.get("changed_files", "-"),
188
+ r.get("naive_tokens", "-"),
189
+ r.get("standard_tokens", "-"),
190
+ r.get("graph_tokens", "-"),
191
+ r.get("naive_to_graph_ratio", "-"),
192
+ r.get("standard_to_graph_ratio", "-"),
193
+ ])
194
+ lines.append(_md_table(headers, table_rows))
195
+ lines.append("")
196
+
197
+ # Table B: Accuracy & Quality
198
+ ia_rows = _read_csvs(results_dir, "impact_accuracy")
199
+ fc_rows = _read_csvs(results_dir, "flow_completeness")
200
+ sq_rows = _read_csvs(results_dir, "search_quality")
201
+
202
+ if ia_rows or fc_rows or sq_rows:
203
+ lines.append("### Accuracy & Quality")
204
+ lines.append("")
205
+ headers = ["Repo", "Impact F1 (graph-derived)", "Flow Recall", "Search MRR"]
206
+ # Build a per-repo summary
207
+ repo_data: dict[str, dict[str, object]] = {}
208
+ mrr_accum: dict[str, list[float]] = {}
209
+ f1_accum: dict[str, list[float]] = {}
210
+ for r in ia_rows:
211
+ # Failed rows are kept in the CSV for forensics but must never
212
+ # contribute to a headline number; co-change rows are a
213
+ # different metric and get their own reporting.
214
+ if r.get("status", "ok") not in ("", "ok"):
215
+ continue
216
+ mode = r.get("ground_truth_mode", "")
217
+ if mode and not mode.startswith("graph-derived"):
218
+ continue
219
+ repo = r.get("repo", "?")
220
+ repo_data.setdefault(repo, {})
221
+ try:
222
+ f1_accum.setdefault(repo, []).append(float(r.get("f1", "")))
223
+ except (ValueError, TypeError):
224
+ pass
225
+ for r in fc_rows:
226
+ repo_data.setdefault(r.get("repo", "?"), {})["recall"] = r.get("recall", "-")
227
+ for r in sq_rows:
228
+ repo = r.get("repo", "?")
229
+ repo_data.setdefault(repo, {})
230
+ try:
231
+ mrr_accum.setdefault(repo, []).append(float(r.get("reciprocal_rank", 0)))
232
+ except (ValueError, TypeError):
233
+ pass
234
+
235
+ table_rows = []
236
+ for repo, d in sorted(repo_data.items()):
237
+ mrr_vals = mrr_accum.get(repo, [])
238
+ mrr = (
239
+ str(round(sum(mrr_vals) / len(mrr_vals), 3))
240
+ if mrr_vals
241
+ else "-"
242
+ )
243
+ f1_vals = f1_accum.get(repo, [])
244
+ f1 = (
245
+ str(round(sum(f1_vals) / len(f1_vals), 3))
246
+ if f1_vals
247
+ else "-"
248
+ )
249
+ table_rows.append([
250
+ repo,
251
+ f1,
252
+ str(d.get("recall", "-")),
253
+ mrr,
254
+ ])
255
+ lines.append(_md_table(headers, table_rows))
256
+ lines.append("")
257
+
258
+ # Table B2: Agent Baseline (grep top-k vs graph query)
259
+ ab_rows = _read_csvs(results_dir, "agent_baseline")
260
+ if ab_rows:
261
+ lines.append("### Agent Baseline (grep top-k vs graph query)")
262
+ lines.append("")
263
+ headers = [
264
+ "Repo", "Question", "Baseline Tokens", "Graph Tokens",
265
+ "Baseline/Graph", "Status",
266
+ ]
267
+ table_rows = []
268
+ for r in ab_rows:
269
+ table_rows.append([
270
+ r.get("repo", "-"),
271
+ r.get("question", "-"),
272
+ r.get("baseline_tokens", "-"),
273
+ r.get("graph_tokens", "-"),
274
+ r.get("baseline_to_graph_ratio", "-"),
275
+ r.get("status", "ok") or "ok",
276
+ ])
277
+ lines.append(_md_table(headers, table_rows))
278
+ lines.append("")
279
+
280
+ # Table C: Performance
281
+ bp_rows = _read_csvs(results_dir, "build_performance")
282
+ if bp_rows:
283
+ lines.append("### Performance")
284
+ lines.append("")
285
+ headers = ["Repo", "Files", "Nodes", "Flow Det. (s)", "Search (ms)"]
286
+ table_rows = []
287
+ for r in bp_rows:
288
+ table_rows.append([
289
+ r.get("repo", "-"),
290
+ r.get("file_count", "-"),
291
+ r.get("node_count", "-"),
292
+ r.get("flow_detection_seconds", "-"),
293
+ r.get("search_avg_ms", "-"),
294
+ ])
295
+ lines.append(_md_table(headers, table_rows))
296
+ lines.append("")
297
+
298
+ if not lines:
299
+ return "No benchmark results found.\n"
300
+
301
+ return "\n".join(lines)
@@ -0,0 +1,211 @@
1
+ """Evaluation runner: orchestrates benchmark execution across repositories."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ import logging
7
+ import subprocess
8
+ from datetime import date
9
+ from pathlib import Path
10
+
11
+ try:
12
+ import yaml # type: ignore[import-untyped]
13
+ except ImportError:
14
+ yaml = None # type: ignore[assignment]
15
+
16
+ from code_review_graph.eval.benchmarks import (
17
+ agent_baseline,
18
+ build_performance,
19
+ flow_completeness,
20
+ impact_accuracy,
21
+ multi_hop_retrieval,
22
+ search_quality,
23
+ token_efficiency,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+ BENCHMARK_REGISTRY = {
29
+ "token_efficiency": token_efficiency.run,
30
+ "impact_accuracy": impact_accuracy.run,
31
+ "flow_completeness": flow_completeness.run,
32
+ "search_quality": search_quality.run,
33
+ "build_performance": build_performance.run,
34
+ "multi_hop_retrieval": multi_hop_retrieval.run,
35
+ "agent_baseline": agent_baseline.run,
36
+ }
37
+
38
+ CONFIGS_DIR = Path(__file__).parent / "configs"
39
+ DEFAULT_OUTPUT = Path("evaluate/results")
40
+ DEFAULT_REPOS = Path("evaluate/test_repos")
41
+
42
+
43
+ def _require_yaml():
44
+ if yaml is None:
45
+ raise ImportError("pyyaml is required: pip install code-review-graph[eval]")
46
+
47
+
48
+ def load_config(name: str) -> dict:
49
+ """Load a single benchmark config by name."""
50
+ _require_yaml()
51
+ path = CONFIGS_DIR / f"{name}.yaml"
52
+ with open(path) as f:
53
+ return yaml.safe_load(f)
54
+
55
+
56
+ def load_all_configs() -> list[dict]:
57
+ """Load all benchmark configs from the configs directory."""
58
+ _require_yaml()
59
+ configs = []
60
+ for p in sorted(CONFIGS_DIR.glob("*.yaml")):
61
+ with open(p) as f:
62
+ configs.append(yaml.safe_load(f))
63
+ return configs
64
+
65
+
66
+ def clone_or_update(config: dict, repos_dir: Path | None = None) -> Path:
67
+ """Clone or update a repository at the config's pinned ``commit`` SHA.
68
+
69
+ Full clones (no ``--depth``) are required: the pinned ``test_commits`` are
70
+ often older than any reasonable shallow-clone window, and a missed SHA
71
+ used to silently fall back to ``git diff HEAD~1 HEAD`` — producing
72
+ benchmark numbers tied to whatever upstream HEAD looked like that day.
73
+
74
+ Every subprocess call's exit status is checked; failures raise
75
+ ``RuntimeError`` so reproducibility issues surface immediately instead of
76
+ yielding garbage results.
77
+ """
78
+ repos_dir = repos_dir or DEFAULT_REPOS
79
+ repos_dir.mkdir(parents=True, exist_ok=True)
80
+ repo_path = repos_dir / config["name"]
81
+
82
+ if repo_path.exists():
83
+ proc = subprocess.run(
84
+ ["git", "fetch", "--all", "--tags"],
85
+ cwd=str(repo_path),
86
+ capture_output=True,
87
+ text=True,
88
+ )
89
+ if proc.returncode != 0:
90
+ raise RuntimeError(
91
+ f"git fetch failed in {repo_path}: {proc.stderr.strip()}"
92
+ )
93
+ else:
94
+ proc = subprocess.run(
95
+ ["git", "clone", config["url"], str(repo_path)],
96
+ capture_output=True,
97
+ text=True,
98
+ )
99
+ if proc.returncode != 0:
100
+ raise RuntimeError(
101
+ f"git clone failed for {config['url']}: {proc.stderr.strip()}"
102
+ )
103
+
104
+ commit = config.get("commit", "HEAD")
105
+ if commit != "HEAD":
106
+ proc = subprocess.run(
107
+ ["git", "checkout", commit],
108
+ cwd=str(repo_path),
109
+ capture_output=True,
110
+ text=True,
111
+ )
112
+ if proc.returncode != 0:
113
+ raise RuntimeError(
114
+ f"git checkout {commit} failed in {repo_path}: "
115
+ f"{proc.stderr.strip()}"
116
+ )
117
+
118
+ return repo_path
119
+
120
+
121
+ def write_csv(results: list[dict], path: Path) -> None:
122
+ """Write benchmark results to a CSV file."""
123
+ if not results:
124
+ return
125
+ path.parent.mkdir(parents=True, exist_ok=True)
126
+ fieldnames = list(results[0].keys())
127
+ with open(path, "w", newline="") as f:
128
+ writer = csv.DictWriter(f, fieldnames=fieldnames)
129
+ writer.writeheader()
130
+ writer.writerows(results)
131
+
132
+
133
+ def run_eval(
134
+ repos: list[str] | None = None,
135
+ benchmarks: list[str] | None = None,
136
+ output_dir: str | Path | None = None,
137
+ ) -> dict[str, list[dict]]:
138
+ """Run evaluation benchmarks across repositories.
139
+
140
+ Args:
141
+ repos: List of repo config names to evaluate (None = all).
142
+ benchmarks: List of benchmark names to run (None = all).
143
+ output_dir: Directory for CSV output files.
144
+
145
+ Returns:
146
+ Dict mapping ``{repo}_{benchmark}`` to list of result dicts.
147
+ """
148
+ output_dir = Path(output_dir) if output_dir else DEFAULT_OUTPUT
149
+ output_dir.mkdir(parents=True, exist_ok=True)
150
+
151
+ if repos:
152
+ configs = [load_config(r) for r in repos]
153
+ else:
154
+ configs = load_all_configs()
155
+
156
+ benchmark_names = benchmarks or list(BENCHMARK_REGISTRY.keys())
157
+ all_results: dict[str, list[dict]] = {}
158
+ today = date.today().isoformat()
159
+
160
+ for config in configs:
161
+ name = config["name"]
162
+ logger.info("Evaluating %s...", name)
163
+
164
+ # Resolve the repo path to an absolute Path before handing it to
165
+ # full_build / get_db_path so the stored qualified_names match what
166
+ # the CLI/MCP layer produces (those paths go through _get_store ->
167
+ # _validate_repo_root which .resolve()s). Without this, a later
168
+ # ``code-review-graph update --repo <relative>`` writes the same
169
+ # function under a new absolute-prefixed qualified_name, leaving the
170
+ # graph with duplicate nodes for the same source location.
171
+ repo_path = clone_or_update(config).resolve()
172
+
173
+ # Build graph
174
+ from code_review_graph.graph import GraphStore
175
+ from code_review_graph.incremental import full_build, get_db_path
176
+ from code_review_graph.postprocessing import run_post_processing
177
+
178
+ db_path = get_db_path(repo_path)
179
+ store = GraphStore(db_path)
180
+
181
+ full_build(repo_path, store)
182
+ # full_build is the parsing-only primitive; the higher-level CLI/MCP
183
+ # wrappers run postprocessing on top. The eval framework bypasses
184
+ # those, so call it directly here. Without this, FTS5 stays empty
185
+ # and downstream benchmarks (token_efficiency, search_quality)
186
+ # silently produce useless results. See: search.rebuild_fts_index.
187
+ pp_result = run_post_processing(store)
188
+ for warning in pp_result.get("warnings", []):
189
+ logger.warning(" postprocessing: %s", warning)
190
+
191
+ for bench_name in benchmark_names:
192
+ if bench_name not in BENCHMARK_REGISTRY:
193
+ logger.warning("Unknown benchmark: %s", bench_name)
194
+ continue
195
+
196
+ logger.info(" Running %s...", bench_name)
197
+ try:
198
+ bench_fn = BENCHMARK_REGISTRY[bench_name]
199
+ results = bench_fn(repo_path, store, config)
200
+
201
+ key = f"{name}_{bench_name}"
202
+ all_results[key] = results
203
+ write_csv(results, output_dir / f"{key}_{today}.csv")
204
+ logger.info(" %s: %d result(s)", bench_name, len(results))
205
+ except Exception as e:
206
+ logger.error(" %s failed: %s", bench_name, e)
207
+ all_results[f"{name}_{bench_name}"] = []
208
+
209
+ store.close()
210
+
211
+ return all_results