codeprobe 0.1.6__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.1.6 → codeprobe-0.2.0}/PKG-INFO +1 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/pyproject.toml +1 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/__init__.py +1 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/claude.py +18 -2
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/api.py +16 -3
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/__init__.py +108 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/experiment_cmd.py +4 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/init_cmd.py +72 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/mine_cmd.py +52 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/probe_cmd.py +11 -9
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/run_cmd.py +23 -10
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/wizard.py +53 -4
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/checkpoint.py +40 -23
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/executor.py +32 -12
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/experiment.py +6 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/isolation.py +9 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/preamble.py +7 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale.py +331 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_families.py +106 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_scanner.py +464 -0
- codeprobe-0.2.0/src/codeprobe/mining/sg_ground_truth.py +163 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/writer.py +81 -32
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/experiment.py +1 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/task.py +1 -0
- codeprobe-0.2.0/src/codeprobe/preambles/github.md +21 -0
- codeprobe-0.2.0/src/codeprobe/preambles/sourcegraph.md +44 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/probe/generator.py +60 -5
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/probe/writer.py +8 -2
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +6 -6
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/PKG-INFO +1 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/SOURCES.txt +5 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_adapters.py +31 -0
- codeprobe-0.2.0/tests/test_changed_symbols.py +241 -0
- codeprobe-0.2.0/tests/test_cli.py +201 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_executor.py +9 -1
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_experiment_core.py +100 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_init_wizard.py +173 -2
- codeprobe-0.2.0/tests/test_mcp_families_mining.py +278 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_new_families.py +29 -3
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_org_scale.py +6 -7
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_preamble.py +36 -13
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_probe.py +351 -0
- codeprobe-0.2.0/tests/test_sg_ground_truth.py +318 -0
- codeprobe-0.1.6/src/codeprobe/preambles/sourcegraph.md +0 -32
- codeprobe-0.1.6/tests/test_cli.py +0 -51
- {codeprobe-0.1.6 → codeprobe-0.2.0}/LICENSE +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/README.md +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/setup.cfg +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/_base.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/aider.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/protocol.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/telemetry.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/report.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/stats.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/assess/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/assess/heuristics.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/interpret_cmd.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/yaml_writer.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/config/loader.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/registry.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/scoring.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/loaders/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/curator.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/curator_backends.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/extractor.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_oracle.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/ratings/collector.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/entry_points.txt +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/requires.txt +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_analysis.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_api.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_assess.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_checkpoint.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_config_loader.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_contrib.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_backends.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_core.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_integration.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_tiers.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_experiment_cmd.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_llm.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_loaders.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_mcp_validate.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_mining.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_models.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_pipeline_integration.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_ratings.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_registry.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_scaffold.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_scoring.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_session.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_telemetry.py +0 -0
- {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_weighted_f1.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import re
|
|
7
|
+
import shutil
|
|
7
8
|
import subprocess
|
|
8
9
|
import tempfile
|
|
9
10
|
from pathlib import Path
|
|
@@ -69,16 +70,31 @@ class ClaudeAdapter(BaseAdapter):
|
|
|
69
70
|
|
|
70
71
|
mcp_path = self._write_mcp_config(config)
|
|
71
72
|
if mcp_path:
|
|
72
|
-
cmd.extend(["--mcp-config", mcp_path])
|
|
73
|
+
cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
|
|
73
74
|
|
|
74
75
|
return cmd
|
|
75
76
|
|
|
76
77
|
def isolate_session(self, slot_id: int) -> dict[str, str]:
|
|
77
|
-
"""Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
|
|
78
|
+
"""Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
|
|
79
|
+
|
|
80
|
+
Copies authentication credentials from the real ``~/.claude/``
|
|
81
|
+
directory so the agent subprocess can authenticate.
|
|
82
|
+
"""
|
|
78
83
|
config_dir = (
|
|
79
84
|
Path(tempfile.gettempdir()) / "codeprobe-claude" / f"slot-{slot_id}"
|
|
80
85
|
)
|
|
81
86
|
config_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
|
|
88
|
+
# Copy auth credentials from the user's real config dir.
|
|
89
|
+
# Without these the subprocess gets "Not logged in".
|
|
90
|
+
real_config = Path.home() / ".claude"
|
|
91
|
+
if real_config.is_dir():
|
|
92
|
+
for name in ("credentials.json", ".credentials.json"):
|
|
93
|
+
src = real_config / name
|
|
94
|
+
dst = config_dir / name
|
|
95
|
+
if src.is_file():
|
|
96
|
+
shutil.copy2(src, dst)
|
|
97
|
+
|
|
82
98
|
return {"CLAUDE_CONFIG_DIR": str(config_dir)}
|
|
83
99
|
|
|
84
100
|
def parse_output(
|
|
@@ -58,11 +58,24 @@ def _build_experiment_config(raw: dict) -> ExperimentConfig:
|
|
|
58
58
|
)
|
|
59
59
|
|
|
60
60
|
|
|
61
|
-
def _discover_task_dirs(
|
|
62
|
-
|
|
61
|
+
def _discover_task_dirs(
|
|
62
|
+
tasks_dir: Path, *, task_ids: tuple[str, ...] = ()
|
|
63
|
+
) -> list[Path]:
|
|
64
|
+
"""Find valid task directories (those containing instruction.md).
|
|
65
|
+
|
|
66
|
+
When *task_ids* is non-empty, only return tasks whose directory name
|
|
67
|
+
appears in that tuple.
|
|
68
|
+
"""
|
|
63
69
|
if not tasks_dir.is_dir():
|
|
64
70
|
raise FileNotFoundError(f"Tasks directory not found: {tasks_dir}")
|
|
65
71
|
|
|
72
|
+
if task_ids:
|
|
73
|
+
allowed = set(task_ids)
|
|
74
|
+
return sorted(
|
|
75
|
+
d
|
|
76
|
+
for d in tasks_dir.iterdir()
|
|
77
|
+
if d.is_dir() and d.name in allowed and (d / "instruction.md").exists()
|
|
78
|
+
)
|
|
66
79
|
return sorted(
|
|
67
80
|
d for d in tasks_dir.iterdir() if d.is_dir() and (d / "instruction.md").exists()
|
|
68
81
|
)
|
|
@@ -103,7 +116,7 @@ def run_experiment(
|
|
|
103
116
|
experiment = load_experiment(experiment_dir)
|
|
104
117
|
|
|
105
118
|
tasks_dir = experiment_dir / experiment.tasks_dir
|
|
106
|
-
task_dirs = _discover_task_dirs(tasks_dir)
|
|
119
|
+
task_dirs = _discover_task_dirs(tasks_dir, task_ids=experiment.task_ids)
|
|
107
120
|
|
|
108
121
|
if not task_dirs:
|
|
109
122
|
raise ValueError(
|
|
@@ -1,18 +1,89 @@
|
|
|
1
1
|
"""CLI entry point for codeprobe."""
|
|
2
2
|
|
|
3
|
+
import json as _json
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
3
7
|
import click
|
|
4
8
|
|
|
5
9
|
from codeprobe import __version__
|
|
6
10
|
|
|
7
11
|
|
|
12
|
+
class _JsonFormatter(logging.Formatter):
|
|
13
|
+
"""Emit one JSON object per log line."""
|
|
14
|
+
|
|
15
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
16
|
+
payload = {
|
|
17
|
+
"level": record.levelname,
|
|
18
|
+
"logger": record.name,
|
|
19
|
+
"message": record.getMessage(),
|
|
20
|
+
"timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
|
|
21
|
+
}
|
|
22
|
+
return _json.dumps(payload)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _configure_logging(verbose: int, quiet: bool, log_format: str = "text") -> None:
|
|
26
|
+
"""Configure namespace-scoped logging for codeprobe.* modules.
|
|
27
|
+
|
|
28
|
+
Attaches a StreamHandler to `logging.getLogger("codeprobe")` so that
|
|
29
|
+
all 26+ codeprobe.* modules emit through hierarchy without touching
|
|
30
|
+
third-party loggers (httpx, urllib3, etc.).
|
|
31
|
+
"""
|
|
32
|
+
if quiet:
|
|
33
|
+
level = logging.WARNING
|
|
34
|
+
elif verbose >= 1:
|
|
35
|
+
level = logging.DEBUG
|
|
36
|
+
else:
|
|
37
|
+
level = logging.INFO
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger("codeprobe")
|
|
40
|
+
logger.setLevel(level)
|
|
41
|
+
logger.propagate = False # don't bubble to root
|
|
42
|
+
|
|
43
|
+
# Idempotent: tests / repeat invocations must not duplicate handlers.
|
|
44
|
+
for h in list(logger.handlers):
|
|
45
|
+
logger.removeHandler(h)
|
|
46
|
+
|
|
47
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
48
|
+
if log_format == "json":
|
|
49
|
+
handler.setFormatter(_JsonFormatter())
|
|
50
|
+
elif verbose >= 1:
|
|
51
|
+
fmt = "%(levelname)s %(name)s: %(message)s"
|
|
52
|
+
handler.setFormatter(logging.Formatter(fmt))
|
|
53
|
+
else:
|
|
54
|
+
fmt = "%(levelname)s: %(message)s"
|
|
55
|
+
handler.setFormatter(logging.Formatter(fmt))
|
|
56
|
+
logger.addHandler(handler)
|
|
57
|
+
|
|
58
|
+
|
|
8
59
|
@click.group()
|
|
60
|
+
@click.option(
|
|
61
|
+
"-v",
|
|
62
|
+
"--verbose",
|
|
63
|
+
count=True,
|
|
64
|
+
help="Increase log verbosity (-v sets DEBUG).",
|
|
65
|
+
)
|
|
66
|
+
@click.option(
|
|
67
|
+
"-q",
|
|
68
|
+
"--quiet",
|
|
69
|
+
is_flag=True,
|
|
70
|
+
default=False,
|
|
71
|
+
help="Suppress INFO logs (WARNING and above only).",
|
|
72
|
+
)
|
|
73
|
+
@click.option(
|
|
74
|
+
"--log-format",
|
|
75
|
+
type=click.Choice(["text", "json"]),
|
|
76
|
+
default="text",
|
|
77
|
+
help="Log output format (default: text). 'json' emits one JSON object per line.",
|
|
78
|
+
)
|
|
9
79
|
@click.version_option(version=__version__, prog_name="codeprobe")
|
|
10
|
-
def main() -> None:
|
|
80
|
+
def main(verbose: int, quiet: bool, log_format: str) -> None:
|
|
11
81
|
"""Benchmark AI coding agents against your own codebase.
|
|
12
82
|
|
|
13
83
|
Mine real tasks from your repo history, run agents against them,
|
|
14
84
|
and interpret the results to find which setup works best for YOUR code.
|
|
15
85
|
"""
|
|
86
|
+
_configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
|
|
16
87
|
|
|
17
88
|
|
|
18
89
|
@main.command()
|
|
@@ -121,6 +192,20 @@ def init(path: str) -> None:
|
|
|
121
192
|
default=False,
|
|
122
193
|
help="Run LLM verification on curated ground truth.",
|
|
123
194
|
)
|
|
195
|
+
@click.option(
|
|
196
|
+
"--mcp-families",
|
|
197
|
+
is_flag=True,
|
|
198
|
+
default=False,
|
|
199
|
+
help="Include MCP-advantaged task families (symbol-reference-trace, "
|
|
200
|
+
"type-hierarchy-consumers, change-scope-audit). Only with --org-scale.",
|
|
201
|
+
)
|
|
202
|
+
@click.option(
|
|
203
|
+
"--sg-repo",
|
|
204
|
+
default="",
|
|
205
|
+
help="Sourcegraph repo identifier for ground truth enrichment "
|
|
206
|
+
"(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
|
|
207
|
+
"when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
|
|
208
|
+
)
|
|
124
209
|
def mine(
|
|
125
210
|
path: str,
|
|
126
211
|
count: int,
|
|
@@ -139,6 +224,8 @@ def mine(
|
|
|
139
224
|
curate: bool,
|
|
140
225
|
backends: tuple[str, ...],
|
|
141
226
|
verify_curation_flag: bool,
|
|
227
|
+
mcp_families: bool,
|
|
228
|
+
sg_repo: str,
|
|
142
229
|
) -> None:
|
|
143
230
|
"""Mine eval tasks from a repository's history.
|
|
144
231
|
|
|
@@ -175,6 +262,8 @@ def mine(
|
|
|
175
262
|
curate=curate,
|
|
176
263
|
backends=backends,
|
|
177
264
|
verify_curation_flag=verify_curation_flag,
|
|
265
|
+
mcp_families=mcp_families,
|
|
266
|
+
sg_repo=sg_repo,
|
|
178
267
|
)
|
|
179
268
|
|
|
180
269
|
|
|
@@ -272,6 +361,20 @@ def init_experiment(path: str, name: str, description: str) -> None:
|
|
|
272
361
|
@click.option(
|
|
273
362
|
"--mcp-config", default=None, help="MCP config as JSON string or file path."
|
|
274
363
|
)
|
|
364
|
+
@click.option(
|
|
365
|
+
"--instruction-variant",
|
|
366
|
+
default=None,
|
|
367
|
+
help="Instruction file variant (e.g., instruction_mcp.md). Default: instruction.md.",
|
|
368
|
+
)
|
|
369
|
+
@click.option(
|
|
370
|
+
"--preamble",
|
|
371
|
+
"preambles",
|
|
372
|
+
multiple=True,
|
|
373
|
+
help=(
|
|
374
|
+
"Preamble to prepend to the instruction. Repeatable. "
|
|
375
|
+
"Built-ins: sourcegraph, github. Or path to a custom .md file."
|
|
376
|
+
),
|
|
377
|
+
)
|
|
275
378
|
def add_config(
|
|
276
379
|
path: str,
|
|
277
380
|
label: str,
|
|
@@ -279,6 +382,8 @@ def add_config(
|
|
|
279
382
|
model: str | None,
|
|
280
383
|
permission_mode: str,
|
|
281
384
|
mcp_config: str | None,
|
|
385
|
+
instruction_variant: str | None,
|
|
386
|
+
preambles: tuple[str, ...],
|
|
282
387
|
) -> None:
|
|
283
388
|
"""Add a configuration to an existing experiment."""
|
|
284
389
|
from codeprobe.cli.experiment_cmd import experiment_add_config
|
|
@@ -290,6 +395,8 @@ def add_config(
|
|
|
290
395
|
model=model,
|
|
291
396
|
permission_mode=permission_mode,
|
|
292
397
|
mcp_config_str=mcp_config,
|
|
398
|
+
instruction_variant=instruction_variant,
|
|
399
|
+
preambles=preambles,
|
|
293
400
|
)
|
|
294
401
|
|
|
295
402
|
|
|
@@ -63,6 +63,8 @@ def experiment_add_config(
|
|
|
63
63
|
model: str | None,
|
|
64
64
|
permission_mode: str,
|
|
65
65
|
mcp_config_str: str | None,
|
|
66
|
+
instruction_variant: str | None = None,
|
|
67
|
+
preambles: tuple[str, ...] = (),
|
|
66
68
|
) -> None:
|
|
67
69
|
"""Add a configuration to an existing experiment."""
|
|
68
70
|
exp_dir = Path(path)
|
|
@@ -104,6 +106,8 @@ def experiment_add_config(
|
|
|
104
106
|
model=model,
|
|
105
107
|
permission_mode=permission_mode,
|
|
106
108
|
mcp_config=mcp_config,
|
|
109
|
+
instruction_variant=instruction_variant,
|
|
110
|
+
preambles=preambles,
|
|
107
111
|
)
|
|
108
112
|
|
|
109
113
|
# Validate the label is a safe path component
|
|
@@ -173,12 +173,83 @@ def _prompt_mcp_config() -> str:
|
|
|
173
173
|
click.echo(f" Error: '{expanded}' does not exist. Try again.")
|
|
174
174
|
|
|
175
175
|
|
|
176
|
+
def _detect_sourcegraph_in_mcp(
|
|
177
|
+
discovered: list[tuple[Path, list[str]]],
|
|
178
|
+
mcp_data: dict | None = None,
|
|
179
|
+
) -> bool:
|
|
180
|
+
"""Return True if any discovered MCP config contains a Sourcegraph server.
|
|
181
|
+
|
|
182
|
+
Checks server names for common Sourcegraph patterns (e.g.
|
|
183
|
+
``sourcegraph``, ``sourcegraph-mcp-server``).
|
|
184
|
+
"""
|
|
185
|
+
sg_names = {"sourcegraph", "sourcegraph-mcp-server"}
|
|
186
|
+
for _path, server_names in discovered:
|
|
187
|
+
for name in server_names:
|
|
188
|
+
if name.lower() in sg_names:
|
|
189
|
+
return True
|
|
190
|
+
if mcp_data:
|
|
191
|
+
for name in mcp_data.get("mcpServers", {}):
|
|
192
|
+
if name.lower() in sg_names:
|
|
193
|
+
return True
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _prompt_sourcegraph_token() -> str:
|
|
198
|
+
"""Prompt for Sourcegraph access token, checking env var first."""
|
|
199
|
+
import os
|
|
200
|
+
|
|
201
|
+
env_token = os.environ.get("SOURCEGRAPH_TOKEN", "")
|
|
202
|
+
if env_token:
|
|
203
|
+
masked = env_token[:4] + "..." + env_token[-4:] if len(env_token) > 8 else "***"
|
|
204
|
+
click.echo(f" Found SOURCEGRAPH_TOKEN in environment ({masked})")
|
|
205
|
+
if click.confirm(" Use this token?", default=True):
|
|
206
|
+
return env_token
|
|
207
|
+
|
|
208
|
+
return click.prompt("Sourcegraph access token")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _prompt_sourcegraph_url() -> str | None:
|
|
212
|
+
"""Prompt for optional custom Sourcegraph instance URL."""
|
|
213
|
+
url = click.prompt(
|
|
214
|
+
"Sourcegraph URL (press Enter for sourcegraph.com)",
|
|
215
|
+
default="",
|
|
216
|
+
show_default=False,
|
|
217
|
+
)
|
|
218
|
+
return url if url else None
|
|
219
|
+
|
|
220
|
+
|
|
176
221
|
def _goal_mcp(agents: list[str], name: str) -> _Result:
|
|
177
222
|
"""Goal 1: MCP comparison prompts."""
|
|
178
223
|
agent = _prompt_agent(agents)
|
|
179
224
|
model = _prompt_model()
|
|
180
|
-
mcp_path = _prompt_mcp_config()
|
|
181
225
|
|
|
226
|
+
# Check if Sourcegraph is available in discovered MCP configs
|
|
227
|
+
discovered = _discover_mcp_configs()
|
|
228
|
+
use_sourcegraph = False
|
|
229
|
+
|
|
230
|
+
if _detect_sourcegraph_in_mcp(discovered):
|
|
231
|
+
click.echo()
|
|
232
|
+
click.echo("Detected Sourcegraph MCP server in your configuration.")
|
|
233
|
+
click.echo("codeprobe can use the HTTP endpoint for better performance.")
|
|
234
|
+
use_sourcegraph = click.confirm("Use Sourcegraph HTTP MCP?", default=True)
|
|
235
|
+
else:
|
|
236
|
+
click.echo()
|
|
237
|
+
click.echo("Would you like to use Sourcegraph as the MCP server?")
|
|
238
|
+
use_sourcegraph = click.confirm("Use Sourcegraph?", default=False)
|
|
239
|
+
|
|
240
|
+
if use_sourcegraph:
|
|
241
|
+
token = _prompt_sourcegraph_token()
|
|
242
|
+
sg_url = _prompt_sourcegraph_url()
|
|
243
|
+
return ask_mcp_comparison(
|
|
244
|
+
experiment_name=name,
|
|
245
|
+
agent=agent,
|
|
246
|
+
model=model,
|
|
247
|
+
sourcegraph_token=token,
|
|
248
|
+
sourcegraph_url=sg_url,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Fall back to generic MCP config path
|
|
252
|
+
mcp_path = _prompt_mcp_config()
|
|
182
253
|
return ask_mcp_comparison(
|
|
183
254
|
experiment_name=name,
|
|
184
255
|
agent=agent,
|
|
@@ -403,6 +403,41 @@ def _clear_tasks_dir(repo_path: Path) -> Path:
|
|
|
403
403
|
return tasks_dir
|
|
404
404
|
|
|
405
405
|
|
|
406
|
+
def _record_task_ids_in_experiment(repo_path: Path, task_ids: list[str]) -> None:
|
|
407
|
+
"""Update the experiment's task_ids so ``run`` only executes these tasks.
|
|
408
|
+
|
|
409
|
+
If exactly one experiment exists under ``<repo>/.codeprobe/``, its
|
|
410
|
+
``experiment.json`` is updated with the new task ID list. When zero
|
|
411
|
+
or multiple experiments exist, this is a no-op (the user must scope
|
|
412
|
+
manually via ``--config``).
|
|
413
|
+
"""
|
|
414
|
+
from codeprobe.core.experiment import load_experiment, save_experiment
|
|
415
|
+
from codeprobe.models.experiment import Experiment
|
|
416
|
+
|
|
417
|
+
codeprobe_dir = repo_path / ".codeprobe"
|
|
418
|
+
if not codeprobe_dir.is_dir():
|
|
419
|
+
return
|
|
420
|
+
|
|
421
|
+
candidates = sorted(
|
|
422
|
+
d
|
|
423
|
+
for d in codeprobe_dir.iterdir()
|
|
424
|
+
if d.is_dir() and (d / "experiment.json").is_file()
|
|
425
|
+
)
|
|
426
|
+
if len(candidates) != 1:
|
|
427
|
+
return
|
|
428
|
+
|
|
429
|
+
exp_dir = candidates[0]
|
|
430
|
+
experiment = load_experiment(exp_dir)
|
|
431
|
+
updated = Experiment(
|
|
432
|
+
name=experiment.name,
|
|
433
|
+
description=experiment.description,
|
|
434
|
+
configs=experiment.configs,
|
|
435
|
+
tasks_dir=experiment.tasks_dir,
|
|
436
|
+
task_ids=tuple(sorted(task_ids)),
|
|
437
|
+
)
|
|
438
|
+
save_experiment(exp_dir, updated)
|
|
439
|
+
|
|
440
|
+
|
|
406
441
|
def _resolve_repo_path(path: str) -> Path:
|
|
407
442
|
"""Resolve a path or URL to a local repo directory."""
|
|
408
443
|
if _is_git_url(path):
|
|
@@ -483,6 +518,8 @@ def run_mine(
|
|
|
483
518
|
curate: bool = False,
|
|
484
519
|
backends: tuple[str, ...] = (),
|
|
485
520
|
verify_curation_flag: bool = False,
|
|
521
|
+
mcp_families: bool = False,
|
|
522
|
+
sg_repo: str = "",
|
|
486
523
|
) -> None:
|
|
487
524
|
"""Mine eval tasks from a repository."""
|
|
488
525
|
from codeprobe.mining import mine_tasks, write_task_dir
|
|
@@ -518,6 +555,8 @@ def run_mine(
|
|
|
518
555
|
curate=curate,
|
|
519
556
|
backends=backends,
|
|
520
557
|
verify_curation_flag=verify_curation_flag,
|
|
558
|
+
mcp_families=mcp_families,
|
|
559
|
+
sg_repo=sg_repo,
|
|
521
560
|
)
|
|
522
561
|
return
|
|
523
562
|
|
|
@@ -571,6 +610,8 @@ def run_mine(
|
|
|
571
610
|
for task in tasks:
|
|
572
611
|
write_task_dir(task, tasks_dir, repo_path)
|
|
573
612
|
|
|
613
|
+
_record_task_ids_in_experiment(repo_path, [t.id for t in tasks])
|
|
614
|
+
|
|
574
615
|
_show_results_table(tasks)
|
|
575
616
|
|
|
576
617
|
warnings = _quality_review(tasks, goal_name, bias)
|
|
@@ -603,6 +644,8 @@ def _run_org_scale_mine(
|
|
|
603
644
|
curate: bool = False,
|
|
604
645
|
backends: tuple[str, ...] = (),
|
|
605
646
|
verify_curation_flag: bool = False,
|
|
647
|
+
mcp_families: bool = False,
|
|
648
|
+
sg_repo: str = "",
|
|
606
649
|
) -> None:
|
|
607
650
|
"""Mine org-scale comprehension tasks with oracle verification."""
|
|
608
651
|
from codeprobe.mining.org_scale import mine_org_scale_tasks
|
|
@@ -631,12 +674,19 @@ def _run_org_scale_mine(
|
|
|
631
674
|
click.echo("No families selected. Aborted.")
|
|
632
675
|
return
|
|
633
676
|
|
|
677
|
+
# Default sg_repo from primary repo name if not explicitly provided
|
|
678
|
+
effective_sg_repo = sg_repo
|
|
679
|
+
if not effective_sg_repo and mcp_families:
|
|
680
|
+
effective_sg_repo = f"github.com/sg-evals/{repo_paths[0].name}"
|
|
681
|
+
|
|
634
682
|
result = mine_org_scale_tasks(
|
|
635
683
|
repo_paths,
|
|
636
684
|
count=count,
|
|
637
685
|
families=selected_families,
|
|
638
686
|
no_llm=no_llm,
|
|
639
687
|
scan_timeout=scan_timeout,
|
|
688
|
+
include_mcp_families=mcp_families,
|
|
689
|
+
sg_repo=effective_sg_repo,
|
|
640
690
|
)
|
|
641
691
|
|
|
642
692
|
if not result.tasks:
|
|
@@ -682,6 +732,8 @@ def _run_org_scale_mine(
|
|
|
682
732
|
curation_backends=curation_backends_used,
|
|
683
733
|
)
|
|
684
734
|
|
|
735
|
+
_record_task_ids_in_experiment(primary_repo, [t.id for t in curated_tasks])
|
|
736
|
+
|
|
685
737
|
_show_org_scale_results(
|
|
686
738
|
curated_tasks, tasks_dir, primary_repo, curation_backends_used
|
|
687
739
|
)
|
|
@@ -3,10 +3,13 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import click
|
|
9
10
|
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
10
13
|
from codeprobe.probe.generator import DEFAULT_COUNT, MAX_PROBES, MIN_PROBES
|
|
11
14
|
|
|
12
15
|
|
|
@@ -76,7 +79,7 @@ def probe(
|
|
|
76
79
|
output_dir = Path(output) if output else repo_root / "probes"
|
|
77
80
|
effective_repo_name = repo_name or repo_root.name
|
|
78
81
|
|
|
79
|
-
|
|
82
|
+
logger.info("Scanning %s for symbols...", repo_root)
|
|
80
83
|
probes = generate_probes(
|
|
81
84
|
repo_root=repo_root,
|
|
82
85
|
count=count,
|
|
@@ -85,12 +88,11 @@ def probe(
|
|
|
85
88
|
)
|
|
86
89
|
|
|
87
90
|
if not probes:
|
|
88
|
-
|
|
91
|
+
logger.warning("No probes generated -- no suitable symbols found.")
|
|
89
92
|
raise SystemExit(1)
|
|
90
93
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
err=True,
|
|
94
|
+
logger.info(
|
|
95
|
+
"Generated %d probes, writing to %s...", len(probes), output_dir
|
|
94
96
|
)
|
|
95
97
|
created = write_probe_tasks(probes, output_dir, effective_repo_name)
|
|
96
98
|
|
|
@@ -108,9 +110,9 @@ def probe(
|
|
|
108
110
|
}
|
|
109
111
|
click.echo(json.dumps(summary, indent=2))
|
|
110
112
|
else:
|
|
111
|
-
|
|
112
|
-
|
|
113
|
+
logger.info("Probe generation complete:")
|
|
114
|
+
logger.info(" Total probes: %d", len(probes))
|
|
113
115
|
for tpl_name, tpl_count in sorted(by_template.items()):
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
logger.info(" %s: %d", tpl_name, tpl_count)
|
|
117
|
+
logger.info(" Output: %s", output_dir)
|
|
116
118
|
click.echo(f"Created {len(created)} probe tasks in {output_dir}")
|
|
@@ -22,6 +22,27 @@ def _on_task_complete(result: CompletedTask) -> None:
|
|
|
22
22
|
click.echo(f" {result.task_id}: {status} ({result.duration_seconds:.1f}s)")
|
|
23
23
|
|
|
24
24
|
|
|
25
|
+
def _find_tasks(d: Path, *, task_ids: tuple[str, ...] = ()) -> list[Path]:
|
|
26
|
+
"""Discover task subdirectories with instruction.md.
|
|
27
|
+
|
|
28
|
+
When *task_ids* is non-empty, only return tasks whose directory name
|
|
29
|
+
appears in that tuple. This scopes task discovery to the current
|
|
30
|
+
experiment, preventing tasks from other experiments from leaking in.
|
|
31
|
+
"""
|
|
32
|
+
if not d.is_dir():
|
|
33
|
+
return []
|
|
34
|
+
if task_ids:
|
|
35
|
+
allowed = set(task_ids)
|
|
36
|
+
return sorted(
|
|
37
|
+
sd
|
|
38
|
+
for sd in d.iterdir()
|
|
39
|
+
if sd.is_dir() and sd.name in allowed and (sd / "instruction.md").exists()
|
|
40
|
+
)
|
|
41
|
+
return sorted(
|
|
42
|
+
sd for sd in d.iterdir() if sd.is_dir() and (sd / "instruction.md").exists()
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
|
|
25
46
|
def _print_dry_run(estimate: DryRunEstimate) -> None:
|
|
26
47
|
"""Pretty-print a DryRunEstimate to stdout."""
|
|
27
48
|
cost_lo, cost_hi = estimate.estimated_cost_range
|
|
@@ -88,17 +109,9 @@ def run_eval(
|
|
|
88
109
|
tasks_dir = exp_dir / experiment.tasks_dir
|
|
89
110
|
repo_tasks = Path(path).resolve() / ".codeprobe" / experiment.tasks_dir
|
|
90
111
|
|
|
91
|
-
|
|
92
|
-
def _find_tasks(d: Path) -> list[Path]:
|
|
93
|
-
if not d.is_dir():
|
|
94
|
-
return []
|
|
95
|
-
return sorted(
|
|
96
|
-
sd for sd in d.iterdir() if sd.is_dir() and (sd / "instruction.md").exists()
|
|
97
|
-
)
|
|
98
|
-
|
|
99
|
-
task_dirs = _find_tasks(tasks_dir)
|
|
112
|
+
task_dirs = _find_tasks(tasks_dir, task_ids=experiment.task_ids)
|
|
100
113
|
if not task_dirs and repo_tasks != tasks_dir:
|
|
101
|
-
task_dirs = _find_tasks(repo_tasks)
|
|
114
|
+
task_dirs = _find_tasks(repo_tasks, task_ids=experiment.task_ids)
|
|
102
115
|
if task_dirs:
|
|
103
116
|
tasks_dir = repo_tasks
|
|
104
117
|
|
|
@@ -14,19 +14,68 @@ from codeprobe.models.experiment import ExperimentConfig
|
|
|
14
14
|
_SAFE_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$")
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
_DEFAULT_SOURCEGRAPH_URL = "https://sourcegraph.com"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_sourcegraph_mcp_config(
|
|
21
|
+
*,
|
|
22
|
+
token: str,
|
|
23
|
+
url: str = _DEFAULT_SOURCEGRAPH_URL,
|
|
24
|
+
) -> dict:
|
|
25
|
+
"""Build an HTTP MCP config dict for Sourcegraph.
|
|
26
|
+
|
|
27
|
+
Returns a ``{"mcpServers": {"sourcegraph": {...}}}`` dict suitable for
|
|
28
|
+
passing as ``mcp_config`` on an :class:`ExperimentConfig`.
|
|
29
|
+
"""
|
|
30
|
+
base_url = url.rstrip("/")
|
|
31
|
+
return {
|
|
32
|
+
"mcpServers": {
|
|
33
|
+
"sourcegraph": {
|
|
34
|
+
"type": "http",
|
|
35
|
+
"url": f"{base_url}/.api/mcp/v1",
|
|
36
|
+
"headers": {"Authorization": f"token {token}"},
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
17
42
|
def ask_mcp_comparison(
|
|
18
43
|
*,
|
|
19
44
|
experiment_name: str,
|
|
20
45
|
agent: str,
|
|
21
46
|
model: str | None,
|
|
22
|
-
mcp_config_path: str,
|
|
47
|
+
mcp_config_path: str | None = None,
|
|
48
|
+
sourcegraph_token: str | None = None,
|
|
49
|
+
sourcegraph_url: str | None = None,
|
|
23
50
|
) -> tuple[EvalrcConfig, list[ExperimentConfig]]:
|
|
24
|
-
"""Goal 1: Compare baseline agent vs MCP-augmented agent.
|
|
25
|
-
|
|
51
|
+
"""Goal 1: Compare baseline agent vs MCP-augmented agent.
|
|
52
|
+
|
|
53
|
+
When *sourcegraph_token* is provided, generates an HTTP-based Sourcegraph
|
|
54
|
+
MCP config with an ``Authorization`` header and adds the ``sourcegraph``
|
|
55
|
+
preamble. Otherwise falls back to loading the MCP config from
|
|
56
|
+
*mcp_config_path*.
|
|
57
|
+
"""
|
|
58
|
+
if sourcegraph_token is not None:
|
|
59
|
+
mcp_data = build_sourcegraph_mcp_config(
|
|
60
|
+
token=sourcegraph_token,
|
|
61
|
+
url=sourcegraph_url or _DEFAULT_SOURCEGRAPH_URL,
|
|
62
|
+
)
|
|
63
|
+
preambles: tuple[str, ...] = ("sourcegraph",)
|
|
64
|
+
else:
|
|
65
|
+
if mcp_config_path is None:
|
|
66
|
+
raise click.BadParameter(
|
|
67
|
+
"Either sourcegraph_token or mcp_config_path must be provided."
|
|
68
|
+
)
|
|
69
|
+
mcp_data = _load_json(mcp_config_path)
|
|
70
|
+
preambles = ()
|
|
26
71
|
|
|
27
72
|
baseline = ExperimentConfig(label="baseline", agent=agent, model=model)
|
|
28
73
|
with_mcp = ExperimentConfig(
|
|
29
|
-
label="with-mcp",
|
|
74
|
+
label="with-mcp",
|
|
75
|
+
agent=agent,
|
|
76
|
+
model=model,
|
|
77
|
+
mcp_config=mcp_data,
|
|
78
|
+
preambles=preambles,
|
|
30
79
|
)
|
|
31
80
|
|
|
32
81
|
evalrc = EvalrcConfig(name=experiment_name, agents=[agent])
|