codeprobe 0.1.7__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.1.7 → codeprobe-0.2.0}/PKG-INFO +1 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/pyproject.toml +1 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/__init__.py +1 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/claude.py +18 -2
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/__init__.py +108 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/experiment_cmd.py +4 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/init_cmd.py +72 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/mine_cmd.py +13 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/probe_cmd.py +11 -9
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/wizard.py +53 -4
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/checkpoint.py +40 -23
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/executor.py +32 -12
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/isolation.py +9 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/preamble.py +7 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale.py +331 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_families.py +106 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_scanner.py +464 -0
- codeprobe-0.2.0/src/codeprobe/mining/sg_ground_truth.py +163 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/writer.py +81 -32
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/task.py +1 -0
- codeprobe-0.2.0/src/codeprobe/preambles/github.md +21 -0
- codeprobe-0.2.0/src/codeprobe/preambles/sourcegraph.md +44 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/probe/generator.py +60 -5
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/probe/writer.py +8 -2
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +6 -6
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/PKG-INFO +1 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/SOURCES.txt +5 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_adapters.py +31 -0
- codeprobe-0.2.0/tests/test_changed_symbols.py +241 -0
- codeprobe-0.2.0/tests/test_cli.py +201 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_executor.py +9 -1
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_init_wizard.py +173 -2
- codeprobe-0.2.0/tests/test_mcp_families_mining.py +278 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_new_families.py +29 -3
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_org_scale.py +6 -7
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_preamble.py +36 -13
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_probe.py +351 -0
- codeprobe-0.2.0/tests/test_sg_ground_truth.py +318 -0
- codeprobe-0.1.7/src/codeprobe/preambles/sourcegraph.md +0 -32
- codeprobe-0.1.7/tests/test_cli.py +0 -51
- {codeprobe-0.1.7 → codeprobe-0.2.0}/LICENSE +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/README.md +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/setup.cfg +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/_base.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/aider.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/protocol.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/telemetry.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/report.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/stats.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/api.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/assess/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/assess/heuristics.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/interpret_cmd.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/run_cmd.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/yaml_writer.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/config/loader.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/experiment.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/registry.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/scoring.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/loaders/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/curator.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/curator_backends.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/extractor.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_oracle.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/experiment.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/ratings/collector.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/entry_points.txt +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/requires.txt +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_analysis.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_api.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_assess.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_checkpoint.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_config_loader.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_contrib.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_backends.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_core.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_integration.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_tiers.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_experiment_cmd.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_experiment_core.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_llm.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_loaders.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_mcp_validate.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_mining.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_models.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_pipeline_integration.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_ratings.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_registry.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_scaffold.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_scoring.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_session.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_telemetry.py +0 -0
- {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_weighted_f1.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import re
|
|
7
|
+
import shutil
|
|
7
8
|
import subprocess
|
|
8
9
|
import tempfile
|
|
9
10
|
from pathlib import Path
|
|
@@ -69,16 +70,31 @@ class ClaudeAdapter(BaseAdapter):
|
|
|
69
70
|
|
|
70
71
|
mcp_path = self._write_mcp_config(config)
|
|
71
72
|
if mcp_path:
|
|
72
|
-
cmd.extend(["--mcp-config", mcp_path])
|
|
73
|
+
cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
|
|
73
74
|
|
|
74
75
|
return cmd
|
|
75
76
|
|
|
76
77
|
def isolate_session(self, slot_id: int) -> dict[str, str]:
|
|
77
|
-
"""Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
|
|
78
|
+
"""Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
|
|
79
|
+
|
|
80
|
+
Copies authentication credentials from the real ``~/.claude/``
|
|
81
|
+
directory so the agent subprocess can authenticate.
|
|
82
|
+
"""
|
|
78
83
|
config_dir = (
|
|
79
84
|
Path(tempfile.gettempdir()) / "codeprobe-claude" / f"slot-{slot_id}"
|
|
80
85
|
)
|
|
81
86
|
config_dir.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
|
|
88
|
+
# Copy auth credentials from the user's real config dir.
|
|
89
|
+
# Without these the subprocess gets "Not logged in".
|
|
90
|
+
real_config = Path.home() / ".claude"
|
|
91
|
+
if real_config.is_dir():
|
|
92
|
+
for name in ("credentials.json", ".credentials.json"):
|
|
93
|
+
src = real_config / name
|
|
94
|
+
dst = config_dir / name
|
|
95
|
+
if src.is_file():
|
|
96
|
+
shutil.copy2(src, dst)
|
|
97
|
+
|
|
82
98
|
return {"CLAUDE_CONFIG_DIR": str(config_dir)}
|
|
83
99
|
|
|
84
100
|
def parse_output(
|
|
@@ -1,18 +1,89 @@
|
|
|
1
1
|
"""CLI entry point for codeprobe."""
|
|
2
2
|
|
|
3
|
+
import json as _json
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
|
|
3
7
|
import click
|
|
4
8
|
|
|
5
9
|
from codeprobe import __version__
|
|
6
10
|
|
|
7
11
|
|
|
12
|
+
class _JsonFormatter(logging.Formatter):
|
|
13
|
+
"""Emit one JSON object per log line."""
|
|
14
|
+
|
|
15
|
+
def format(self, record: logging.LogRecord) -> str:
|
|
16
|
+
payload = {
|
|
17
|
+
"level": record.levelname,
|
|
18
|
+
"logger": record.name,
|
|
19
|
+
"message": record.getMessage(),
|
|
20
|
+
"timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
|
|
21
|
+
}
|
|
22
|
+
return _json.dumps(payload)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _configure_logging(verbose: int, quiet: bool, log_format: str = "text") -> None:
|
|
26
|
+
"""Configure namespace-scoped logging for codeprobe.* modules.
|
|
27
|
+
|
|
28
|
+
Attaches a StreamHandler to `logging.getLogger("codeprobe")` so that
|
|
29
|
+
all 26+ codeprobe.* modules emit through hierarchy without touching
|
|
30
|
+
third-party loggers (httpx, urllib3, etc.).
|
|
31
|
+
"""
|
|
32
|
+
if quiet:
|
|
33
|
+
level = logging.WARNING
|
|
34
|
+
elif verbose >= 1:
|
|
35
|
+
level = logging.DEBUG
|
|
36
|
+
else:
|
|
37
|
+
level = logging.INFO
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger("codeprobe")
|
|
40
|
+
logger.setLevel(level)
|
|
41
|
+
logger.propagate = False # don't bubble to root
|
|
42
|
+
|
|
43
|
+
# Idempotent: tests / repeat invocations must not duplicate handlers.
|
|
44
|
+
for h in list(logger.handlers):
|
|
45
|
+
logger.removeHandler(h)
|
|
46
|
+
|
|
47
|
+
handler = logging.StreamHandler(sys.stderr)
|
|
48
|
+
if log_format == "json":
|
|
49
|
+
handler.setFormatter(_JsonFormatter())
|
|
50
|
+
elif verbose >= 1:
|
|
51
|
+
fmt = "%(levelname)s %(name)s: %(message)s"
|
|
52
|
+
handler.setFormatter(logging.Formatter(fmt))
|
|
53
|
+
else:
|
|
54
|
+
fmt = "%(levelname)s: %(message)s"
|
|
55
|
+
handler.setFormatter(logging.Formatter(fmt))
|
|
56
|
+
logger.addHandler(handler)
|
|
57
|
+
|
|
58
|
+
|
|
8
59
|
@click.group()
|
|
60
|
+
@click.option(
|
|
61
|
+
"-v",
|
|
62
|
+
"--verbose",
|
|
63
|
+
count=True,
|
|
64
|
+
help="Increase log verbosity (-v sets DEBUG).",
|
|
65
|
+
)
|
|
66
|
+
@click.option(
|
|
67
|
+
"-q",
|
|
68
|
+
"--quiet",
|
|
69
|
+
is_flag=True,
|
|
70
|
+
default=False,
|
|
71
|
+
help="Suppress INFO logs (WARNING and above only).",
|
|
72
|
+
)
|
|
73
|
+
@click.option(
|
|
74
|
+
"--log-format",
|
|
75
|
+
type=click.Choice(["text", "json"]),
|
|
76
|
+
default="text",
|
|
77
|
+
help="Log output format (default: text). 'json' emits one JSON object per line.",
|
|
78
|
+
)
|
|
9
79
|
@click.version_option(version=__version__, prog_name="codeprobe")
|
|
10
|
-
def main() -> None:
|
|
80
|
+
def main(verbose: int, quiet: bool, log_format: str) -> None:
|
|
11
81
|
"""Benchmark AI coding agents against your own codebase.
|
|
12
82
|
|
|
13
83
|
Mine real tasks from your repo history, run agents against them,
|
|
14
84
|
and interpret the results to find which setup works best for YOUR code.
|
|
15
85
|
"""
|
|
86
|
+
_configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
|
|
16
87
|
|
|
17
88
|
|
|
18
89
|
@main.command()
|
|
@@ -121,6 +192,20 @@ def init(path: str) -> None:
|
|
|
121
192
|
default=False,
|
|
122
193
|
help="Run LLM verification on curated ground truth.",
|
|
123
194
|
)
|
|
195
|
+
@click.option(
|
|
196
|
+
"--mcp-families",
|
|
197
|
+
is_flag=True,
|
|
198
|
+
default=False,
|
|
199
|
+
help="Include MCP-advantaged task families (symbol-reference-trace, "
|
|
200
|
+
"type-hierarchy-consumers, change-scope-audit). Only with --org-scale.",
|
|
201
|
+
)
|
|
202
|
+
@click.option(
|
|
203
|
+
"--sg-repo",
|
|
204
|
+
default="",
|
|
205
|
+
help="Sourcegraph repo identifier for ground truth enrichment "
|
|
206
|
+
"(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
|
|
207
|
+
"when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
|
|
208
|
+
)
|
|
124
209
|
def mine(
|
|
125
210
|
path: str,
|
|
126
211
|
count: int,
|
|
@@ -139,6 +224,8 @@ def mine(
|
|
|
139
224
|
curate: bool,
|
|
140
225
|
backends: tuple[str, ...],
|
|
141
226
|
verify_curation_flag: bool,
|
|
227
|
+
mcp_families: bool,
|
|
228
|
+
sg_repo: str,
|
|
142
229
|
) -> None:
|
|
143
230
|
"""Mine eval tasks from a repository's history.
|
|
144
231
|
|
|
@@ -175,6 +262,8 @@ def mine(
|
|
|
175
262
|
curate=curate,
|
|
176
263
|
backends=backends,
|
|
177
264
|
verify_curation_flag=verify_curation_flag,
|
|
265
|
+
mcp_families=mcp_families,
|
|
266
|
+
sg_repo=sg_repo,
|
|
178
267
|
)
|
|
179
268
|
|
|
180
269
|
|
|
@@ -272,6 +361,20 @@ def init_experiment(path: str, name: str, description: str) -> None:
|
|
|
272
361
|
@click.option(
|
|
273
362
|
"--mcp-config", default=None, help="MCP config as JSON string or file path."
|
|
274
363
|
)
|
|
364
|
+
@click.option(
|
|
365
|
+
"--instruction-variant",
|
|
366
|
+
default=None,
|
|
367
|
+
help="Instruction file variant (e.g., instruction_mcp.md). Default: instruction.md.",
|
|
368
|
+
)
|
|
369
|
+
@click.option(
|
|
370
|
+
"--preamble",
|
|
371
|
+
"preambles",
|
|
372
|
+
multiple=True,
|
|
373
|
+
help=(
|
|
374
|
+
"Preamble to prepend to the instruction. Repeatable. "
|
|
375
|
+
"Built-ins: sourcegraph, github. Or path to a custom .md file."
|
|
376
|
+
),
|
|
377
|
+
)
|
|
275
378
|
def add_config(
|
|
276
379
|
path: str,
|
|
277
380
|
label: str,
|
|
@@ -279,6 +382,8 @@ def add_config(
|
|
|
279
382
|
model: str | None,
|
|
280
383
|
permission_mode: str,
|
|
281
384
|
mcp_config: str | None,
|
|
385
|
+
instruction_variant: str | None,
|
|
386
|
+
preambles: tuple[str, ...],
|
|
282
387
|
) -> None:
|
|
283
388
|
"""Add a configuration to an existing experiment."""
|
|
284
389
|
from codeprobe.cli.experiment_cmd import experiment_add_config
|
|
@@ -290,6 +395,8 @@ def add_config(
|
|
|
290
395
|
model=model,
|
|
291
396
|
permission_mode=permission_mode,
|
|
292
397
|
mcp_config_str=mcp_config,
|
|
398
|
+
instruction_variant=instruction_variant,
|
|
399
|
+
preambles=preambles,
|
|
293
400
|
)
|
|
294
401
|
|
|
295
402
|
|
|
@@ -63,6 +63,8 @@ def experiment_add_config(
|
|
|
63
63
|
model: str | None,
|
|
64
64
|
permission_mode: str,
|
|
65
65
|
mcp_config_str: str | None,
|
|
66
|
+
instruction_variant: str | None = None,
|
|
67
|
+
preambles: tuple[str, ...] = (),
|
|
66
68
|
) -> None:
|
|
67
69
|
"""Add a configuration to an existing experiment."""
|
|
68
70
|
exp_dir = Path(path)
|
|
@@ -104,6 +106,8 @@ def experiment_add_config(
|
|
|
104
106
|
model=model,
|
|
105
107
|
permission_mode=permission_mode,
|
|
106
108
|
mcp_config=mcp_config,
|
|
109
|
+
instruction_variant=instruction_variant,
|
|
110
|
+
preambles=preambles,
|
|
107
111
|
)
|
|
108
112
|
|
|
109
113
|
# Validate the label is a safe path component
|
|
@@ -173,12 +173,83 @@ def _prompt_mcp_config() -> str:
|
|
|
173
173
|
click.echo(f" Error: '{expanded}' does not exist. Try again.")
|
|
174
174
|
|
|
175
175
|
|
|
176
|
+
def _detect_sourcegraph_in_mcp(
|
|
177
|
+
discovered: list[tuple[Path, list[str]]],
|
|
178
|
+
mcp_data: dict | None = None,
|
|
179
|
+
) -> bool:
|
|
180
|
+
"""Return True if any discovered MCP config contains a Sourcegraph server.
|
|
181
|
+
|
|
182
|
+
Checks server names for common Sourcegraph patterns (e.g.
|
|
183
|
+
``sourcegraph``, ``sourcegraph-mcp-server``).
|
|
184
|
+
"""
|
|
185
|
+
sg_names = {"sourcegraph", "sourcegraph-mcp-server"}
|
|
186
|
+
for _path, server_names in discovered:
|
|
187
|
+
for name in server_names:
|
|
188
|
+
if name.lower() in sg_names:
|
|
189
|
+
return True
|
|
190
|
+
if mcp_data:
|
|
191
|
+
for name in mcp_data.get("mcpServers", {}):
|
|
192
|
+
if name.lower() in sg_names:
|
|
193
|
+
return True
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _prompt_sourcegraph_token() -> str:
|
|
198
|
+
"""Prompt for Sourcegraph access token, checking env var first."""
|
|
199
|
+
import os
|
|
200
|
+
|
|
201
|
+
env_token = os.environ.get("SOURCEGRAPH_TOKEN", "")
|
|
202
|
+
if env_token:
|
|
203
|
+
masked = env_token[:4] + "..." + env_token[-4:] if len(env_token) > 8 else "***"
|
|
204
|
+
click.echo(f" Found SOURCEGRAPH_TOKEN in environment ({masked})")
|
|
205
|
+
if click.confirm(" Use this token?", default=True):
|
|
206
|
+
return env_token
|
|
207
|
+
|
|
208
|
+
return click.prompt("Sourcegraph access token")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def _prompt_sourcegraph_url() -> str | None:
|
|
212
|
+
"""Prompt for optional custom Sourcegraph instance URL."""
|
|
213
|
+
url = click.prompt(
|
|
214
|
+
"Sourcegraph URL (press Enter for sourcegraph.com)",
|
|
215
|
+
default="",
|
|
216
|
+
show_default=False,
|
|
217
|
+
)
|
|
218
|
+
return url if url else None
|
|
219
|
+
|
|
220
|
+
|
|
176
221
|
def _goal_mcp(agents: list[str], name: str) -> _Result:
|
|
177
222
|
"""Goal 1: MCP comparison prompts."""
|
|
178
223
|
agent = _prompt_agent(agents)
|
|
179
224
|
model = _prompt_model()
|
|
180
|
-
mcp_path = _prompt_mcp_config()
|
|
181
225
|
|
|
226
|
+
# Check if Sourcegraph is available in discovered MCP configs
|
|
227
|
+
discovered = _discover_mcp_configs()
|
|
228
|
+
use_sourcegraph = False
|
|
229
|
+
|
|
230
|
+
if _detect_sourcegraph_in_mcp(discovered):
|
|
231
|
+
click.echo()
|
|
232
|
+
click.echo("Detected Sourcegraph MCP server in your configuration.")
|
|
233
|
+
click.echo("codeprobe can use the HTTP endpoint for better performance.")
|
|
234
|
+
use_sourcegraph = click.confirm("Use Sourcegraph HTTP MCP?", default=True)
|
|
235
|
+
else:
|
|
236
|
+
click.echo()
|
|
237
|
+
click.echo("Would you like to use Sourcegraph as the MCP server?")
|
|
238
|
+
use_sourcegraph = click.confirm("Use Sourcegraph?", default=False)
|
|
239
|
+
|
|
240
|
+
if use_sourcegraph:
|
|
241
|
+
token = _prompt_sourcegraph_token()
|
|
242
|
+
sg_url = _prompt_sourcegraph_url()
|
|
243
|
+
return ask_mcp_comparison(
|
|
244
|
+
experiment_name=name,
|
|
245
|
+
agent=agent,
|
|
246
|
+
model=model,
|
|
247
|
+
sourcegraph_token=token,
|
|
248
|
+
sourcegraph_url=sg_url,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
# Fall back to generic MCP config path
|
|
252
|
+
mcp_path = _prompt_mcp_config()
|
|
182
253
|
return ask_mcp_comparison(
|
|
183
254
|
experiment_name=name,
|
|
184
255
|
agent=agent,
|
|
@@ -518,6 +518,8 @@ def run_mine(
|
|
|
518
518
|
curate: bool = False,
|
|
519
519
|
backends: tuple[str, ...] = (),
|
|
520
520
|
verify_curation_flag: bool = False,
|
|
521
|
+
mcp_families: bool = False,
|
|
522
|
+
sg_repo: str = "",
|
|
521
523
|
) -> None:
|
|
522
524
|
"""Mine eval tasks from a repository."""
|
|
523
525
|
from codeprobe.mining import mine_tasks, write_task_dir
|
|
@@ -553,6 +555,8 @@ def run_mine(
|
|
|
553
555
|
curate=curate,
|
|
554
556
|
backends=backends,
|
|
555
557
|
verify_curation_flag=verify_curation_flag,
|
|
558
|
+
mcp_families=mcp_families,
|
|
559
|
+
sg_repo=sg_repo,
|
|
556
560
|
)
|
|
557
561
|
return
|
|
558
562
|
|
|
@@ -640,6 +644,8 @@ def _run_org_scale_mine(
|
|
|
640
644
|
curate: bool = False,
|
|
641
645
|
backends: tuple[str, ...] = (),
|
|
642
646
|
verify_curation_flag: bool = False,
|
|
647
|
+
mcp_families: bool = False,
|
|
648
|
+
sg_repo: str = "",
|
|
643
649
|
) -> None:
|
|
644
650
|
"""Mine org-scale comprehension tasks with oracle verification."""
|
|
645
651
|
from codeprobe.mining.org_scale import mine_org_scale_tasks
|
|
@@ -668,12 +674,19 @@ def _run_org_scale_mine(
|
|
|
668
674
|
click.echo("No families selected. Aborted.")
|
|
669
675
|
return
|
|
670
676
|
|
|
677
|
+
# Default sg_repo from primary repo name if not explicitly provided
|
|
678
|
+
effective_sg_repo = sg_repo
|
|
679
|
+
if not effective_sg_repo and mcp_families:
|
|
680
|
+
effective_sg_repo = f"github.com/sg-evals/{repo_paths[0].name}"
|
|
681
|
+
|
|
671
682
|
result = mine_org_scale_tasks(
|
|
672
683
|
repo_paths,
|
|
673
684
|
count=count,
|
|
674
685
|
families=selected_families,
|
|
675
686
|
no_llm=no_llm,
|
|
676
687
|
scan_timeout=scan_timeout,
|
|
688
|
+
include_mcp_families=mcp_families,
|
|
689
|
+
sg_repo=effective_sg_repo,
|
|
677
690
|
)
|
|
678
691
|
|
|
679
692
|
if not result.tasks:
|
|
@@ -3,10 +3,13 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
|
+
import logging
|
|
6
7
|
from pathlib import Path
|
|
7
8
|
|
|
8
9
|
import click
|
|
9
10
|
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
10
13
|
from codeprobe.probe.generator import DEFAULT_COUNT, MAX_PROBES, MIN_PROBES
|
|
11
14
|
|
|
12
15
|
|
|
@@ -76,7 +79,7 @@ def probe(
|
|
|
76
79
|
output_dir = Path(output) if output else repo_root / "probes"
|
|
77
80
|
effective_repo_name = repo_name or repo_root.name
|
|
78
81
|
|
|
79
|
-
|
|
82
|
+
logger.info("Scanning %s for symbols...", repo_root)
|
|
80
83
|
probes = generate_probes(
|
|
81
84
|
repo_root=repo_root,
|
|
82
85
|
count=count,
|
|
@@ -85,12 +88,11 @@ def probe(
|
|
|
85
88
|
)
|
|
86
89
|
|
|
87
90
|
if not probes:
|
|
88
|
-
|
|
91
|
+
logger.warning("No probes generated -- no suitable symbols found.")
|
|
89
92
|
raise SystemExit(1)
|
|
90
93
|
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
err=True,
|
|
94
|
+
logger.info(
|
|
95
|
+
"Generated %d probes, writing to %s...", len(probes), output_dir
|
|
94
96
|
)
|
|
95
97
|
created = write_probe_tasks(probes, output_dir, effective_repo_name)
|
|
96
98
|
|
|
@@ -108,9 +110,9 @@ def probe(
|
|
|
108
110
|
}
|
|
109
111
|
click.echo(json.dumps(summary, indent=2))
|
|
110
112
|
else:
|
|
111
|
-
|
|
112
|
-
|
|
113
|
+
logger.info("Probe generation complete:")
|
|
114
|
+
logger.info(" Total probes: %d", len(probes))
|
|
113
115
|
for tpl_name, tpl_count in sorted(by_template.items()):
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
logger.info(" %s: %d", tpl_name, tpl_count)
|
|
117
|
+
logger.info(" Output: %s", output_dir)
|
|
116
118
|
click.echo(f"Created {len(created)} probe tasks in {output_dir}")
|
|
@@ -14,19 +14,68 @@ from codeprobe.models.experiment import ExperimentConfig
|
|
|
14
14
|
_SAFE_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$")
|
|
15
15
|
|
|
16
16
|
|
|
17
|
+
_DEFAULT_SOURCEGRAPH_URL = "https://sourcegraph.com"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def build_sourcegraph_mcp_config(
|
|
21
|
+
*,
|
|
22
|
+
token: str,
|
|
23
|
+
url: str = _DEFAULT_SOURCEGRAPH_URL,
|
|
24
|
+
) -> dict:
|
|
25
|
+
"""Build an HTTP MCP config dict for Sourcegraph.
|
|
26
|
+
|
|
27
|
+
Returns a ``{"mcpServers": {"sourcegraph": {...}}}`` dict suitable for
|
|
28
|
+
passing as ``mcp_config`` on an :class:`ExperimentConfig`.
|
|
29
|
+
"""
|
|
30
|
+
base_url = url.rstrip("/")
|
|
31
|
+
return {
|
|
32
|
+
"mcpServers": {
|
|
33
|
+
"sourcegraph": {
|
|
34
|
+
"type": "http",
|
|
35
|
+
"url": f"{base_url}/.api/mcp/v1",
|
|
36
|
+
"headers": {"Authorization": f"token {token}"},
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
|
|
17
42
|
def ask_mcp_comparison(
|
|
18
43
|
*,
|
|
19
44
|
experiment_name: str,
|
|
20
45
|
agent: str,
|
|
21
46
|
model: str | None,
|
|
22
|
-
mcp_config_path: str,
|
|
47
|
+
mcp_config_path: str | None = None,
|
|
48
|
+
sourcegraph_token: str | None = None,
|
|
49
|
+
sourcegraph_url: str | None = None,
|
|
23
50
|
) -> tuple[EvalrcConfig, list[ExperimentConfig]]:
|
|
24
|
-
"""Goal 1: Compare baseline agent vs MCP-augmented agent.
|
|
25
|
-
|
|
51
|
+
"""Goal 1: Compare baseline agent vs MCP-augmented agent.
|
|
52
|
+
|
|
53
|
+
When *sourcegraph_token* is provided, generates an HTTP-based Sourcegraph
|
|
54
|
+
MCP config with an ``Authorization`` header and adds the ``sourcegraph``
|
|
55
|
+
preamble. Otherwise falls back to loading the MCP config from
|
|
56
|
+
*mcp_config_path*.
|
|
57
|
+
"""
|
|
58
|
+
if sourcegraph_token is not None:
|
|
59
|
+
mcp_data = build_sourcegraph_mcp_config(
|
|
60
|
+
token=sourcegraph_token,
|
|
61
|
+
url=sourcegraph_url or _DEFAULT_SOURCEGRAPH_URL,
|
|
62
|
+
)
|
|
63
|
+
preambles: tuple[str, ...] = ("sourcegraph",)
|
|
64
|
+
else:
|
|
65
|
+
if mcp_config_path is None:
|
|
66
|
+
raise click.BadParameter(
|
|
67
|
+
"Either sourcegraph_token or mcp_config_path must be provided."
|
|
68
|
+
)
|
|
69
|
+
mcp_data = _load_json(mcp_config_path)
|
|
70
|
+
preambles = ()
|
|
26
71
|
|
|
27
72
|
baseline = ExperimentConfig(label="baseline", agent=agent, model=model)
|
|
28
73
|
with_mcp = ExperimentConfig(
|
|
29
|
-
label="with-mcp",
|
|
74
|
+
label="with-mcp",
|
|
75
|
+
agent=agent,
|
|
76
|
+
model=model,
|
|
77
|
+
mcp_config=mcp_data,
|
|
78
|
+
preambles=preambles,
|
|
30
79
|
)
|
|
31
80
|
|
|
32
81
|
evalrc = EvalrcConfig(name=experiment_name, agents=[agent])
|
|
@@ -9,6 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
import json
|
|
10
10
|
import logging
|
|
11
11
|
import sqlite3
|
|
12
|
+
import time
|
|
12
13
|
from dataclasses import asdict
|
|
13
14
|
from datetime import datetime, timezone
|
|
14
15
|
from pathlib import Path
|
|
@@ -173,31 +174,47 @@ class CheckpointStore:
|
|
|
173
174
|
def _open(self, db_path: Path) -> sqlite3.Connection:
|
|
174
175
|
"""Open (or create) the SQLite database with WAL mode.
|
|
175
176
|
|
|
176
|
-
|
|
177
|
-
|
|
177
|
+
Retries on transient ``OperationalError`` (lock contention during
|
|
178
|
+
concurrent creation). Only removes and recreates the file on
|
|
179
|
+
non-transient ``DatabaseError`` (genuine corruption).
|
|
178
180
|
"""
|
|
179
181
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
182
|
+
last_err: Exception | None = None
|
|
183
|
+
for attempt in range(4):
|
|
184
|
+
try:
|
|
185
|
+
conn = sqlite3.connect(str(db_path), timeout=10)
|
|
186
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
187
|
+
conn.execute(_CREATE_TABLE)
|
|
188
|
+
self._migrate_schema(conn)
|
|
189
|
+
conn.commit()
|
|
190
|
+
return conn
|
|
191
|
+
except sqlite3.OperationalError as exc:
|
|
192
|
+
# Transient lock contention — retry with backoff
|
|
193
|
+
last_err = exc
|
|
194
|
+
logger.debug(
|
|
195
|
+
"Checkpoint DB busy at %s (attempt %d): %s",
|
|
196
|
+
db_path,
|
|
197
|
+
attempt + 1,
|
|
198
|
+
exc,
|
|
199
|
+
)
|
|
200
|
+
time.sleep(0.1 * (2**attempt))
|
|
201
|
+
except sqlite3.DatabaseError:
|
|
202
|
+
logger.warning(
|
|
203
|
+
"Corrupt checkpoint DB at %s — removing and recreating",
|
|
204
|
+
db_path,
|
|
205
|
+
)
|
|
206
|
+
db_path.unlink(missing_ok=True)
|
|
207
|
+
db_path.with_suffix(".db-wal").unlink(missing_ok=True)
|
|
208
|
+
db_path.with_suffix(".db-shm").unlink(missing_ok=True)
|
|
209
|
+
conn = sqlite3.connect(str(db_path), timeout=10)
|
|
210
|
+
conn.execute("PRAGMA journal_mode=WAL")
|
|
211
|
+
conn.execute(_CREATE_TABLE)
|
|
212
|
+
self._migrate_schema(conn)
|
|
213
|
+
conn.commit()
|
|
214
|
+
return conn
|
|
215
|
+
raise sqlite3.OperationalError(
|
|
216
|
+
f"Could not open checkpoint DB at {db_path} after 4 attempts: {last_err}"
|
|
217
|
+
)
|
|
201
218
|
|
|
202
219
|
@staticmethod
|
|
203
220
|
def _migrate_schema(conn: sqlite3.Connection) -> None:
|
|
@@ -166,21 +166,25 @@ def execute_task(
|
|
|
166
166
|
"""
|
|
167
167
|
task_id = task_dir.name
|
|
168
168
|
|
|
169
|
+
# Load task metadata once — used for reward_type auto-detection and
|
|
170
|
+
# preamble context (e.g. sg_repo for Sourcegraph preamble).
|
|
171
|
+
_task_meta: dict = {}
|
|
172
|
+
meta_path = task_dir / "metadata.json"
|
|
173
|
+
if meta_path.is_file():
|
|
174
|
+
try:
|
|
175
|
+
import json as _json
|
|
176
|
+
|
|
177
|
+
_task_meta = _json.loads(meta_path.read_text(encoding="utf-8"))
|
|
178
|
+
except (ValueError, OSError):
|
|
179
|
+
pass
|
|
180
|
+
|
|
169
181
|
# Auto-detect reward_type from task metadata when caller uses default.
|
|
170
182
|
# Oracle tasks (org-scale) need "continuous" scoring to read reward.txt;
|
|
171
183
|
# the default "binary" would score exit-code-only and always pass.
|
|
172
184
|
if reward_type == "binary":
|
|
173
|
-
|
|
174
|
-
if
|
|
175
|
-
|
|
176
|
-
import json as _json
|
|
177
|
-
|
|
178
|
-
meta = _json.loads(meta_path.read_text(encoding="utf-8"))
|
|
179
|
-
task_rt = (meta.get("verification") or {}).get("reward_type")
|
|
180
|
-
if task_rt and task_rt != "binary":
|
|
181
|
-
reward_type = task_rt
|
|
182
|
-
except (ValueError, OSError):
|
|
183
|
-
pass # Stick with caller's default
|
|
185
|
+
task_rt = (_task_meta.get("verification") or {}).get("reward_type")
|
|
186
|
+
if task_rt and task_rt != "binary":
|
|
187
|
+
reward_type = task_rt
|
|
184
188
|
|
|
185
189
|
def _error_result(error: str, error_category: str | None = None) -> TaskResult:
|
|
186
190
|
return TaskResult(
|
|
@@ -206,6 +210,12 @@ def execute_task(
|
|
|
206
210
|
)
|
|
207
211
|
|
|
208
212
|
if preamble_names and preamble_resolver is not None:
|
|
213
|
+
# Build extra context from task metadata for preamble templates
|
|
214
|
+
extra_ctx: dict[str, str] = {}
|
|
215
|
+
sg_repo = (_task_meta.get("metadata") or {}).get("sg_repo", "")
|
|
216
|
+
if sg_repo:
|
|
217
|
+
extra_ctx["sg_repo"] = sg_repo
|
|
218
|
+
|
|
209
219
|
try:
|
|
210
220
|
prompt, resolved_preambles = compose_instruction(
|
|
211
221
|
instruction,
|
|
@@ -214,6 +224,7 @@ def execute_task(
|
|
|
214
224
|
resolver=preamble_resolver,
|
|
215
225
|
task_id=task_id,
|
|
216
226
|
worktree_path=worktree_path,
|
|
227
|
+
extra_context=extra_ctx or None,
|
|
217
228
|
)
|
|
218
229
|
except (FileNotFoundError, ValueError) as exc:
|
|
219
230
|
return _error_result(f"Preamble resolution failed: {exc}")
|
|
@@ -325,7 +336,7 @@ def _git_reset_workdir(repo_path: Path) -> None:
|
|
|
325
336
|
capture_output=True,
|
|
326
337
|
)
|
|
327
338
|
subprocess.run(
|
|
328
|
-
["git", "clean", "-fd"],
|
|
339
|
+
["git", "clean", "-fd", "-e", ".codeprobe", "-e", ".codeprobe-worktrees"],
|
|
329
340
|
cwd=repo_path,
|
|
330
341
|
check=True,
|
|
331
342
|
capture_output=True,
|
|
@@ -448,6 +459,15 @@ def execute_config(
|
|
|
448
459
|
"""
|
|
449
460
|
checkpointed_ids, results = _restore_checkpointed(checkpoint_store)
|
|
450
461
|
|
|
462
|
+
# Filter checkpointed results to only include tasks in the current
|
|
463
|
+
# experiment. Without this, stale entries from prior runs with different
|
|
464
|
+
# task_ids leak into the results list and inflate/deflate scores.
|
|
465
|
+
current_task_ids = {d.name for d in task_dirs}
|
|
466
|
+
checkpointed_ids = {
|
|
467
|
+
(tid, ri) for tid, ri in checkpointed_ids if tid in current_task_ids
|
|
468
|
+
}
|
|
469
|
+
results = [r for r in results if r.task_id in current_task_ids]
|
|
470
|
+
|
|
451
471
|
# Build expanded work items: (task_dir, repeat_index) for all repeats
|
|
452
472
|
all_work: list[tuple[Path, int]] = [
|
|
453
473
|
(d, ri) for d in task_dirs for ri in range(repeats)
|