codeprobe 0.1.7__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {codeprobe-0.1.7 → codeprobe-0.2.0}/PKG-INFO +1 -1
  2. {codeprobe-0.1.7 → codeprobe-0.2.0}/pyproject.toml +1 -1
  3. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/__init__.py +1 -1
  4. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/claude.py +18 -2
  5. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/__init__.py +108 -1
  6. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/experiment_cmd.py +4 -0
  7. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/init_cmd.py +72 -1
  8. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/mine_cmd.py +13 -0
  9. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/probe_cmd.py +11 -9
  10. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/wizard.py +53 -4
  11. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/checkpoint.py +40 -23
  12. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/executor.py +32 -12
  13. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/isolation.py +9 -1
  14. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/preamble.py +7 -0
  15. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale.py +331 -0
  16. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_families.py +106 -1
  17. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_scanner.py +464 -0
  18. codeprobe-0.2.0/src/codeprobe/mining/sg_ground_truth.py +163 -0
  19. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/writer.py +81 -32
  20. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/task.py +1 -0
  21. codeprobe-0.2.0/src/codeprobe/preambles/github.md +21 -0
  22. codeprobe-0.2.0/src/codeprobe/preambles/sourcegraph.md +44 -0
  23. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/probe/generator.py +60 -5
  24. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/probe/writer.py +8 -2
  25. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +6 -6
  26. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/PKG-INFO +1 -1
  27. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/SOURCES.txt +5 -0
  28. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_adapters.py +31 -0
  29. codeprobe-0.2.0/tests/test_changed_symbols.py +241 -0
  30. codeprobe-0.2.0/tests/test_cli.py +201 -0
  31. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_executor.py +9 -1
  32. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_init_wizard.py +173 -2
  33. codeprobe-0.2.0/tests/test_mcp_families_mining.py +278 -0
  34. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_new_families.py +29 -3
  35. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_org_scale.py +6 -7
  36. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_preamble.py +36 -13
  37. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_probe.py +351 -0
  38. codeprobe-0.2.0/tests/test_sg_ground_truth.py +318 -0
  39. codeprobe-0.1.7/src/codeprobe/preambles/sourcegraph.md +0 -32
  40. codeprobe-0.1.7/tests/test_cli.py +0 -51
  41. {codeprobe-0.1.7 → codeprobe-0.2.0}/LICENSE +0 -0
  42. {codeprobe-0.1.7 → codeprobe-0.2.0}/README.md +0 -0
  43. {codeprobe-0.1.7 → codeprobe-0.2.0}/setup.cfg +0 -0
  44. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/__main__.py +0 -0
  45. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/__init__.py +0 -0
  46. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/_base.py +0 -0
  47. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/aider.py +0 -0
  48. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/codex.py +0 -0
  49. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/copilot.py +0 -0
  50. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/openai_compat.py +0 -0
  51. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/protocol.py +0 -0
  52. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/session.py +0 -0
  53. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/adapters/telemetry.py +0 -0
  54. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/__init__.py +0 -0
  55. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/ranking.py +0 -0
  56. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/report.py +0 -0
  57. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/analysis/stats.py +0 -0
  58. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/api.py +0 -0
  59. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/assess/__init__.py +0 -0
  60. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/assess/heuristics.py +0 -0
  61. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/assess_cmd.py +0 -0
  62. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/interpret_cmd.py +0 -0
  63. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/ratings_cmd.py +0 -0
  64. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/run_cmd.py +0 -0
  65. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  66. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/cli/yaml_writer.py +0 -0
  67. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/config/__init__.py +0 -0
  68. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/config/loader.py +0 -0
  69. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/__init__.py +0 -0
  70. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/_shared.py +0 -0
  71. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/adaptive.py +0 -0
  72. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/counterfactual.py +0 -0
  73. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/debate.py +0 -0
  74. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/decision_tree.py +0 -0
  75. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/elo.py +0 -0
  76. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/fingerprint.py +0 -0
  77. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/mutation.py +0 -0
  78. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/pareto.py +0 -0
  79. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/sprt.py +0 -0
  80. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/contrib/tournament.py +0 -0
  81. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/__init__.py +0 -0
  82. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/experiment.py +0 -0
  83. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/llm.py +0 -0
  84. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/registry.py +0 -0
  85. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/sandbox.py +0 -0
  86. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/core/scoring.py +0 -0
  87. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/loaders/__init__.py +0 -0
  88. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/__init__.py +0 -0
  89. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/_lang.py +0 -0
  90. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/curator.py +0 -0
  91. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/curator_backends.py +0 -0
  92. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/curator_tiers.py +0 -0
  93. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/extractor.py +0 -0
  94. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  95. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_validate.py +0 -0
  96. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/mining/sources.py +0 -0
  97. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/__init__.py +0 -0
  98. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/evalrc.py +0 -0
  99. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/experiment.py +0 -0
  100. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/models/preamble.py +0 -0
  101. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/preambles/__init__.py +0 -0
  102. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/probe/__init__.py +0 -0
  103. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/ratings/__init__.py +0 -0
  104. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/ratings/collector.py +0 -0
  105. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/scaffold/__init__.py +0 -0
  106. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/scaffold/writer.py +0 -0
  107. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/__init__.py +0 -0
  108. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  109. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  110. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  111. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/entry_points.txt +0 -0
  112. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/requires.txt +0 -0
  113. {codeprobe-0.1.7 → codeprobe-0.2.0}/src/codeprobe.egg-info/top_level.txt +0 -0
  114. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_analysis.py +0 -0
  115. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_api.py +0 -0
  116. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_assess.py +0 -0
  117. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_checkpoint.py +0 -0
  118. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_config_loader.py +0 -0
  119. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_contrib.py +0 -0
  120. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_backends.py +0 -0
  121. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_core.py +0 -0
  122. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_integration.py +0 -0
  123. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_curator_tiers.py +0 -0
  124. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_experiment_cmd.py +0 -0
  125. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_experiment_core.py +0 -0
  126. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_llm.py +0 -0
  127. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_loaders.py +0 -0
  128. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_mcp_validate.py +0 -0
  129. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_mining.py +0 -0
  130. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_models.py +0 -0
  131. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_openai_compat.py +0 -0
  132. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_oracle_types.py +0 -0
  133. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_pipeline_integration.py +0 -0
  134. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_ratings.py +0 -0
  135. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_ratings_cmd.py +0 -0
  136. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_registry.py +0 -0
  137. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_scaffold.py +0 -0
  138. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_scanner_refactor.py +0 -0
  139. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_scoring.py +0 -0
  140. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_session.py +0 -0
  141. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_telemetry.py +0 -0
  142. {codeprobe-0.1.7 → codeprobe-0.2.0}/tests/test_weighted_f1.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeprobe
3
- Version: 0.1.7
3
+ Version: 0.2.0
4
4
  Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
5
  Author: codeprobe contributors
6
6
  License-Expression: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.1.7"
3
+ version = "0.2.0"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -1,3 +1,3 @@
1
1
  """codeprobe — Benchmark AI coding agents against your own codebase."""
2
2
 
3
- __version__ = "0.1.7"
3
+ __version__ = "0.2.0"
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  import re
7
+ import shutil
7
8
  import subprocess
8
9
  import tempfile
9
10
  from pathlib import Path
@@ -69,16 +70,31 @@ class ClaudeAdapter(BaseAdapter):
69
70
 
70
71
  mcp_path = self._write_mcp_config(config)
71
72
  if mcp_path:
72
- cmd.extend(["--mcp-config", mcp_path])
73
+ cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
73
74
 
74
75
  return cmd
75
76
 
76
77
  def isolate_session(self, slot_id: int) -> dict[str, str]:
77
- """Return a per-slot CLAUDE_CONFIG_DIR for session isolation."""
78
+ """Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
79
+
80
+ Copies authentication credentials from the real ``~/.claude/``
81
+ directory so the agent subprocess can authenticate.
82
+ """
78
83
  config_dir = (
79
84
  Path(tempfile.gettempdir()) / "codeprobe-claude" / f"slot-{slot_id}"
80
85
  )
81
86
  config_dir.mkdir(parents=True, exist_ok=True)
87
+
88
+ # Copy auth credentials from the user's real config dir.
89
+ # Without these the subprocess gets "Not logged in".
90
+ real_config = Path.home() / ".claude"
91
+ if real_config.is_dir():
92
+ for name in ("credentials.json", ".credentials.json"):
93
+ src = real_config / name
94
+ dst = config_dir / name
95
+ if src.is_file():
96
+ shutil.copy2(src, dst)
97
+
82
98
  return {"CLAUDE_CONFIG_DIR": str(config_dir)}
83
99
 
84
100
  def parse_output(
@@ -1,18 +1,89 @@
1
1
  """CLI entry point for codeprobe."""
2
2
 
3
+ import json as _json
4
+ import logging
5
+ import sys
6
+
3
7
  import click
4
8
 
5
9
  from codeprobe import __version__
6
10
 
7
11
 
12
+ class _JsonFormatter(logging.Formatter):
13
+ """Emit one JSON object per log line."""
14
+
15
+ def format(self, record: logging.LogRecord) -> str:
16
+ payload = {
17
+ "level": record.levelname,
18
+ "logger": record.name,
19
+ "message": record.getMessage(),
20
+ "timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
21
+ }
22
+ return _json.dumps(payload)
23
+
24
+
25
+ def _configure_logging(verbose: int, quiet: bool, log_format: str = "text") -> None:
26
+ """Configure namespace-scoped logging for codeprobe.* modules.
27
+
28
+ Attaches a StreamHandler to `logging.getLogger("codeprobe")` so that
29
+ all 26+ codeprobe.* modules emit through hierarchy without touching
30
+ third-party loggers (httpx, urllib3, etc.).
31
+ """
32
+ if quiet:
33
+ level = logging.WARNING
34
+ elif verbose >= 1:
35
+ level = logging.DEBUG
36
+ else:
37
+ level = logging.INFO
38
+
39
+ logger = logging.getLogger("codeprobe")
40
+ logger.setLevel(level)
41
+ logger.propagate = False # don't bubble to root
42
+
43
+ # Idempotent: tests / repeat invocations must not duplicate handlers.
44
+ for h in list(logger.handlers):
45
+ logger.removeHandler(h)
46
+
47
+ handler = logging.StreamHandler(sys.stderr)
48
+ if log_format == "json":
49
+ handler.setFormatter(_JsonFormatter())
50
+ elif verbose >= 1:
51
+ fmt = "%(levelname)s %(name)s: %(message)s"
52
+ handler.setFormatter(logging.Formatter(fmt))
53
+ else:
54
+ fmt = "%(levelname)s: %(message)s"
55
+ handler.setFormatter(logging.Formatter(fmt))
56
+ logger.addHandler(handler)
57
+
58
+
8
59
  @click.group()
60
+ @click.option(
61
+ "-v",
62
+ "--verbose",
63
+ count=True,
64
+ help="Increase log verbosity (-v sets DEBUG).",
65
+ )
66
+ @click.option(
67
+ "-q",
68
+ "--quiet",
69
+ is_flag=True,
70
+ default=False,
71
+ help="Suppress INFO logs (WARNING and above only).",
72
+ )
73
+ @click.option(
74
+ "--log-format",
75
+ type=click.Choice(["text", "json"]),
76
+ default="text",
77
+ help="Log output format (default: text). 'json' emits one JSON object per line.",
78
+ )
9
79
  @click.version_option(version=__version__, prog_name="codeprobe")
10
- def main() -> None:
80
+ def main(verbose: int, quiet: bool, log_format: str) -> None:
11
81
  """Benchmark AI coding agents against your own codebase.
12
82
 
13
83
  Mine real tasks from your repo history, run agents against them,
14
84
  and interpret the results to find which setup works best for YOUR code.
15
85
  """
86
+ _configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
16
87
 
17
88
 
18
89
  @main.command()
@@ -121,6 +192,20 @@ def init(path: str) -> None:
121
192
  default=False,
122
193
  help="Run LLM verification on curated ground truth.",
123
194
  )
195
+ @click.option(
196
+ "--mcp-families",
197
+ is_flag=True,
198
+ default=False,
199
+ help="Include MCP-advantaged task families (symbol-reference-trace, "
200
+ "type-hierarchy-consumers, change-scope-audit). Only with --org-scale.",
201
+ )
202
+ @click.option(
203
+ "--sg-repo",
204
+ default="",
205
+ help="Sourcegraph repo identifier for ground truth enrichment "
206
+ "(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
207
+ "when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
208
+ )
124
209
  def mine(
125
210
  path: str,
126
211
  count: int,
@@ -139,6 +224,8 @@ def mine(
139
224
  curate: bool,
140
225
  backends: tuple[str, ...],
141
226
  verify_curation_flag: bool,
227
+ mcp_families: bool,
228
+ sg_repo: str,
142
229
  ) -> None:
143
230
  """Mine eval tasks from a repository's history.
144
231
 
@@ -175,6 +262,8 @@ def mine(
175
262
  curate=curate,
176
263
  backends=backends,
177
264
  verify_curation_flag=verify_curation_flag,
265
+ mcp_families=mcp_families,
266
+ sg_repo=sg_repo,
178
267
  )
179
268
 
180
269
 
@@ -272,6 +361,20 @@ def init_experiment(path: str, name: str, description: str) -> None:
272
361
  @click.option(
273
362
  "--mcp-config", default=None, help="MCP config as JSON string or file path."
274
363
  )
364
+ @click.option(
365
+ "--instruction-variant",
366
+ default=None,
367
+ help="Instruction file variant (e.g., instruction_mcp.md). Default: instruction.md.",
368
+ )
369
+ @click.option(
370
+ "--preamble",
371
+ "preambles",
372
+ multiple=True,
373
+ help=(
374
+ "Preamble to prepend to the instruction. Repeatable. "
375
+ "Built-ins: sourcegraph, github. Or path to a custom .md file."
376
+ ),
377
+ )
275
378
  def add_config(
276
379
  path: str,
277
380
  label: str,
@@ -279,6 +382,8 @@ def add_config(
279
382
  model: str | None,
280
383
  permission_mode: str,
281
384
  mcp_config: str | None,
385
+ instruction_variant: str | None,
386
+ preambles: tuple[str, ...],
282
387
  ) -> None:
283
388
  """Add a configuration to an existing experiment."""
284
389
  from codeprobe.cli.experiment_cmd import experiment_add_config
@@ -290,6 +395,8 @@ def add_config(
290
395
  model=model,
291
396
  permission_mode=permission_mode,
292
397
  mcp_config_str=mcp_config,
398
+ instruction_variant=instruction_variant,
399
+ preambles=preambles,
293
400
  )
294
401
 
295
402
 
@@ -63,6 +63,8 @@ def experiment_add_config(
63
63
  model: str | None,
64
64
  permission_mode: str,
65
65
  mcp_config_str: str | None,
66
+ instruction_variant: str | None = None,
67
+ preambles: tuple[str, ...] = (),
66
68
  ) -> None:
67
69
  """Add a configuration to an existing experiment."""
68
70
  exp_dir = Path(path)
@@ -104,6 +106,8 @@ def experiment_add_config(
104
106
  model=model,
105
107
  permission_mode=permission_mode,
106
108
  mcp_config=mcp_config,
109
+ instruction_variant=instruction_variant,
110
+ preambles=preambles,
107
111
  )
108
112
 
109
113
  # Validate the label is a safe path component
@@ -173,12 +173,83 @@ def _prompt_mcp_config() -> str:
173
173
  click.echo(f" Error: '{expanded}' does not exist. Try again.")
174
174
 
175
175
 
176
+ def _detect_sourcegraph_in_mcp(
177
+ discovered: list[tuple[Path, list[str]]],
178
+ mcp_data: dict | None = None,
179
+ ) -> bool:
180
+ """Return True if any discovered MCP config contains a Sourcegraph server.
181
+
182
+ Checks server names for common Sourcegraph patterns (e.g.
183
+ ``sourcegraph``, ``sourcegraph-mcp-server``).
184
+ """
185
+ sg_names = {"sourcegraph", "sourcegraph-mcp-server"}
186
+ for _path, server_names in discovered:
187
+ for name in server_names:
188
+ if name.lower() in sg_names:
189
+ return True
190
+ if mcp_data:
191
+ for name in mcp_data.get("mcpServers", {}):
192
+ if name.lower() in sg_names:
193
+ return True
194
+ return False
195
+
196
+
197
+ def _prompt_sourcegraph_token() -> str:
198
+ """Prompt for Sourcegraph access token, checking env var first."""
199
+ import os
200
+
201
+ env_token = os.environ.get("SOURCEGRAPH_TOKEN", "")
202
+ if env_token:
203
+ masked = env_token[:4] + "..." + env_token[-4:] if len(env_token) > 8 else "***"
204
+ click.echo(f" Found SOURCEGRAPH_TOKEN in environment ({masked})")
205
+ if click.confirm(" Use this token?", default=True):
206
+ return env_token
207
+
208
+ return click.prompt("Sourcegraph access token")
209
+
210
+
211
+ def _prompt_sourcegraph_url() -> str | None:
212
+ """Prompt for optional custom Sourcegraph instance URL."""
213
+ url = click.prompt(
214
+ "Sourcegraph URL (press Enter for sourcegraph.com)",
215
+ default="",
216
+ show_default=False,
217
+ )
218
+ return url if url else None
219
+
220
+
176
221
  def _goal_mcp(agents: list[str], name: str) -> _Result:
177
222
  """Goal 1: MCP comparison prompts."""
178
223
  agent = _prompt_agent(agents)
179
224
  model = _prompt_model()
180
- mcp_path = _prompt_mcp_config()
181
225
 
226
+ # Check if Sourcegraph is available in discovered MCP configs
227
+ discovered = _discover_mcp_configs()
228
+ use_sourcegraph = False
229
+
230
+ if _detect_sourcegraph_in_mcp(discovered):
231
+ click.echo()
232
+ click.echo("Detected Sourcegraph MCP server in your configuration.")
233
+ click.echo("codeprobe can use the HTTP endpoint for better performance.")
234
+ use_sourcegraph = click.confirm("Use Sourcegraph HTTP MCP?", default=True)
235
+ else:
236
+ click.echo()
237
+ click.echo("Would you like to use Sourcegraph as the MCP server?")
238
+ use_sourcegraph = click.confirm("Use Sourcegraph?", default=False)
239
+
240
+ if use_sourcegraph:
241
+ token = _prompt_sourcegraph_token()
242
+ sg_url = _prompt_sourcegraph_url()
243
+ return ask_mcp_comparison(
244
+ experiment_name=name,
245
+ agent=agent,
246
+ model=model,
247
+ sourcegraph_token=token,
248
+ sourcegraph_url=sg_url,
249
+ )
250
+
251
+ # Fall back to generic MCP config path
252
+ mcp_path = _prompt_mcp_config()
182
253
  return ask_mcp_comparison(
183
254
  experiment_name=name,
184
255
  agent=agent,
@@ -518,6 +518,8 @@ def run_mine(
518
518
  curate: bool = False,
519
519
  backends: tuple[str, ...] = (),
520
520
  verify_curation_flag: bool = False,
521
+ mcp_families: bool = False,
522
+ sg_repo: str = "",
521
523
  ) -> None:
522
524
  """Mine eval tasks from a repository."""
523
525
  from codeprobe.mining import mine_tasks, write_task_dir
@@ -553,6 +555,8 @@ def run_mine(
553
555
  curate=curate,
554
556
  backends=backends,
555
557
  verify_curation_flag=verify_curation_flag,
558
+ mcp_families=mcp_families,
559
+ sg_repo=sg_repo,
556
560
  )
557
561
  return
558
562
 
@@ -640,6 +644,8 @@ def _run_org_scale_mine(
640
644
  curate: bool = False,
641
645
  backends: tuple[str, ...] = (),
642
646
  verify_curation_flag: bool = False,
647
+ mcp_families: bool = False,
648
+ sg_repo: str = "",
643
649
  ) -> None:
644
650
  """Mine org-scale comprehension tasks with oracle verification."""
645
651
  from codeprobe.mining.org_scale import mine_org_scale_tasks
@@ -668,12 +674,19 @@ def _run_org_scale_mine(
668
674
  click.echo("No families selected. Aborted.")
669
675
  return
670
676
 
677
+ # Default sg_repo from primary repo name if not explicitly provided
678
+ effective_sg_repo = sg_repo
679
+ if not effective_sg_repo and mcp_families:
680
+ effective_sg_repo = f"github.com/sg-evals/{repo_paths[0].name}"
681
+
671
682
  result = mine_org_scale_tasks(
672
683
  repo_paths,
673
684
  count=count,
674
685
  families=selected_families,
675
686
  no_llm=no_llm,
676
687
  scan_timeout=scan_timeout,
688
+ include_mcp_families=mcp_families,
689
+ sg_repo=effective_sg_repo,
677
690
  )
678
691
 
679
692
  if not result.tasks:
@@ -3,10 +3,13 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
6
7
  from pathlib import Path
7
8
 
8
9
  import click
9
10
 
11
+ logger = logging.getLogger(__name__)
12
+
10
13
  from codeprobe.probe.generator import DEFAULT_COUNT, MAX_PROBES, MIN_PROBES
11
14
 
12
15
 
@@ -76,7 +79,7 @@ def probe(
76
79
  output_dir = Path(output) if output else repo_root / "probes"
77
80
  effective_repo_name = repo_name or repo_root.name
78
81
 
79
- click.echo(f"Scanning {repo_root} for symbols...", err=True)
82
+ logger.info("Scanning %s for symbols...", repo_root)
80
83
  probes = generate_probes(
81
84
  repo_root=repo_root,
82
85
  count=count,
@@ -85,12 +88,11 @@ def probe(
85
88
  )
86
89
 
87
90
  if not probes:
88
- click.echo("No probes generated -- no suitable symbols found.", err=True)
91
+ logger.warning("No probes generated -- no suitable symbols found.")
89
92
  raise SystemExit(1)
90
93
 
91
- click.echo(
92
- f"Generated {len(probes)} probes, writing to {output_dir}...",
93
- err=True,
94
+ logger.info(
95
+ "Generated %d probes, writing to %s...", len(probes), output_dir
94
96
  )
95
97
  created = write_probe_tasks(probes, output_dir, effective_repo_name)
96
98
 
@@ -108,9 +110,9 @@ def probe(
108
110
  }
109
111
  click.echo(json.dumps(summary, indent=2))
110
112
  else:
111
- click.echo(f"Probe generation complete:", err=True)
112
- click.echo(f" Total probes: {len(probes)}", err=True)
113
+ logger.info("Probe generation complete:")
114
+ logger.info(" Total probes: %d", len(probes))
113
115
  for tpl_name, tpl_count in sorted(by_template.items()):
114
- click.echo(f" {tpl_name}: {tpl_count}", err=True)
115
- click.echo(f" Output: {output_dir}", err=True)
116
+ logger.info(" %s: %d", tpl_name, tpl_count)
117
+ logger.info(" Output: %s", output_dir)
116
118
  click.echo(f"Created {len(created)} probe tasks in {output_dir}")
@@ -14,19 +14,68 @@ from codeprobe.models.experiment import ExperimentConfig
14
14
  _SAFE_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$")
15
15
 
16
16
 
17
+ _DEFAULT_SOURCEGRAPH_URL = "https://sourcegraph.com"
18
+
19
+
20
+ def build_sourcegraph_mcp_config(
21
+ *,
22
+ token: str,
23
+ url: str = _DEFAULT_SOURCEGRAPH_URL,
24
+ ) -> dict:
25
+ """Build an HTTP MCP config dict for Sourcegraph.
26
+
27
+ Returns a ``{"mcpServers": {"sourcegraph": {...}}}`` dict suitable for
28
+ passing as ``mcp_config`` on an :class:`ExperimentConfig`.
29
+ """
30
+ base_url = url.rstrip("/")
31
+ return {
32
+ "mcpServers": {
33
+ "sourcegraph": {
34
+ "type": "http",
35
+ "url": f"{base_url}/.api/mcp/v1",
36
+ "headers": {"Authorization": f"token {token}"},
37
+ }
38
+ }
39
+ }
40
+
41
+
17
42
  def ask_mcp_comparison(
18
43
  *,
19
44
  experiment_name: str,
20
45
  agent: str,
21
46
  model: str | None,
22
- mcp_config_path: str,
47
+ mcp_config_path: str | None = None,
48
+ sourcegraph_token: str | None = None,
49
+ sourcegraph_url: str | None = None,
23
50
  ) -> tuple[EvalrcConfig, list[ExperimentConfig]]:
24
- """Goal 1: Compare baseline agent vs MCP-augmented agent."""
25
- mcp_data = _load_json(mcp_config_path)
51
+ """Goal 1: Compare baseline agent vs MCP-augmented agent.
52
+
53
+ When *sourcegraph_token* is provided, generates an HTTP-based Sourcegraph
54
+ MCP config with an ``Authorization`` header and adds the ``sourcegraph``
55
+ preamble. Otherwise falls back to loading the MCP config from
56
+ *mcp_config_path*.
57
+ """
58
+ if sourcegraph_token is not None:
59
+ mcp_data = build_sourcegraph_mcp_config(
60
+ token=sourcegraph_token,
61
+ url=sourcegraph_url or _DEFAULT_SOURCEGRAPH_URL,
62
+ )
63
+ preambles: tuple[str, ...] = ("sourcegraph",)
64
+ else:
65
+ if mcp_config_path is None:
66
+ raise click.BadParameter(
67
+ "Either sourcegraph_token or mcp_config_path must be provided."
68
+ )
69
+ mcp_data = _load_json(mcp_config_path)
70
+ preambles = ()
26
71
 
27
72
  baseline = ExperimentConfig(label="baseline", agent=agent, model=model)
28
73
  with_mcp = ExperimentConfig(
29
- label="with-mcp", agent=agent, model=model, mcp_config=mcp_data
74
+ label="with-mcp",
75
+ agent=agent,
76
+ model=model,
77
+ mcp_config=mcp_data,
78
+ preambles=preambles,
30
79
  )
31
80
 
32
81
  evalrc = EvalrcConfig(name=experiment_name, agents=[agent])
@@ -9,6 +9,7 @@ from __future__ import annotations
9
9
  import json
10
10
  import logging
11
11
  import sqlite3
12
+ import time
12
13
  from dataclasses import asdict
13
14
  from datetime import datetime, timezone
14
15
  from pathlib import Path
@@ -173,31 +174,47 @@ class CheckpointStore:
173
174
  def _open(self, db_path: Path) -> sqlite3.Connection:
174
175
  """Open (or create) the SQLite database with WAL mode.
175
176
 
176
- If the file is corrupt, it is removed and recreated.
177
- Automatically migrates the schema for older databases.
177
+ Retries on transient ``OperationalError`` (lock contention during
178
+ concurrent creation). Only removes and recreates the file on
179
+ non-transient ``DatabaseError`` (genuine corruption).
178
180
  """
179
181
  db_path.parent.mkdir(parents=True, exist_ok=True)
180
- try:
181
- conn = sqlite3.connect(str(db_path), timeout=10)
182
- conn.execute("PRAGMA journal_mode=WAL")
183
- conn.execute(_CREATE_TABLE)
184
- self._migrate_schema(conn)
185
- conn.commit()
186
- return conn
187
- except sqlite3.DatabaseError:
188
- logger.warning(
189
- "Corrupt checkpoint DB at %s — removing and recreating", db_path
190
- )
191
- db_path.unlink(missing_ok=True)
192
- # Also remove WAL/SHM sidecar files
193
- db_path.with_suffix(".db-wal").unlink(missing_ok=True)
194
- db_path.with_suffix(".db-shm").unlink(missing_ok=True)
195
- conn = sqlite3.connect(str(db_path), timeout=10)
196
- conn.execute("PRAGMA journal_mode=WAL")
197
- conn.execute(_CREATE_TABLE)
198
- self._migrate_schema(conn)
199
- conn.commit()
200
- return conn
182
+ last_err: Exception | None = None
183
+ for attempt in range(4):
184
+ try:
185
+ conn = sqlite3.connect(str(db_path), timeout=10)
186
+ conn.execute("PRAGMA journal_mode=WAL")
187
+ conn.execute(_CREATE_TABLE)
188
+ self._migrate_schema(conn)
189
+ conn.commit()
190
+ return conn
191
+ except sqlite3.OperationalError as exc:
192
+ # Transient lock contention — retry with backoff
193
+ last_err = exc
194
+ logger.debug(
195
+ "Checkpoint DB busy at %s (attempt %d): %s",
196
+ db_path,
197
+ attempt + 1,
198
+ exc,
199
+ )
200
+ time.sleep(0.1 * (2**attempt))
201
+ except sqlite3.DatabaseError:
202
+ logger.warning(
203
+ "Corrupt checkpoint DB at %s — removing and recreating",
204
+ db_path,
205
+ )
206
+ db_path.unlink(missing_ok=True)
207
+ db_path.with_suffix(".db-wal").unlink(missing_ok=True)
208
+ db_path.with_suffix(".db-shm").unlink(missing_ok=True)
209
+ conn = sqlite3.connect(str(db_path), timeout=10)
210
+ conn.execute("PRAGMA journal_mode=WAL")
211
+ conn.execute(_CREATE_TABLE)
212
+ self._migrate_schema(conn)
213
+ conn.commit()
214
+ return conn
215
+ raise sqlite3.OperationalError(
216
+ f"Could not open checkpoint DB at {db_path} after 4 attempts: {last_err}"
217
+ )
201
218
 
202
219
  @staticmethod
203
220
  def _migrate_schema(conn: sqlite3.Connection) -> None:
@@ -166,21 +166,25 @@ def execute_task(
166
166
  """
167
167
  task_id = task_dir.name
168
168
 
169
+ # Load task metadata once — used for reward_type auto-detection and
170
+ # preamble context (e.g. sg_repo for Sourcegraph preamble).
171
+ _task_meta: dict = {}
172
+ meta_path = task_dir / "metadata.json"
173
+ if meta_path.is_file():
174
+ try:
175
+ import json as _json
176
+
177
+ _task_meta = _json.loads(meta_path.read_text(encoding="utf-8"))
178
+ except (ValueError, OSError):
179
+ pass
180
+
169
181
  # Auto-detect reward_type from task metadata when caller uses default.
170
182
  # Oracle tasks (org-scale) need "continuous" scoring to read reward.txt;
171
183
  # the default "binary" would score exit-code-only and always pass.
172
184
  if reward_type == "binary":
173
- meta_path = task_dir / "metadata.json"
174
- if meta_path.is_file():
175
- try:
176
- import json as _json
177
-
178
- meta = _json.loads(meta_path.read_text(encoding="utf-8"))
179
- task_rt = (meta.get("verification") or {}).get("reward_type")
180
- if task_rt and task_rt != "binary":
181
- reward_type = task_rt
182
- except (ValueError, OSError):
183
- pass # Stick with caller's default
185
+ task_rt = (_task_meta.get("verification") or {}).get("reward_type")
186
+ if task_rt and task_rt != "binary":
187
+ reward_type = task_rt
184
188
 
185
189
  def _error_result(error: str, error_category: str | None = None) -> TaskResult:
186
190
  return TaskResult(
@@ -206,6 +210,12 @@ def execute_task(
206
210
  )
207
211
 
208
212
  if preamble_names and preamble_resolver is not None:
213
+ # Build extra context from task metadata for preamble templates
214
+ extra_ctx: dict[str, str] = {}
215
+ sg_repo = (_task_meta.get("metadata") or {}).get("sg_repo", "")
216
+ if sg_repo:
217
+ extra_ctx["sg_repo"] = sg_repo
218
+
209
219
  try:
210
220
  prompt, resolved_preambles = compose_instruction(
211
221
  instruction,
@@ -214,6 +224,7 @@ def execute_task(
214
224
  resolver=preamble_resolver,
215
225
  task_id=task_id,
216
226
  worktree_path=worktree_path,
227
+ extra_context=extra_ctx or None,
217
228
  )
218
229
  except (FileNotFoundError, ValueError) as exc:
219
230
  return _error_result(f"Preamble resolution failed: {exc}")
@@ -325,7 +336,7 @@ def _git_reset_workdir(repo_path: Path) -> None:
325
336
  capture_output=True,
326
337
  )
327
338
  subprocess.run(
328
- ["git", "clean", "-fd"],
339
+ ["git", "clean", "-fd", "-e", ".codeprobe", "-e", ".codeprobe-worktrees"],
329
340
  cwd=repo_path,
330
341
  check=True,
331
342
  capture_output=True,
@@ -448,6 +459,15 @@ def execute_config(
448
459
  """
449
460
  checkpointed_ids, results = _restore_checkpointed(checkpoint_store)
450
461
 
462
+ # Filter checkpointed results to only include tasks in the current
463
+ # experiment. Without this, stale entries from prior runs with different
464
+ # task_ids leak into the results list and inflate/deflate scores.
465
+ current_task_ids = {d.name for d in task_dirs}
466
+ checkpointed_ids = {
467
+ (tid, ri) for tid, ri in checkpointed_ids if tid in current_task_ids
468
+ }
469
+ results = [r for r in results if r.task_id in current_task_ids]
470
+
451
471
  # Build expanded work items: (task_dir, repeat_index) for all repeats
452
472
  all_work: list[tuple[Path, int]] = [
453
473
  (d, ri) for d in task_dirs for ri in range(repeats)