codeprobe 0.1.6__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. {codeprobe-0.1.6 → codeprobe-0.2.0}/PKG-INFO +1 -1
  2. {codeprobe-0.1.6 → codeprobe-0.2.0}/pyproject.toml +1 -1
  3. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/__init__.py +1 -1
  4. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/claude.py +18 -2
  5. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/api.py +16 -3
  6. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/__init__.py +108 -1
  7. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/experiment_cmd.py +4 -0
  8. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/init_cmd.py +72 -1
  9. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/mine_cmd.py +52 -0
  10. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/probe_cmd.py +11 -9
  11. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/run_cmd.py +23 -10
  12. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/wizard.py +53 -4
  13. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/checkpoint.py +40 -23
  14. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/executor.py +32 -12
  15. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/experiment.py +6 -1
  16. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/isolation.py +9 -1
  17. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/preamble.py +7 -0
  18. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale.py +331 -0
  19. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_families.py +106 -1
  20. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_scanner.py +464 -0
  21. codeprobe-0.2.0/src/codeprobe/mining/sg_ground_truth.py +163 -0
  22. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/writer.py +81 -32
  23. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/experiment.py +1 -0
  24. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/task.py +1 -0
  25. codeprobe-0.2.0/src/codeprobe/preambles/github.md +21 -0
  26. codeprobe-0.2.0/src/codeprobe/preambles/sourcegraph.md +44 -0
  27. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/probe/generator.py +60 -5
  28. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/probe/writer.py +8 -2
  29. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +6 -6
  30. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/PKG-INFO +1 -1
  31. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/SOURCES.txt +5 -0
  32. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_adapters.py +31 -0
  33. codeprobe-0.2.0/tests/test_changed_symbols.py +241 -0
  34. codeprobe-0.2.0/tests/test_cli.py +201 -0
  35. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_executor.py +9 -1
  36. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_experiment_core.py +100 -0
  37. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_init_wizard.py +173 -2
  38. codeprobe-0.2.0/tests/test_mcp_families_mining.py +278 -0
  39. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_new_families.py +29 -3
  40. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_org_scale.py +6 -7
  41. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_preamble.py +36 -13
  42. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_probe.py +351 -0
  43. codeprobe-0.2.0/tests/test_sg_ground_truth.py +318 -0
  44. codeprobe-0.1.6/src/codeprobe/preambles/sourcegraph.md +0 -32
  45. codeprobe-0.1.6/tests/test_cli.py +0 -51
  46. {codeprobe-0.1.6 → codeprobe-0.2.0}/LICENSE +0 -0
  47. {codeprobe-0.1.6 → codeprobe-0.2.0}/README.md +0 -0
  48. {codeprobe-0.1.6 → codeprobe-0.2.0}/setup.cfg +0 -0
  49. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/__main__.py +0 -0
  50. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/__init__.py +0 -0
  51. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/_base.py +0 -0
  52. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/aider.py +0 -0
  53. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/codex.py +0 -0
  54. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/copilot.py +0 -0
  55. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/openai_compat.py +0 -0
  56. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/protocol.py +0 -0
  57. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/session.py +0 -0
  58. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/adapters/telemetry.py +0 -0
  59. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/__init__.py +0 -0
  60. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/ranking.py +0 -0
  61. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/report.py +0 -0
  62. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/analysis/stats.py +0 -0
  63. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/assess/__init__.py +0 -0
  64. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/assess/heuristics.py +0 -0
  65. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/assess_cmd.py +0 -0
  66. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/interpret_cmd.py +0 -0
  67. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/ratings_cmd.py +0 -0
  68. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  69. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/cli/yaml_writer.py +0 -0
  70. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/config/__init__.py +0 -0
  71. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/config/loader.py +0 -0
  72. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/__init__.py +0 -0
  73. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/_shared.py +0 -0
  74. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/adaptive.py +0 -0
  75. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/counterfactual.py +0 -0
  76. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/debate.py +0 -0
  77. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/decision_tree.py +0 -0
  78. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/elo.py +0 -0
  79. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/fingerprint.py +0 -0
  80. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/mutation.py +0 -0
  81. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/pareto.py +0 -0
  82. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/sprt.py +0 -0
  83. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/contrib/tournament.py +0 -0
  84. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/__init__.py +0 -0
  85. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/llm.py +0 -0
  86. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/registry.py +0 -0
  87. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/sandbox.py +0 -0
  88. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/core/scoring.py +0 -0
  89. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/loaders/__init__.py +0 -0
  90. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/__init__.py +0 -0
  91. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/_lang.py +0 -0
  92. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/curator.py +0 -0
  93. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/curator_backends.py +0 -0
  94. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/curator_tiers.py +0 -0
  95. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/extractor.py +0 -0
  96. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  97. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/org_scale_validate.py +0 -0
  98. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/mining/sources.py +0 -0
  99. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/__init__.py +0 -0
  100. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/evalrc.py +0 -0
  101. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/models/preamble.py +0 -0
  102. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/preambles/__init__.py +0 -0
  103. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/probe/__init__.py +0 -0
  104. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/ratings/__init__.py +0 -0
  105. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/ratings/collector.py +0 -0
  106. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/scaffold/__init__.py +0 -0
  107. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/scaffold/writer.py +0 -0
  108. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/__init__.py +0 -0
  109. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  110. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  111. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  112. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/entry_points.txt +0 -0
  113. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/requires.txt +0 -0
  114. {codeprobe-0.1.6 → codeprobe-0.2.0}/src/codeprobe.egg-info/top_level.txt +0 -0
  115. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_analysis.py +0 -0
  116. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_api.py +0 -0
  117. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_assess.py +0 -0
  118. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_checkpoint.py +0 -0
  119. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_config_loader.py +0 -0
  120. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_contrib.py +0 -0
  121. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_backends.py +0 -0
  122. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_core.py +0 -0
  123. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_integration.py +0 -0
  124. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_curator_tiers.py +0 -0
  125. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_experiment_cmd.py +0 -0
  126. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_llm.py +0 -0
  127. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_loaders.py +0 -0
  128. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_mcp_validate.py +0 -0
  129. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_mining.py +0 -0
  130. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_models.py +0 -0
  131. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_openai_compat.py +0 -0
  132. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_oracle_types.py +0 -0
  133. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_pipeline_integration.py +0 -0
  134. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_ratings.py +0 -0
  135. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_ratings_cmd.py +0 -0
  136. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_registry.py +0 -0
  137. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_scaffold.py +0 -0
  138. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_scanner_refactor.py +0 -0
  139. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_scoring.py +0 -0
  140. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_session.py +0 -0
  141. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_telemetry.py +0 -0
  142. {codeprobe-0.1.6 → codeprobe-0.2.0}/tests/test_weighted_f1.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeprobe
3
- Version: 0.1.6
3
+ Version: 0.2.0
4
4
  Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
5
  Author: codeprobe contributors
6
6
  License-Expression: Apache-2.0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.1.6"
3
+ version = "0.2.0"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -1,3 +1,3 @@
1
1
  """codeprobe — Benchmark AI coding agents against your own codebase."""
2
2
 
3
- __version__ = "0.1.6"
3
+ __version__ = "0.2.0"
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  import re
7
+ import shutil
7
8
  import subprocess
8
9
  import tempfile
9
10
  from pathlib import Path
@@ -69,16 +70,31 @@ class ClaudeAdapter(BaseAdapter):
69
70
 
70
71
  mcp_path = self._write_mcp_config(config)
71
72
  if mcp_path:
72
- cmd.extend(["--mcp-config", mcp_path])
73
+ cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
73
74
 
74
75
  return cmd
75
76
 
76
77
  def isolate_session(self, slot_id: int) -> dict[str, str]:
77
- """Return a per-slot CLAUDE_CONFIG_DIR for session isolation."""
78
+ """Return a per-slot CLAUDE_CONFIG_DIR for session isolation.
79
+
80
+ Copies authentication credentials from the real ``~/.claude/``
81
+ directory so the agent subprocess can authenticate.
82
+ """
78
83
  config_dir = (
79
84
  Path(tempfile.gettempdir()) / "codeprobe-claude" / f"slot-{slot_id}"
80
85
  )
81
86
  config_dir.mkdir(parents=True, exist_ok=True)
87
+
88
+ # Copy auth credentials from the user's real config dir.
89
+ # Without these the subprocess gets "Not logged in".
90
+ real_config = Path.home() / ".claude"
91
+ if real_config.is_dir():
92
+ for name in ("credentials.json", ".credentials.json"):
93
+ src = real_config / name
94
+ dst = config_dir / name
95
+ if src.is_file():
96
+ shutil.copy2(src, dst)
97
+
82
98
  return {"CLAUDE_CONFIG_DIR": str(config_dir)}
83
99
 
84
100
  def parse_output(
@@ -58,11 +58,24 @@ def _build_experiment_config(raw: dict) -> ExperimentConfig:
58
58
  )
59
59
 
60
60
 
61
- def _discover_task_dirs(tasks_dir: Path) -> list[Path]:
62
- """Find valid task directories (those containing instruction.md)."""
61
+ def _discover_task_dirs(
62
+ tasks_dir: Path, *, task_ids: tuple[str, ...] = ()
63
+ ) -> list[Path]:
64
+ """Find valid task directories (those containing instruction.md).
65
+
66
+ When *task_ids* is non-empty, only return tasks whose directory name
67
+ appears in that tuple.
68
+ """
63
69
  if not tasks_dir.is_dir():
64
70
  raise FileNotFoundError(f"Tasks directory not found: {tasks_dir}")
65
71
 
72
+ if task_ids:
73
+ allowed = set(task_ids)
74
+ return sorted(
75
+ d
76
+ for d in tasks_dir.iterdir()
77
+ if d.is_dir() and d.name in allowed and (d / "instruction.md").exists()
78
+ )
66
79
  return sorted(
67
80
  d for d in tasks_dir.iterdir() if d.is_dir() and (d / "instruction.md").exists()
68
81
  )
@@ -103,7 +116,7 @@ def run_experiment(
103
116
  experiment = load_experiment(experiment_dir)
104
117
 
105
118
  tasks_dir = experiment_dir / experiment.tasks_dir
106
- task_dirs = _discover_task_dirs(tasks_dir)
119
+ task_dirs = _discover_task_dirs(tasks_dir, task_ids=experiment.task_ids)
107
120
 
108
121
  if not task_dirs:
109
122
  raise ValueError(
@@ -1,18 +1,89 @@
1
1
  """CLI entry point for codeprobe."""
2
2
 
3
+ import json as _json
4
+ import logging
5
+ import sys
6
+
3
7
  import click
4
8
 
5
9
  from codeprobe import __version__
6
10
 
7
11
 
12
+ class _JsonFormatter(logging.Formatter):
13
+ """Emit one JSON object per log line."""
14
+
15
+ def format(self, record: logging.LogRecord) -> str:
16
+ payload = {
17
+ "level": record.levelname,
18
+ "logger": record.name,
19
+ "message": record.getMessage(),
20
+ "timestamp": self.formatTime(record, "%Y-%m-%dT%H:%M:%S%z"),
21
+ }
22
+ return _json.dumps(payload)
23
+
24
+
25
+ def _configure_logging(verbose: int, quiet: bool, log_format: str = "text") -> None:
26
+ """Configure namespace-scoped logging for codeprobe.* modules.
27
+
28
+ Attaches a StreamHandler to `logging.getLogger("codeprobe")` so that
29
+ all 26+ codeprobe.* modules emit through hierarchy without touching
30
+ third-party loggers (httpx, urllib3, etc.).
31
+ """
32
+ if quiet:
33
+ level = logging.WARNING
34
+ elif verbose >= 1:
35
+ level = logging.DEBUG
36
+ else:
37
+ level = logging.INFO
38
+
39
+ logger = logging.getLogger("codeprobe")
40
+ logger.setLevel(level)
41
+ logger.propagate = False # don't bubble to root
42
+
43
+ # Idempotent: tests / repeat invocations must not duplicate handlers.
44
+ for h in list(logger.handlers):
45
+ logger.removeHandler(h)
46
+
47
+ handler = logging.StreamHandler(sys.stderr)
48
+ if log_format == "json":
49
+ handler.setFormatter(_JsonFormatter())
50
+ elif verbose >= 1:
51
+ fmt = "%(levelname)s %(name)s: %(message)s"
52
+ handler.setFormatter(logging.Formatter(fmt))
53
+ else:
54
+ fmt = "%(levelname)s: %(message)s"
55
+ handler.setFormatter(logging.Formatter(fmt))
56
+ logger.addHandler(handler)
57
+
58
+
8
59
  @click.group()
60
+ @click.option(
61
+ "-v",
62
+ "--verbose",
63
+ count=True,
64
+ help="Increase log verbosity (-v sets DEBUG).",
65
+ )
66
+ @click.option(
67
+ "-q",
68
+ "--quiet",
69
+ is_flag=True,
70
+ default=False,
71
+ help="Suppress INFO logs (WARNING and above only).",
72
+ )
73
+ @click.option(
74
+ "--log-format",
75
+ type=click.Choice(["text", "json"]),
76
+ default="text",
77
+ help="Log output format (default: text). 'json' emits one JSON object per line.",
78
+ )
9
79
  @click.version_option(version=__version__, prog_name="codeprobe")
10
- def main() -> None:
80
+ def main(verbose: int, quiet: bool, log_format: str) -> None:
11
81
  """Benchmark AI coding agents against your own codebase.
12
82
 
13
83
  Mine real tasks from your repo history, run agents against them,
14
84
  and interpret the results to find which setup works best for YOUR code.
15
85
  """
86
+ _configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
16
87
 
17
88
 
18
89
  @main.command()
@@ -121,6 +192,20 @@ def init(path: str) -> None:
121
192
  default=False,
122
193
  help="Run LLM verification on curated ground truth.",
123
194
  )
195
+ @click.option(
196
+ "--mcp-families",
197
+ is_flag=True,
198
+ default=False,
199
+ help="Include MCP-advantaged task families (symbol-reference-trace, "
200
+ "type-hierarchy-consumers, change-scope-audit). Only with --org-scale.",
201
+ )
202
+ @click.option(
203
+ "--sg-repo",
204
+ default="",
205
+ help="Sourcegraph repo identifier for ground truth enrichment "
206
+ "(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
207
+ "when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
208
+ )
124
209
  def mine(
125
210
  path: str,
126
211
  count: int,
@@ -139,6 +224,8 @@ def mine(
139
224
  curate: bool,
140
225
  backends: tuple[str, ...],
141
226
  verify_curation_flag: bool,
227
+ mcp_families: bool,
228
+ sg_repo: str,
142
229
  ) -> None:
143
230
  """Mine eval tasks from a repository's history.
144
231
 
@@ -175,6 +262,8 @@ def mine(
175
262
  curate=curate,
176
263
  backends=backends,
177
264
  verify_curation_flag=verify_curation_flag,
265
+ mcp_families=mcp_families,
266
+ sg_repo=sg_repo,
178
267
  )
179
268
 
180
269
 
@@ -272,6 +361,20 @@ def init_experiment(path: str, name: str, description: str) -> None:
272
361
  @click.option(
273
362
  "--mcp-config", default=None, help="MCP config as JSON string or file path."
274
363
  )
364
+ @click.option(
365
+ "--instruction-variant",
366
+ default=None,
367
+ help="Instruction file variant (e.g., instruction_mcp.md). Default: instruction.md.",
368
+ )
369
+ @click.option(
370
+ "--preamble",
371
+ "preambles",
372
+ multiple=True,
373
+ help=(
374
+ "Preamble to prepend to the instruction. Repeatable. "
375
+ "Built-ins: sourcegraph, github. Or path to a custom .md file."
376
+ ),
377
+ )
275
378
  def add_config(
276
379
  path: str,
277
380
  label: str,
@@ -279,6 +382,8 @@ def add_config(
279
382
  model: str | None,
280
383
  permission_mode: str,
281
384
  mcp_config: str | None,
385
+ instruction_variant: str | None,
386
+ preambles: tuple[str, ...],
282
387
  ) -> None:
283
388
  """Add a configuration to an existing experiment."""
284
389
  from codeprobe.cli.experiment_cmd import experiment_add_config
@@ -290,6 +395,8 @@ def add_config(
290
395
  model=model,
291
396
  permission_mode=permission_mode,
292
397
  mcp_config_str=mcp_config,
398
+ instruction_variant=instruction_variant,
399
+ preambles=preambles,
293
400
  )
294
401
 
295
402
 
@@ -63,6 +63,8 @@ def experiment_add_config(
63
63
  model: str | None,
64
64
  permission_mode: str,
65
65
  mcp_config_str: str | None,
66
+ instruction_variant: str | None = None,
67
+ preambles: tuple[str, ...] = (),
66
68
  ) -> None:
67
69
  """Add a configuration to an existing experiment."""
68
70
  exp_dir = Path(path)
@@ -104,6 +106,8 @@ def experiment_add_config(
104
106
  model=model,
105
107
  permission_mode=permission_mode,
106
108
  mcp_config=mcp_config,
109
+ instruction_variant=instruction_variant,
110
+ preambles=preambles,
107
111
  )
108
112
 
109
113
  # Validate the label is a safe path component
@@ -173,12 +173,83 @@ def _prompt_mcp_config() -> str:
173
173
  click.echo(f" Error: '{expanded}' does not exist. Try again.")
174
174
 
175
175
 
176
+ def _detect_sourcegraph_in_mcp(
177
+ discovered: list[tuple[Path, list[str]]],
178
+ mcp_data: dict | None = None,
179
+ ) -> bool:
180
+ """Return True if any discovered MCP config contains a Sourcegraph server.
181
+
182
+ Checks server names for common Sourcegraph patterns (e.g.
183
+ ``sourcegraph``, ``sourcegraph-mcp-server``).
184
+ """
185
+ sg_names = {"sourcegraph", "sourcegraph-mcp-server"}
186
+ for _path, server_names in discovered:
187
+ for name in server_names:
188
+ if name.lower() in sg_names:
189
+ return True
190
+ if mcp_data:
191
+ for name in mcp_data.get("mcpServers", {}):
192
+ if name.lower() in sg_names:
193
+ return True
194
+ return False
195
+
196
+
197
+ def _prompt_sourcegraph_token() -> str:
198
+ """Prompt for Sourcegraph access token, checking env var first."""
199
+ import os
200
+
201
+ env_token = os.environ.get("SOURCEGRAPH_TOKEN", "")
202
+ if env_token:
203
+ masked = env_token[:4] + "..." + env_token[-4:] if len(env_token) > 8 else "***"
204
+ click.echo(f" Found SOURCEGRAPH_TOKEN in environment ({masked})")
205
+ if click.confirm(" Use this token?", default=True):
206
+ return env_token
207
+
208
+ return click.prompt("Sourcegraph access token")
209
+
210
+
211
+ def _prompt_sourcegraph_url() -> str | None:
212
+ """Prompt for optional custom Sourcegraph instance URL."""
213
+ url = click.prompt(
214
+ "Sourcegraph URL (press Enter for sourcegraph.com)",
215
+ default="",
216
+ show_default=False,
217
+ )
218
+ return url if url else None
219
+
220
+
176
221
  def _goal_mcp(agents: list[str], name: str) -> _Result:
177
222
  """Goal 1: MCP comparison prompts."""
178
223
  agent = _prompt_agent(agents)
179
224
  model = _prompt_model()
180
- mcp_path = _prompt_mcp_config()
181
225
 
226
+ # Check if Sourcegraph is available in discovered MCP configs
227
+ discovered = _discover_mcp_configs()
228
+ use_sourcegraph = False
229
+
230
+ if _detect_sourcegraph_in_mcp(discovered):
231
+ click.echo()
232
+ click.echo("Detected Sourcegraph MCP server in your configuration.")
233
+ click.echo("codeprobe can use the HTTP endpoint for better performance.")
234
+ use_sourcegraph = click.confirm("Use Sourcegraph HTTP MCP?", default=True)
235
+ else:
236
+ click.echo()
237
+ click.echo("Would you like to use Sourcegraph as the MCP server?")
238
+ use_sourcegraph = click.confirm("Use Sourcegraph?", default=False)
239
+
240
+ if use_sourcegraph:
241
+ token = _prompt_sourcegraph_token()
242
+ sg_url = _prompt_sourcegraph_url()
243
+ return ask_mcp_comparison(
244
+ experiment_name=name,
245
+ agent=agent,
246
+ model=model,
247
+ sourcegraph_token=token,
248
+ sourcegraph_url=sg_url,
249
+ )
250
+
251
+ # Fall back to generic MCP config path
252
+ mcp_path = _prompt_mcp_config()
182
253
  return ask_mcp_comparison(
183
254
  experiment_name=name,
184
255
  agent=agent,
@@ -403,6 +403,41 @@ def _clear_tasks_dir(repo_path: Path) -> Path:
403
403
  return tasks_dir
404
404
 
405
405
 
406
+ def _record_task_ids_in_experiment(repo_path: Path, task_ids: list[str]) -> None:
407
+ """Update the experiment's task_ids so ``run`` only executes these tasks.
408
+
409
+ If exactly one experiment exists under ``<repo>/.codeprobe/``, its
410
+ ``experiment.json`` is updated with the new task ID list. When zero
411
+ or multiple experiments exist, this is a no-op (the user must scope
412
+ manually via ``--config``).
413
+ """
414
+ from codeprobe.core.experiment import load_experiment, save_experiment
415
+ from codeprobe.models.experiment import Experiment
416
+
417
+ codeprobe_dir = repo_path / ".codeprobe"
418
+ if not codeprobe_dir.is_dir():
419
+ return
420
+
421
+ candidates = sorted(
422
+ d
423
+ for d in codeprobe_dir.iterdir()
424
+ if d.is_dir() and (d / "experiment.json").is_file()
425
+ )
426
+ if len(candidates) != 1:
427
+ return
428
+
429
+ exp_dir = candidates[0]
430
+ experiment = load_experiment(exp_dir)
431
+ updated = Experiment(
432
+ name=experiment.name,
433
+ description=experiment.description,
434
+ configs=experiment.configs,
435
+ tasks_dir=experiment.tasks_dir,
436
+ task_ids=tuple(sorted(task_ids)),
437
+ )
438
+ save_experiment(exp_dir, updated)
439
+
440
+
406
441
  def _resolve_repo_path(path: str) -> Path:
407
442
  """Resolve a path or URL to a local repo directory."""
408
443
  if _is_git_url(path):
@@ -483,6 +518,8 @@ def run_mine(
483
518
  curate: bool = False,
484
519
  backends: tuple[str, ...] = (),
485
520
  verify_curation_flag: bool = False,
521
+ mcp_families: bool = False,
522
+ sg_repo: str = "",
486
523
  ) -> None:
487
524
  """Mine eval tasks from a repository."""
488
525
  from codeprobe.mining import mine_tasks, write_task_dir
@@ -518,6 +555,8 @@ def run_mine(
518
555
  curate=curate,
519
556
  backends=backends,
520
557
  verify_curation_flag=verify_curation_flag,
558
+ mcp_families=mcp_families,
559
+ sg_repo=sg_repo,
521
560
  )
522
561
  return
523
562
 
@@ -571,6 +610,8 @@ def run_mine(
571
610
  for task in tasks:
572
611
  write_task_dir(task, tasks_dir, repo_path)
573
612
 
613
+ _record_task_ids_in_experiment(repo_path, [t.id for t in tasks])
614
+
574
615
  _show_results_table(tasks)
575
616
 
576
617
  warnings = _quality_review(tasks, goal_name, bias)
@@ -603,6 +644,8 @@ def _run_org_scale_mine(
603
644
  curate: bool = False,
604
645
  backends: tuple[str, ...] = (),
605
646
  verify_curation_flag: bool = False,
647
+ mcp_families: bool = False,
648
+ sg_repo: str = "",
606
649
  ) -> None:
607
650
  """Mine org-scale comprehension tasks with oracle verification."""
608
651
  from codeprobe.mining.org_scale import mine_org_scale_tasks
@@ -631,12 +674,19 @@ def _run_org_scale_mine(
631
674
  click.echo("No families selected. Aborted.")
632
675
  return
633
676
 
677
+ # Default sg_repo from primary repo name if not explicitly provided
678
+ effective_sg_repo = sg_repo
679
+ if not effective_sg_repo and mcp_families:
680
+ effective_sg_repo = f"github.com/sg-evals/{repo_paths[0].name}"
681
+
634
682
  result = mine_org_scale_tasks(
635
683
  repo_paths,
636
684
  count=count,
637
685
  families=selected_families,
638
686
  no_llm=no_llm,
639
687
  scan_timeout=scan_timeout,
688
+ include_mcp_families=mcp_families,
689
+ sg_repo=effective_sg_repo,
640
690
  )
641
691
 
642
692
  if not result.tasks:
@@ -682,6 +732,8 @@ def _run_org_scale_mine(
682
732
  curation_backends=curation_backends_used,
683
733
  )
684
734
 
735
+ _record_task_ids_in_experiment(primary_repo, [t.id for t in curated_tasks])
736
+
685
737
  _show_org_scale_results(
686
738
  curated_tasks, tasks_dir, primary_repo, curation_backends_used
687
739
  )
@@ -3,10 +3,13 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
+ import logging
6
7
  from pathlib import Path
7
8
 
8
9
  import click
9
10
 
11
+ logger = logging.getLogger(__name__)
12
+
10
13
  from codeprobe.probe.generator import DEFAULT_COUNT, MAX_PROBES, MIN_PROBES
11
14
 
12
15
 
@@ -76,7 +79,7 @@ def probe(
76
79
  output_dir = Path(output) if output else repo_root / "probes"
77
80
  effective_repo_name = repo_name or repo_root.name
78
81
 
79
- click.echo(f"Scanning {repo_root} for symbols...", err=True)
82
+ logger.info("Scanning %s for symbols...", repo_root)
80
83
  probes = generate_probes(
81
84
  repo_root=repo_root,
82
85
  count=count,
@@ -85,12 +88,11 @@ def probe(
85
88
  )
86
89
 
87
90
  if not probes:
88
- click.echo("No probes generated -- no suitable symbols found.", err=True)
91
+ logger.warning("No probes generated -- no suitable symbols found.")
89
92
  raise SystemExit(1)
90
93
 
91
- click.echo(
92
- f"Generated {len(probes)} probes, writing to {output_dir}...",
93
- err=True,
94
+ logger.info(
95
+ "Generated %d probes, writing to %s...", len(probes), output_dir
94
96
  )
95
97
  created = write_probe_tasks(probes, output_dir, effective_repo_name)
96
98
 
@@ -108,9 +110,9 @@ def probe(
108
110
  }
109
111
  click.echo(json.dumps(summary, indent=2))
110
112
  else:
111
- click.echo(f"Probe generation complete:", err=True)
112
- click.echo(f" Total probes: {len(probes)}", err=True)
113
+ logger.info("Probe generation complete:")
114
+ logger.info(" Total probes: %d", len(probes))
113
115
  for tpl_name, tpl_count in sorted(by_template.items()):
114
- click.echo(f" {tpl_name}: {tpl_count}", err=True)
115
- click.echo(f" Output: {output_dir}", err=True)
116
+ logger.info(" %s: %d", tpl_name, tpl_count)
117
+ logger.info(" Output: %s", output_dir)
116
118
  click.echo(f"Created {len(created)} probe tasks in {output_dir}")
@@ -22,6 +22,27 @@ def _on_task_complete(result: CompletedTask) -> None:
22
22
  click.echo(f" {result.task_id}: {status} ({result.duration_seconds:.1f}s)")
23
23
 
24
24
 
25
+ def _find_tasks(d: Path, *, task_ids: tuple[str, ...] = ()) -> list[Path]:
26
+ """Discover task subdirectories with instruction.md.
27
+
28
+ When *task_ids* is non-empty, only return tasks whose directory name
29
+ appears in that tuple. This scopes task discovery to the current
30
+ experiment, preventing tasks from other experiments from leaking in.
31
+ """
32
+ if not d.is_dir():
33
+ return []
34
+ if task_ids:
35
+ allowed = set(task_ids)
36
+ return sorted(
37
+ sd
38
+ for sd in d.iterdir()
39
+ if sd.is_dir() and sd.name in allowed and (sd / "instruction.md").exists()
40
+ )
41
+ return sorted(
42
+ sd for sd in d.iterdir() if sd.is_dir() and (sd / "instruction.md").exists()
43
+ )
44
+
45
+
25
46
  def _print_dry_run(estimate: DryRunEstimate) -> None:
26
47
  """Pretty-print a DryRunEstimate to stdout."""
27
48
  cost_lo, cost_hi = estimate.estimated_cost_range
@@ -88,17 +109,9 @@ def run_eval(
88
109
  tasks_dir = exp_dir / experiment.tasks_dir
89
110
  repo_tasks = Path(path).resolve() / ".codeprobe" / experiment.tasks_dir
90
111
 
91
- # Prefer whichever location actually has task subdirectories with instruction.md
92
- def _find_tasks(d: Path) -> list[Path]:
93
- if not d.is_dir():
94
- return []
95
- return sorted(
96
- sd for sd in d.iterdir() if sd.is_dir() and (sd / "instruction.md").exists()
97
- )
98
-
99
- task_dirs = _find_tasks(tasks_dir)
112
+ task_dirs = _find_tasks(tasks_dir, task_ids=experiment.task_ids)
100
113
  if not task_dirs and repo_tasks != tasks_dir:
101
- task_dirs = _find_tasks(repo_tasks)
114
+ task_dirs = _find_tasks(repo_tasks, task_ids=experiment.task_ids)
102
115
  if task_dirs:
103
116
  tasks_dir = repo_tasks
104
117
 
@@ -14,19 +14,68 @@ from codeprobe.models.experiment import ExperimentConfig
14
14
  _SAFE_NAME = re.compile(r"^[a-zA-Z0-9][a-zA-Z0-9._-]*$")
15
15
 
16
16
 
17
+ _DEFAULT_SOURCEGRAPH_URL = "https://sourcegraph.com"
18
+
19
+
20
+ def build_sourcegraph_mcp_config(
21
+ *,
22
+ token: str,
23
+ url: str = _DEFAULT_SOURCEGRAPH_URL,
24
+ ) -> dict:
25
+ """Build an HTTP MCP config dict for Sourcegraph.
26
+
27
+ Returns a ``{"mcpServers": {"sourcegraph": {...}}}`` dict suitable for
28
+ passing as ``mcp_config`` on an :class:`ExperimentConfig`.
29
+ """
30
+ base_url = url.rstrip("/")
31
+ return {
32
+ "mcpServers": {
33
+ "sourcegraph": {
34
+ "type": "http",
35
+ "url": f"{base_url}/.api/mcp/v1",
36
+ "headers": {"Authorization": f"token {token}"},
37
+ }
38
+ }
39
+ }
40
+
41
+
17
42
  def ask_mcp_comparison(
18
43
  *,
19
44
  experiment_name: str,
20
45
  agent: str,
21
46
  model: str | None,
22
- mcp_config_path: str,
47
+ mcp_config_path: str | None = None,
48
+ sourcegraph_token: str | None = None,
49
+ sourcegraph_url: str | None = None,
23
50
  ) -> tuple[EvalrcConfig, list[ExperimentConfig]]:
24
- """Goal 1: Compare baseline agent vs MCP-augmented agent."""
25
- mcp_data = _load_json(mcp_config_path)
51
+ """Goal 1: Compare baseline agent vs MCP-augmented agent.
52
+
53
+ When *sourcegraph_token* is provided, generates an HTTP-based Sourcegraph
54
+ MCP config with an ``Authorization`` header and adds the ``sourcegraph``
55
+ preamble. Otherwise falls back to loading the MCP config from
56
+ *mcp_config_path*.
57
+ """
58
+ if sourcegraph_token is not None:
59
+ mcp_data = build_sourcegraph_mcp_config(
60
+ token=sourcegraph_token,
61
+ url=sourcegraph_url or _DEFAULT_SOURCEGRAPH_URL,
62
+ )
63
+ preambles: tuple[str, ...] = ("sourcegraph",)
64
+ else:
65
+ if mcp_config_path is None:
66
+ raise click.BadParameter(
67
+ "Either sourcegraph_token or mcp_config_path must be provided."
68
+ )
69
+ mcp_data = _load_json(mcp_config_path)
70
+ preambles = ()
26
71
 
27
72
  baseline = ExperimentConfig(label="baseline", agent=agent, model=model)
28
73
  with_mcp = ExperimentConfig(
29
- label="with-mcp", agent=agent, model=model, mcp_config=mcp_data
74
+ label="with-mcp",
75
+ agent=agent,
76
+ model=model,
77
+ mcp_config=mcp_data,
78
+ preambles=preambles,
30
79
  )
31
80
 
32
81
  evalrc = EvalrcConfig(name=experiment_name, agents=[agent])