codeprobe 0.2.7__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.2.7 → codeprobe-0.3.0}/PKG-INFO +33 -20
- {codeprobe-0.2.7 → codeprobe-0.3.0}/README.md +31 -19
- {codeprobe-0.2.7 → codeprobe-0.3.0}/pyproject.toml +8 -1
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/__init__.py +1 -1
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/__init__.py +203 -1
- codeprobe-0.3.0/src/codeprobe/cli/doctor_cmd.py +115 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/experiment_cmd.py +38 -1
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/init_cmd.py +3 -45
- codeprobe-0.3.0/src/codeprobe/cli/json_display.py +48 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/mine_cmd.py +169 -0
- codeprobe-0.3.0/src/codeprobe/cli/preamble_cmd.py +92 -0
- codeprobe-0.3.0/src/codeprobe/cli/rich_display.py +234 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/run_cmd.py +250 -22
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/yaml_writer.py +17 -5
- codeprobe-0.3.0/src/codeprobe/core/events.py +274 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/executor.py +146 -12
- codeprobe-0.3.0/src/codeprobe/core/mcp_discovery.py +47 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/registry.py +34 -2
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/scoring.py +7 -12
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/writer.py +2 -2
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe.egg-info/PKG-INFO +33 -20
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe.egg-info/SOURCES.txt +17 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe.egg-info/entry_points.txt +6 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe.egg-info/requires.txt +1 -0
- codeprobe-0.3.0/tests/test_adapter_contracts.py +134 -0
- codeprobe-0.3.0/tests/test_ctrlc_integration.py +117 -0
- codeprobe-0.3.0/tests/test_doctor_cmd.py +127 -0
- codeprobe-0.3.0/tests/test_events.py +343 -0
- codeprobe-0.3.0/tests/test_executor_events.py +423 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_init_wizard.py +59 -15
- codeprobe-0.3.0/tests/test_json_display.py +229 -0
- codeprobe-0.3.0/tests/test_mine_presets.py +163 -0
- codeprobe-0.3.0/tests/test_mine_profiles.py +384 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_pipeline_integration.py +233 -0
- codeprobe-0.3.0/tests/test_preamble_cmd.py +115 -0
- codeprobe-0.3.0/tests/test_run_config_resolution.py +221 -0
- codeprobe-0.3.0/tests/test_show_prompt.py +108 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/LICENSE +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/setup.cfg +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/_base.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/aider.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/claude.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/protocol.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/adapters/telemetry.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/analysis/report.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/analysis/stats.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/api.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/assess/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/assess/heuristics.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/interpret_cmd.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/probe_cmd.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/cli/wizard.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/config/loader.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/checkpoint.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/experiment.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/isolation.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/preamble.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/loaders/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/curator.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/curator_backends.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/extractor.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/org_scale.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/org_scale_families.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/org_scale_oracle.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/org_scale_scanner.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/sg_ground_truth.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/models/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/models/experiment.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/models/task.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/preambles/github.md +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/preambles/sourcegraph.md +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/probe/generator.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/probe/writer.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/ratings/collector.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_adapters.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_analysis.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_api.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_assess.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_changed_symbols.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_checkpoint.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_cli.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_config_loader.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_contrib.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_curator_backends.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_curator_core.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_curator_integration.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_curator_tiers.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_executor.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_experiment_cmd.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_experiment_core.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_llm.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_loaders.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_mcp_families_mining.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_mcp_validate.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_mining.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_models.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_new_families.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_org_scale.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_preamble.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_probe.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_ratings.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_registry.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_scaffold.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_scoring.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_session.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_sg_ground_truth.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_telemetry.py +0 -0
- {codeprobe-0.2.7 → codeprobe-0.3.0}/tests/test_weighted_f1.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -24,6 +24,7 @@ Requires-Dist: anthropic>=0.39
|
|
|
24
24
|
Requires-Dist: openai>=1.66
|
|
25
25
|
Requires-Dist: tiktoken<1,>=0.7
|
|
26
26
|
Requires-Dist: scipy<2,>=1.11
|
|
27
|
+
Requires-Dist: rich<14,>=13.7
|
|
27
28
|
Provides-Extra: dev
|
|
28
29
|
Requires-Dist: pytest<9,>=8.0; extra == "dev"
|
|
29
30
|
Requires-Dist: pytest-cov<6,>=5.0; extra == "dev"
|
|
@@ -84,18 +85,20 @@ codeprobe interpret . # Get recommendations
|
|
|
84
85
|
|
|
85
86
|
## Commands
|
|
86
87
|
|
|
87
|
-
| Command
|
|
88
|
-
|
|
|
89
|
-
| `codeprobe assess`
|
|
90
|
-
| `codeprobe init`
|
|
91
|
-
| `codeprobe mine`
|
|
92
|
-
| `codeprobe probe`
|
|
93
|
-
| `codeprobe experiment`
|
|
94
|
-
| `codeprobe run`
|
|
95
|
-
| `codeprobe interpret`
|
|
96
|
-
| `codeprobe
|
|
97
|
-
| `codeprobe
|
|
98
|
-
| `codeprobe
|
|
88
|
+
| Command | Purpose |
|
|
89
|
+
| -------------------------- | ------------------------------------------------ |
|
|
90
|
+
| `codeprobe assess` | Score a codebase's benchmarking potential |
|
|
91
|
+
| `codeprobe init` | Interactive wizard — choose what to compare |
|
|
92
|
+
| `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
|
|
93
|
+
| `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
|
|
94
|
+
| `codeprobe experiment` | Manage comparison experiments (init, add-config) |
|
|
95
|
+
| `codeprobe run` | Execute tasks against AI agents |
|
|
96
|
+
| `codeprobe interpret` | Analyze results, rank configurations |
|
|
97
|
+
| `codeprobe doctor` | Check environment readiness (agents, keys, git) |
|
|
98
|
+
| `codeprobe preambles list` | List available preambles at all search levels |
|
|
99
|
+
| `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
|
|
100
|
+
| `codeprobe scaffold` | Create/validate eval task directories |
|
|
101
|
+
| `codeprobe ratings` | Record and analyze agent session quality ratings |
|
|
99
102
|
|
|
100
103
|
## Two Ways to Generate Tasks
|
|
101
104
|
|
|
@@ -181,17 +184,32 @@ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}
|
|
|
181
184
|
codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
|
|
182
185
|
codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
|
|
183
186
|
codeprobe run . --dry-run # Estimate resource usage without running
|
|
187
|
+
codeprobe run . --model opus-4 # Override experiment.json model
|
|
188
|
+
codeprobe run . --timeout 600 # Override default 300s timeout
|
|
189
|
+
codeprobe run . --repeats 3 # Run each task 3 times
|
|
190
|
+
codeprobe run . --show-prompt # Print resolved prompt without running agent
|
|
184
191
|
|
|
185
192
|
# Mining
|
|
186
193
|
codeprobe mine . --enrich # Use LLM to improve weak task instructions
|
|
187
194
|
codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
|
|
188
195
|
codeprobe mine . --mcp-families # Include MCP-optimized task families
|
|
189
196
|
codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
|
|
197
|
+
codeprobe mine . --preset quick # Quick scan: count=3
|
|
198
|
+
codeprobe mine . --preset mcp # MCP eval: org-scale + MCP families + enrich
|
|
199
|
+
|
|
200
|
+
# Mine profiles (save/load custom flag combinations)
|
|
201
|
+
codeprobe mine --save-profile my-setup --count 10 --org-scale .
|
|
202
|
+
codeprobe mine --profile my-setup . # Load saved flags
|
|
203
|
+
codeprobe mine --list-profiles # Show available profiles
|
|
190
204
|
|
|
191
205
|
# Experiment configs
|
|
192
206
|
codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
|
|
193
207
|
codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
|
|
194
208
|
|
|
209
|
+
# Diagnostics
|
|
210
|
+
codeprobe doctor # Check agents, API keys, git, Python
|
|
211
|
+
codeprobe preambles list # Show available preambles at all levels
|
|
212
|
+
|
|
195
213
|
# Output
|
|
196
214
|
codeprobe interpret . --format csv # Export for pivot tables
|
|
197
215
|
codeprobe interpret . --format html # Self-contained HTML report
|
|
@@ -210,14 +228,9 @@ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
|
|
|
210
228
|
|
|
211
229
|
## Configuration
|
|
212
230
|
|
|
213
|
-
|
|
231
|
+
Configuration lives in `experiment.json` (created by `codeprobe init` or `codeprobe experiment init`). CLI flags override experiment.json values — precedence: built-in defaults < experiment.json < CLI flags.
|
|
214
232
|
|
|
215
|
-
|
|
216
|
-
name: my-experiment
|
|
217
|
-
agents: [claude, copilot]
|
|
218
|
-
models: [claude-sonnet-4-6, claude-opus-4-6]
|
|
219
|
-
tasks_dir: .codeprobe/tasks
|
|
220
|
-
```
|
|
233
|
+
Run-time observability is on by default: Rich Live dashboard in TTY, JSON event lines with `--log-format json` for CI. Cost budget warnings at 80% and 100% thresholds are always visible on stderr.
|
|
221
234
|
|
|
222
235
|
## License
|
|
223
236
|
|
|
@@ -49,18 +49,20 @@ codeprobe interpret . # Get recommendations
|
|
|
49
49
|
|
|
50
50
|
## Commands
|
|
51
51
|
|
|
52
|
-
| Command
|
|
53
|
-
|
|
|
54
|
-
| `codeprobe assess`
|
|
55
|
-
| `codeprobe init`
|
|
56
|
-
| `codeprobe mine`
|
|
57
|
-
| `codeprobe probe`
|
|
58
|
-
| `codeprobe experiment`
|
|
59
|
-
| `codeprobe run`
|
|
60
|
-
| `codeprobe interpret`
|
|
61
|
-
| `codeprobe
|
|
62
|
-
| `codeprobe
|
|
63
|
-
| `codeprobe
|
|
52
|
+
| Command | Purpose |
|
|
53
|
+
| -------------------------- | ------------------------------------------------ |
|
|
54
|
+
| `codeprobe assess` | Score a codebase's benchmarking potential |
|
|
55
|
+
| `codeprobe init` | Interactive wizard — choose what to compare |
|
|
56
|
+
| `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
|
|
57
|
+
| `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
|
|
58
|
+
| `codeprobe experiment` | Manage comparison experiments (init, add-config) |
|
|
59
|
+
| `codeprobe run` | Execute tasks against AI agents |
|
|
60
|
+
| `codeprobe interpret` | Analyze results, rank configurations |
|
|
61
|
+
| `codeprobe doctor` | Check environment readiness (agents, keys, git) |
|
|
62
|
+
| `codeprobe preambles list` | List available preambles at all search levels |
|
|
63
|
+
| `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
|
|
64
|
+
| `codeprobe scaffold` | Create/validate eval task directories |
|
|
65
|
+
| `codeprobe ratings` | Record and analyze agent session quality ratings |
|
|
64
66
|
|
|
65
67
|
## Two Ways to Generate Tasks
|
|
66
68
|
|
|
@@ -146,17 +148,32 @@ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}
|
|
|
146
148
|
codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
|
|
147
149
|
codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
|
|
148
150
|
codeprobe run . --dry-run # Estimate resource usage without running
|
|
151
|
+
codeprobe run . --model opus-4 # Override experiment.json model
|
|
152
|
+
codeprobe run . --timeout 600 # Override default 300s timeout
|
|
153
|
+
codeprobe run . --repeats 3 # Run each task 3 times
|
|
154
|
+
codeprobe run . --show-prompt # Print resolved prompt without running agent
|
|
149
155
|
|
|
150
156
|
# Mining
|
|
151
157
|
codeprobe mine . --enrich # Use LLM to improve weak task instructions
|
|
152
158
|
codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
|
|
153
159
|
codeprobe mine . --mcp-families # Include MCP-optimized task families
|
|
154
160
|
codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
|
|
161
|
+
codeprobe mine . --preset quick # Quick scan: count=3
|
|
162
|
+
codeprobe mine . --preset mcp # MCP eval: org-scale + MCP families + enrich
|
|
163
|
+
|
|
164
|
+
# Mine profiles (save/load custom flag combinations)
|
|
165
|
+
codeprobe mine --save-profile my-setup --count 10 --org-scale .
|
|
166
|
+
codeprobe mine --profile my-setup . # Load saved flags
|
|
167
|
+
codeprobe mine --list-profiles # Show available profiles
|
|
155
168
|
|
|
156
169
|
# Experiment configs
|
|
157
170
|
codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
|
|
158
171
|
codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
|
|
159
172
|
|
|
173
|
+
# Diagnostics
|
|
174
|
+
codeprobe doctor # Check agents, API keys, git, Python
|
|
175
|
+
codeprobe preambles list # Show available preambles at all levels
|
|
176
|
+
|
|
160
177
|
# Output
|
|
161
178
|
codeprobe interpret . --format csv # Export for pivot tables
|
|
162
179
|
codeprobe interpret . --format html # Self-contained HTML report
|
|
@@ -175,14 +192,9 @@ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
|
|
|
175
192
|
|
|
176
193
|
## Configuration
|
|
177
194
|
|
|
178
|
-
|
|
195
|
+
Configuration lives in `experiment.json` (created by `codeprobe init` or `codeprobe experiment init`). CLI flags override experiment.json values — precedence: built-in defaults < experiment.json < CLI flags.
|
|
179
196
|
|
|
180
|
-
|
|
181
|
-
name: my-experiment
|
|
182
|
-
agents: [claude, copilot]
|
|
183
|
-
models: [claude-sonnet-4-6, claude-opus-4-6]
|
|
184
|
-
tasks_dir: .codeprobe/tasks
|
|
185
|
-
```
|
|
197
|
+
Run-time observability is on by default: Rich Live dashboard in TTY, JSON event lines with `--log-format json` for CI. Cost budget warnings at 80% and 100% thresholds are always visible on stderr.
|
|
186
198
|
|
|
187
199
|
## License
|
|
188
200
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "codeprobe"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -25,6 +25,7 @@ dependencies = [
|
|
|
25
25
|
"openai>=1.66",
|
|
26
26
|
"tiktoken>=0.7,<1",
|
|
27
27
|
"scipy>=1.11,<2",
|
|
28
|
+
"rich>=13.7,<14",
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
[project.urls]
|
|
@@ -57,6 +58,12 @@ claude = "codeprobe.adapters.session:ClaudeSessionCollector"
|
|
|
57
58
|
codex = "codeprobe.adapters.session:CodexSessionCollector"
|
|
58
59
|
copilot = "codeprobe.adapters.session:CopilotSessionCollector"
|
|
59
60
|
|
|
61
|
+
[project.entry-points."codeprobe.scorers"]
|
|
62
|
+
binary = "codeprobe.core.scoring:BinaryScorer"
|
|
63
|
+
continuous = "codeprobe.core.scoring:ContinuousScorer"
|
|
64
|
+
checkpoint = "codeprobe.core.scoring:CheckpointScorer"
|
|
65
|
+
test_ratio = "codeprobe.core.scoring:ContinuousScorer"
|
|
66
|
+
|
|
60
67
|
[build-system]
|
|
61
68
|
requires = ["setuptools>=68", "wheel"]
|
|
62
69
|
build-backend = "setuptools.build_meta"
|
|
@@ -84,6 +84,10 @@ def main(verbose: int, quiet: bool, log_format: str) -> None:
|
|
|
84
84
|
and interpret the results to find which setup works best for YOUR code.
|
|
85
85
|
"""
|
|
86
86
|
_configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
|
|
87
|
+
ctx = click.get_current_context()
|
|
88
|
+
ctx.ensure_object(dict)
|
|
89
|
+
ctx.obj["log_format"] = log_format
|
|
90
|
+
ctx.obj["quiet"] = quiet
|
|
87
91
|
|
|
88
92
|
|
|
89
93
|
@main.command()
|
|
@@ -101,6 +105,32 @@ def init(path: str) -> None:
|
|
|
101
105
|
|
|
102
106
|
@main.command()
|
|
103
107
|
@click.argument("path", default=".")
|
|
108
|
+
@click.option(
|
|
109
|
+
"--preset",
|
|
110
|
+
type=click.Choice(["quick", "mcp"], case_sensitive=False),
|
|
111
|
+
default=None,
|
|
112
|
+
help="Apply a named preset: 'quick' (count=3) or 'mcp' (org-scale + MCP families).",
|
|
113
|
+
)
|
|
114
|
+
@click.option(
|
|
115
|
+
"--profile",
|
|
116
|
+
"profile_name",
|
|
117
|
+
default=None,
|
|
118
|
+
help="Load a user-defined profile from ~/.codeprobe/mine-profiles.json "
|
|
119
|
+
"or .codeprobe/mine-profiles.json. Explicit flags override profile values.",
|
|
120
|
+
)
|
|
121
|
+
@click.option(
|
|
122
|
+
"--save-profile",
|
|
123
|
+
"save_profile_name",
|
|
124
|
+
default=None,
|
|
125
|
+
help="Save current flag values as a named profile to ~/.codeprobe/mine-profiles.json.",
|
|
126
|
+
)
|
|
127
|
+
@click.option(
|
|
128
|
+
"--list-profiles",
|
|
129
|
+
"list_profiles_flag",
|
|
130
|
+
is_flag=True,
|
|
131
|
+
default=False,
|
|
132
|
+
help="Show available profiles from user and project levels.",
|
|
133
|
+
)
|
|
104
134
|
@click.option("--count", default=5, help="Number of tasks to mine (3-20).")
|
|
105
135
|
@click.option(
|
|
106
136
|
"--source",
|
|
@@ -206,8 +236,14 @@ def init(path: str) -> None:
|
|
|
206
236
|
"(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
|
|
207
237
|
"when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
|
|
208
238
|
)
|
|
239
|
+
@click.pass_context
|
|
209
240
|
def mine(
|
|
241
|
+
ctx: click.Context,
|
|
210
242
|
path: str,
|
|
243
|
+
preset: str | None,
|
|
244
|
+
profile_name: str | None,
|
|
245
|
+
save_profile_name: str | None,
|
|
246
|
+
list_profiles_flag: bool,
|
|
211
247
|
count: int,
|
|
212
248
|
source: str,
|
|
213
249
|
min_files: int,
|
|
@@ -232,6 +268,21 @@ def mine(
|
|
|
232
268
|
Extracts real code-change tasks from merged PRs/MRs with ground truth,
|
|
233
269
|
test scripts, and scoring rubrics.
|
|
234
270
|
|
|
271
|
+
\b
|
|
272
|
+
Presets (--preset):
|
|
273
|
+
quick — Fast scan: count=3, default SDLC mode
|
|
274
|
+
mcp — MCP eval: count=8, org-scale + MCP families + enrich
|
|
275
|
+
|
|
276
|
+
\b
|
|
277
|
+
Profiles (--profile / --save-profile / --list-profiles):
|
|
278
|
+
Save: codeprobe mine --save-profile my-setup --count 10 --org-scale .
|
|
279
|
+
Load: codeprobe mine --profile my-setup /path/to/repo
|
|
280
|
+
List: codeprobe mine --list-profiles
|
|
281
|
+
|
|
282
|
+
\b
|
|
283
|
+
Precedence: built-in defaults < profile < --preset < explicit CLI flags.
|
|
284
|
+
|
|
285
|
+
\b
|
|
235
286
|
Use --org-scale to mine comprehension/IR tasks with oracle verification
|
|
236
287
|
instead of SDLC code-change tasks.
|
|
237
288
|
|
|
@@ -242,10 +293,98 @@ def mine(
|
|
|
242
293
|
choosing an eval goal, task count, and git host before mining.
|
|
243
294
|
Use --no-interactive to skip the prompts and use defaults/flags directly.
|
|
244
295
|
"""
|
|
245
|
-
from
|
|
296
|
+
from pathlib import Path as _Path
|
|
297
|
+
|
|
298
|
+
from codeprobe.cli.mine_cmd import (
|
|
299
|
+
list_profiles,
|
|
300
|
+
load_profile,
|
|
301
|
+
run_mine,
|
|
302
|
+
save_profile,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
# --list-profiles: show and exit
|
|
306
|
+
if list_profiles_flag:
|
|
307
|
+
repo_path = _Path(path).resolve() if path != "." else _Path.cwd()
|
|
308
|
+
entries = list_profiles(repo_path)
|
|
309
|
+
if not entries:
|
|
310
|
+
click.echo("No profiles found.")
|
|
311
|
+
else:
|
|
312
|
+
click.echo(f"{'Name':<20s} {'Source':<10s} {'Settings'}")
|
|
313
|
+
click.echo("-" * 60)
|
|
314
|
+
for name, source_label, prof in entries:
|
|
315
|
+
summary = ", ".join(f"{k}={v}" for k, v in sorted(prof.items()))
|
|
316
|
+
click.echo(f"{name:<20s} {source_label:<10s} {summary}")
|
|
317
|
+
return
|
|
318
|
+
|
|
319
|
+
# --save-profile: save current flags and exit
|
|
320
|
+
if save_profile_name is not None:
|
|
321
|
+
# Collect all current param values, keeping only those that differ
|
|
322
|
+
# from Click defaults.
|
|
323
|
+
param_defaults = {p.name: p.default for p in ctx.command.params}
|
|
324
|
+
# Exclude meta-params that aren't mining flags
|
|
325
|
+
_EXCLUDE_FROM_PROFILE = frozenset(
|
|
326
|
+
{
|
|
327
|
+
"path",
|
|
328
|
+
"profile_name",
|
|
329
|
+
"save_profile_name",
|
|
330
|
+
"list_profiles_flag",
|
|
331
|
+
}
|
|
332
|
+
)
|
|
333
|
+
values = {
|
|
334
|
+
k: (list(v) if isinstance(v, tuple) else v)
|
|
335
|
+
for k, v in ctx.params.items()
|
|
336
|
+
if k not in _EXCLUDE_FROM_PROFILE and v != param_defaults.get(k)
|
|
337
|
+
}
|
|
338
|
+
saved_path = save_profile(save_profile_name, values)
|
|
339
|
+
click.echo(f"Profile '{save_profile_name}' saved to {saved_path}")
|
|
340
|
+
return
|
|
341
|
+
|
|
342
|
+
# --profile: load profile values as defaults, then apply preset and CLI overrides
|
|
343
|
+
if profile_name is not None:
|
|
344
|
+
repo_path = _Path(path).resolve() if path != "." else _Path.cwd()
|
|
345
|
+
prof = load_profile(profile_name, repo_path)
|
|
346
|
+
|
|
347
|
+
# Determine which params were explicitly set on the CLI
|
|
348
|
+
explicitly_set = {
|
|
349
|
+
p.name
|
|
350
|
+
for p in ctx.command.params
|
|
351
|
+
if ctx.get_parameter_source(p.name) is not None
|
|
352
|
+
and ctx.get_parameter_source(p.name).name == "COMMANDLINE"
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
# Apply profile values for params NOT explicitly set on CLI.
|
|
356
|
+
# Tuple-typed params (click multiple=True) need list→tuple coercion.
|
|
357
|
+
_TUPLE_PARAMS = frozenset({"subsystem", "family", "repos", "backends"})
|
|
358
|
+
|
|
359
|
+
def _prof_val(key: str, current: object) -> object:
|
|
360
|
+
if key in explicitly_set or key not in prof:
|
|
361
|
+
return current
|
|
362
|
+
v = prof[key]
|
|
363
|
+
return tuple(v) if key in _TUPLE_PARAMS else v
|
|
364
|
+
|
|
365
|
+
count = _prof_val("count", count) # type: ignore[assignment]
|
|
366
|
+
source = _prof_val("source", source) # type: ignore[assignment]
|
|
367
|
+
min_files = _prof_val("min_files", min_files) # type: ignore[assignment]
|
|
368
|
+
enrich = _prof_val("enrich", enrich) # type: ignore[assignment]
|
|
369
|
+
org_scale = _prof_val("org_scale", org_scale) # type: ignore[assignment]
|
|
370
|
+
mcp_families = _prof_val("mcp_families", mcp_families) # type: ignore[assignment]
|
|
371
|
+
no_llm = _prof_val("no_llm", no_llm) # type: ignore[assignment]
|
|
372
|
+
discover_subsystems = _prof_val("discover_subsystems", discover_subsystems) # type: ignore[assignment]
|
|
373
|
+
scan_timeout = _prof_val("scan_timeout", scan_timeout) # type: ignore[assignment]
|
|
374
|
+
validate_flag = _prof_val("validate_flag", validate_flag) # type: ignore[assignment]
|
|
375
|
+
curate = _prof_val("curate", curate) # type: ignore[assignment]
|
|
376
|
+
verify_curation_flag = _prof_val("verify_curation_flag", verify_curation_flag) # type: ignore[assignment]
|
|
377
|
+
sg_repo = _prof_val("sg_repo", sg_repo) # type: ignore[assignment]
|
|
378
|
+
subsystem = _prof_val("subsystem", subsystem) # type: ignore[assignment]
|
|
379
|
+
family = _prof_val("family", family) # type: ignore[assignment]
|
|
380
|
+
repos = _prof_val("repos", repos) # type: ignore[assignment]
|
|
381
|
+
backends = _prof_val("backends", backends) # type: ignore[assignment]
|
|
382
|
+
interactive = _prof_val("interactive", interactive) # type: ignore[assignment]
|
|
383
|
+
preset = _prof_val("preset", preset) # type: ignore[assignment]
|
|
246
384
|
|
|
247
385
|
run_mine(
|
|
248
386
|
path,
|
|
387
|
+
preset=preset,
|
|
249
388
|
count=count,
|
|
250
389
|
source=source,
|
|
251
390
|
min_files=min_files,
|
|
@@ -294,7 +433,39 @@ def mine(
|
|
|
294
433
|
default=False,
|
|
295
434
|
help="Print estimated resource requirements without executing any agents.",
|
|
296
435
|
)
|
|
436
|
+
@click.option(
|
|
437
|
+
"--force-plain",
|
|
438
|
+
is_flag=True,
|
|
439
|
+
default=False,
|
|
440
|
+
help="Force plain-text output even in a TTY (disable Rich dashboard).",
|
|
441
|
+
)
|
|
442
|
+
@click.option(
|
|
443
|
+
"--force-rich",
|
|
444
|
+
is_flag=True,
|
|
445
|
+
default=False,
|
|
446
|
+
help="Force Rich Live dashboard even in non-TTY environments.",
|
|
447
|
+
)
|
|
448
|
+
@click.option(
|
|
449
|
+
"--timeout",
|
|
450
|
+
default=None,
|
|
451
|
+
type=int,
|
|
452
|
+
help="Timeout in seconds per task (overrides experiment.json extra.timeout_seconds).",
|
|
453
|
+
)
|
|
454
|
+
@click.option(
|
|
455
|
+
"--repeats",
|
|
456
|
+
default=None,
|
|
457
|
+
type=int,
|
|
458
|
+
help="Number of repeats per task (overrides default of 1).",
|
|
459
|
+
)
|
|
460
|
+
@click.option(
|
|
461
|
+
"--show-prompt",
|
|
462
|
+
is_flag=True,
|
|
463
|
+
default=False,
|
|
464
|
+
help="Print the fully-resolved prompt for the first task and exit (no agent spawned).",
|
|
465
|
+
)
|
|
466
|
+
@click.pass_context
|
|
297
467
|
def run(
|
|
468
|
+
ctx: click.Context,
|
|
298
469
|
path: str,
|
|
299
470
|
agent: str,
|
|
300
471
|
model: str | None,
|
|
@@ -302,6 +473,11 @@ def run(
|
|
|
302
473
|
max_cost_usd: float | None,
|
|
303
474
|
parallel: int,
|
|
304
475
|
dry_run: bool,
|
|
476
|
+
force_plain: bool,
|
|
477
|
+
force_rich: bool,
|
|
478
|
+
timeout: int | None,
|
|
479
|
+
repeats: int | None,
|
|
480
|
+
show_prompt: bool,
|
|
305
481
|
) -> None:
|
|
306
482
|
"""Run eval tasks against an AI coding agent.
|
|
307
483
|
|
|
@@ -310,6 +486,16 @@ def run(
|
|
|
310
486
|
"""
|
|
311
487
|
from codeprobe.cli.run_cmd import run_eval
|
|
312
488
|
|
|
489
|
+
ctx.ensure_object(dict)
|
|
490
|
+
log_format = ctx.obj.get("log_format", "text")
|
|
491
|
+
quiet = ctx.obj.get("quiet", False)
|
|
492
|
+
|
|
493
|
+
if show_prompt:
|
|
494
|
+
from codeprobe.cli.run_cmd import show_prompt_and_exit
|
|
495
|
+
|
|
496
|
+
show_prompt_and_exit(path, config=config, agent=agent, model=model)
|
|
497
|
+
return
|
|
498
|
+
|
|
313
499
|
run_eval(
|
|
314
500
|
path,
|
|
315
501
|
agent=agent,
|
|
@@ -318,6 +504,12 @@ def run(
|
|
|
318
504
|
max_cost_usd=max_cost_usd,
|
|
319
505
|
parallel=parallel,
|
|
320
506
|
dry_run=dry_run,
|
|
507
|
+
log_format=log_format,
|
|
508
|
+
quiet=quiet,
|
|
509
|
+
force_plain=force_plain,
|
|
510
|
+
force_rich=force_rich,
|
|
511
|
+
timeout=timeout,
|
|
512
|
+
repeats=repeats if repeats is not None else 1,
|
|
321
513
|
)
|
|
322
514
|
|
|
323
515
|
|
|
@@ -488,3 +680,13 @@ main.add_command(scaffold)
|
|
|
488
680
|
from codeprobe.cli.probe_cmd import probe # noqa: E402
|
|
489
681
|
|
|
490
682
|
main.add_command(probe)
|
|
683
|
+
|
|
684
|
+
# Register the preambles subcommand group
|
|
685
|
+
from codeprobe.cli.preamble_cmd import preambles # noqa: E402
|
|
686
|
+
|
|
687
|
+
main.add_command(preambles)
|
|
688
|
+
|
|
689
|
+
# Register the doctor command
|
|
690
|
+
from codeprobe.cli.doctor_cmd import doctor # noqa: E402
|
|
691
|
+
|
|
692
|
+
main.add_command(doctor)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Doctor command — checks environment readiness for codeprobe."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
|
|
11
|
+
import click
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass(frozen=True)
|
|
15
|
+
class CheckResult:
|
|
16
|
+
name: str
|
|
17
|
+
passed: bool
|
|
18
|
+
detail: str
|
|
19
|
+
fix: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _check_tool(name: str, fix: str) -> CheckResult:
|
|
23
|
+
found = shutil.which(name) is not None
|
|
24
|
+
return CheckResult(
|
|
25
|
+
name=f"{name} CLI",
|
|
26
|
+
passed=found,
|
|
27
|
+
detail="found" if found else "not found",
|
|
28
|
+
fix=fix,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _check_env_key(key: str, fix: str) -> CheckResult:
|
|
33
|
+
present = key in os.environ and len(os.environ[key]) > 0
|
|
34
|
+
return CheckResult(
|
|
35
|
+
name=key,
|
|
36
|
+
passed=present,
|
|
37
|
+
detail="set" if present else "not set",
|
|
38
|
+
fix=fix,
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _check_git_repo() -> CheckResult:
|
|
43
|
+
try:
|
|
44
|
+
result = subprocess.run(
|
|
45
|
+
["git", "rev-parse", "--is-inside-work-tree"],
|
|
46
|
+
capture_output=True,
|
|
47
|
+
text=True,
|
|
48
|
+
timeout=5,
|
|
49
|
+
)
|
|
50
|
+
is_repo = result.returncode == 0
|
|
51
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
52
|
+
is_repo = False
|
|
53
|
+
return CheckResult(
|
|
54
|
+
name="git repo",
|
|
55
|
+
passed=is_repo,
|
|
56
|
+
detail="inside git repo" if is_repo else "not a git repository",
|
|
57
|
+
fix="Run 'git init' or cd into an existing git repository.",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _check_python_version() -> CheckResult:
|
|
62
|
+
major, minor = sys.version_info[:2]
|
|
63
|
+
ok = (major, minor) >= (3, 11)
|
|
64
|
+
return CheckResult(
|
|
65
|
+
name="Python version",
|
|
66
|
+
passed=ok,
|
|
67
|
+
detail=f"{major}.{minor}",
|
|
68
|
+
fix="Install Python 3.11 or later. See https://www.python.org/downloads/",
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def run_checks() -> list[CheckResult]:
|
|
73
|
+
"""Run all environment checks and return results."""
|
|
74
|
+
return [
|
|
75
|
+
_check_tool(
|
|
76
|
+
"claude",
|
|
77
|
+
"Install Claude Code: https://docs.anthropic.com/en/docs/claude-code",
|
|
78
|
+
),
|
|
79
|
+
_check_tool(
|
|
80
|
+
"copilot",
|
|
81
|
+
"Install GitHub Copilot CLI: https://github.com/github/gh-copilot",
|
|
82
|
+
),
|
|
83
|
+
_check_tool(
|
|
84
|
+
"codex", "Install OpenAI Codex CLI: https://github.com/openai/codex"
|
|
85
|
+
),
|
|
86
|
+
_check_tool("aider", "Install aider: https://aider.chat/docs/install.html"),
|
|
87
|
+
_check_env_key(
|
|
88
|
+
"ANTHROPIC_API_KEY", "Set ANTHROPIC_API_KEY in your environment."
|
|
89
|
+
),
|
|
90
|
+
_check_env_key("OPENAI_API_KEY", "Set OPENAI_API_KEY in your environment."),
|
|
91
|
+
_check_env_key(
|
|
92
|
+
"GITHUB_TOKEN",
|
|
93
|
+
"Set GITHUB_TOKEN in your environment. See https://github.com/settings/tokens",
|
|
94
|
+
),
|
|
95
|
+
_check_git_repo(),
|
|
96
|
+
_check_python_version(),
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@click.command("doctor")
|
|
101
|
+
def doctor() -> None:
|
|
102
|
+
"""Check environment readiness for running codeprobe."""
|
|
103
|
+
results = run_checks()
|
|
104
|
+
any_failed = False
|
|
105
|
+
|
|
106
|
+
for r in results:
|
|
107
|
+
if r.passed:
|
|
108
|
+
click.echo(f" PASS {r.name} ({r.detail})")
|
|
109
|
+
else:
|
|
110
|
+
any_failed = True
|
|
111
|
+
click.echo(f" FAIL {r.name} ({r.detail})")
|
|
112
|
+
click.echo(f" -> {r.fix}")
|
|
113
|
+
|
|
114
|
+
if any_failed:
|
|
115
|
+
raise SystemExit(1)
|
|
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
import json
|
|
6
6
|
import statistics
|
|
7
|
+
import sys
|
|
7
8
|
from datetime import datetime, timezone
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
|
|
@@ -56,6 +57,36 @@ def experiment_init(
|
|
|
56
57
|
)
|
|
57
58
|
|
|
58
59
|
|
|
60
|
+
def _interactive_mcp_selection() -> str | None:
|
|
61
|
+
"""Offer interactive MCP config selection when available.
|
|
62
|
+
|
|
63
|
+
Returns a file path string if the user selects a config, or None to skip.
|
|
64
|
+
"""
|
|
65
|
+
from codeprobe.core.mcp_discovery import discover_mcp_configs
|
|
66
|
+
|
|
67
|
+
discovered = discover_mcp_configs()
|
|
68
|
+
if not discovered:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
click.echo()
|
|
72
|
+
click.echo("Discovered MCP configurations:")
|
|
73
|
+
for i, (p, servers) in enumerate(discovered, 1):
|
|
74
|
+
click.echo(f" {i}. {p} ({len(servers)} servers)")
|
|
75
|
+
for s in servers:
|
|
76
|
+
click.echo(f" - {s}")
|
|
77
|
+
click.echo(f" {len(discovered) + 1}. Skip (no MCP config)")
|
|
78
|
+
click.echo()
|
|
79
|
+
|
|
80
|
+
choice = click.prompt(
|
|
81
|
+
"Select MCP config",
|
|
82
|
+
type=click.IntRange(1, len(discovered) + 1),
|
|
83
|
+
default=len(discovered) + 1,
|
|
84
|
+
)
|
|
85
|
+
if choice <= len(discovered):
|
|
86
|
+
return str(discovered[choice - 1][0])
|
|
87
|
+
return None
|
|
88
|
+
|
|
89
|
+
|
|
59
90
|
def experiment_add_config(
|
|
60
91
|
path: str,
|
|
61
92
|
label: str,
|
|
@@ -84,7 +115,7 @@ def experiment_add_config(
|
|
|
84
115
|
)
|
|
85
116
|
raise SystemExit(1)
|
|
86
117
|
|
|
87
|
-
# Parse MCP config
|
|
118
|
+
# Parse MCP config — offer interactive discovery when omitted in a TTY
|
|
88
119
|
mcp_config: dict | None = None
|
|
89
120
|
if mcp_config_str:
|
|
90
121
|
try:
|
|
@@ -99,6 +130,12 @@ def experiment_add_config(
|
|
|
99
130
|
err=True,
|
|
100
131
|
)
|
|
101
132
|
raise SystemExit(1)
|
|
133
|
+
elif sys.stderr.isatty():
|
|
134
|
+
mcp_config_str = _interactive_mcp_selection()
|
|
135
|
+
if mcp_config_str:
|
|
136
|
+
mcp_path = Path(mcp_config_str).expanduser().resolve()
|
|
137
|
+
if mcp_path.is_file():
|
|
138
|
+
mcp_config = json.loads(mcp_path.read_text(encoding="utf-8"))
|
|
102
139
|
|
|
103
140
|
new_config = ExperimentConfig(
|
|
104
141
|
label=label,
|