codeprobe 0.2.8__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.2.8 → codeprobe-0.3.1}/PKG-INFO +35 -22
- {codeprobe-0.2.8 → codeprobe-0.3.1}/README.md +33 -21
- {codeprobe-0.2.8 → codeprobe-0.3.1}/pyproject.toml +8 -3
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/__init__.py +1 -1
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/assess/heuristics.py +42 -9
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/__init__.py +228 -1
- codeprobe-0.3.1/src/codeprobe/cli/doctor_cmd.py +114 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/experiment_cmd.py +38 -1
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/init_cmd.py +3 -45
- codeprobe-0.3.1/src/codeprobe/cli/json_display.py +48 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/mine_cmd.py +605 -54
- codeprobe-0.3.1/src/codeprobe/cli/preamble_cmd.py +92 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/probe_cmd.py +24 -4
- codeprobe-0.3.1/src/codeprobe/cli/rich_display.py +234 -0
- codeprobe-0.3.1/src/codeprobe/cli/run_cmd.py +556 -0
- codeprobe-0.3.1/src/codeprobe/cli/validate_cmd.py +288 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/yaml_writer.py +17 -5
- codeprobe-0.3.1/src/codeprobe/core/__main__.py +8 -0
- codeprobe-0.3.1/src/codeprobe/core/events.py +274 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/executor.py +157 -15
- codeprobe-0.3.1/src/codeprobe/core/mcp_discovery.py +47 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/registry.py +35 -3
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/scoring.py +260 -21
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/loaders/__init__.py +19 -1
- codeprobe-0.3.1/src/codeprobe/loaders/suite.py +76 -0
- codeprobe-0.3.1/src/codeprobe/mining/_graph.py +310 -0
- codeprobe-0.3.1/src/codeprobe/mining/comprehension.py +473 -0
- codeprobe-0.3.1/src/codeprobe/mining/comprehension_writer.py +114 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/__init__.py +8 -1
- codeprobe-0.3.1/src/codeprobe/models/suite.py +23 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/task.py +40 -0
- codeprobe-0.3.1/src/codeprobe/probe/adapter.py +151 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/PKG-INFO +35 -22
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/SOURCES.txt +33 -1
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/entry_points.txt +6 -2
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/requires.txt +1 -0
- codeprobe-0.3.1/tests/test_adapter_contracts.py +104 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_adapters.py +0 -161
- codeprobe-0.3.1/tests/test_artifact_scorer.py +316 -0
- codeprobe-0.3.1/tests/test_checkpoint_scoring.py +369 -0
- codeprobe-0.3.1/tests/test_comprehension.py +329 -0
- codeprobe-0.3.1/tests/test_ctrlc_integration.py +119 -0
- codeprobe-0.3.1/tests/test_doctor_cmd.py +127 -0
- codeprobe-0.3.1/tests/test_events.py +343 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_executor.py +167 -0
- codeprobe-0.3.1/tests/test_executor_events.py +423 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_init_wizard.py +59 -15
- codeprobe-0.3.1/tests/test_json_display.py +229 -0
- codeprobe-0.3.1/tests/test_mine_goals.py +518 -0
- codeprobe-0.3.1/tests/test_mine_presets.py +163 -0
- codeprobe-0.3.1/tests/test_mine_profiles.py +384 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_pipeline_integration.py +233 -0
- codeprobe-0.3.1/tests/test_preamble_cmd.py +115 -0
- codeprobe-0.3.1/tests/test_probe_adapter.py +317 -0
- codeprobe-0.3.1/tests/test_run_config_resolution.py +221 -0
- codeprobe-0.3.1/tests/test_shell_shim.py +177 -0
- codeprobe-0.3.1/tests/test_show_prompt.py +108 -0
- codeprobe-0.3.1/tests/test_suite.py +243 -0
- codeprobe-0.3.1/tests/test_validate_cmd.py +272 -0
- codeprobe-0.2.8/src/codeprobe/adapters/aider.py +0 -79
- codeprobe-0.2.8/src/codeprobe/cli/run_cmd.py +0 -251
- {codeprobe-0.2.8 → codeprobe-0.3.1}/LICENSE +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/setup.cfg +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/_base.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/claude.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/protocol.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/telemetry.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/report.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/stats.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/api.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/assess/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/interpret_cmd.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/wizard.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/config/loader.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/checkpoint.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/experiment.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/isolation.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/preamble.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/curator.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/curator_backends.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/extractor.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_families.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_oracle.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_scanner.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/sg_ground_truth.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/writer.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/experiment.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/preambles/github.md +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/preambles/sourcegraph.md +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/probe/generator.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/probe/writer.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/ratings/collector.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_analysis.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_api.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_assess.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_changed_symbols.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_checkpoint.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_cli.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_config_loader.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_contrib.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_backends.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_core.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_integration.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_tiers.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_experiment_cmd.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_experiment_core.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_llm.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_loaders.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_mcp_families_mining.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_mcp_validate.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_mining.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_models.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_new_families.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_org_scale.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_preamble.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_probe.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_ratings.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_registry.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_scaffold.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_scoring.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_session.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_sg_ground_truth.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_telemetry.py +0 -0
- {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_weighted_f1.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -24,6 +24,7 @@ Requires-Dist: anthropic>=0.39
|
|
|
24
24
|
Requires-Dist: openai>=1.66
|
|
25
25
|
Requires-Dist: tiktoken<1,>=0.7
|
|
26
26
|
Requires-Dist: scipy<2,>=1.11
|
|
27
|
+
Requires-Dist: rich<14,>=13.7
|
|
27
28
|
Provides-Extra: dev
|
|
28
29
|
Requires-Dist: pytest<9,>=8.0; extra == "dev"
|
|
29
30
|
Requires-Dist: pytest-cov<6,>=5.0; extra == "dev"
|
|
@@ -37,11 +38,11 @@ Dynamic: license-file
|
|
|
37
38
|
|
|
38
39
|
Benchmark AI coding agents against **your own codebase**.
|
|
39
40
|
|
|
40
|
-
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for
|
|
41
|
+
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
|
|
41
42
|
|
|
42
43
|
## Why codeprobe?
|
|
43
44
|
|
|
44
|
-
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
|
|
45
|
+
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
|
|
45
46
|
|
|
46
47
|
## Prerequisites
|
|
47
48
|
|
|
@@ -84,18 +85,20 @@ codeprobe interpret . # Get recommendations
|
|
|
84
85
|
|
|
85
86
|
## Commands
|
|
86
87
|
|
|
87
|
-
| Command
|
|
88
|
-
|
|
|
89
|
-
| `codeprobe assess`
|
|
90
|
-
| `codeprobe init`
|
|
91
|
-
| `codeprobe mine`
|
|
92
|
-
| `codeprobe probe`
|
|
93
|
-
| `codeprobe experiment`
|
|
94
|
-
| `codeprobe run`
|
|
95
|
-
| `codeprobe interpret`
|
|
96
|
-
| `codeprobe
|
|
97
|
-
| `codeprobe
|
|
98
|
-
| `codeprobe
|
|
88
|
+
| Command | Purpose |
|
|
89
|
+
| -------------------------- | ------------------------------------------------ |
|
|
90
|
+
| `codeprobe assess` | Score a codebase's benchmarking potential |
|
|
91
|
+
| `codeprobe init` | Interactive wizard — choose what to compare |
|
|
92
|
+
| `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
|
|
93
|
+
| `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
|
|
94
|
+
| `codeprobe experiment` | Manage comparison experiments (init, add-config) |
|
|
95
|
+
| `codeprobe run` | Execute tasks against AI agents |
|
|
96
|
+
| `codeprobe interpret` | Analyze results, rank configurations |
|
|
97
|
+
| `codeprobe doctor` | Check environment readiness (agents, keys, git) |
|
|
98
|
+
| `codeprobe preambles list` | List available preambles at all search levels |
|
|
99
|
+
| `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
|
|
100
|
+
| `codeprobe scaffold` | Create/validate eval task directories |
|
|
101
|
+
| `codeprobe ratings` | Record and analyze agent session quality ratings |
|
|
99
102
|
|
|
100
103
|
## Two Ways to Generate Tasks
|
|
101
104
|
|
|
@@ -181,17 +184,32 @@ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}
|
|
|
181
184
|
codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
|
|
182
185
|
codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
|
|
183
186
|
codeprobe run . --dry-run # Estimate resource usage without running
|
|
187
|
+
codeprobe run . --model opus-4 # Override experiment.json model
|
|
188
|
+
codeprobe run . --timeout 600 # Override default 300s timeout
|
|
189
|
+
codeprobe run . --repeats 3 # Run each task 3 times
|
|
190
|
+
codeprobe run . --show-prompt # Print resolved prompt without running agent
|
|
184
191
|
|
|
185
192
|
# Mining
|
|
186
193
|
codeprobe mine . --enrich # Use LLM to improve weak task instructions
|
|
187
194
|
codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
|
|
188
195
|
codeprobe mine . --mcp-families # Include MCP-optimized task families
|
|
189
196
|
codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
|
|
197
|
+
codeprobe mine . --preset quick # Quick scan: count=3
|
|
198
|
+
codeprobe mine . --preset mcp # MCP eval: org-scale + MCP families + enrich
|
|
199
|
+
|
|
200
|
+
# Mine profiles (save/load custom flag combinations)
|
|
201
|
+
codeprobe mine --save-profile my-setup --count 10 --org-scale .
|
|
202
|
+
codeprobe mine --profile my-setup . # Load saved flags
|
|
203
|
+
codeprobe mine --list-profiles # Show available profiles
|
|
190
204
|
|
|
191
205
|
# Experiment configs
|
|
192
206
|
codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
|
|
193
207
|
codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
|
|
194
208
|
|
|
209
|
+
# Diagnostics
|
|
210
|
+
codeprobe doctor # Check agents, API keys, git, Python
|
|
211
|
+
codeprobe preambles list # Show available preambles at all levels
|
|
212
|
+
|
|
195
213
|
# Output
|
|
196
214
|
codeprobe interpret . --format csv # Export for pivot tables
|
|
197
215
|
codeprobe interpret . --format html # Self-contained HTML report
|
|
@@ -210,14 +228,9 @@ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
|
|
|
210
228
|
|
|
211
229
|
## Configuration
|
|
212
230
|
|
|
213
|
-
|
|
231
|
+
Configuration lives in `experiment.json` (created by `codeprobe init` or `codeprobe experiment init`). CLI flags override experiment.json values — precedence: built-in defaults < experiment.json < CLI flags.
|
|
214
232
|
|
|
215
|
-
|
|
216
|
-
name: my-experiment
|
|
217
|
-
agents: [claude, copilot]
|
|
218
|
-
models: [claude-sonnet-4-6, claude-opus-4-6]
|
|
219
|
-
tasks_dir: .codeprobe/tasks
|
|
220
|
-
```
|
|
233
|
+
Run-time observability is on by default: Rich Live dashboard in TTY, JSON event lines with `--log-format json` for CI. Cost budget warnings at 80% and 100% thresholds are always visible on stderr.
|
|
221
234
|
|
|
222
235
|
## License
|
|
223
236
|
|
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
Benchmark AI coding agents against **your own codebase**.
|
|
4
4
|
|
|
5
|
-
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for
|
|
5
|
+
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
|
|
6
6
|
|
|
7
7
|
## Why codeprobe?
|
|
8
8
|
|
|
9
|
-
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
|
|
9
|
+
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
|
|
10
10
|
|
|
11
11
|
## Prerequisites
|
|
12
12
|
|
|
@@ -49,18 +49,20 @@ codeprobe interpret . # Get recommendations
|
|
|
49
49
|
|
|
50
50
|
## Commands
|
|
51
51
|
|
|
52
|
-
| Command
|
|
53
|
-
|
|
|
54
|
-
| `codeprobe assess`
|
|
55
|
-
| `codeprobe init`
|
|
56
|
-
| `codeprobe mine`
|
|
57
|
-
| `codeprobe probe`
|
|
58
|
-
| `codeprobe experiment`
|
|
59
|
-
| `codeprobe run`
|
|
60
|
-
| `codeprobe interpret`
|
|
61
|
-
| `codeprobe
|
|
62
|
-
| `codeprobe
|
|
63
|
-
| `codeprobe
|
|
52
|
+
| Command | Purpose |
|
|
53
|
+
| -------------------------- | ------------------------------------------------ |
|
|
54
|
+
| `codeprobe assess` | Score a codebase's benchmarking potential |
|
|
55
|
+
| `codeprobe init` | Interactive wizard — choose what to compare |
|
|
56
|
+
| `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
|
|
57
|
+
| `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
|
|
58
|
+
| `codeprobe experiment` | Manage comparison experiments (init, add-config) |
|
|
59
|
+
| `codeprobe run` | Execute tasks against AI agents |
|
|
60
|
+
| `codeprobe interpret` | Analyze results, rank configurations |
|
|
61
|
+
| `codeprobe doctor` | Check environment readiness (agents, keys, git) |
|
|
62
|
+
| `codeprobe preambles list` | List available preambles at all search levels |
|
|
63
|
+
| `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
|
|
64
|
+
| `codeprobe scaffold` | Create/validate eval task directories |
|
|
65
|
+
| `codeprobe ratings` | Record and analyze agent session quality ratings |
|
|
64
66
|
|
|
65
67
|
## Two Ways to Generate Tasks
|
|
66
68
|
|
|
@@ -146,17 +148,32 @@ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}
|
|
|
146
148
|
codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
|
|
147
149
|
codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
|
|
148
150
|
codeprobe run . --dry-run # Estimate resource usage without running
|
|
151
|
+
codeprobe run . --model opus-4 # Override experiment.json model
|
|
152
|
+
codeprobe run . --timeout 600 # Override default 300s timeout
|
|
153
|
+
codeprobe run . --repeats 3 # Run each task 3 times
|
|
154
|
+
codeprobe run . --show-prompt # Print resolved prompt without running agent
|
|
149
155
|
|
|
150
156
|
# Mining
|
|
151
157
|
codeprobe mine . --enrich # Use LLM to improve weak task instructions
|
|
152
158
|
codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
|
|
153
159
|
codeprobe mine . --mcp-families # Include MCP-optimized task families
|
|
154
160
|
codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
|
|
161
|
+
codeprobe mine . --preset quick # Quick scan: count=3
|
|
162
|
+
codeprobe mine . --preset mcp # MCP eval: org-scale + MCP families + enrich
|
|
163
|
+
|
|
164
|
+
# Mine profiles (save/load custom flag combinations)
|
|
165
|
+
codeprobe mine --save-profile my-setup --count 10 --org-scale .
|
|
166
|
+
codeprobe mine --profile my-setup . # Load saved flags
|
|
167
|
+
codeprobe mine --list-profiles # Show available profiles
|
|
155
168
|
|
|
156
169
|
# Experiment configs
|
|
157
170
|
codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
|
|
158
171
|
codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
|
|
159
172
|
|
|
173
|
+
# Diagnostics
|
|
174
|
+
codeprobe doctor # Check agents, API keys, git, Python
|
|
175
|
+
codeprobe preambles list # Show available preambles at all levels
|
|
176
|
+
|
|
160
177
|
# Output
|
|
161
178
|
codeprobe interpret . --format csv # Export for pivot tables
|
|
162
179
|
codeprobe interpret . --format html # Self-contained HTML report
|
|
@@ -175,14 +192,9 @@ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
|
|
|
175
192
|
|
|
176
193
|
## Configuration
|
|
177
194
|
|
|
178
|
-
|
|
195
|
+
Configuration lives in `experiment.json` (created by `codeprobe init` or `codeprobe experiment init`). CLI flags override experiment.json values — precedence: built-in defaults < experiment.json < CLI flags.
|
|
179
196
|
|
|
180
|
-
|
|
181
|
-
name: my-experiment
|
|
182
|
-
agents: [claude, copilot]
|
|
183
|
-
models: [claude-sonnet-4-6, claude-opus-4-6]
|
|
184
|
-
tasks_dir: .codeprobe/tasks
|
|
185
|
-
```
|
|
197
|
+
Run-time observability is on by default: Rich Live dashboard in TTY, JSON event lines with `--log-format json` for CI. Cost budget warnings at 80% and 100% thresholds are always visible on stderr.
|
|
186
198
|
|
|
187
199
|
## License
|
|
188
200
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "codeprobe"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.1"
|
|
4
4
|
description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -25,6 +25,7 @@ dependencies = [
|
|
|
25
25
|
"openai>=1.66",
|
|
26
26
|
"tiktoken>=0.7,<1",
|
|
27
27
|
"scipy>=1.11,<2",
|
|
28
|
+
"rich>=13.7,<14",
|
|
28
29
|
]
|
|
29
30
|
|
|
30
31
|
[project.urls]
|
|
@@ -46,17 +47,21 @@ dev = [
|
|
|
46
47
|
codeprobe = "codeprobe.cli:main"
|
|
47
48
|
|
|
48
49
|
[project.entry-points."codeprobe.agents"]
|
|
49
|
-
aider = "codeprobe.adapters.aider:AiderAdapter"
|
|
50
50
|
claude = "codeprobe.adapters.claude:ClaudeAdapter"
|
|
51
51
|
codex = "codeprobe.adapters.codex:CodexAdapter"
|
|
52
52
|
copilot = "codeprobe.adapters.copilot:CopilotAdapter"
|
|
53
|
-
openai = "codeprobe.adapters.openai_compat:OpenAICompatAdapter"
|
|
54
53
|
|
|
55
54
|
[project.entry-points."codeprobe.sessions"]
|
|
56
55
|
claude = "codeprobe.adapters.session:ClaudeSessionCollector"
|
|
57
56
|
codex = "codeprobe.adapters.session:CodexSessionCollector"
|
|
58
57
|
copilot = "codeprobe.adapters.session:CopilotSessionCollector"
|
|
59
58
|
|
|
59
|
+
[project.entry-points."codeprobe.scorers"]
|
|
60
|
+
binary = "codeprobe.core.scoring:BinaryScorer"
|
|
61
|
+
continuous = "codeprobe.core.scoring:ContinuousScorer"
|
|
62
|
+
checkpoint = "codeprobe.core.scoring:CheckpointScorer"
|
|
63
|
+
test_ratio = "codeprobe.core.scoring:ContinuousScorer"
|
|
64
|
+
|
|
60
65
|
[build-system]
|
|
61
66
|
requires = ["setuptools>=68", "wheel"]
|
|
62
67
|
build-backend = "setuptools.build_meta"
|
|
@@ -142,7 +142,12 @@ def _run_git(args: list[str], cwd: Path) -> str:
|
|
|
142
142
|
timeout=30,
|
|
143
143
|
)
|
|
144
144
|
if result.returncode != 0:
|
|
145
|
-
logger.debug(
|
|
145
|
+
logger.debug(
|
|
146
|
+
"git %s exited %d: %s",
|
|
147
|
+
" ".join(args),
|
|
148
|
+
result.returncode,
|
|
149
|
+
result.stderr.strip(),
|
|
150
|
+
)
|
|
146
151
|
return ""
|
|
147
152
|
return result.stdout.strip()
|
|
148
153
|
except (subprocess.TimeoutExpired, OSError) as exc:
|
|
@@ -307,7 +312,9 @@ def gather_heuristics(repo_path: Path) -> RepoHeuristics:
|
|
|
307
312
|
history, CI presence, test coverage, languages, and activity.
|
|
308
313
|
"""
|
|
309
314
|
total_commits_str = _run_git(["rev-list", "--count", "HEAD"], cwd=repo_path)
|
|
310
|
-
merge_commits_str = _run_git(
|
|
315
|
+
merge_commits_str = _run_git(
|
|
316
|
+
["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path
|
|
317
|
+
)
|
|
311
318
|
contributors_str = _run_git(["shortlog", "-sn", "HEAD"], cwd=repo_path)
|
|
312
319
|
file_list = _run_git(["ls-files"], cwd=repo_path)
|
|
313
320
|
|
|
@@ -354,7 +361,10 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
|
|
|
354
361
|
has_ci = heuristics.has_ci
|
|
355
362
|
has_fw = len(heuristics.test_frameworks) > 0
|
|
356
363
|
if has_tests and has_ci and has_fw:
|
|
357
|
-
tc_score, tc_reason =
|
|
364
|
+
tc_score, tc_reason = (
|
|
365
|
+
1.0,
|
|
366
|
+
f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})",
|
|
367
|
+
)
|
|
358
368
|
elif has_tests and (has_ci or has_fw):
|
|
359
369
|
tc_score, tc_reason = 0.7, "Tests present with partial CI/framework support"
|
|
360
370
|
elif has_tests:
|
|
@@ -409,15 +419,29 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
|
|
|
409
419
|
DimensionScore(name="ci_maturity", score=ci_score, reasoning=ci_reason),
|
|
410
420
|
)
|
|
411
421
|
|
|
412
|
-
#
|
|
413
|
-
|
|
422
|
+
# Weighted average — ci_maturity is a weak signal because CI configs are
|
|
423
|
+
# often absent in shallow clones / Sourcegraph views, and codeprobe
|
|
424
|
+
# validates via mined test.sh scripts, not CI pipelines.
|
|
425
|
+
_WEIGHTS: dict[str, float] = {
|
|
426
|
+
"task_richness": 0.25,
|
|
427
|
+
"test_coverage": 0.25,
|
|
428
|
+
"complexity": 0.20,
|
|
429
|
+
"activity": 0.15,
|
|
430
|
+
"documentation": 0.10,
|
|
431
|
+
"ci_maturity": 0.05,
|
|
432
|
+
}
|
|
433
|
+
overall = sum(d.score * _WEIGHTS[d.name] for d in dimensions)
|
|
414
434
|
|
|
415
435
|
if overall >= 0.7:
|
|
416
436
|
recommendation = "Excellent benchmarking candidate — rich history with tests"
|
|
417
437
|
elif overall >= 0.5:
|
|
418
|
-
recommendation =
|
|
438
|
+
recommendation = (
|
|
439
|
+
"Good candidate — may need more merge history for diverse tasks"
|
|
440
|
+
)
|
|
419
441
|
elif overall >= 0.3:
|
|
420
|
-
recommendation =
|
|
442
|
+
recommendation = (
|
|
443
|
+
"Fair candidate — limited test coverage may reduce task quality"
|
|
444
|
+
)
|
|
421
445
|
else:
|
|
422
446
|
recommendation = "Poor candidate — consider a repo with more history and tests"
|
|
423
447
|
|
|
@@ -458,11 +482,15 @@ def _parse_model_assessment(
|
|
|
458
482
|
score_val = float(item.get("score", 0))
|
|
459
483
|
score_val = max(0.0, min(1.0, score_val))
|
|
460
484
|
reasoning = str(item.get("reasoning", ""))
|
|
461
|
-
dim_by_name[name] = DimensionScore(
|
|
485
|
+
dim_by_name[name] = DimensionScore(
|
|
486
|
+
name=name, score=score_val, reasoning=reasoning
|
|
487
|
+
)
|
|
462
488
|
|
|
463
489
|
missing = set(RUBRIC_V1) - set(dim_by_name)
|
|
464
490
|
if missing:
|
|
465
|
-
raise LLMParseError(
|
|
491
|
+
raise LLMParseError(
|
|
492
|
+
f"Model response missing dimensions: {', '.join(sorted(missing))}"
|
|
493
|
+
)
|
|
466
494
|
|
|
467
495
|
dimensions = tuple(dim_by_name[name] for name in RUBRIC_V1)
|
|
468
496
|
|
|
@@ -498,6 +526,11 @@ def score_repo_with_model(heuristics: RepoHeuristics) -> AssessmentScore:
|
|
|
498
526
|
"You are evaluating a code repository's suitability for AI agent benchmarking.\n\n"
|
|
499
527
|
f"Here are the raw repository statistics:\n{stats_json}\n\n"
|
|
500
528
|
f"Score this repository on each of these dimensions (0.0 to 1.0):\n{rubric_list}\n\n"
|
|
529
|
+
"Weighting guidance for the overall score: task_richness and test_coverage "
|
|
530
|
+
"are the most important (~25% each), followed by complexity (~20%), "
|
|
531
|
+
"activity (~15%), documentation (~10%). ci_maturity should be a minor "
|
|
532
|
+
"signal (~5%) because CI configs are often absent in cloned repos and "
|
|
533
|
+
"codeprobe validates via mined test scripts, not CI pipelines.\n\n"
|
|
501
534
|
"Respond with ONLY valid JSON matching this exact schema:\n"
|
|
502
535
|
"{\n"
|
|
503
536
|
' "overall": <float 0.0-1.0>,\n'
|
|
@@ -84,6 +84,10 @@ def main(verbose: int, quiet: bool, log_format: str) -> None:
|
|
|
84
84
|
and interpret the results to find which setup works best for YOUR code.
|
|
85
85
|
"""
|
|
86
86
|
_configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
|
|
87
|
+
ctx = click.get_current_context()
|
|
88
|
+
ctx.ensure_object(dict)
|
|
89
|
+
ctx.obj["log_format"] = log_format
|
|
90
|
+
ctx.obj["quiet"] = quiet
|
|
87
91
|
|
|
88
92
|
|
|
89
93
|
@main.command()
|
|
@@ -101,6 +105,40 @@ def init(path: str) -> None:
|
|
|
101
105
|
|
|
102
106
|
@main.command()
|
|
103
107
|
@click.argument("path", default=".")
|
|
108
|
+
@click.option(
|
|
109
|
+
"--preset",
|
|
110
|
+
type=click.Choice(["quick", "mcp"], case_sensitive=False),
|
|
111
|
+
default=None,
|
|
112
|
+
help="Apply a named preset: 'quick' (count=3) or 'mcp' (org-scale + MCP families).",
|
|
113
|
+
)
|
|
114
|
+
@click.option(
|
|
115
|
+
"--goal",
|
|
116
|
+
type=click.Choice(
|
|
117
|
+
["quality", "navigation", "mcp", "general"], case_sensitive=False
|
|
118
|
+
),
|
|
119
|
+
default=None,
|
|
120
|
+
help="Eval goal: quality, navigation, mcp, general. Skips interactive goal prompt.",
|
|
121
|
+
)
|
|
122
|
+
@click.option(
|
|
123
|
+
"--profile",
|
|
124
|
+
"profile_name",
|
|
125
|
+
default=None,
|
|
126
|
+
help="Load a user-defined profile from ~/.codeprobe/mine-profiles.json "
|
|
127
|
+
"or .codeprobe/mine-profiles.json. Explicit flags override profile values.",
|
|
128
|
+
)
|
|
129
|
+
@click.option(
|
|
130
|
+
"--save-profile",
|
|
131
|
+
"save_profile_name",
|
|
132
|
+
default=None,
|
|
133
|
+
help="Save current flag values as a named profile to ~/.codeprobe/mine-profiles.json.",
|
|
134
|
+
)
|
|
135
|
+
@click.option(
|
|
136
|
+
"--list-profiles",
|
|
137
|
+
"list_profiles_flag",
|
|
138
|
+
is_flag=True,
|
|
139
|
+
default=False,
|
|
140
|
+
help="Show available profiles from user and project levels.",
|
|
141
|
+
)
|
|
104
142
|
@click.option("--count", default=5, help="Number of tasks to mine (3-20).")
|
|
105
143
|
@click.option(
|
|
106
144
|
"--source",
|
|
@@ -206,8 +244,15 @@ def init(path: str) -> None:
|
|
|
206
244
|
"(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
|
|
207
245
|
"when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
|
|
208
246
|
)
|
|
247
|
+
@click.pass_context
|
|
209
248
|
def mine(
|
|
249
|
+
ctx: click.Context,
|
|
210
250
|
path: str,
|
|
251
|
+
preset: str | None,
|
|
252
|
+
goal: str | None,
|
|
253
|
+
profile_name: str | None,
|
|
254
|
+
save_profile_name: str | None,
|
|
255
|
+
list_profiles_flag: bool,
|
|
211
256
|
count: int,
|
|
212
257
|
source: str,
|
|
213
258
|
min_files: int,
|
|
@@ -232,6 +277,21 @@ def mine(
|
|
|
232
277
|
Extracts real code-change tasks from merged PRs/MRs with ground truth,
|
|
233
278
|
test scripts, and scoring rubrics.
|
|
234
279
|
|
|
280
|
+
\b
|
|
281
|
+
Presets (--preset):
|
|
282
|
+
quick — Fast scan: count=3, default SDLC mode
|
|
283
|
+
mcp — MCP eval: count=8, org-scale + MCP families + enrich
|
|
284
|
+
|
|
285
|
+
\b
|
|
286
|
+
Profiles (--profile / --save-profile / --list-profiles):
|
|
287
|
+
Save: codeprobe mine --save-profile my-setup --count 10 --org-scale .
|
|
288
|
+
Load: codeprobe mine --profile my-setup /path/to/repo
|
|
289
|
+
List: codeprobe mine --list-profiles
|
|
290
|
+
|
|
291
|
+
\b
|
|
292
|
+
Precedence: built-in defaults < profile < --preset < explicit CLI flags.
|
|
293
|
+
|
|
294
|
+
\b
|
|
235
295
|
Use --org-scale to mine comprehension/IR tasks with oracle verification
|
|
236
296
|
instead of SDLC code-change tasks.
|
|
237
297
|
|
|
@@ -242,10 +302,100 @@ def mine(
|
|
|
242
302
|
choosing an eval goal, task count, and git host before mining.
|
|
243
303
|
Use --no-interactive to skip the prompts and use defaults/flags directly.
|
|
244
304
|
"""
|
|
245
|
-
from
|
|
305
|
+
from pathlib import Path as _Path
|
|
306
|
+
|
|
307
|
+
from codeprobe.cli.mine_cmd import (
|
|
308
|
+
list_profiles,
|
|
309
|
+
load_profile,
|
|
310
|
+
run_mine,
|
|
311
|
+
save_profile,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# --list-profiles: show and exit
|
|
315
|
+
if list_profiles_flag:
|
|
316
|
+
repo_path = _Path(path).resolve() if path != "." else _Path.cwd()
|
|
317
|
+
entries = list_profiles(repo_path)
|
|
318
|
+
if not entries:
|
|
319
|
+
click.echo("No profiles found.")
|
|
320
|
+
else:
|
|
321
|
+
click.echo(f"{'Name':<20s} {'Source':<10s} {'Settings'}")
|
|
322
|
+
click.echo("-" * 60)
|
|
323
|
+
for name, source_label, prof in entries:
|
|
324
|
+
summary = ", ".join(f"{k}={v}" for k, v in sorted(prof.items()))
|
|
325
|
+
click.echo(f"{name:<20s} {source_label:<10s} {summary}")
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
# --save-profile: save current flags and exit
|
|
329
|
+
if save_profile_name is not None:
|
|
330
|
+
# Collect all current param values, keeping only those that differ
|
|
331
|
+
# from Click defaults.
|
|
332
|
+
param_defaults = {p.name: p.default for p in ctx.command.params}
|
|
333
|
+
# Exclude meta-params that aren't mining flags
|
|
334
|
+
_EXCLUDE_FROM_PROFILE = frozenset(
|
|
335
|
+
{
|
|
336
|
+
"path",
|
|
337
|
+
"profile_name",
|
|
338
|
+
"save_profile_name",
|
|
339
|
+
"list_profiles_flag",
|
|
340
|
+
}
|
|
341
|
+
)
|
|
342
|
+
values = {
|
|
343
|
+
k: (list(v) if isinstance(v, tuple) else v)
|
|
344
|
+
for k, v in ctx.params.items()
|
|
345
|
+
if k not in _EXCLUDE_FROM_PROFILE and v != param_defaults.get(k)
|
|
346
|
+
}
|
|
347
|
+
saved_path = save_profile(save_profile_name, values)
|
|
348
|
+
click.echo(f"Profile '{save_profile_name}' saved to {saved_path}")
|
|
349
|
+
return
|
|
350
|
+
|
|
351
|
+
# --profile: load profile values as defaults, then apply preset and CLI overrides
|
|
352
|
+
if profile_name is not None:
|
|
353
|
+
repo_path = _Path(path).resolve() if path != "." else _Path.cwd()
|
|
354
|
+
prof = load_profile(profile_name, repo_path)
|
|
355
|
+
|
|
356
|
+
# Determine which params were explicitly set on the CLI
|
|
357
|
+
explicitly_set = {
|
|
358
|
+
p.name
|
|
359
|
+
for p in ctx.command.params
|
|
360
|
+
if ctx.get_parameter_source(p.name) is not None
|
|
361
|
+
and ctx.get_parameter_source(p.name).name == "COMMANDLINE"
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
# Apply profile values for params NOT explicitly set on CLI.
|
|
365
|
+
# Tuple-typed params (click multiple=True) need list→tuple coercion.
|
|
366
|
+
_TUPLE_PARAMS = frozenset({"subsystem", "family", "repos", "backends"})
|
|
367
|
+
|
|
368
|
+
def _prof_val(key: str, current: object) -> object:
|
|
369
|
+
if key in explicitly_set or key not in prof:
|
|
370
|
+
return current
|
|
371
|
+
v = prof[key]
|
|
372
|
+
return tuple(v) if key in _TUPLE_PARAMS else v
|
|
373
|
+
|
|
374
|
+
count = _prof_val("count", count) # type: ignore[assignment]
|
|
375
|
+
source = _prof_val("source", source) # type: ignore[assignment]
|
|
376
|
+
min_files = _prof_val("min_files", min_files) # type: ignore[assignment]
|
|
377
|
+
enrich = _prof_val("enrich", enrich) # type: ignore[assignment]
|
|
378
|
+
org_scale = _prof_val("org_scale", org_scale) # type: ignore[assignment]
|
|
379
|
+
mcp_families = _prof_val("mcp_families", mcp_families) # type: ignore[assignment]
|
|
380
|
+
no_llm = _prof_val("no_llm", no_llm) # type: ignore[assignment]
|
|
381
|
+
discover_subsystems = _prof_val("discover_subsystems", discover_subsystems) # type: ignore[assignment]
|
|
382
|
+
scan_timeout = _prof_val("scan_timeout", scan_timeout) # type: ignore[assignment]
|
|
383
|
+
validate_flag = _prof_val("validate_flag", validate_flag) # type: ignore[assignment]
|
|
384
|
+
curate = _prof_val("curate", curate) # type: ignore[assignment]
|
|
385
|
+
verify_curation_flag = _prof_val("verify_curation_flag", verify_curation_flag) # type: ignore[assignment]
|
|
386
|
+
sg_repo = _prof_val("sg_repo", sg_repo) # type: ignore[assignment]
|
|
387
|
+
subsystem = _prof_val("subsystem", subsystem) # type: ignore[assignment]
|
|
388
|
+
family = _prof_val("family", family) # type: ignore[assignment]
|
|
389
|
+
repos = _prof_val("repos", repos) # type: ignore[assignment]
|
|
390
|
+
backends = _prof_val("backends", backends) # type: ignore[assignment]
|
|
391
|
+
interactive = _prof_val("interactive", interactive) # type: ignore[assignment]
|
|
392
|
+
preset = _prof_val("preset", preset) # type: ignore[assignment]
|
|
393
|
+
goal = _prof_val("goal", goal) # type: ignore[assignment]
|
|
246
394
|
|
|
247
395
|
run_mine(
|
|
248
396
|
path,
|
|
397
|
+
preset=preset,
|
|
398
|
+
goal=goal,
|
|
249
399
|
count=count,
|
|
250
400
|
source=source,
|
|
251
401
|
min_files=min_files,
|
|
@@ -294,7 +444,46 @@ def mine(
|
|
|
294
444
|
default=False,
|
|
295
445
|
help="Print estimated resource requirements without executing any agents.",
|
|
296
446
|
)
|
|
447
|
+
@click.option(
|
|
448
|
+
"--force-plain",
|
|
449
|
+
is_flag=True,
|
|
450
|
+
default=False,
|
|
451
|
+
help="Force plain-text output even in a TTY (disable Rich dashboard).",
|
|
452
|
+
)
|
|
453
|
+
@click.option(
|
|
454
|
+
"--force-rich",
|
|
455
|
+
is_flag=True,
|
|
456
|
+
default=False,
|
|
457
|
+
help="Force Rich Live dashboard even in non-TTY environments.",
|
|
458
|
+
)
|
|
459
|
+
@click.option(
|
|
460
|
+
"--timeout",
|
|
461
|
+
default=None,
|
|
462
|
+
type=int,
|
|
463
|
+
help="Timeout in seconds per task (overrides experiment.json extra.timeout_seconds).",
|
|
464
|
+
)
|
|
465
|
+
@click.option(
|
|
466
|
+
"--repeats",
|
|
467
|
+
default=None,
|
|
468
|
+
type=int,
|
|
469
|
+
help="Number of repeats per task (overrides default of 1).",
|
|
470
|
+
)
|
|
471
|
+
@click.option(
|
|
472
|
+
"--show-prompt",
|
|
473
|
+
is_flag=True,
|
|
474
|
+
default=False,
|
|
475
|
+
help="Print the fully-resolved prompt for the first task and exit (no agent spawned).",
|
|
476
|
+
)
|
|
477
|
+
@click.option(
|
|
478
|
+
"--suite",
|
|
479
|
+
"suite_path",
|
|
480
|
+
default=None,
|
|
481
|
+
type=click.Path(exists=True),
|
|
482
|
+
help="Path to a suite.toml manifest to filter tasks by type, difficulty, and tags.",
|
|
483
|
+
)
|
|
484
|
+
@click.pass_context
|
|
297
485
|
def run(
|
|
486
|
+
ctx: click.Context,
|
|
298
487
|
path: str,
|
|
299
488
|
agent: str,
|
|
300
489
|
model: str | None,
|
|
@@ -302,6 +491,12 @@ def run(
|
|
|
302
491
|
max_cost_usd: float | None,
|
|
303
492
|
parallel: int,
|
|
304
493
|
dry_run: bool,
|
|
494
|
+
force_plain: bool,
|
|
495
|
+
force_rich: bool,
|
|
496
|
+
timeout: int | None,
|
|
497
|
+
repeats: int | None,
|
|
498
|
+
show_prompt: bool,
|
|
499
|
+
suite_path: str | None,
|
|
305
500
|
) -> None:
|
|
306
501
|
"""Run eval tasks against an AI coding agent.
|
|
307
502
|
|
|
@@ -310,6 +505,16 @@ def run(
|
|
|
310
505
|
"""
|
|
311
506
|
from codeprobe.cli.run_cmd import run_eval
|
|
312
507
|
|
|
508
|
+
ctx.ensure_object(dict)
|
|
509
|
+
log_format = ctx.obj.get("log_format", "text")
|
|
510
|
+
quiet = ctx.obj.get("quiet", False)
|
|
511
|
+
|
|
512
|
+
if show_prompt:
|
|
513
|
+
from codeprobe.cli.run_cmd import show_prompt_and_exit
|
|
514
|
+
|
|
515
|
+
show_prompt_and_exit(path, config=config, agent=agent, model=model)
|
|
516
|
+
return
|
|
517
|
+
|
|
313
518
|
run_eval(
|
|
314
519
|
path,
|
|
315
520
|
agent=agent,
|
|
@@ -318,6 +523,13 @@ def run(
|
|
|
318
523
|
max_cost_usd=max_cost_usd,
|
|
319
524
|
parallel=parallel,
|
|
320
525
|
dry_run=dry_run,
|
|
526
|
+
log_format=log_format,
|
|
527
|
+
quiet=quiet,
|
|
528
|
+
force_plain=force_plain,
|
|
529
|
+
force_rich=force_rich,
|
|
530
|
+
timeout=timeout,
|
|
531
|
+
repeats=repeats if repeats is not None else 1,
|
|
532
|
+
suite_path=suite_path,
|
|
321
533
|
)
|
|
322
534
|
|
|
323
535
|
|
|
@@ -488,3 +700,18 @@ main.add_command(scaffold)
|
|
|
488
700
|
from codeprobe.cli.probe_cmd import probe # noqa: E402
|
|
489
701
|
|
|
490
702
|
main.add_command(probe)
|
|
703
|
+
|
|
704
|
+
# Register the preambles subcommand group
|
|
705
|
+
from codeprobe.cli.preamble_cmd import preambles # noqa: E402
|
|
706
|
+
|
|
707
|
+
main.add_command(preambles)
|
|
708
|
+
|
|
709
|
+
# Register the doctor command
|
|
710
|
+
from codeprobe.cli.doctor_cmd import doctor # noqa: E402
|
|
711
|
+
|
|
712
|
+
main.add_command(doctor)
|
|
713
|
+
|
|
714
|
+
# Register the validate command
|
|
715
|
+
from codeprobe.cli.validate_cmd import validate # noqa: E402
|
|
716
|
+
|
|
717
|
+
main.add_command(validate)
|