codeprobe 0.3.0__tar.gz → 0.3.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.3.0 → codeprobe-0.3.1}/PKG-INFO +3 -3
- {codeprobe-0.3.0 → codeprobe-0.3.1}/README.md +2 -2
- {codeprobe-0.3.0 → codeprobe-0.3.1}/pyproject.toml +1 -3
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/__init__.py +1 -1
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/assess/heuristics.py +42 -9
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/__init__.py +25 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/doctor_cmd.py +0 -1
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/mine_cmd.py +436 -54
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/probe_cmd.py +24 -4
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/run_cmd.py +77 -0
- codeprobe-0.3.1/src/codeprobe/cli/validate_cmd.py +288 -0
- codeprobe-0.3.1/src/codeprobe/core/__main__.py +8 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/executor.py +18 -3
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/registry.py +1 -1
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/scoring.py +253 -9
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/loaders/__init__.py +19 -1
- codeprobe-0.3.1/src/codeprobe/loaders/suite.py +76 -0
- codeprobe-0.3.1/src/codeprobe/mining/_graph.py +310 -0
- codeprobe-0.3.1/src/codeprobe/mining/comprehension.py +473 -0
- codeprobe-0.3.1/src/codeprobe/mining/comprehension_writer.py +114 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/models/__init__.py +8 -1
- codeprobe-0.3.1/src/codeprobe/models/suite.py +23 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/models/task.py +40 -0
- codeprobe-0.3.1/src/codeprobe/probe/adapter.py +151 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe.egg-info/PKG-INFO +3 -3
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe.egg-info/SOURCES.txt +16 -1
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe.egg-info/entry_points.txt +0 -2
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_adapter_contracts.py +1 -31
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_adapters.py +0 -161
- codeprobe-0.3.1/tests/test_artifact_scorer.py +316 -0
- codeprobe-0.3.1/tests/test_checkpoint_scoring.py +369 -0
- codeprobe-0.3.1/tests/test_comprehension.py +329 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_ctrlc_integration.py +2 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_executor.py +167 -0
- codeprobe-0.3.1/tests/test_mine_goals.py +518 -0
- codeprobe-0.3.1/tests/test_probe_adapter.py +317 -0
- codeprobe-0.3.1/tests/test_shell_shim.py +177 -0
- codeprobe-0.3.1/tests/test_suite.py +243 -0
- codeprobe-0.3.1/tests/test_validate_cmd.py +272 -0
- codeprobe-0.3.0/src/codeprobe/adapters/aider.py +0 -79
- {codeprobe-0.3.0 → codeprobe-0.3.1}/LICENSE +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/setup.cfg +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/_base.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/claude.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/protocol.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/adapters/telemetry.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/analysis/report.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/analysis/stats.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/api.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/assess/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/experiment_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/init_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/interpret_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/json_display.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/preamble_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/rich_display.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/wizard.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/cli/yaml_writer.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/config/loader.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/checkpoint.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/events.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/experiment.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/isolation.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/mcp_discovery.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/preamble.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/curator.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/curator_backends.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/extractor.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_families.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_oracle.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_scanner.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/sg_ground_truth.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/mining/writer.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/models/experiment.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/preambles/github.md +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/preambles/sourcegraph.md +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/probe/generator.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/probe/writer.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/ratings/collector.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe.egg-info/requires.txt +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_analysis.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_api.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_assess.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_changed_symbols.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_checkpoint.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_cli.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_config_loader.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_contrib.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_curator_backends.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_curator_core.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_curator_integration.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_curator_tiers.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_doctor_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_events.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_executor_events.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_experiment_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_experiment_core.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_init_wizard.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_json_display.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_llm.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_loaders.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_mcp_families_mining.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_mcp_validate.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_mine_presets.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_mine_profiles.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_mining.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_models.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_new_families.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_org_scale.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_pipeline_integration.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_preamble.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_preamble_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_probe.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_ratings.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_registry.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_run_config_resolution.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_scaffold.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_scoring.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_session.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_sg_ground_truth.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_show_prompt.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_telemetry.py +0 -0
- {codeprobe-0.3.0 → codeprobe-0.3.1}/tests/test_weighted_f1.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.1
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -38,11 +38,11 @@ Dynamic: license-file
|
|
|
38
38
|
|
|
39
39
|
Benchmark AI coding agents against **your own codebase**.
|
|
40
40
|
|
|
41
|
-
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for
|
|
41
|
+
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
|
|
42
42
|
|
|
43
43
|
## Why codeprobe?
|
|
44
44
|
|
|
45
|
-
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
|
|
45
|
+
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
|
|
46
46
|
|
|
47
47
|
## Prerequisites
|
|
48
48
|
|
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
Benchmark AI coding agents against **your own codebase**.
|
|
4
4
|
|
|
5
|
-
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for
|
|
5
|
+
Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
|
|
6
6
|
|
|
7
7
|
## Why codeprobe?
|
|
8
8
|
|
|
9
|
-
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
|
|
9
|
+
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
|
|
10
10
|
|
|
11
11
|
## Prerequisites
|
|
12
12
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "codeprobe"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.1"
|
|
4
4
|
description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -47,11 +47,9 @@ dev = [
|
|
|
47
47
|
codeprobe = "codeprobe.cli:main"
|
|
48
48
|
|
|
49
49
|
[project.entry-points."codeprobe.agents"]
|
|
50
|
-
aider = "codeprobe.adapters.aider:AiderAdapter"
|
|
51
50
|
claude = "codeprobe.adapters.claude:ClaudeAdapter"
|
|
52
51
|
codex = "codeprobe.adapters.codex:CodexAdapter"
|
|
53
52
|
copilot = "codeprobe.adapters.copilot:CopilotAdapter"
|
|
54
|
-
openai = "codeprobe.adapters.openai_compat:OpenAICompatAdapter"
|
|
55
53
|
|
|
56
54
|
[project.entry-points."codeprobe.sessions"]
|
|
57
55
|
claude = "codeprobe.adapters.session:ClaudeSessionCollector"
|
|
@@ -142,7 +142,12 @@ def _run_git(args: list[str], cwd: Path) -> str:
|
|
|
142
142
|
timeout=30,
|
|
143
143
|
)
|
|
144
144
|
if result.returncode != 0:
|
|
145
|
-
logger.debug(
|
|
145
|
+
logger.debug(
|
|
146
|
+
"git %s exited %d: %s",
|
|
147
|
+
" ".join(args),
|
|
148
|
+
result.returncode,
|
|
149
|
+
result.stderr.strip(),
|
|
150
|
+
)
|
|
146
151
|
return ""
|
|
147
152
|
return result.stdout.strip()
|
|
148
153
|
except (subprocess.TimeoutExpired, OSError) as exc:
|
|
@@ -307,7 +312,9 @@ def gather_heuristics(repo_path: Path) -> RepoHeuristics:
|
|
|
307
312
|
history, CI presence, test coverage, languages, and activity.
|
|
308
313
|
"""
|
|
309
314
|
total_commits_str = _run_git(["rev-list", "--count", "HEAD"], cwd=repo_path)
|
|
310
|
-
merge_commits_str = _run_git(
|
|
315
|
+
merge_commits_str = _run_git(
|
|
316
|
+
["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path
|
|
317
|
+
)
|
|
311
318
|
contributors_str = _run_git(["shortlog", "-sn", "HEAD"], cwd=repo_path)
|
|
312
319
|
file_list = _run_git(["ls-files"], cwd=repo_path)
|
|
313
320
|
|
|
@@ -354,7 +361,10 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
|
|
|
354
361
|
has_ci = heuristics.has_ci
|
|
355
362
|
has_fw = len(heuristics.test_frameworks) > 0
|
|
356
363
|
if has_tests and has_ci and has_fw:
|
|
357
|
-
tc_score, tc_reason =
|
|
364
|
+
tc_score, tc_reason = (
|
|
365
|
+
1.0,
|
|
366
|
+
f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})",
|
|
367
|
+
)
|
|
358
368
|
elif has_tests and (has_ci or has_fw):
|
|
359
369
|
tc_score, tc_reason = 0.7, "Tests present with partial CI/framework support"
|
|
360
370
|
elif has_tests:
|
|
@@ -409,15 +419,29 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
|
|
|
409
419
|
DimensionScore(name="ci_maturity", score=ci_score, reasoning=ci_reason),
|
|
410
420
|
)
|
|
411
421
|
|
|
412
|
-
#
|
|
413
|
-
|
|
422
|
+
# Weighted average — ci_maturity is a weak signal because CI configs are
|
|
423
|
+
# often absent in shallow clones / Sourcegraph views, and codeprobe
|
|
424
|
+
# validates via mined test.sh scripts, not CI pipelines.
|
|
425
|
+
_WEIGHTS: dict[str, float] = {
|
|
426
|
+
"task_richness": 0.25,
|
|
427
|
+
"test_coverage": 0.25,
|
|
428
|
+
"complexity": 0.20,
|
|
429
|
+
"activity": 0.15,
|
|
430
|
+
"documentation": 0.10,
|
|
431
|
+
"ci_maturity": 0.05,
|
|
432
|
+
}
|
|
433
|
+
overall = sum(d.score * _WEIGHTS[d.name] for d in dimensions)
|
|
414
434
|
|
|
415
435
|
if overall >= 0.7:
|
|
416
436
|
recommendation = "Excellent benchmarking candidate — rich history with tests"
|
|
417
437
|
elif overall >= 0.5:
|
|
418
|
-
recommendation =
|
|
438
|
+
recommendation = (
|
|
439
|
+
"Good candidate — may need more merge history for diverse tasks"
|
|
440
|
+
)
|
|
419
441
|
elif overall >= 0.3:
|
|
420
|
-
recommendation =
|
|
442
|
+
recommendation = (
|
|
443
|
+
"Fair candidate — limited test coverage may reduce task quality"
|
|
444
|
+
)
|
|
421
445
|
else:
|
|
422
446
|
recommendation = "Poor candidate — consider a repo with more history and tests"
|
|
423
447
|
|
|
@@ -458,11 +482,15 @@ def _parse_model_assessment(
|
|
|
458
482
|
score_val = float(item.get("score", 0))
|
|
459
483
|
score_val = max(0.0, min(1.0, score_val))
|
|
460
484
|
reasoning = str(item.get("reasoning", ""))
|
|
461
|
-
dim_by_name[name] = DimensionScore(
|
|
485
|
+
dim_by_name[name] = DimensionScore(
|
|
486
|
+
name=name, score=score_val, reasoning=reasoning
|
|
487
|
+
)
|
|
462
488
|
|
|
463
489
|
missing = set(RUBRIC_V1) - set(dim_by_name)
|
|
464
490
|
if missing:
|
|
465
|
-
raise LLMParseError(
|
|
491
|
+
raise LLMParseError(
|
|
492
|
+
f"Model response missing dimensions: {', '.join(sorted(missing))}"
|
|
493
|
+
)
|
|
466
494
|
|
|
467
495
|
dimensions = tuple(dim_by_name[name] for name in RUBRIC_V1)
|
|
468
496
|
|
|
@@ -498,6 +526,11 @@ def score_repo_with_model(heuristics: RepoHeuristics) -> AssessmentScore:
|
|
|
498
526
|
"You are evaluating a code repository's suitability for AI agent benchmarking.\n\n"
|
|
499
527
|
f"Here are the raw repository statistics:\n{stats_json}\n\n"
|
|
500
528
|
f"Score this repository on each of these dimensions (0.0 to 1.0):\n{rubric_list}\n\n"
|
|
529
|
+
"Weighting guidance for the overall score: task_richness and test_coverage "
|
|
530
|
+
"are the most important (~25% each), followed by complexity (~20%), "
|
|
531
|
+
"activity (~15%), documentation (~10%). ci_maturity should be a minor "
|
|
532
|
+
"signal (~5%) because CI configs are often absent in cloned repos and "
|
|
533
|
+
"codeprobe validates via mined test scripts, not CI pipelines.\n\n"
|
|
501
534
|
"Respond with ONLY valid JSON matching this exact schema:\n"
|
|
502
535
|
"{\n"
|
|
503
536
|
' "overall": <float 0.0-1.0>,\n'
|
|
@@ -111,6 +111,14 @@ def init(path: str) -> None:
|
|
|
111
111
|
default=None,
|
|
112
112
|
help="Apply a named preset: 'quick' (count=3) or 'mcp' (org-scale + MCP families).",
|
|
113
113
|
)
|
|
114
|
+
@click.option(
|
|
115
|
+
"--goal",
|
|
116
|
+
type=click.Choice(
|
|
117
|
+
["quality", "navigation", "mcp", "general"], case_sensitive=False
|
|
118
|
+
),
|
|
119
|
+
default=None,
|
|
120
|
+
help="Eval goal: quality, navigation, mcp, general. Skips interactive goal prompt.",
|
|
121
|
+
)
|
|
114
122
|
@click.option(
|
|
115
123
|
"--profile",
|
|
116
124
|
"profile_name",
|
|
@@ -241,6 +249,7 @@ def mine(
|
|
|
241
249
|
ctx: click.Context,
|
|
242
250
|
path: str,
|
|
243
251
|
preset: str | None,
|
|
252
|
+
goal: str | None,
|
|
244
253
|
profile_name: str | None,
|
|
245
254
|
save_profile_name: str | None,
|
|
246
255
|
list_profiles_flag: bool,
|
|
@@ -381,10 +390,12 @@ def mine(
|
|
|
381
390
|
backends = _prof_val("backends", backends) # type: ignore[assignment]
|
|
382
391
|
interactive = _prof_val("interactive", interactive) # type: ignore[assignment]
|
|
383
392
|
preset = _prof_val("preset", preset) # type: ignore[assignment]
|
|
393
|
+
goal = _prof_val("goal", goal) # type: ignore[assignment]
|
|
384
394
|
|
|
385
395
|
run_mine(
|
|
386
396
|
path,
|
|
387
397
|
preset=preset,
|
|
398
|
+
goal=goal,
|
|
388
399
|
count=count,
|
|
389
400
|
source=source,
|
|
390
401
|
min_files=min_files,
|
|
@@ -463,6 +474,13 @@ def mine(
|
|
|
463
474
|
default=False,
|
|
464
475
|
help="Print the fully-resolved prompt for the first task and exit (no agent spawned).",
|
|
465
476
|
)
|
|
477
|
+
@click.option(
|
|
478
|
+
"--suite",
|
|
479
|
+
"suite_path",
|
|
480
|
+
default=None,
|
|
481
|
+
type=click.Path(exists=True),
|
|
482
|
+
help="Path to a suite.toml manifest to filter tasks by type, difficulty, and tags.",
|
|
483
|
+
)
|
|
466
484
|
@click.pass_context
|
|
467
485
|
def run(
|
|
468
486
|
ctx: click.Context,
|
|
@@ -478,6 +496,7 @@ def run(
|
|
|
478
496
|
timeout: int | None,
|
|
479
497
|
repeats: int | None,
|
|
480
498
|
show_prompt: bool,
|
|
499
|
+
suite_path: str | None,
|
|
481
500
|
) -> None:
|
|
482
501
|
"""Run eval tasks against an AI coding agent.
|
|
483
502
|
|
|
@@ -510,6 +529,7 @@ def run(
|
|
|
510
529
|
force_rich=force_rich,
|
|
511
530
|
timeout=timeout,
|
|
512
531
|
repeats=repeats if repeats is not None else 1,
|
|
532
|
+
suite_path=suite_path,
|
|
513
533
|
)
|
|
514
534
|
|
|
515
535
|
|
|
@@ -690,3 +710,8 @@ main.add_command(preambles)
|
|
|
690
710
|
from codeprobe.cli.doctor_cmd import doctor # noqa: E402
|
|
691
711
|
|
|
692
712
|
main.add_command(doctor)
|
|
713
|
+
|
|
714
|
+
# Register the validate command
|
|
715
|
+
from codeprobe.cli.validate_cmd import validate # noqa: E402
|
|
716
|
+
|
|
717
|
+
main.add_command(validate)
|
|
@@ -83,7 +83,6 @@ def run_checks() -> list[CheckResult]:
|
|
|
83
83
|
_check_tool(
|
|
84
84
|
"codex", "Install OpenAI Codex CLI: https://github.com/openai/codex"
|
|
85
85
|
),
|
|
86
|
-
_check_tool("aider", "Install aider: https://aider.chat/docs/install.html"),
|
|
87
86
|
_check_env_key(
|
|
88
87
|
"ANTHROPIC_API_KEY", "Set ANTHROPIC_API_KEY in your environment."
|
|
89
88
|
),
|