codeprobe 0.3.4__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.3.4 → codeprobe-0.3.6}/PKG-INFO +24 -2
- {codeprobe-0.3.4 → codeprobe-0.3.6}/README.md +23 -1
- {codeprobe-0.3.4 → codeprobe-0.3.6}/pyproject.toml +4 -1
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/__init__.py +1 -1
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/_base.py +51 -4
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/claude.py +1 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/protocol.py +8 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/telemetry.py +26 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/api.py +11 -2
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/assess/heuristics.py +21 -7
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/__init__.py +148 -40
- codeprobe-0.3.6/src/codeprobe/cli/auth_cmd.py +81 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/mine_cmd.py +360 -57
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/rich_display.py +10 -2
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/run_cmd.py +24 -4
- codeprobe-0.3.6/src/codeprobe/config/redact.py +45 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/executor.py +115 -3
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/experiment.py +8 -1
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/isolation.py +138 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/extractor.py +1 -0
- codeprobe-0.3.6/src/codeprobe/mining/multi_repo.py +499 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/org_scale.py +19 -16
- codeprobe-0.3.6/src/codeprobe/mining/sg_auth.py +318 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/sg_ground_truth.py +103 -28
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/writer.py +130 -6
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/models/experiment.py +15 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/models/task.py +37 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe.egg-info/PKG-INFO +24 -2
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe.egg-info/SOURCES.txt +12 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_adapters.py +173 -9
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_assess.py +90 -11
- codeprobe-0.3.6/tests/test_auth_cmd.py +149 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_cli.py +101 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_executor.py +303 -7
- codeprobe-0.3.6/tests/test_isolation.py +179 -0
- codeprobe-0.3.6/tests/test_mine_cli.py +138 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_mine_goals.py +383 -49
- codeprobe-0.3.6/tests/test_mine_presets.py +309 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_mining.py +142 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_models.py +10 -0
- codeprobe-0.3.6/tests/test_multi_repo_e2e.py +465 -0
- codeprobe-0.3.6/tests/test_multi_repo_mining.py +225 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_org_scale.py +134 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_pipeline_integration.py +6 -6
- codeprobe-0.3.6/tests/test_secret_redaction.py +234 -0
- codeprobe-0.3.6/tests/test_sg_auth.py +307 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_sg_ground_truth.py +141 -29
- codeprobe-0.3.6/tests/test_suite_manifest.py +142 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_telemetry.py +36 -0
- codeprobe-0.3.4/tests/test_mine_presets.py +0 -163
- {codeprobe-0.3.4 → codeprobe-0.3.6}/LICENSE +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/setup.cfg +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/analysis/report.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/analysis/stats.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/assess/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/doctor_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/experiment_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/init_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/interpret_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/json_display.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/preamble_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/probe_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/validate_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/wizard.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/cli/yaml_writer.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/config/loader.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/__main__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/checkpoint.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/events.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/mcp_discovery.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/preamble.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/registry.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/core/scoring.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/loaders/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/loaders/suite.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/_graph.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/comprehension.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/comprehension_writer.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/curator.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/curator_backends.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/org_scale_families.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/org_scale_oracle.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/org_scale_scanner.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/models/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/models/suite.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/preambles/github.md +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/preambles/sourcegraph.md +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/probe/adapter.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/probe/generator.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/probe/writer.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/ratings/collector.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe.egg-info/entry_points.txt +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe.egg-info/requires.txt +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_adapter_contracts.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_analysis.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_api.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_artifact_scorer.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_changed_symbols.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_checkpoint.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_checkpoint_scoring.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_comprehension.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_config_loader.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_contrib.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_ctrlc_integration.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_curator_backends.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_curator_core.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_curator_integration.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_curator_tiers.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_doctor_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_events.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_executor_events.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_experiment_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_experiment_core.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_init_wizard.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_json_display.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_llm.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_loaders.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_mcp_families_mining.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_mcp_validate.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_mine_profiles.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_new_families.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_preamble.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_preamble_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_probe.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_probe_adapter.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_ratings.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_registry.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_run_config_resolution.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_scaffold.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_scoring.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_session.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_shell_shim.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_show_prompt.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_suite.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_validate_cmd.py +0 -0
- {codeprobe-0.3.4 → codeprobe-0.3.6}/tests/test_weighted_f1.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.6
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -42,7 +42,7 @@ Mine real tasks from your repo history, run agents against them, and find out wh
|
|
|
42
42
|
|
|
43
43
|
## Why codeprobe?
|
|
44
44
|
|
|
45
|
-
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique
|
|
45
|
+
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
|
|
46
46
|
|
|
47
47
|
## Prerequisites
|
|
48
48
|
|
|
@@ -122,6 +122,28 @@ codeprobe probe . -n 10 -l python -s 42 -o ./probes
|
|
|
122
122
|
|
|
123
123
|
Generates four probe types: find-function, count-callers, return-type, module-dependency.
|
|
124
124
|
|
|
125
|
+
## Curation Workflows
|
|
126
|
+
|
|
127
|
+
End-to-end flows from a raw repo to ranked agent results. Each workflow covers the full `assess → mine → validate → run → interpret` pipeline.
|
|
128
|
+
|
|
129
|
+
| Workflow | When to use | Guide |
|
|
130
|
+
| -------------- | ----------------------------------------- | ------------------------------------------------------------ |
|
|
131
|
+
| **Standard** | Repo has merged PRs/MRs | [docs/workflows/standard.md](docs/workflows/standard.md) |
|
|
132
|
+
| **Cold-start** | New repo, squashed history, vendored code | [docs/workflows/cold-start.md](docs/workflows/cold-start.md) |
|
|
133
|
+
| **Cross-repo** | Tasks spanning multiple repositories | [docs/workflows/cross-repo.md](docs/workflows/cross-repo.md) |
|
|
134
|
+
|
|
135
|
+
**Quick start (standard path):**
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
codeprobe assess /path/to/repo
|
|
139
|
+
codeprobe mine /path/to/repo --goal quality --count 10 --no-interactive
|
|
140
|
+
codeprobe validate /path/to/repo/.codeprobe/tasks/<task-id>
|
|
141
|
+
codeprobe run /path/to/repo --agent claude --max-cost-usd 5.00
|
|
142
|
+
codeprobe interpret /path/to/repo
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
For the full MCP comparison setup (preambles, baseline vs with-MCP configs), see the next section.
|
|
146
|
+
|
|
125
147
|
## MCP Comparison Experiments
|
|
126
148
|
|
|
127
149
|
Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
|
|
@@ -6,7 +6,7 @@ Mine real tasks from your repo history, run agents against them, and find out wh
|
|
|
6
6
|
|
|
7
7
|
## Why codeprobe?
|
|
8
8
|
|
|
9
|
-
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique
|
|
9
|
+
Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
|
|
10
10
|
|
|
11
11
|
## Prerequisites
|
|
12
12
|
|
|
@@ -86,6 +86,28 @@ codeprobe probe . -n 10 -l python -s 42 -o ./probes
|
|
|
86
86
|
|
|
87
87
|
Generates four probe types: find-function, count-callers, return-type, module-dependency.
|
|
88
88
|
|
|
89
|
+
## Curation Workflows
|
|
90
|
+
|
|
91
|
+
End-to-end flows from a raw repo to ranked agent results. Each workflow covers the full `assess → mine → validate → run → interpret` pipeline.
|
|
92
|
+
|
|
93
|
+
| Workflow | When to use | Guide |
|
|
94
|
+
| -------------- | ----------------------------------------- | ------------------------------------------------------------ |
|
|
95
|
+
| **Standard** | Repo has merged PRs/MRs | [docs/workflows/standard.md](docs/workflows/standard.md) |
|
|
96
|
+
| **Cold-start** | New repo, squashed history, vendored code | [docs/workflows/cold-start.md](docs/workflows/cold-start.md) |
|
|
97
|
+
| **Cross-repo** | Tasks spanning multiple repositories | [docs/workflows/cross-repo.md](docs/workflows/cross-repo.md) |
|
|
98
|
+
|
|
99
|
+
**Quick start (standard path):**
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
codeprobe assess /path/to/repo
|
|
103
|
+
codeprobe mine /path/to/repo --goal quality --count 10 --no-interactive
|
|
104
|
+
codeprobe validate /path/to/repo/.codeprobe/tasks/<task-id>
|
|
105
|
+
codeprobe run /path/to/repo --agent claude --max-cost-usd 5.00
|
|
106
|
+
codeprobe interpret /path/to/repo
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
For the full MCP comparison setup (preambles, baseline vs with-MCP configs), see the next section.
|
|
110
|
+
|
|
89
111
|
## MCP Comparison Experiments
|
|
90
112
|
|
|
91
113
|
Compare agent performance with and without MCP tools (Sourcegraph, GitHub, etc.).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "codeprobe"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.6"
|
|
4
4
|
description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -75,6 +75,9 @@ where = ["src"]
|
|
|
75
75
|
|
|
76
76
|
[tool.pytest.ini_options]
|
|
77
77
|
testpaths = ["tests"]
|
|
78
|
+
markers = [
|
|
79
|
+
"integration: requires external services (skipped by default in CI)",
|
|
80
|
+
]
|
|
78
81
|
|
|
79
82
|
[tool.mypy]
|
|
80
83
|
python_version = "3.11"
|
|
@@ -63,6 +63,19 @@ def _adapter_safe_env(extra: dict[str, str] | None = None) -> dict[str, str]:
|
|
|
63
63
|
return env
|
|
64
64
|
|
|
65
65
|
|
|
66
|
+
def _decode_timeout_output(raw: str | bytes | None) -> str:
|
|
67
|
+
"""Decode stdout/stderr from a TimeoutExpired exception.
|
|
68
|
+
|
|
69
|
+
The exception may carry ``str``, ``bytes``, or ``None`` depending on
|
|
70
|
+
how ``subprocess.run`` was called and how the process was killed.
|
|
71
|
+
"""
|
|
72
|
+
if raw is None:
|
|
73
|
+
return ""
|
|
74
|
+
if isinstance(raw, bytes):
|
|
75
|
+
return raw.decode("utf-8", errors="replace")
|
|
76
|
+
return raw
|
|
77
|
+
|
|
78
|
+
|
|
66
79
|
class BaseAdapter:
|
|
67
80
|
"""Base class for CLI-based agent adapters.
|
|
68
81
|
|
|
@@ -158,16 +171,50 @@ class BaseAdapter:
|
|
|
158
171
|
text=True,
|
|
159
172
|
timeout=config.timeout_seconds,
|
|
160
173
|
cwd=config.cwd,
|
|
161
|
-
env=_adapter_safe_env(session_env),
|
|
174
|
+
env=_adapter_safe_env(session_env) if session_env else None,
|
|
162
175
|
)
|
|
163
176
|
except subprocess.TimeoutExpired as exc:
|
|
164
177
|
duration = time.monotonic() - start
|
|
178
|
+
timeout_error = f"Agent timed out after {config.timeout_seconds}s"
|
|
179
|
+
|
|
180
|
+
raw_stdout = _decode_timeout_output(exc.stdout)
|
|
181
|
+
raw_stderr = _decode_timeout_output(exc.stderr) or None
|
|
182
|
+
|
|
183
|
+
if raw_stdout:
|
|
184
|
+
try:
|
|
185
|
+
partial_result = subprocess.CompletedProcess(
|
|
186
|
+
args=cmd,
|
|
187
|
+
returncode=-1,
|
|
188
|
+
stdout=raw_stdout,
|
|
189
|
+
stderr=raw_stderr or "",
|
|
190
|
+
)
|
|
191
|
+
parsed = self.parse_output(partial_result, duration)
|
|
192
|
+
merged_error = timeout_error
|
|
193
|
+
if parsed.error:
|
|
194
|
+
merged_error = f"{timeout_error}; {parsed.error}"
|
|
195
|
+
return AgentOutput(
|
|
196
|
+
stdout=parsed.stdout,
|
|
197
|
+
stderr=parsed.stderr,
|
|
198
|
+
exit_code=-1,
|
|
199
|
+
duration_seconds=duration,
|
|
200
|
+
input_tokens=parsed.input_tokens,
|
|
201
|
+
output_tokens=parsed.output_tokens,
|
|
202
|
+
cache_read_tokens=parsed.cache_read_tokens,
|
|
203
|
+
cost_usd=parsed.cost_usd,
|
|
204
|
+
cost_model=parsed.cost_model,
|
|
205
|
+
cost_source=parsed.cost_source,
|
|
206
|
+
error=merged_error,
|
|
207
|
+
tool_call_count=parsed.tool_call_count,
|
|
208
|
+
)
|
|
209
|
+
except Exception as parse_exc:
|
|
210
|
+
timeout_error = f"{timeout_error}; parse_output failed: {parse_exc}"
|
|
211
|
+
|
|
165
212
|
return AgentOutput(
|
|
166
|
-
stdout=
|
|
167
|
-
stderr=
|
|
213
|
+
stdout=raw_stdout,
|
|
214
|
+
stderr=raw_stderr,
|
|
168
215
|
exit_code=-1,
|
|
169
216
|
duration_seconds=duration,
|
|
170
|
-
error=
|
|
217
|
+
error=timeout_error,
|
|
171
218
|
)
|
|
172
219
|
except FileNotFoundError as exc:
|
|
173
220
|
raise AdapterSetupError(f"Binary not found at runtime: {exc}") from exc
|
|
@@ -44,6 +44,7 @@ class AgentOutput:
|
|
|
44
44
|
cost_model: str = "unknown"
|
|
45
45
|
error: str | None = None
|
|
46
46
|
cost_source: str = "unavailable"
|
|
47
|
+
tool_call_count: int | None = None
|
|
47
48
|
|
|
48
49
|
def __post_init__(self) -> None:
|
|
49
50
|
if self.cost_model not in ALLOWED_COST_MODELS:
|
|
@@ -81,6 +82,13 @@ class AgentAdapter(Protocol):
|
|
|
81
82
|
|
|
82
83
|
[project.entry-points."codeprobe.agents"]
|
|
83
84
|
myagent = "my_package:MyAgentAdapter"
|
|
85
|
+
|
|
86
|
+
For cross-repo tasks, the executor may lay out additional
|
|
87
|
+
repositories under ``<workspace>/repos/<name>``, each pinned to its
|
|
88
|
+
own pre-merge commit. Adapters don't need special handling — the
|
|
89
|
+
paths are available for the model to navigate, and the primary
|
|
90
|
+
workspace remains at its existing location for backwards
|
|
91
|
+
compatibility with single-repo tasks.
|
|
84
92
|
"""
|
|
85
93
|
|
|
86
94
|
@property
|
|
@@ -65,6 +65,7 @@ class UsageData:
|
|
|
65
65
|
cost_model: str = "unknown"
|
|
66
66
|
cost_source: str = "unavailable"
|
|
67
67
|
error: str | None = None
|
|
68
|
+
tool_call_count: int | None = None
|
|
68
69
|
|
|
69
70
|
def __post_init__(self) -> None:
|
|
70
71
|
if self.cost_model not in ALLOWED_COST_MODELS:
|
|
@@ -86,6 +87,28 @@ class TelemetryCollector(Protocol):
|
|
|
86
87
|
def collect(self, raw_output: str, **context: Any) -> UsageData: ...
|
|
87
88
|
|
|
88
89
|
|
|
90
|
+
def _count_tool_use_blocks(envelope: dict[str, Any]) -> int | None:
|
|
91
|
+
"""Count ``tool_use`` content blocks in a Claude CLI JSON envelope.
|
|
92
|
+
|
|
93
|
+
Iterates the ``messages`` array (when present) and counts content
|
|
94
|
+
blocks with ``type == "tool_use"`` in assistant messages.
|
|
95
|
+
Returns ``None`` when the envelope has no ``messages`` key.
|
|
96
|
+
"""
|
|
97
|
+
messages = envelope.get("messages")
|
|
98
|
+
if messages is None:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
count = 0
|
|
102
|
+
for msg in messages:
|
|
103
|
+
content = msg.get("content")
|
|
104
|
+
if not isinstance(content, list):
|
|
105
|
+
continue
|
|
106
|
+
for block in content:
|
|
107
|
+
if isinstance(block, dict) and block.get("type") == "tool_use":
|
|
108
|
+
count += 1
|
|
109
|
+
return count
|
|
110
|
+
|
|
111
|
+
|
|
89
112
|
class JsonStdoutCollector:
|
|
90
113
|
"""Extract telemetry from Claude CLI JSON envelope on stdout.
|
|
91
114
|
|
|
@@ -125,6 +148,8 @@ class JsonStdoutCollector:
|
|
|
125
148
|
cost_model = "unknown"
|
|
126
149
|
cost_source = "unavailable"
|
|
127
150
|
|
|
151
|
+
tool_call_count = _count_tool_use_blocks(envelope)
|
|
152
|
+
|
|
128
153
|
return UsageData(
|
|
129
154
|
input_tokens=input_tokens,
|
|
130
155
|
output_tokens=output_tokens,
|
|
@@ -132,6 +157,7 @@ class JsonStdoutCollector:
|
|
|
132
157
|
cost_usd=cost_usd_raw,
|
|
133
158
|
cost_model=cost_model,
|
|
134
159
|
cost_source=cost_source,
|
|
160
|
+
tool_call_count=tool_call_count,
|
|
135
161
|
)
|
|
136
162
|
|
|
137
163
|
|
|
@@ -185,8 +185,17 @@ def run_experiment(
|
|
|
185
185
|
|
|
186
186
|
save_config_results(experiment_dir, exp_config.label, results)
|
|
187
187
|
|
|
188
|
-
|
|
189
|
-
|
|
188
|
+
scoring = sum(1 for r in results if r.automated_score > 0.0)
|
|
189
|
+
mean = (
|
|
190
|
+
sum(r.automated_score for r in results) / len(results) if results else 0.0
|
|
191
|
+
)
|
|
192
|
+
logger.info(
|
|
193
|
+
"[%s] %d/%d scored (mean=%.2f)",
|
|
194
|
+
exp_config.label,
|
|
195
|
+
scoring,
|
|
196
|
+
len(results),
|
|
197
|
+
mean,
|
|
198
|
+
)
|
|
190
199
|
|
|
191
200
|
all_config_results.append(
|
|
192
201
|
ConfigResults(config=exp_config.label, completed=results)
|
|
@@ -69,6 +69,16 @@ _TEST_GLOBS: list[str] = [
|
|
|
69
69
|
"*.spec.js",
|
|
70
70
|
]
|
|
71
71
|
|
|
72
|
+
# Recursive variants for repos with nested test layouts (e.g. numpy/_core/tests/).
|
|
73
|
+
_RECURSIVE_TEST_DIR_GLOBS: list[str] = [
|
|
74
|
+
"**/tests/**",
|
|
75
|
+
"**/test/**",
|
|
76
|
+
"**/spec/**",
|
|
77
|
+
"**/__tests__/**",
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
_RECURSIVE_TEST_FILE_GLOBS: list[str] = [f"**/{p}" for p in _TEST_GLOBS]
|
|
81
|
+
|
|
72
82
|
# ---------------------------------------------------------------------------
|
|
73
83
|
# Fixed rubric — model scores against these, doesn't invent them
|
|
74
84
|
# ---------------------------------------------------------------------------
|
|
@@ -217,16 +227,20 @@ def _detect_primary_languages(file_list: str) -> list[str]:
|
|
|
217
227
|
|
|
218
228
|
|
|
219
229
|
def _has_tests(repo_path: Path) -> bool:
|
|
220
|
-
"""Check whether the repo appears to contain tests.
|
|
230
|
+
"""Check whether the repo appears to contain tests.
|
|
231
|
+
|
|
232
|
+
Checks top-level test directories first, then falls back to recursive
|
|
233
|
+
git ls-files glob patterns to catch repos with nested test layouts
|
|
234
|
+
(e.g. numpy/_core/tests/, numpy/tests/).
|
|
235
|
+
"""
|
|
236
|
+
# Fast path: top-level test directories
|
|
221
237
|
for d in _TEST_DIRS:
|
|
222
238
|
if (repo_path / d).is_dir():
|
|
223
239
|
return True
|
|
224
|
-
#
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
return True
|
|
229
|
-
return False
|
|
240
|
+
# Single git ls-files call with all patterns (top-level + recursive)
|
|
241
|
+
all_patterns = _TEST_GLOBS + _RECURSIVE_TEST_DIR_GLOBS + _RECURSIVE_TEST_FILE_GLOBS
|
|
242
|
+
out = _run_git(["ls-files", "--", *all_patterns], cwd=repo_path)
|
|
243
|
+
return bool(out)
|
|
230
244
|
|
|
231
245
|
|
|
232
246
|
def _has_ci(repo_path: Path) -> bool:
|