codeprobe 0.5.2__tar.gz → 0.5.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.5.2 → codeprobe-0.5.4}/PKG-INFO +1 -1
- {codeprobe-0.5.2 → codeprobe-0.5.4}/pyproject.toml +1 -1
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/claude.py +52 -4
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/protocol.py +14 -1
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/telemetry.py +86 -5
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/stats.py +25 -1
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/api.py +2 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/__init__.py +28 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/experiment_cmd.py +4 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/run_cmd.py +2 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/executor.py +1 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/experiment.py +4 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/experiment.py +16 -1
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/PKG-INFO +1 -1
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_adapters.py +162 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_stats.py +57 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/LICENSE +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/README.md +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/setup.cfg +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/acceptance_compiler.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/_base.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/dual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/analysis/report.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/assess/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/assess/heuristics.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/assess/oracle_diff.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/auth_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/doctor_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/init_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/interpret_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/json_display.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/mine_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/preamble_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/probe_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/rich_display.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/validate_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/wizard.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/cli/yaml_writer.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/config/loader.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/config/redact.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/__main__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/checkpoint.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/events.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/isolation.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/mcp_discovery.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/preamble.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/registry.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/repo_hygiene.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/core/scoring.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/loaders/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/loaders/suite.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/_graph.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/comprehension.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/comprehension_writer.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/curator.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/curator_backends.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/extractor.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/multi_repo.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_families.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_oracle.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_scanner.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/sg_auth.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/sg_ground_truth.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/task_types.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/mining/writer.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/suite.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/models/task.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/preambles/github.md +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/preambles/sourcegraph.md +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/adapter.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/generator.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/probe/writer.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/ratings/collector.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/SOURCES.txt +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/entry_points.txt +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/requires.txt +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_acceptance_compiler.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_acceptance_compiler_integration.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_adapter_contracts.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_analysis.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_api.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_artifact_scorer.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_assess.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_auth_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_changed_symbols.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_checkpoint.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_checkpoint_scoring.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_cli.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_comprehension.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_config_loader.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_contrib.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_convergence.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_criteria_loader.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ctrlc_integration.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_backends.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_core.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_integration.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_curator_tiers.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_doctor_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_adversarial_fixes.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_composite.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_e2e.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_matrix.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_scorer.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_dual_scoring_details.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_events.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_examples_dual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_executor.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_executor_dual_isolation.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_executor_events.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_experiment_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_experiment_core.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ground_truth_schema.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_init_wizard.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_isolation.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_json_display.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_listeners_dual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_llm.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_loader.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_loaders.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_loaders_dual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mcp_families_mining.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mcp_validate.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_cli.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_goals.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_presets.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mine_profiles.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mining.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_mining_dual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_models.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_multi_repo_e2e.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_multi_repo_mining.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_new_families.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_oracle_diff.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_oracle_registry.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_org_scale.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_pipeline_integration.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_preamble.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_preamble_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_probe.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_probe_adapter.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ratings.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_registry.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_regression_gate.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_release_gate.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_repo_hygiene.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_report_dual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_run_config_resolution.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_safe_leg_score.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scaffold.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scaffold_upgrade.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_score_result.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scoring.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scoring_extended.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_scoring_v2.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_sdlc_ground_truth.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_secret_redaction.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_session.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_sg_auth.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_sg_ground_truth.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_shell_shim.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_show_prompt.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_suite.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_suite_manifest.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_task_model.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_task_types.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_telemetry.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_validate_cmd.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_validate_dual.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_verifier.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_verify.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_weighted_checklist.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_weighted_f1.py +0 -0
- {codeprobe-0.5.2 → codeprobe-0.5.4}/tests/test_writer_dual.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -243,7 +243,13 @@ class ClaudeAdapter(BaseAdapter):
|
|
|
243
243
|
|
|
244
244
|
def build_command(self, prompt: str, config: AgentConfig) -> list[str]:
|
|
245
245
|
binary = self._require_binary()
|
|
246
|
-
|
|
246
|
+
# stream-json + --verbose emits newline-delimited events including
|
|
247
|
+
# every assistant message (with tool_use content blocks) and ends
|
|
248
|
+
# with a ``type: "result"`` event mirroring the ``json`` envelope.
|
|
249
|
+
# This is what gives us accurate per-run tool_call_count and
|
|
250
|
+
# per-tool observability; the collector reconstructs the envelope
|
|
251
|
+
# from the terminal event.
|
|
252
|
+
cmd = [binary, "-p", prompt, "--output-format", "stream-json", "--verbose"]
|
|
247
253
|
|
|
248
254
|
if config.model:
|
|
249
255
|
cmd.extend(["--model", _normalize_model_for_cli(config.model)])
|
|
@@ -262,6 +268,27 @@ class ClaudeAdapter(BaseAdapter):
|
|
|
262
268
|
if mcp_path:
|
|
263
269
|
cmd.extend(["--mcp-config", mcp_path, "--strict-mcp-config"])
|
|
264
270
|
|
|
271
|
+
# Tool restrictions. Claude CLI has three related flags:
|
|
272
|
+
# --tools "" disables all built-in tools
|
|
273
|
+
# --allowedTools X,Y auto-approves these tools (no permission
|
|
274
|
+
# prompt); names may include MCP tools as
|
|
275
|
+
# ``mcp__<server>__<tool>``
|
|
276
|
+
# --disallowedTools X,Y blocks these tools outright
|
|
277
|
+
# We treat ``allowed_tools`` as a whitelist: when set, built-ins
|
|
278
|
+
# are disabled (``--tools ""``) and listed names are auto-approved
|
|
279
|
+
# (``--allowedTools``). This yields true MCP-only runs when the
|
|
280
|
+
# whitelist contains only ``mcp__*`` names — verified against
|
|
281
|
+
# claude 2.1.x: without auto-approval the agent hits permission
|
|
282
|
+
# prompts and ends the turn early.
|
|
283
|
+
if config.allowed_tools is not None:
|
|
284
|
+
cmd.extend(["--tools", ""])
|
|
285
|
+
if config.allowed_tools:
|
|
286
|
+
cmd.extend(["--allowedTools", ",".join(config.allowed_tools)])
|
|
287
|
+
if config.disallowed_tools:
|
|
288
|
+
cmd.extend(
|
|
289
|
+
["--disallowedTools", ",".join(config.disallowed_tools)]
|
|
290
|
+
)
|
|
291
|
+
|
|
265
292
|
return cmd
|
|
266
293
|
|
|
267
294
|
def isolate_session(self, slot_id: int) -> dict[str, str]:
|
|
@@ -290,15 +317,35 @@ class ClaudeAdapter(BaseAdapter):
|
|
|
290
317
|
return {}
|
|
291
318
|
|
|
292
319
|
def parse_output(self, result: subprocess.CompletedProcess[str], duration: float) -> AgentOutput:
|
|
293
|
-
"""Parse Claude CLI JSON envelope into AgentOutput.
|
|
320
|
+
"""Parse Claude CLI JSON envelope into AgentOutput.
|
|
321
|
+
|
|
322
|
+
Handles both ``--output-format json`` (single envelope) and
|
|
323
|
+
``--output-format stream-json --verbose`` (newline-delimited
|
|
324
|
+
events) — the collector auto-detects. When parsing a stream, the
|
|
325
|
+
final ``type: "result"`` event carries the same fields as the
|
|
326
|
+
single-envelope shape, so we reconstruct ``result`` text from it.
|
|
327
|
+
"""
|
|
294
328
|
usage = self._collector.collect(result.stdout)
|
|
295
329
|
|
|
296
|
-
# Extract content text
|
|
330
|
+
# Extract content text. For stream-json, the terminal result event
|
|
331
|
+
# has a ``result`` field; iterate events to find it. For single
|
|
332
|
+
# envelope, json.loads works directly.
|
|
333
|
+
stdout_text = result.stdout
|
|
297
334
|
try:
|
|
298
335
|
envelope = json.loads(result.stdout)
|
|
299
336
|
stdout_text = envelope.get("result", result.stdout)
|
|
300
337
|
except (json.JSONDecodeError, ValueError):
|
|
301
|
-
|
|
338
|
+
for line in reversed(result.stdout.splitlines()):
|
|
339
|
+
line = line.strip()
|
|
340
|
+
if not line:
|
|
341
|
+
continue
|
|
342
|
+
try:
|
|
343
|
+
ev = json.loads(line)
|
|
344
|
+
except (json.JSONDecodeError, ValueError):
|
|
345
|
+
continue
|
|
346
|
+
if isinstance(ev, dict) and ev.get("type") == "result":
|
|
347
|
+
stdout_text = ev.get("result", result.stdout)
|
|
348
|
+
break
|
|
302
349
|
|
|
303
350
|
return AgentOutput(
|
|
304
351
|
stdout=stdout_text,
|
|
@@ -313,4 +360,5 @@ class ClaudeAdapter(BaseAdapter):
|
|
|
313
360
|
cost_source=usage.cost_source,
|
|
314
361
|
error=usage.error,
|
|
315
362
|
tool_call_count=usage.tool_call_count,
|
|
363
|
+
tool_use_by_name=usage.tool_use_by_name,
|
|
316
364
|
)
|
|
@@ -45,6 +45,9 @@ class AgentOutput:
|
|
|
45
45
|
error: str | None = None
|
|
46
46
|
cost_source: str = "unavailable"
|
|
47
47
|
tool_call_count: int | None = None
|
|
48
|
+
# Per-tool usage counts (e.g. {"Read": 5, "mcp__sourcegraph__...": 2}).
|
|
49
|
+
# None when the adapter couldn't capture a streaming transcript.
|
|
50
|
+
tool_use_by_name: dict[str, int] | None = None
|
|
48
51
|
|
|
49
52
|
def __post_init__(self) -> None:
|
|
50
53
|
if self.cost_model not in ALLOWED_COST_MODELS:
|
|
@@ -63,12 +66,22 @@ class AgentOutput:
|
|
|
63
66
|
|
|
64
67
|
@dataclass(frozen=True)
|
|
65
68
|
class AgentConfig:
|
|
66
|
-
"""Configuration passed to an agent adapter.
|
|
69
|
+
"""Configuration passed to an agent adapter.
|
|
70
|
+
|
|
71
|
+
``allowed_tools`` / ``disallowed_tools`` restrict which tools the agent
|
|
72
|
+
may call. When both are ``None`` the adapter uses its default tool set.
|
|
73
|
+
When ``allowed_tools`` is an empty list, the adapter disables all
|
|
74
|
+
built-in tools (useful for MCP-only experiments: MCP tools are still
|
|
75
|
+
available because they come from ``mcp_config``, but no built-in
|
|
76
|
+
``Read``/``Grep``/``Bash``/etc. are).
|
|
77
|
+
"""
|
|
67
78
|
|
|
68
79
|
model: str | None = None
|
|
69
80
|
permission_mode: str = "default"
|
|
70
81
|
timeout_seconds: int = 3600
|
|
71
82
|
mcp_config: dict | None = None
|
|
83
|
+
allowed_tools: list[str] | None = None
|
|
84
|
+
disallowed_tools: list[str] | None = None
|
|
72
85
|
extra: dict | None = None
|
|
73
86
|
cwd: str | None = None
|
|
74
87
|
|
|
@@ -66,6 +66,11 @@ class UsageData:
|
|
|
66
66
|
cost_source: str = "unavailable"
|
|
67
67
|
error: str | None = None
|
|
68
68
|
tool_call_count: int | None = None
|
|
69
|
+
# Tool-use counts broken down by tool name (e.g. ``{"Read": 5,
|
|
70
|
+
# "mcp__sourcegraph__keyword_search": 2}``). Populated only when the
|
|
71
|
+
# adapter captured a streaming transcript. None means "not captured",
|
|
72
|
+
# not "no tool calls".
|
|
73
|
+
tool_use_by_name: dict[str, int] | None = None
|
|
69
74
|
|
|
70
75
|
def __post_init__(self) -> None:
|
|
71
76
|
if self.cost_model not in ALLOWED_COST_MODELS:
|
|
@@ -145,6 +150,45 @@ def _count_tool_use_blocks(envelope: dict[str, Any]) -> int | None:
|
|
|
145
150
|
return count
|
|
146
151
|
|
|
147
152
|
|
|
153
|
+
def _parse_stream_json(raw_output: str) -> tuple[dict[str, Any] | None, int, dict[str, int]]:
|
|
154
|
+
"""Parse a ``--output-format stream-json --verbose`` transcript.
|
|
155
|
+
|
|
156
|
+
Returns ``(result_event, tool_use_count, tool_use_by_name)``.
|
|
157
|
+
``result_event`` is the final ``type: "result"`` event (same shape as
|
|
158
|
+
``--output-format json`` envelope), or None when the stream is
|
|
159
|
+
malformed or has no terminal event. ``tool_use_by_name`` aggregates
|
|
160
|
+
tool-use block counts by tool name (including MCP tools, which appear
|
|
161
|
+
as ``mcp__<server>__<tool>``), useful for observability.
|
|
162
|
+
"""
|
|
163
|
+
result_event: dict[str, Any] | None = None
|
|
164
|
+
tool_use_count = 0
|
|
165
|
+
by_name: dict[str, int] = {}
|
|
166
|
+
for line in raw_output.splitlines():
|
|
167
|
+
line = line.strip()
|
|
168
|
+
if not line:
|
|
169
|
+
continue
|
|
170
|
+
try:
|
|
171
|
+
ev = json.loads(line)
|
|
172
|
+
except (json.JSONDecodeError, ValueError):
|
|
173
|
+
continue
|
|
174
|
+
if not isinstance(ev, dict):
|
|
175
|
+
continue
|
|
176
|
+
if ev.get("type") == "assistant":
|
|
177
|
+
msg = ev.get("message")
|
|
178
|
+
if isinstance(msg, dict):
|
|
179
|
+
for block in msg.get("content", []) or []:
|
|
180
|
+
if not isinstance(block, dict):
|
|
181
|
+
continue
|
|
182
|
+
if block.get("type") == "tool_use":
|
|
183
|
+
tool_use_count += 1
|
|
184
|
+
name = block.get("name", "")
|
|
185
|
+
if isinstance(name, str) and name:
|
|
186
|
+
by_name[name] = by_name.get(name, 0) + 1
|
|
187
|
+
if ev.get("type") == "result":
|
|
188
|
+
result_event = ev
|
|
189
|
+
return result_event, tool_use_count, by_name
|
|
190
|
+
|
|
191
|
+
|
|
148
192
|
class JsonStdoutCollector:
|
|
149
193
|
"""Extract telemetry from Claude CLI JSON envelope on stdout.
|
|
150
194
|
|
|
@@ -162,10 +206,40 @@ class JsonStdoutCollector:
|
|
|
162
206
|
"""
|
|
163
207
|
|
|
164
208
|
def collect(self, raw_output: str, **context: Any) -> UsageData:
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
209
|
+
# Two accepted shapes:
|
|
210
|
+
# 1. ``--output-format json`` — a single JSON envelope; no
|
|
211
|
+
# per-tool-use trace, so tool_call_count stays None.
|
|
212
|
+
# 2. ``--output-format stream-json --verbose`` — newline-delimited
|
|
213
|
+
# events ending in a ``type: "result"`` event that mirrors
|
|
214
|
+
# shape (1). We also count ``tool_use`` blocks across all
|
|
215
|
+
# ``assistant`` events for accurate tool_call_count.
|
|
216
|
+
stream_tool_count: int | None = None
|
|
217
|
+
stream_tool_by_name: dict[str, int] = {}
|
|
218
|
+
trimmed = raw_output.lstrip()
|
|
219
|
+
if trimmed.startswith("{\n") or trimmed.startswith("{"):
|
|
220
|
+
# Try single-envelope path first — most adapters still use
|
|
221
|
+
# ``--output-format json``.
|
|
222
|
+
try:
|
|
223
|
+
envelope = json.loads(raw_output)
|
|
224
|
+
if envelope.get("type") == "result" and "\n" in raw_output.rstrip():
|
|
225
|
+
# Ambiguous: looks like a single-line event from the
|
|
226
|
+
# stream. Fall through to stream parsing below.
|
|
227
|
+
raise ValueError("ambiguous envelope — retry as stream")
|
|
228
|
+
except (json.JSONDecodeError, ValueError):
|
|
229
|
+
envelope = None
|
|
230
|
+
else:
|
|
231
|
+
envelope = None
|
|
232
|
+
if envelope is None:
|
|
233
|
+
result_ev, stream_tool_count, stream_tool_by_name = _parse_stream_json(
|
|
234
|
+
raw_output
|
|
235
|
+
)
|
|
236
|
+
if result_ev is None:
|
|
237
|
+
return UsageData(
|
|
238
|
+
error="JSON parse failed: output is neither a valid "
|
|
239
|
+
"envelope nor a stream-json transcript ending in a "
|
|
240
|
+
"'result' event"
|
|
241
|
+
)
|
|
242
|
+
envelope = result_ev
|
|
169
243
|
|
|
170
244
|
usage = envelope.get("usage")
|
|
171
245
|
if usage is None:
|
|
@@ -197,7 +271,13 @@ class JsonStdoutCollector:
|
|
|
197
271
|
cost_model = "unknown"
|
|
198
272
|
cost_source = "unavailable"
|
|
199
273
|
|
|
200
|
-
|
|
274
|
+
# Prefer stream-json count when the transcript was streamed — it's
|
|
275
|
+
# always present and accurate. Fall back to the envelope's
|
|
276
|
+
# ``messages`` array (when some future CLI flag surfaces it), else
|
|
277
|
+
# stays None.
|
|
278
|
+
tool_call_count = stream_tool_count
|
|
279
|
+
if tool_call_count is None:
|
|
280
|
+
tool_call_count = _count_tool_use_blocks(envelope)
|
|
201
281
|
|
|
202
282
|
return UsageData(
|
|
203
283
|
input_tokens=input_tokens,
|
|
@@ -207,6 +287,7 @@ class JsonStdoutCollector:
|
|
|
207
287
|
cost_model=cost_model,
|
|
208
288
|
cost_source=cost_source,
|
|
209
289
|
tool_call_count=tool_call_count,
|
|
290
|
+
tool_use_by_name=stream_tool_by_name or None,
|
|
210
291
|
error=envelope_error,
|
|
211
292
|
)
|
|
212
293
|
|
|
@@ -583,7 +583,31 @@ def compare_configs(
|
|
|
583
583
|
elif speed_diff > 0:
|
|
584
584
|
parts.append(f"{speed_diff:.1f}s slower")
|
|
585
585
|
|
|
586
|
-
|
|
586
|
+
# Soften the verdict when the effect is negligible or the test is
|
|
587
|
+
# underpowered, so we don't confidently declare a "winner" on what may
|
|
588
|
+
# be noise. Thresholds:
|
|
589
|
+
# Cohen's d: |d| < 0.2 is "negligible" (Cohen 1988).
|
|
590
|
+
# Cliff's delta: |delta| < 0.147 is "negligible" (Romano et al. 2006).
|
|
591
|
+
# p-value > 0.05: not significant at the conventional threshold.
|
|
592
|
+
scores_tied = abs(score_diff) < 0.01
|
|
593
|
+
negligible_threshold = 0.2 if eff_method == "cohens_d" else 0.147
|
|
594
|
+
small_effect = (
|
|
595
|
+
eff_size is not None and abs(eff_size) < negligible_threshold
|
|
596
|
+
)
|
|
597
|
+
not_significant = p_val is not None and p_val > 0.05
|
|
598
|
+
|
|
599
|
+
if scores_tied:
|
|
600
|
+
verdict = "effectively tied"
|
|
601
|
+
elif small_effect and not_significant:
|
|
602
|
+
verdict = f"{winner} nominally ahead (not significant; small effect)"
|
|
603
|
+
elif small_effect:
|
|
604
|
+
verdict = f"{winner} nominally ahead (small effect size)"
|
|
605
|
+
elif not_significant:
|
|
606
|
+
verdict = f"{winner} nominally ahead (not significant at p=0.05)"
|
|
607
|
+
else:
|
|
608
|
+
verdict = f"{winner} wins"
|
|
609
|
+
|
|
610
|
+
summary = f"{a.label} vs {b.label}: {', '.join(parts)} \u2192 {verdict}"
|
|
587
611
|
|
|
588
612
|
return PairwiseComparison(
|
|
589
613
|
config_a=a.label,
|
|
@@ -151,6 +151,8 @@ def run_experiment(
|
|
|
151
151
|
permission_mode=perm,
|
|
152
152
|
timeout_seconds=timeout,
|
|
153
153
|
mcp_config=exp_config.mcp_config,
|
|
154
|
+
allowed_tools=exp_config.allowed_tools,
|
|
155
|
+
disallowed_tools=exp_config.disallowed_tools,
|
|
154
156
|
cwd=str(experiment_dir.resolve()),
|
|
155
157
|
)
|
|
156
158
|
|
|
@@ -794,6 +794,23 @@ def init_experiment(
|
|
|
794
794
|
"Built-ins: sourcegraph, github. Or path to a custom .md file."
|
|
795
795
|
),
|
|
796
796
|
)
|
|
797
|
+
@click.option(
|
|
798
|
+
"--allowed-tools",
|
|
799
|
+
default=None,
|
|
800
|
+
help=(
|
|
801
|
+
"Restrict the agent to this comma-separated list of built-in "
|
|
802
|
+
"tool names (e.g. 'Read,Grep'). Pass an empty string ('') to "
|
|
803
|
+
"disable all built-in tools for an MCP-only comparison."
|
|
804
|
+
),
|
|
805
|
+
)
|
|
806
|
+
@click.option(
|
|
807
|
+
"--disallowed-tools",
|
|
808
|
+
default=None,
|
|
809
|
+
help=(
|
|
810
|
+
"Block the agent from these comma-separated built-in tool names "
|
|
811
|
+
"(e.g. 'Bash,Write'). Applies on top of --allowed-tools."
|
|
812
|
+
),
|
|
813
|
+
)
|
|
797
814
|
def add_config(
|
|
798
815
|
path: str,
|
|
799
816
|
label: str,
|
|
@@ -803,10 +820,19 @@ def add_config(
|
|
|
803
820
|
mcp_config: str | None,
|
|
804
821
|
instruction_variant: str | None,
|
|
805
822
|
preambles: tuple[str, ...],
|
|
823
|
+
allowed_tools: str | None,
|
|
824
|
+
disallowed_tools: str | None,
|
|
806
825
|
) -> None:
|
|
807
826
|
"""Add a configuration to an existing experiment."""
|
|
808
827
|
from codeprobe.cli.experiment_cmd import experiment_add_config
|
|
809
828
|
|
|
829
|
+
# Parse comma-separated tool lists. An empty string means "MCP-only":
|
|
830
|
+
# disable all built-in tools. None means "adapter default".
|
|
831
|
+
def _parse_tools(raw: str | None) -> list[str] | None:
|
|
832
|
+
if raw is None:
|
|
833
|
+
return None
|
|
834
|
+
return [t.strip() for t in raw.split(",") if t.strip()]
|
|
835
|
+
|
|
810
836
|
experiment_add_config(
|
|
811
837
|
path,
|
|
812
838
|
label=label,
|
|
@@ -816,6 +842,8 @@ def add_config(
|
|
|
816
842
|
mcp_config_str=mcp_config,
|
|
817
843
|
instruction_variant=instruction_variant,
|
|
818
844
|
preambles=preambles,
|
|
845
|
+
allowed_tools=_parse_tools(allowed_tools),
|
|
846
|
+
disallowed_tools=_parse_tools(disallowed_tools),
|
|
819
847
|
)
|
|
820
848
|
|
|
821
849
|
|
|
@@ -142,6 +142,8 @@ def experiment_add_config(
|
|
|
142
142
|
mcp_config_str: str | None,
|
|
143
143
|
instruction_variant: str | None = None,
|
|
144
144
|
preambles: tuple[str, ...] = (),
|
|
145
|
+
allowed_tools: list[str] | None = None,
|
|
146
|
+
disallowed_tools: list[str] | None = None,
|
|
145
147
|
) -> None:
|
|
146
148
|
"""Add a configuration to an existing experiment."""
|
|
147
149
|
exp_dir = Path(path)
|
|
@@ -191,6 +193,8 @@ def experiment_add_config(
|
|
|
191
193
|
mcp_config=mcp_config,
|
|
192
194
|
instruction_variant=instruction_variant,
|
|
193
195
|
preambles=preambles,
|
|
196
|
+
allowed_tools=allowed_tools,
|
|
197
|
+
disallowed_tools=disallowed_tools,
|
|
194
198
|
)
|
|
195
199
|
|
|
196
200
|
# Validate the label is a safe path component
|
|
@@ -387,6 +387,7 @@ def execute_task(
|
|
|
387
387
|
cost_model=output.cost_model,
|
|
388
388
|
cost_source=output.cost_source,
|
|
389
389
|
tool_call_count=output.tool_call_count,
|
|
390
|
+
tool_use_by_name=output.tool_use_by_name,
|
|
390
391
|
)
|
|
391
392
|
|
|
392
393
|
# For oracle tasks, the agent writes answer.txt / answer.json to the
|
|
@@ -98,7 +98,11 @@ def load_experiment(exp_dir: Path) -> Experiment:
|
|
|
98
98
|
model=c.get("model"),
|
|
99
99
|
permission_mode=c.get("permission_mode", "default"),
|
|
100
100
|
mcp_config=c.get("mcp_config"),
|
|
101
|
+
allowed_tools=c.get("allowed_tools"),
|
|
102
|
+
disallowed_tools=c.get("disallowed_tools"),
|
|
101
103
|
instruction_variant=c.get("instruction_variant"),
|
|
104
|
+
preambles=tuple(c.get("preambles", ())),
|
|
105
|
+
reward_type=c.get("reward_type", "binary"),
|
|
102
106
|
extra=c.get("extra", {}),
|
|
103
107
|
)
|
|
104
108
|
for c in data.get("configs", [])
|
|
@@ -8,13 +8,23 @@ from typing import Any
|
|
|
8
8
|
|
|
9
9
|
@dataclass(frozen=True)
|
|
10
10
|
class ExperimentConfig:
|
|
11
|
-
"""A single configuration to evaluate (e.g., 'baseline' or 'with-mcp').
|
|
11
|
+
"""A single configuration to evaluate (e.g., 'baseline' or 'with-mcp').
|
|
12
|
+
|
|
13
|
+
``allowed_tools`` / ``disallowed_tools`` restrict which tools the
|
|
14
|
+
agent is allowed to call during this config's runs. Semantics mirror
|
|
15
|
+
the underlying CLI (Claude's ``--allowedTools`` / ``--disallowedTools``
|
|
16
|
+
/ ``--tools``). Set ``allowed_tools=[]`` to disable all built-in tools
|
|
17
|
+
for an MCP-only comparison — MCP tools are still reachable because
|
|
18
|
+
they come from ``mcp_config``.
|
|
19
|
+
"""
|
|
12
20
|
|
|
13
21
|
label: str
|
|
14
22
|
agent: str = "claude"
|
|
15
23
|
model: str | None = None
|
|
16
24
|
permission_mode: str = "default"
|
|
17
25
|
mcp_config: dict | None = None
|
|
26
|
+
allowed_tools: list[str] | None = None
|
|
27
|
+
disallowed_tools: list[str] | None = None
|
|
18
28
|
instruction_variant: str | None = None
|
|
19
29
|
preambles: tuple[str, ...] = ()
|
|
20
30
|
reward_type: str = "binary"
|
|
@@ -29,6 +39,8 @@ class ExperimentConfig:
|
|
|
29
39
|
f"ExperimentConfig(label={self.label!r}, agent={self.agent!r}, "
|
|
30
40
|
f"model={self.model!r}, permission_mode={self.permission_mode!r}, "
|
|
31
41
|
f"mcp_config={redacted_mcp!r}, "
|
|
42
|
+
f"allowed_tools={self.allowed_tools!r}, "
|
|
43
|
+
f"disallowed_tools={self.disallowed_tools!r}, "
|
|
32
44
|
f"instruction_variant={self.instruction_variant!r}, "
|
|
33
45
|
f"preambles={self.preambles!r}, reward_type={self.reward_type!r}, "
|
|
34
46
|
f"extra={self.extra!r})"
|
|
@@ -113,6 +125,9 @@ class CompletedTask:
|
|
|
113
125
|
cost_model: str = "unknown"
|
|
114
126
|
cost_source: str = "unavailable"
|
|
115
127
|
tool_call_count: int | None = None
|
|
128
|
+
# Per-tool usage breakdown (e.g. {"Read": 5,
|
|
129
|
+
# "mcp__sourcegraph__keyword_search": 2}). None when not captured.
|
|
130
|
+
tool_use_by_name: dict[str, int] | None = None
|
|
116
131
|
error_category: str | None = None
|
|
117
132
|
scoring_details: dict = field(default_factory=dict)
|
|
118
133
|
metadata: dict = field(default_factory=dict)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.5.
|
|
3
|
+
Version: 0.5.4
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -1824,3 +1824,165 @@ class TestClaudeModelNormalization:
|
|
|
1824
1824
|
cmd = adapter.build_command("test", config)
|
|
1825
1825
|
idx = cmd.index("--model")
|
|
1826
1826
|
assert cmd[idx + 1] == "claude-sonnet-4-6"
|
|
1827
|
+
|
|
1828
|
+
|
|
1829
|
+
# ---------------------------------------------------------------------------
|
|
1830
|
+
# 0.5.4: tool-restriction flags + stream-json tool_use capture
|
|
1831
|
+
# ---------------------------------------------------------------------------
|
|
1832
|
+
|
|
1833
|
+
|
|
1834
|
+
class TestClaudeToolRestrictions:
|
|
1835
|
+
"""Claude adapter wires AgentConfig.{allowed,disallowed}_tools to CLI."""
|
|
1836
|
+
|
|
1837
|
+
def test_allowed_tools_empty_list_maps_to_tools_empty(self) -> None:
|
|
1838
|
+
"""``allowed_tools=[]`` means MCP-only → ``--tools ""``."""
|
|
1839
|
+
adapter = ClaudeAdapter()
|
|
1840
|
+
if not adapter.find_binary():
|
|
1841
|
+
pytest.skip("claude binary not available")
|
|
1842
|
+
config = AgentConfig(allowed_tools=[])
|
|
1843
|
+
cmd = adapter.build_command("test", config)
|
|
1844
|
+
# Should contain --tools followed immediately by empty string.
|
|
1845
|
+
assert "--tools" in cmd
|
|
1846
|
+
idx = cmd.index("--tools")
|
|
1847
|
+
assert cmd[idx + 1] == ""
|
|
1848
|
+
|
|
1849
|
+
def test_allowed_tools_nonempty_emits_both_flags(self) -> None:
|
|
1850
|
+
"""Non-empty allowed_tools = whitelist. Adapter disables built-ins
|
|
1851
|
+
via --tools "" AND auto-approves listed names via --allowedTools,
|
|
1852
|
+
because in claude 2.1.x, --allowedTools alone doesn't restrict
|
|
1853
|
+
the available tool set (it just auto-approves) and without both
|
|
1854
|
+
flags the agent either burns turns on permission prompts or calls
|
|
1855
|
+
unlisted tools."""
|
|
1856
|
+
adapter = ClaudeAdapter()
|
|
1857
|
+
if not adapter.find_binary():
|
|
1858
|
+
pytest.skip("claude binary not available")
|
|
1859
|
+
config = AgentConfig(allowed_tools=["Read", "Grep"])
|
|
1860
|
+
cmd = adapter.build_command("test", config)
|
|
1861
|
+
assert "--tools" in cmd
|
|
1862
|
+
assert cmd[cmd.index("--tools") + 1] == ""
|
|
1863
|
+
assert "--allowedTools" in cmd
|
|
1864
|
+
assert cmd[cmd.index("--allowedTools") + 1] == "Read,Grep"
|
|
1865
|
+
|
|
1866
|
+
def test_disallowed_tools_maps_to_disallowedTools(self) -> None:
|
|
1867
|
+
adapter = ClaudeAdapter()
|
|
1868
|
+
if not adapter.find_binary():
|
|
1869
|
+
pytest.skip("claude binary not available")
|
|
1870
|
+
config = AgentConfig(disallowed_tools=["Bash", "Write"])
|
|
1871
|
+
cmd = adapter.build_command("test", config)
|
|
1872
|
+
assert "--disallowedTools" in cmd
|
|
1873
|
+
idx = cmd.index("--disallowedTools")
|
|
1874
|
+
assert cmd[idx + 1] == "Bash,Write"
|
|
1875
|
+
|
|
1876
|
+
def test_both_tool_restrictions_coexist(self) -> None:
|
|
1877
|
+
adapter = ClaudeAdapter()
|
|
1878
|
+
if not adapter.find_binary():
|
|
1879
|
+
pytest.skip("claude binary not available")
|
|
1880
|
+
config = AgentConfig(
|
|
1881
|
+
allowed_tools=["Read"], disallowed_tools=["Bash"]
|
|
1882
|
+
)
|
|
1883
|
+
cmd = adapter.build_command("test", config)
|
|
1884
|
+
assert "--allowedTools" in cmd
|
|
1885
|
+
assert "--disallowedTools" in cmd
|
|
1886
|
+
|
|
1887
|
+
def test_none_tool_restrictions_omit_flags(self) -> None:
|
|
1888
|
+
"""Default behavior: no --tools / --allowedTools / --disallowedTools."""
|
|
1889
|
+
adapter = ClaudeAdapter()
|
|
1890
|
+
if not adapter.find_binary():
|
|
1891
|
+
pytest.skip("claude binary not available")
|
|
1892
|
+
config = AgentConfig()
|
|
1893
|
+
cmd = adapter.build_command("test", config)
|
|
1894
|
+
assert "--tools" not in cmd
|
|
1895
|
+
assert "--allowedTools" not in cmd
|
|
1896
|
+
assert "--disallowedTools" not in cmd
|
|
1897
|
+
|
|
1898
|
+
def test_stream_json_is_default_output_format(self) -> None:
|
|
1899
|
+
"""Claude adapter switched to stream-json for tool_use capture."""
|
|
1900
|
+
adapter = ClaudeAdapter()
|
|
1901
|
+
if not adapter.find_binary():
|
|
1902
|
+
pytest.skip("claude binary not available")
|
|
1903
|
+
cmd = adapter.build_command("test", AgentConfig())
|
|
1904
|
+
assert "--output-format" in cmd
|
|
1905
|
+
idx = cmd.index("--output-format")
|
|
1906
|
+
assert cmd[idx + 1] == "stream-json"
|
|
1907
|
+
assert "--verbose" in cmd
|
|
1908
|
+
|
|
1909
|
+
|
|
1910
|
+
class TestStreamJsonToolUseCapture:
|
|
1911
|
+
"""JsonStdoutCollector parses stream-json and counts tool_use blocks."""
|
|
1912
|
+
|
|
1913
|
+
def _make_stream(self, tool_names: list[str]) -> str:
|
|
1914
|
+
"""Build a minimal stream-json transcript with given tool_use blocks."""
|
|
1915
|
+
import json as _json
|
|
1916
|
+
|
|
1917
|
+
lines = [
|
|
1918
|
+
_json.dumps({
|
|
1919
|
+
"type": "system", "subtype": "init",
|
|
1920
|
+
"mcp_servers": [{"name": "sourcegraph", "status": "connected"}],
|
|
1921
|
+
})
|
|
1922
|
+
]
|
|
1923
|
+
for name in tool_names:
|
|
1924
|
+
lines.append(_json.dumps({
|
|
1925
|
+
"type": "assistant",
|
|
1926
|
+
"message": {"content": [{"type": "tool_use", "name": name}]},
|
|
1927
|
+
}))
|
|
1928
|
+
# Terminal result event carries the envelope-shape fields.
|
|
1929
|
+
lines.append(_json.dumps({
|
|
1930
|
+
"type": "result",
|
|
1931
|
+
"subtype": "success",
|
|
1932
|
+
"result": "Done.",
|
|
1933
|
+
"is_error": False,
|
|
1934
|
+
"usage": {
|
|
1935
|
+
"input_tokens": 10,
|
|
1936
|
+
"output_tokens": 20,
|
|
1937
|
+
"cache_read_input_tokens": 100,
|
|
1938
|
+
},
|
|
1939
|
+
"total_cost_usd": 0.05,
|
|
1940
|
+
}))
|
|
1941
|
+
return "\n".join(lines) + "\n"
|
|
1942
|
+
|
|
1943
|
+
def test_counts_all_tool_use_blocks(self) -> None:
|
|
1944
|
+
from codeprobe.adapters.telemetry import JsonStdoutCollector
|
|
1945
|
+
|
|
1946
|
+
stream = self._make_stream(["Read", "Grep", "Read", "Bash"])
|
|
1947
|
+
u = JsonStdoutCollector().collect(stream)
|
|
1948
|
+
assert u.tool_call_count == 4
|
|
1949
|
+
assert u.tool_use_by_name == {"Read": 2, "Grep": 1, "Bash": 1}
|
|
1950
|
+
|
|
1951
|
+
def test_counts_mcp_tool_names(self) -> None:
|
|
1952
|
+
"""MCP tools show up as ``mcp__<server>__<tool>``; counted correctly."""
|
|
1953
|
+
from codeprobe.adapters.telemetry import JsonStdoutCollector
|
|
1954
|
+
|
|
1955
|
+
stream = self._make_stream([
|
|
1956
|
+
"Read", "mcp__sourcegraph__keyword_search",
|
|
1957
|
+
"mcp__sourcegraph__find_references",
|
|
1958
|
+
])
|
|
1959
|
+
u = JsonStdoutCollector().collect(stream)
|
|
1960
|
+
assert u.tool_call_count == 3
|
|
1961
|
+
assert u.tool_use_by_name["mcp__sourcegraph__keyword_search"] == 1
|
|
1962
|
+
assert u.tool_use_by_name["mcp__sourcegraph__find_references"] == 1
|
|
1963
|
+
|
|
1964
|
+
def test_empty_stream_returns_no_tool_calls(self) -> None:
|
|
1965
|
+
from codeprobe.adapters.telemetry import JsonStdoutCollector
|
|
1966
|
+
|
|
1967
|
+
stream = self._make_stream([])
|
|
1968
|
+
u = JsonStdoutCollector().collect(stream)
|
|
1969
|
+
assert u.tool_call_count == 0
|
|
1970
|
+
assert u.tool_use_by_name is None # sentinel: nothing captured
|
|
1971
|
+
|
|
1972
|
+
def test_single_envelope_still_works(self) -> None:
|
|
1973
|
+
"""Back-compat: legacy --output-format json single envelope parses."""
|
|
1974
|
+
from codeprobe.adapters.telemetry import JsonStdoutCollector
|
|
1975
|
+
|
|
1976
|
+
envelope = {
|
|
1977
|
+
"result": "ok",
|
|
1978
|
+
"usage": {
|
|
1979
|
+
"input_tokens": 5, "output_tokens": 10,
|
|
1980
|
+
"cache_read_input_tokens": 0,
|
|
1981
|
+
},
|
|
1982
|
+
"total_cost_usd": 0.01,
|
|
1983
|
+
}
|
|
1984
|
+
import json as _json
|
|
1985
|
+
|
|
1986
|
+
u = JsonStdoutCollector().collect(_json.dumps(envelope))
|
|
1987
|
+
assert u.input_tokens == 5
|
|
1988
|
+
assert u.tool_call_count is None # envelope has no messages
|