codeprobe 0.4.1__tar.gz → 0.5.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codeprobe-0.4.1 → codeprobe-0.5.2}/PKG-INFO +6 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/README.md +2 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/pyproject.toml +7 -3
- codeprobe-0.5.2/src/codeprobe/acceptance_compiler.py +355 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/_base.py +12 -6
- codeprobe-0.5.2/src/codeprobe/adapters/claude.py +316 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/protocol.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/telemetry.py +52 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/analysis/report.py +90 -13
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/analysis/stats.py +84 -5
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/api.py +5 -4
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/assess/__init__.py +14 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/assess/heuristics.py +1 -1
- codeprobe-0.5.2/src/codeprobe/assess/oracle_diff.py +517 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/__init__.py +106 -9
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/experiment_cmd.py +49 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/init_cmd.py +5 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/interpret_cmd.py +10 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/json_display.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/mine_cmd.py +686 -203
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/rich_display.py +2 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/run_cmd.py +73 -65
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/validate_cmd.py +56 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/config/loader.py +47 -21
- codeprobe-0.5.2/src/codeprobe/config/redact.py +123 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/checkpoint.py +2 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/events.py +2 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/executor.py +26 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/experiment.py +7 -0
- codeprobe-0.5.2/src/codeprobe/core/repo_hygiene.py +51 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/scoring.py +26 -7
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/comprehension.py +15 -10
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/comprehension_writer.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/curator.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/curator_backends.py +11 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/extractor.py +385 -74
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/org_scale.py +88 -25
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/org_scale_oracle.py +70 -11
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/org_scale_scanner.py +171 -10
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/sg_auth.py +28 -6
- codeprobe-0.5.2/src/codeprobe/mining/task_types.py +180 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/writer.py +413 -18
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/models/suite.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/models/task.py +3 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/probe/generator.py +19 -11
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/probe/writer.py +3 -4
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/ratings/collector.py +2 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe.egg-info/PKG-INFO +6 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe.egg-info/SOURCES.txt +20 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe.egg-info/requires.txt +3 -2
- codeprobe-0.5.2/tests/test_acceptance_compiler.py +566 -0
- codeprobe-0.5.2/tests/test_acceptance_compiler_integration.py +161 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_adapters.py +326 -169
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_artifact_scorer.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_auth_cmd.py +0 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_changed_symbols.py +0 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_checkpoint.py +0 -4
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_checkpoint_scoring.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_cli.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_contrib.py +9 -5
- codeprobe-0.5.2/tests/test_convergence.py +441 -0
- codeprobe-0.5.2/tests/test_criteria_loader.py +411 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_curator_backends.py +2 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_curator_integration.py +7 -6
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_curator_tiers.py +0 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_doctor_cmd.py +0 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_dual_adversarial_fixes.py +175 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_executor.py +53 -8
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_executor_dual_isolation.py +2 -4
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_executor_events.py +0 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_experiment_cmd.py +78 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_ground_truth_schema.py +1 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_init_wizard.py +10 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_json_display.py +0 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_listeners_dual.py +50 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_llm.py +1 -3
- codeprobe-0.5.2/tests/test_loader.py +128 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mcp_families_mining.py +3 -6
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mcp_validate.py +0 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mine_cli.py +59 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mine_goals.py +3 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mine_presets.py +2 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mine_profiles.py +0 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mining.py +422 -17
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_multi_repo_e2e.py +1 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_new_families.py +2 -3
- codeprobe-0.5.2/tests/test_oracle_diff.py +341 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_oracle_registry.py +1 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_org_scale.py +173 -11
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_pipeline_integration.py +4 -8
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_preamble.py +3 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_ratings.py +2 -2
- codeprobe-0.5.2/tests/test_regression_gate.py +719 -0
- codeprobe-0.5.2/tests/test_release_gate.py +477 -0
- codeprobe-0.5.2/tests/test_repo_hygiene.py +106 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_run_config_resolution.py +0 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_scaffold.py +0 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_scoring.py +1 -3
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_scoring_extended.py +1 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_scoring_v2.py +1 -1
- codeprobe-0.5.2/tests/test_sdlc_ground_truth.py +519 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_secret_redaction.py +181 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_sg_auth.py +4 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_sg_ground_truth.py +0 -1
- codeprobe-0.5.2/tests/test_stats.py +261 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_suite_manifest.py +0 -2
- codeprobe-0.5.2/tests/test_task_model.py +186 -0
- codeprobe-0.5.2/tests/test_task_types.py +187 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_telemetry.py +45 -7
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_validate_cmd.py +65 -1
- codeprobe-0.5.2/tests/test_verifier.py +629 -0
- codeprobe-0.5.2/tests/test_verify.py +172 -0
- codeprobe-0.5.2/tests/test_weighted_checklist.py +662 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_weighted_f1.py +154 -8
- codeprobe-0.4.1/src/codeprobe/adapters/claude.py +0 -136
- codeprobe-0.4.1/src/codeprobe/config/redact.py +0 -45
- {codeprobe-0.4.1 → codeprobe-0.5.2}/LICENSE +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/setup.cfg +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/__main__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/codex.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/copilot.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/openai_compat.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/adapters/session.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/analysis/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/analysis/dual.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/analysis/ranking.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/assess_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/auth_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/doctor_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/preamble_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/probe_cmd.py +2 -2
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/ratings_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/scaffold_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/wizard.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/cli/yaml_writer.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/config/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/_shared.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/adaptive.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/counterfactual.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/debate.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/decision_tree.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/elo.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/fingerprint.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/mutation.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/pareto.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/sprt.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/contrib/tournament.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/__main__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/isolation.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/llm.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/mcp_discovery.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/preamble.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/registry.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/core/sandbox.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/loaders/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/loaders/suite.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/_graph.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/_lang.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/curator_tiers.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/multi_repo.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/org_scale_families.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/org_scale_validate.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/sg_ground_truth.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/mining/sources.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/models/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/models/evalrc.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/models/experiment.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/models/preamble.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/preambles/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/preambles/github.md +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/preambles/sourcegraph.md +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/probe/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/probe/adapter.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/ratings/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/scaffold/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/scaffold/writer.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/templates/__init__.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe.egg-info/dependency_links.txt +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe.egg-info/entry_points.txt +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/src/codeprobe.egg-info/top_level.txt +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_adapter_contracts.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_analysis.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_api.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_assess.py +4 -4
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_comprehension.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_config_loader.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_ctrlc_integration.py +1 -1
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_curator_core.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_dual_composite.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_dual_e2e.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_dual_matrix.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_dual_scorer.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_dual_scoring_details.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_events.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_examples_dual.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_experiment_core.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_isolation.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_loaders.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_loaders_dual.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_mining_dual.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_models.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_multi_repo_mining.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_openai_compat.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_oracle_types.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_preamble_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_probe.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_probe_adapter.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_ratings_cmd.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_registry.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_report_dual.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_safe_leg_score.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_scaffold_upgrade.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_scanner_refactor.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_score_result.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_session.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_shell_shim.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_show_prompt.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_suite.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_validate_dual.py +0 -0
- {codeprobe-0.4.1 → codeprobe-0.5.2}/tests/test_writer_dual.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codeprobe
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.2
|
|
4
4
|
Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
|
|
5
5
|
Author: codeprobe contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -20,8 +20,8 @@ Description-Content-Type: text/markdown
|
|
|
20
20
|
License-File: LICENSE
|
|
21
21
|
Requires-Dist: click<9,>=8.0
|
|
22
22
|
Requires-Dist: pyyaml<7,>=6.0
|
|
23
|
-
Requires-Dist: anthropic
|
|
24
|
-
Requires-Dist: openai
|
|
23
|
+
Requires-Dist: anthropic<1,>=0.39
|
|
24
|
+
Requires-Dist: openai<3,>=1.66
|
|
25
25
|
Requires-Dist: tiktoken<1,>=0.7
|
|
26
26
|
Requires-Dist: scipy<2,>=1.11
|
|
27
27
|
Requires-Dist: rich<14,>=13.7
|
|
@@ -32,6 +32,7 @@ Requires-Dist: ruff<1,>=0.4; extra == "dev"
|
|
|
32
32
|
Requires-Dist: mypy<2,>=1.10; extra == "dev"
|
|
33
33
|
Requires-Dist: types-PyYAML<7,>=6.0; extra == "dev"
|
|
34
34
|
Requires-Dist: scipy<2,>=1.11; extra == "dev"
|
|
35
|
+
Requires-Dist: build<2,>=1.0; extra == "dev"
|
|
35
36
|
Dynamic: license-file
|
|
36
37
|
|
|
37
38
|
# codeprobe
|
|
@@ -83,6 +84,8 @@ codeprobe run . # Run agents against tasks
|
|
|
83
84
|
codeprobe interpret . # Get recommendations
|
|
84
85
|
```
|
|
85
86
|
|
|
87
|
+
Prefer driving codeprobe through a coding agent instead? See [docs/workflows/with-agents.md](docs/workflows/with-agents.md) for the skills-based workflow (`/experiment`, `/assess-codebase`, `/interpret`).
|
|
88
|
+
|
|
86
89
|
## Commands
|
|
87
90
|
|
|
88
91
|
| Command | Purpose |
|
|
@@ -47,6 +47,8 @@ codeprobe run . # Run agents against tasks
|
|
|
47
47
|
codeprobe interpret . # Get recommendations
|
|
48
48
|
```
|
|
49
49
|
|
|
50
|
+
Prefer driving codeprobe through a coding agent instead? See [docs/workflows/with-agents.md](docs/workflows/with-agents.md) for the skills-based workflow (`/experiment`, `/assess-codebase`, `/interpret`).
|
|
51
|
+
|
|
50
52
|
## Commands
|
|
51
53
|
|
|
52
54
|
| Command | Purpose |
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "codeprobe"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.5.2"
|
|
4
4
|
description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
license = "Apache-2.0"
|
|
@@ -21,8 +21,8 @@ classifiers = [
|
|
|
21
21
|
dependencies = [
|
|
22
22
|
"click>=8.0,<9",
|
|
23
23
|
"pyyaml>=6.0,<7",
|
|
24
|
-
"anthropic>=0.39",
|
|
25
|
-
"openai>=1.66",
|
|
24
|
+
"anthropic>=0.39,<1",
|
|
25
|
+
"openai>=1.66,<3",
|
|
26
26
|
"tiktoken>=0.7,<1",
|
|
27
27
|
"scipy>=1.11,<2",
|
|
28
28
|
"rich>=13.7,<14",
|
|
@@ -41,6 +41,10 @@ dev = [
|
|
|
41
41
|
"mypy>=1.10,<2",
|
|
42
42
|
"types-PyYAML>=6.0,<7",
|
|
43
43
|
"scipy>=1.11,<2",
|
|
44
|
+
# Needed by tests/test_release_gate.py::test_build_and_stage_real_wheel
|
|
45
|
+
# which shells out to ``python -m build --wheel`` to verify the wheel
|
|
46
|
+
# produced at release time is installable and version-consistent.
|
|
47
|
+
"build>=1.0,<2",
|
|
44
48
|
]
|
|
45
49
|
|
|
46
50
|
[project.scripts]
|
|
@@ -0,0 +1,355 @@
|
|
|
1
|
+
"""Criterion-driven Test Agent action compiler.
|
|
2
|
+
|
|
3
|
+
Reads acceptance criteria from ``acceptance/criteria.toml`` (via
|
|
4
|
+
:func:`acceptance.loader.load_criteria`) and compiles each criterion whose
|
|
5
|
+
``check_type`` requires a workspace artifact into a :class:`TestAction` —
|
|
6
|
+
a frozen dataclass holding a bash snippet that the Test Agent executes to
|
|
7
|
+
populate the artifact(s) the Verifier reads.
|
|
8
|
+
|
|
9
|
+
Structural check types (``import_equals``, ``regex_present``, etc.) require
|
|
10
|
+
no workspace artifact and produce no action. Check types that have no
|
|
11
|
+
handler registered in ``acceptance.verify.Verifier._handlers()`` also
|
|
12
|
+
produce no action — emitting artifacts for them would be pure waste since
|
|
13
|
+
the Verifier skips them regardless.
|
|
14
|
+
|
|
15
|
+
This module is a **pure function** — no IO beyond what the caller passes in,
|
|
16
|
+
no subprocesses, no LLM calls. Token substitution uses ``.replace()``
|
|
17
|
+
chains (never ``.format()``) to avoid crashes on shell ``${VAR}`` braces.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import re
|
|
23
|
+
import textwrap
|
|
24
|
+
from collections.abc import Callable
|
|
25
|
+
from dataclasses import dataclass
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
from acceptance.loader import Criterion
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Public types
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class TestAction:
|
|
38
|
+
"""A single compiled action for the Test Agent to execute."""
|
|
39
|
+
|
|
40
|
+
criterion_id: str
|
|
41
|
+
description: str
|
|
42
|
+
shell_snippet: str
|
|
43
|
+
artifact_paths: tuple[str, ...]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# ---------------------------------------------------------------------------
|
|
47
|
+
# Check types that the Verifier handles AND that read workspace artifacts.
|
|
48
|
+
# Structural types are excluded (they introspect Python or source files).
|
|
49
|
+
# Handler-less types are excluded (no Verifier reader → artifacts are waste).
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
#: Check types handled by the Verifier that DO NOT need workspace artifacts.
|
|
53
|
+
_STRUCTURAL_TYPES: frozenset[str] = frozenset(
|
|
54
|
+
{
|
|
55
|
+
"import_equals",
|
|
56
|
+
"dataclass_has_fields",
|
|
57
|
+
"regex_present",
|
|
58
|
+
"regex_absent",
|
|
59
|
+
"pyproject_deps_bounded",
|
|
60
|
+
}
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
#: Check types present in criteria.toml but absent from Verifier._handlers().
|
|
64
|
+
#: Criterion IDs must match this pattern to be safe for shell embedding.
|
|
65
|
+
#: Prevents command injection via $() or backticks in double-quoted contexts.
|
|
66
|
+
_SAFE_ID_RE: re.Pattern[str] = re.compile(r"^[A-Za-z0-9][A-Za-z0-9_\-]{0,63}$")
|
|
67
|
+
|
|
68
|
+
#: Shell environment variable names must match this pattern.
|
|
69
|
+
_SAFE_ENV_RE: re.Pattern[str] = re.compile(r"^[A-Z_][A-Z0-9_]{0,127}$")
|
|
70
|
+
|
|
71
|
+
_HANDLERLESS_TYPES: frozenset[str] = frozenset(
|
|
72
|
+
{
|
|
73
|
+
"stream_separation",
|
|
74
|
+
"log_level_matches",
|
|
75
|
+
"json_lines_valid",
|
|
76
|
+
"dataclass_roundtrip",
|
|
77
|
+
"yaml_field_equal",
|
|
78
|
+
}
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
# ---------------------------------------------------------------------------
|
|
82
|
+
# Public API
|
|
83
|
+
# ---------------------------------------------------------------------------
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def compile_actions(
|
|
87
|
+
criteria: list[Criterion],
|
|
88
|
+
*,
|
|
89
|
+
target_repo: Path,
|
|
90
|
+
workspace: Path,
|
|
91
|
+
project_root: Path,
|
|
92
|
+
) -> list[TestAction]:
|
|
93
|
+
"""Return one :class:`TestAction` per criterion that needs a workspace artifact.
|
|
94
|
+
|
|
95
|
+
Structural criteria and handler-less criteria produce no action.
|
|
96
|
+
Criteria whose params cannot be resolved produce a stub action that writes
|
|
97
|
+
a ``COMPILE_ERROR`` marker so the Verifier sees an explicit failure rather
|
|
98
|
+
than a silent skip.
|
|
99
|
+
"""
|
|
100
|
+
actions: list[TestAction] = []
|
|
101
|
+
for criterion in criteria:
|
|
102
|
+
if not _SAFE_ID_RE.fullmatch(criterion.id):
|
|
103
|
+
raise ValueError(
|
|
104
|
+
f"Criterion id {criterion.id!r} contains characters unsafe "
|
|
105
|
+
"for shell embedding; only [A-Za-z0-9_-] allowed."
|
|
106
|
+
)
|
|
107
|
+
ct = criterion.check_type
|
|
108
|
+
if ct in _STRUCTURAL_TYPES or ct in _HANDLERLESS_TYPES:
|
|
109
|
+
continue
|
|
110
|
+
emitter = _EMITTERS.get(ct)
|
|
111
|
+
if emitter is None:
|
|
112
|
+
continue
|
|
113
|
+
action = emitter(criterion, target_repo, workspace, project_root)
|
|
114
|
+
if action is not None:
|
|
115
|
+
actions.append(action)
|
|
116
|
+
return actions
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
# ---------------------------------------------------------------------------
|
|
120
|
+
# Token substitution
|
|
121
|
+
# ---------------------------------------------------------------------------
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _substitute_command(
|
|
125
|
+
raw: str,
|
|
126
|
+
target_repo: Path,
|
|
127
|
+
workspace: Path,
|
|
128
|
+
project_root: Path,
|
|
129
|
+
params: dict[str, Any],
|
|
130
|
+
) -> str:
|
|
131
|
+
"""Substitute ``{repo}``, ``{results}``, ``{tasks_dir}``, ``{experiment}``
|
|
132
|
+
tokens inside a command string.
|
|
133
|
+
|
|
134
|
+
Uses ``.replace()`` (not ``.format()``) so shell ``${VAR}`` braces are
|
|
135
|
+
left intact.
|
|
136
|
+
"""
|
|
137
|
+
result = raw.replace("{repo}", str(target_repo))
|
|
138
|
+
result = result.replace("{results}", str(workspace / "results"))
|
|
139
|
+
result = result.replace(
|
|
140
|
+
"{experiment}", str(workspace / ".codeprobe" / "experiment.json")
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
# {tasks_dir} resolves via the fixture param if present, else workspace/tasks.
|
|
144
|
+
fixture = params.get("fixture")
|
|
145
|
+
if fixture and isinstance(fixture, str):
|
|
146
|
+
resolved = (project_root / fixture).resolve()
|
|
147
|
+
root_resolved = project_root.resolve()
|
|
148
|
+
if not str(resolved).startswith(str(root_resolved)):
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"fixture param {fixture!r} escapes project_root — path traversal denied"
|
|
151
|
+
)
|
|
152
|
+
tasks_dir = str(resolved)
|
|
153
|
+
else:
|
|
154
|
+
tasks_dir = str(workspace / "tasks")
|
|
155
|
+
result = result.replace("{tasks_dir}", tasks_dir)
|
|
156
|
+
|
|
157
|
+
return result
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ---------------------------------------------------------------------------
|
|
161
|
+
# Per-check-type emitters
|
|
162
|
+
# ---------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _emit_cli_help_contains(
|
|
166
|
+
c: Criterion, target_repo: Path, workspace: Path, project_root: Path
|
|
167
|
+
) -> TestAction | None:
|
|
168
|
+
commands = c.params.get("commands")
|
|
169
|
+
if not isinstance(commands, list) or not commands:
|
|
170
|
+
return _stub_compile_error(c, workspace)
|
|
171
|
+
lines: list[str] = []
|
|
172
|
+
for i, raw_cmd in enumerate(commands):
|
|
173
|
+
if not isinstance(raw_cmd, str):
|
|
174
|
+
continue
|
|
175
|
+
cmd = _substitute_command(
|
|
176
|
+
raw_cmd, target_repo, workspace, project_root, c.params
|
|
177
|
+
)
|
|
178
|
+
op = ">>" if i > 0 else ">"
|
|
179
|
+
lines.append(
|
|
180
|
+
f'( {cmd} ) {op} "{workspace}/{c.id}.stdout" 2>> "{workspace}/{c.id}.stderr"'
|
|
181
|
+
)
|
|
182
|
+
lines.append(f'echo "0" > "{workspace}/{c.id}.exit"')
|
|
183
|
+
snippet = "\n".join(lines)
|
|
184
|
+
return TestAction(
|
|
185
|
+
criterion_id=c.id,
|
|
186
|
+
description=f"help-check: {len(commands)} commands",
|
|
187
|
+
shell_snippet=snippet,
|
|
188
|
+
artifact_paths=(f"{c.id}.stdout", f"{c.id}.stderr", f"{c.id}.exit"),
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def _emit_command_capture(
|
|
193
|
+
c: Criterion, target_repo: Path, workspace: Path, project_root: Path
|
|
194
|
+
) -> TestAction | None:
|
|
195
|
+
"""Shared emitter for types that capture stdout+stderr from a command."""
|
|
196
|
+
raw_cmd = c.params.get("command")
|
|
197
|
+
if not isinstance(raw_cmd, str) or not raw_cmd:
|
|
198
|
+
return _stub_compile_error(c, workspace)
|
|
199
|
+
cmd = _substitute_command(raw_cmd, target_repo, workspace, project_root, c.params)
|
|
200
|
+
snippet = textwrap.dedent(f"""\
|
|
201
|
+
( {cmd} ) \\
|
|
202
|
+
> "{workspace}/{c.id}.stdout" \\
|
|
203
|
+
2> "{workspace}/{c.id}.stderr"
|
|
204
|
+
echo "$?" > "{workspace}/{c.id}.exit"
|
|
205
|
+
""").strip()
|
|
206
|
+
return TestAction(
|
|
207
|
+
criterion_id=c.id,
|
|
208
|
+
description=f"run: {cmd}",
|
|
209
|
+
shell_snippet=snippet,
|
|
210
|
+
artifact_paths=(f"{c.id}.stdout", f"{c.id}.stderr", f"{c.id}.exit"),
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _emit_cli_writes_file(
|
|
215
|
+
c: Criterion, target_repo: Path, workspace: Path, project_root: Path
|
|
216
|
+
) -> TestAction | None:
|
|
217
|
+
raw_cmd = c.params.get("command")
|
|
218
|
+
expected_path = c.params.get("expected_path")
|
|
219
|
+
if not isinstance(raw_cmd, str) or not raw_cmd:
|
|
220
|
+
return None
|
|
221
|
+
cmd = _substitute_command(raw_cmd, target_repo, workspace, project_root, c.params)
|
|
222
|
+
snippet = textwrap.dedent(f"""\
|
|
223
|
+
( cd "{workspace}" && {cmd} ) \\
|
|
224
|
+
> "{workspace}/{c.id}.stdout" \\
|
|
225
|
+
2> "{workspace}/{c.id}.stderr"
|
|
226
|
+
echo "$?" > "{workspace}/{c.id}.exit"
|
|
227
|
+
""").strip()
|
|
228
|
+
artifact_paths = [f"{c.id}.stdout", f"{c.id}.stderr", f"{c.id}.exit"]
|
|
229
|
+
if isinstance(expected_path, str) and expected_path:
|
|
230
|
+
artifact_paths.append(expected_path)
|
|
231
|
+
return TestAction(
|
|
232
|
+
criterion_id=c.id,
|
|
233
|
+
description=f"writes-file: {expected_path or '?'}",
|
|
234
|
+
shell_snippet=snippet,
|
|
235
|
+
artifact_paths=tuple(artifact_paths),
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def _emit_file_exists(
|
|
240
|
+
c: Criterion, target_repo: Path, workspace: Path, project_root: Path
|
|
241
|
+
) -> TestAction | None:
|
|
242
|
+
rel = c.params.get("path") or c.params.get("expected_path")
|
|
243
|
+
if not isinstance(rel, str) or not rel:
|
|
244
|
+
return None
|
|
245
|
+
# file_exists checks are passive — the artifact should already be produced
|
|
246
|
+
# by a dependency. Emit a no-op touch that documents what we expect.
|
|
247
|
+
snippet = f'# file_exists: verifier checks "{workspace}/{rel}" — no command needed'
|
|
248
|
+
return TestAction(
|
|
249
|
+
criterion_id=c.id,
|
|
250
|
+
description=f"file-exists: {rel}",
|
|
251
|
+
shell_snippet=snippet,
|
|
252
|
+
artifact_paths=(rel,),
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _emit_sync_action(
|
|
257
|
+
c: Criterion, target_repo: Path, workspace: Path, project_root: Path
|
|
258
|
+
) -> TestAction | None:
|
|
259
|
+
"""Emit a sync snippet that copies ``target_repo/.codeprobe/`` into the
|
|
260
|
+
workspace so the Verifier's ``{repo}`` → workspace substitution finds the
|
|
261
|
+
artifacts where the real ``codeprobe`` tool wrote them.
|
|
262
|
+
"""
|
|
263
|
+
source_rel = c.params.get("source") or c.params.get("search_in")
|
|
264
|
+
if not isinstance(source_rel, str):
|
|
265
|
+
return None
|
|
266
|
+
snippet = textwrap.dedent(f"""\
|
|
267
|
+
# Sync target_repo output into workspace for {c.id}
|
|
268
|
+
mkdir -p "{workspace}/.codeprobe"
|
|
269
|
+
if [ -d "{target_repo}/.codeprobe" ]; then
|
|
270
|
+
cp -r "{target_repo}/.codeprobe/." "{workspace}/.codeprobe/"
|
|
271
|
+
fi
|
|
272
|
+
touch "{workspace}/{c.id}.synced"
|
|
273
|
+
""").strip()
|
|
274
|
+
return TestAction(
|
|
275
|
+
criterion_id=c.id,
|
|
276
|
+
description=f"sync .codeprobe for {c.check_type}",
|
|
277
|
+
shell_snippet=snippet,
|
|
278
|
+
artifact_paths=(f"{c.id}.synced",),
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _emit_canary_detect(
|
|
283
|
+
c: Criterion, target_repo: Path, workspace: Path, project_root: Path
|
|
284
|
+
) -> TestAction | None:
|
|
285
|
+
"""Emit an action that writes the canary UUID to ``$WORKSPACE/canary.txt``
|
|
286
|
+
and syncs ``.codeprobe/`` so the Verifier's rglob can find the UUID in
|
|
287
|
+
at least one workspace file.
|
|
288
|
+
"""
|
|
289
|
+
canary_env = c.params.get("canary_env", "CODEPROBE_CANARY_UUID")
|
|
290
|
+
if not isinstance(canary_env, str) or not _SAFE_ENV_RE.fullmatch(canary_env):
|
|
291
|
+
return None
|
|
292
|
+
snippet = textwrap.dedent(f"""\
|
|
293
|
+
# Canary detection for {c.id}
|
|
294
|
+
echo "${canary_env}" > "{workspace}/canary.txt"
|
|
295
|
+
mkdir -p "{workspace}/.codeprobe"
|
|
296
|
+
if [ -d "{target_repo}/.codeprobe" ]; then
|
|
297
|
+
cp -r "{target_repo}/.codeprobe/." "{workspace}/.codeprobe/"
|
|
298
|
+
fi
|
|
299
|
+
""").strip()
|
|
300
|
+
return TestAction(
|
|
301
|
+
criterion_id=c.id,
|
|
302
|
+
description="canary: write UUID + sync workspace",
|
|
303
|
+
shell_snippet=snippet,
|
|
304
|
+
artifact_paths=("canary.txt",),
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
# ---------------------------------------------------------------------------
|
|
309
|
+
# Stub emitters (for missing/invalid params)
|
|
310
|
+
# ---------------------------------------------------------------------------
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _stub_compile_error(c: Criterion, workspace: Path) -> TestAction:
|
|
314
|
+
"""Emit a stub action that writes a ``COMPILE_ERROR`` marker.
|
|
315
|
+
|
|
316
|
+
The Verifier sees an explicit failure rather than a silent skip.
|
|
317
|
+
"""
|
|
318
|
+
snippet = textwrap.dedent(f"""\
|
|
319
|
+
echo "COMPILE_ERROR: missing or invalid params for {c.id}" \\
|
|
320
|
+
> "{workspace}/{c.id}.stdout"
|
|
321
|
+
echo "COMPILE_ERROR: missing or invalid params for {c.id}" \\
|
|
322
|
+
> "{workspace}/{c.id}.stderr"
|
|
323
|
+
echo "255" > "{workspace}/{c.id}.exit"
|
|
324
|
+
""").strip()
|
|
325
|
+
return TestAction(
|
|
326
|
+
criterion_id=c.id,
|
|
327
|
+
description=f"STUB: {c.id} (missing params)",
|
|
328
|
+
shell_snippet=snippet,
|
|
329
|
+
artifact_paths=(f"{c.id}.exit", f"{c.id}.stdout", f"{c.id}.stderr"),
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
# ---------------------------------------------------------------------------
|
|
334
|
+
# Emitter dispatch table
|
|
335
|
+
# ---------------------------------------------------------------------------
|
|
336
|
+
|
|
337
|
+
_Emitter = Callable[[Criterion, Path, Path, Path], TestAction | None]
|
|
338
|
+
|
|
339
|
+
_EMITTERS: dict[str, _Emitter] = {
|
|
340
|
+
"cli_exit_code": _emit_command_capture,
|
|
341
|
+
"cli_help_contains": _emit_cli_help_contains,
|
|
342
|
+
"cli_stdout_contains": _emit_command_capture,
|
|
343
|
+
"stdout_contains": _emit_command_capture,
|
|
344
|
+
"stderr_contains": _emit_command_capture,
|
|
345
|
+
"cli_writes_file": _emit_cli_writes_file,
|
|
346
|
+
"file_exists": _emit_file_exists,
|
|
347
|
+
"count_ge": _emit_sync_action,
|
|
348
|
+
"json_count_ge": _emit_sync_action,
|
|
349
|
+
"json_field_not_null": _emit_sync_action,
|
|
350
|
+
"json_field_equals": _emit_sync_action,
|
|
351
|
+
"json_field_type": _emit_sync_action,
|
|
352
|
+
"canary_detect": _emit_canary_detect,
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
__all__ = ["TestAction", "compile_actions"]
|
|
@@ -24,14 +24,24 @@ _ADAPTER_ENV_WHITELIST: frozenset[str] = frozenset(
|
|
|
24
24
|
# System essentials
|
|
25
25
|
"PATH",
|
|
26
26
|
"HOME",
|
|
27
|
+
"USER",
|
|
28
|
+
"LOGNAME",
|
|
27
29
|
"LANG",
|
|
28
30
|
"TERM",
|
|
29
31
|
"TMPDIR",
|
|
30
32
|
"LC_ALL",
|
|
33
|
+
# XDG / desktop-session env — required for Linux keyring (libsecret)
|
|
34
|
+
# lookups so OAuth/keychain-auth agents can reach the session bus
|
|
35
|
+
# when CLAUDE_CONFIG_DIR is overridden for isolation.
|
|
36
|
+
"DBUS_SESSION_BUS_ADDRESS",
|
|
37
|
+
"XDG_RUNTIME_DIR",
|
|
38
|
+
"XDG_DATA_HOME",
|
|
39
|
+
"XDG_CONFIG_HOME",
|
|
31
40
|
# Codeprobe sandbox signal (eval harness sets this)
|
|
32
41
|
"CODEPROBE_SANDBOX",
|
|
33
42
|
# Agent-specific API keys (required by the adapters)
|
|
34
43
|
"ANTHROPIC_API_KEY",
|
|
44
|
+
"CLAUDE_CODE_OAUTH_TOKEN",
|
|
35
45
|
"CLAUDE_CONFIG_DIR",
|
|
36
46
|
"GITHUB_TOKEN",
|
|
37
47
|
"OPENAI_API_KEY",
|
|
@@ -114,9 +124,7 @@ class BaseAdapter:
|
|
|
114
124
|
@abstractmethod
|
|
115
125
|
def build_command(self, prompt: str, config: AgentConfig) -> list[str]: ...
|
|
116
126
|
|
|
117
|
-
def parse_output(
|
|
118
|
-
self, result: subprocess.CompletedProcess[str], duration: float
|
|
119
|
-
) -> AgentOutput:
|
|
127
|
+
def parse_output(self, result: subprocess.CompletedProcess[str], duration: float) -> AgentOutput:
|
|
120
128
|
"""Convert subprocess result to AgentOutput.
|
|
121
129
|
|
|
122
130
|
Subclasses override to extract tokens, cost, etc. from agent output.
|
|
@@ -137,9 +145,7 @@ class BaseAdapter:
|
|
|
137
145
|
if not config.mcp_config:
|
|
138
146
|
return None
|
|
139
147
|
expanded = json.loads(os.path.expandvars(json.dumps(config.mcp_config)))
|
|
140
|
-
tmp = tempfile.NamedTemporaryFile(
|
|
141
|
-
mode="w", suffix=".json", prefix="codeprobe-mcp-", delete=False
|
|
142
|
-
)
|
|
148
|
+
tmp = tempfile.NamedTemporaryFile(mode="w", suffix=".json", prefix="codeprobe-mcp-", delete=False)
|
|
143
149
|
json.dump(expanded, tmp)
|
|
144
150
|
tmp.close()
|
|
145
151
|
return tmp.name
|