codeprobe 0.3.0__tar.gz → 0.3.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (173) hide show
  1. {codeprobe-0.3.0 → codeprobe-0.3.2}/PKG-INFO +3 -3
  2. {codeprobe-0.3.0 → codeprobe-0.3.2}/README.md +2 -2
  3. {codeprobe-0.3.0 → codeprobe-0.3.2}/pyproject.toml +1 -3
  4. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/__init__.py +1 -1
  5. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/assess/heuristics.py +42 -9
  6. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/__init__.py +25 -0
  7. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/doctor_cmd.py +0 -1
  8. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/mine_cmd.py +436 -54
  9. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/probe_cmd.py +24 -4
  10. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/run_cmd.py +77 -0
  11. codeprobe-0.3.2/src/codeprobe/cli/validate_cmd.py +288 -0
  12. codeprobe-0.3.2/src/codeprobe/core/__main__.py +8 -0
  13. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/executor.py +18 -3
  14. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/isolation.py +41 -6
  15. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/registry.py +1 -1
  16. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/scoring.py +253 -9
  17. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/loaders/__init__.py +19 -1
  18. codeprobe-0.3.2/src/codeprobe/loaders/suite.py +76 -0
  19. codeprobe-0.3.2/src/codeprobe/mining/_graph.py +310 -0
  20. codeprobe-0.3.2/src/codeprobe/mining/comprehension.py +473 -0
  21. codeprobe-0.3.2/src/codeprobe/mining/comprehension_writer.py +114 -0
  22. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/models/__init__.py +8 -1
  23. codeprobe-0.3.2/src/codeprobe/models/suite.py +23 -0
  24. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/models/task.py +40 -0
  25. codeprobe-0.3.2/src/codeprobe/probe/adapter.py +151 -0
  26. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe.egg-info/PKG-INFO +3 -3
  27. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe.egg-info/SOURCES.txt +16 -1
  28. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe.egg-info/entry_points.txt +0 -2
  29. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_adapter_contracts.py +1 -31
  30. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_adapters.py +0 -161
  31. codeprobe-0.3.2/tests/test_artifact_scorer.py +316 -0
  32. codeprobe-0.3.2/tests/test_checkpoint_scoring.py +369 -0
  33. codeprobe-0.3.2/tests/test_comprehension.py +329 -0
  34. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_ctrlc_integration.py +2 -0
  35. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_executor.py +182 -7
  36. codeprobe-0.3.2/tests/test_mine_goals.py +518 -0
  37. codeprobe-0.3.2/tests/test_probe_adapter.py +317 -0
  38. codeprobe-0.3.2/tests/test_shell_shim.py +177 -0
  39. codeprobe-0.3.2/tests/test_suite.py +243 -0
  40. codeprobe-0.3.2/tests/test_validate_cmd.py +272 -0
  41. codeprobe-0.3.0/src/codeprobe/adapters/aider.py +0 -79
  42. {codeprobe-0.3.0 → codeprobe-0.3.2}/LICENSE +0 -0
  43. {codeprobe-0.3.0 → codeprobe-0.3.2}/setup.cfg +0 -0
  44. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/__main__.py +0 -0
  45. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/__init__.py +0 -0
  46. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/_base.py +0 -0
  47. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/claude.py +0 -0
  48. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/codex.py +0 -0
  49. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/copilot.py +0 -0
  50. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/openai_compat.py +0 -0
  51. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/protocol.py +0 -0
  52. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/session.py +0 -0
  53. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/adapters/telemetry.py +0 -0
  54. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/analysis/__init__.py +0 -0
  55. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/analysis/ranking.py +0 -0
  56. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/analysis/report.py +0 -0
  57. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/analysis/stats.py +0 -0
  58. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/api.py +0 -0
  59. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/assess/__init__.py +0 -0
  60. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/assess_cmd.py +0 -0
  61. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/experiment_cmd.py +0 -0
  62. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/init_cmd.py +0 -0
  63. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/interpret_cmd.py +0 -0
  64. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/json_display.py +0 -0
  65. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/preamble_cmd.py +0 -0
  66. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/ratings_cmd.py +0 -0
  67. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/rich_display.py +0 -0
  68. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  69. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/wizard.py +0 -0
  70. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/cli/yaml_writer.py +0 -0
  71. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/config/__init__.py +0 -0
  72. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/config/loader.py +0 -0
  73. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/__init__.py +0 -0
  74. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/_shared.py +0 -0
  75. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/adaptive.py +0 -0
  76. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/counterfactual.py +0 -0
  77. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/debate.py +0 -0
  78. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/decision_tree.py +0 -0
  79. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/elo.py +0 -0
  80. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/fingerprint.py +0 -0
  81. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/mutation.py +0 -0
  82. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/pareto.py +0 -0
  83. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/sprt.py +0 -0
  84. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/contrib/tournament.py +0 -0
  85. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/__init__.py +0 -0
  86. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/checkpoint.py +0 -0
  87. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/events.py +0 -0
  88. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/experiment.py +0 -0
  89. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/llm.py +0 -0
  90. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/mcp_discovery.py +0 -0
  91. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/preamble.py +0 -0
  92. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/core/sandbox.py +0 -0
  93. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/__init__.py +0 -0
  94. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/_lang.py +0 -0
  95. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/curator.py +0 -0
  96. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/curator_backends.py +0 -0
  97. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/curator_tiers.py +0 -0
  98. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/extractor.py +0 -0
  99. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/org_scale.py +0 -0
  100. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/org_scale_families.py +0 -0
  101. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  102. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/org_scale_scanner.py +0 -0
  103. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/org_scale_validate.py +0 -0
  104. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/sg_ground_truth.py +0 -0
  105. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/sources.py +0 -0
  106. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/mining/writer.py +0 -0
  107. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/models/evalrc.py +0 -0
  108. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/models/experiment.py +0 -0
  109. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/models/preamble.py +0 -0
  110. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/preambles/__init__.py +0 -0
  111. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/preambles/github.md +0 -0
  112. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/preambles/sourcegraph.md +0 -0
  113. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/probe/__init__.py +0 -0
  114. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/probe/generator.py +0 -0
  115. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/probe/writer.py +0 -0
  116. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/ratings/__init__.py +0 -0
  117. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/ratings/collector.py +0 -0
  118. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/scaffold/__init__.py +0 -0
  119. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/scaffold/writer.py +0 -0
  120. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/templates/__init__.py +0 -0
  121. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
  122. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  123. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  124. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  125. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe.egg-info/requires.txt +0 -0
  126. {codeprobe-0.3.0 → codeprobe-0.3.2}/src/codeprobe.egg-info/top_level.txt +0 -0
  127. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_analysis.py +0 -0
  128. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_api.py +0 -0
  129. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_assess.py +0 -0
  130. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_changed_symbols.py +0 -0
  131. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_checkpoint.py +0 -0
  132. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_cli.py +0 -0
  133. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_config_loader.py +0 -0
  134. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_contrib.py +0 -0
  135. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_curator_backends.py +0 -0
  136. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_curator_core.py +0 -0
  137. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_curator_integration.py +0 -0
  138. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_curator_tiers.py +0 -0
  139. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_doctor_cmd.py +0 -0
  140. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_events.py +0 -0
  141. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_executor_events.py +0 -0
  142. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_experiment_cmd.py +0 -0
  143. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_experiment_core.py +0 -0
  144. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_init_wizard.py +0 -0
  145. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_json_display.py +0 -0
  146. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_llm.py +0 -0
  147. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_loaders.py +0 -0
  148. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_mcp_families_mining.py +0 -0
  149. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_mcp_validate.py +0 -0
  150. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_mine_presets.py +0 -0
  151. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_mine_profiles.py +0 -0
  152. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_mining.py +0 -0
  153. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_models.py +0 -0
  154. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_new_families.py +0 -0
  155. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_openai_compat.py +0 -0
  156. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_oracle_types.py +0 -0
  157. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_org_scale.py +0 -0
  158. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_pipeline_integration.py +0 -0
  159. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_preamble.py +0 -0
  160. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_preamble_cmd.py +0 -0
  161. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_probe.py +0 -0
  162. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_ratings.py +0 -0
  163. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_ratings_cmd.py +0 -0
  164. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_registry.py +0 -0
  165. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_run_config_resolution.py +0 -0
  166. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_scaffold.py +0 -0
  167. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_scanner_refactor.py +0 -0
  168. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_scoring.py +0 -0
  169. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_session.py +0 -0
  170. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_sg_ground_truth.py +0 -0
  171. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_show_prompt.py +0 -0
  172. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_telemetry.py +0 -0
  173. {codeprobe-0.3.0 → codeprobe-0.3.2}/tests/test_weighted_f1.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeprobe
3
- Version: 0.3.0
3
+ Version: 0.3.2
4
4
  Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
5
  Author: codeprobe contributors
6
6
  License-Expression: Apache-2.0
@@ -38,11 +38,11 @@ Dynamic: license-file
38
38
 
39
39
  Benchmark AI coding agents against **your own codebase**.
40
40
 
41
- Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code not someone else's benchmark suite.
41
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
42
42
 
43
43
  ## Why codeprobe?
44
44
 
45
- Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
45
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
46
46
 
47
47
  ## Prerequisites
48
48
 
@@ -2,11 +2,11 @@
2
2
 
3
3
  Benchmark AI coding agents against **your own codebase**.
4
4
 
5
- Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code not someone else's benchmark suite.
5
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
6
6
 
7
7
  ## Why codeprobe?
8
8
 
9
- Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
9
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
10
10
 
11
11
  ## Prerequisites
12
12
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.3.0"
3
+ version = "0.3.2"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -47,11 +47,9 @@ dev = [
47
47
  codeprobe = "codeprobe.cli:main"
48
48
 
49
49
  [project.entry-points."codeprobe.agents"]
50
- aider = "codeprobe.adapters.aider:AiderAdapter"
51
50
  claude = "codeprobe.adapters.claude:ClaudeAdapter"
52
51
  codex = "codeprobe.adapters.codex:CodexAdapter"
53
52
  copilot = "codeprobe.adapters.copilot:CopilotAdapter"
54
- openai = "codeprobe.adapters.openai_compat:OpenAICompatAdapter"
55
53
 
56
54
  [project.entry-points."codeprobe.sessions"]
57
55
  claude = "codeprobe.adapters.session:ClaudeSessionCollector"
@@ -1,3 +1,3 @@
1
1
  """codeprobe — Benchmark AI coding agents against your own codebase."""
2
2
 
3
- __version__ = "0.3.0"
3
+ __version__ = "0.3.1"
@@ -142,7 +142,12 @@ def _run_git(args: list[str], cwd: Path) -> str:
142
142
  timeout=30,
143
143
  )
144
144
  if result.returncode != 0:
145
- logger.debug("git %s exited %d: %s", " ".join(args), result.returncode, result.stderr.strip())
145
+ logger.debug(
146
+ "git %s exited %d: %s",
147
+ " ".join(args),
148
+ result.returncode,
149
+ result.stderr.strip(),
150
+ )
146
151
  return ""
147
152
  return result.stdout.strip()
148
153
  except (subprocess.TimeoutExpired, OSError) as exc:
@@ -307,7 +312,9 @@ def gather_heuristics(repo_path: Path) -> RepoHeuristics:
307
312
  history, CI presence, test coverage, languages, and activity.
308
313
  """
309
314
  total_commits_str = _run_git(["rev-list", "--count", "HEAD"], cwd=repo_path)
310
- merge_commits_str = _run_git(["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path)
315
+ merge_commits_str = _run_git(
316
+ ["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path
317
+ )
311
318
  contributors_str = _run_git(["shortlog", "-sn", "HEAD"], cwd=repo_path)
312
319
  file_list = _run_git(["ls-files"], cwd=repo_path)
313
320
 
@@ -354,7 +361,10 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
354
361
  has_ci = heuristics.has_ci
355
362
  has_fw = len(heuristics.test_frameworks) > 0
356
363
  if has_tests and has_ci and has_fw:
357
- tc_score, tc_reason = 1.0, f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})"
364
+ tc_score, tc_reason = (
365
+ 1.0,
366
+ f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})",
367
+ )
358
368
  elif has_tests and (has_ci or has_fw):
359
369
  tc_score, tc_reason = 0.7, "Tests present with partial CI/framework support"
360
370
  elif has_tests:
@@ -409,15 +419,29 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
409
419
  DimensionScore(name="ci_maturity", score=ci_score, reasoning=ci_reason),
410
420
  )
411
421
 
412
- # Equal weights for heuristic path (model path lets the model weight them).
413
- overall = sum(d.score for d in dimensions) / len(dimensions)
422
+ # Weighted average ci_maturity is a weak signal because CI configs are
423
+ # often absent in shallow clones / Sourcegraph views, and codeprobe
424
+ # validates via mined test.sh scripts, not CI pipelines.
425
+ _WEIGHTS: dict[str, float] = {
426
+ "task_richness": 0.25,
427
+ "test_coverage": 0.25,
428
+ "complexity": 0.20,
429
+ "activity": 0.15,
430
+ "documentation": 0.10,
431
+ "ci_maturity": 0.05,
432
+ }
433
+ overall = sum(d.score * _WEIGHTS[d.name] for d in dimensions)
414
434
 
415
435
  if overall >= 0.7:
416
436
  recommendation = "Excellent benchmarking candidate — rich history with tests"
417
437
  elif overall >= 0.5:
418
- recommendation = "Good candidate — may need more merge history for diverse tasks"
438
+ recommendation = (
439
+ "Good candidate — may need more merge history for diverse tasks"
440
+ )
419
441
  elif overall >= 0.3:
420
- recommendation = "Fair candidate — limited test coverage may reduce task quality"
442
+ recommendation = (
443
+ "Fair candidate — limited test coverage may reduce task quality"
444
+ )
421
445
  else:
422
446
  recommendation = "Poor candidate — consider a repo with more history and tests"
423
447
 
@@ -458,11 +482,15 @@ def _parse_model_assessment(
458
482
  score_val = float(item.get("score", 0))
459
483
  score_val = max(0.0, min(1.0, score_val))
460
484
  reasoning = str(item.get("reasoning", ""))
461
- dim_by_name[name] = DimensionScore(name=name, score=score_val, reasoning=reasoning)
485
+ dim_by_name[name] = DimensionScore(
486
+ name=name, score=score_val, reasoning=reasoning
487
+ )
462
488
 
463
489
  missing = set(RUBRIC_V1) - set(dim_by_name)
464
490
  if missing:
465
- raise LLMParseError(f"Model response missing dimensions: {', '.join(sorted(missing))}")
491
+ raise LLMParseError(
492
+ f"Model response missing dimensions: {', '.join(sorted(missing))}"
493
+ )
466
494
 
467
495
  dimensions = tuple(dim_by_name[name] for name in RUBRIC_V1)
468
496
 
@@ -498,6 +526,11 @@ def score_repo_with_model(heuristics: RepoHeuristics) -> AssessmentScore:
498
526
  "You are evaluating a code repository's suitability for AI agent benchmarking.\n\n"
499
527
  f"Here are the raw repository statistics:\n{stats_json}\n\n"
500
528
  f"Score this repository on each of these dimensions (0.0 to 1.0):\n{rubric_list}\n\n"
529
+ "Weighting guidance for the overall score: task_richness and test_coverage "
530
+ "are the most important (~25% each), followed by complexity (~20%), "
531
+ "activity (~15%), documentation (~10%). ci_maturity should be a minor "
532
+ "signal (~5%) because CI configs are often absent in cloned repos and "
533
+ "codeprobe validates via mined test scripts, not CI pipelines.\n\n"
501
534
  "Respond with ONLY valid JSON matching this exact schema:\n"
502
535
  "{\n"
503
536
  ' "overall": <float 0.0-1.0>,\n'
@@ -111,6 +111,14 @@ def init(path: str) -> None:
111
111
  default=None,
112
112
  help="Apply a named preset: 'quick' (count=3) or 'mcp' (org-scale + MCP families).",
113
113
  )
114
+ @click.option(
115
+ "--goal",
116
+ type=click.Choice(
117
+ ["quality", "navigation", "mcp", "general"], case_sensitive=False
118
+ ),
119
+ default=None,
120
+ help="Eval goal: quality, navigation, mcp, general. Skips interactive goal prompt.",
121
+ )
114
122
  @click.option(
115
123
  "--profile",
116
124
  "profile_name",
@@ -241,6 +249,7 @@ def mine(
241
249
  ctx: click.Context,
242
250
  path: str,
243
251
  preset: str | None,
252
+ goal: str | None,
244
253
  profile_name: str | None,
245
254
  save_profile_name: str | None,
246
255
  list_profiles_flag: bool,
@@ -381,10 +390,12 @@ def mine(
381
390
  backends = _prof_val("backends", backends) # type: ignore[assignment]
382
391
  interactive = _prof_val("interactive", interactive) # type: ignore[assignment]
383
392
  preset = _prof_val("preset", preset) # type: ignore[assignment]
393
+ goal = _prof_val("goal", goal) # type: ignore[assignment]
384
394
 
385
395
  run_mine(
386
396
  path,
387
397
  preset=preset,
398
+ goal=goal,
388
399
  count=count,
389
400
  source=source,
390
401
  min_files=min_files,
@@ -463,6 +474,13 @@ def mine(
463
474
  default=False,
464
475
  help="Print the fully-resolved prompt for the first task and exit (no agent spawned).",
465
476
  )
477
+ @click.option(
478
+ "--suite",
479
+ "suite_path",
480
+ default=None,
481
+ type=click.Path(exists=True),
482
+ help="Path to a suite.toml manifest to filter tasks by type, difficulty, and tags.",
483
+ )
466
484
  @click.pass_context
467
485
  def run(
468
486
  ctx: click.Context,
@@ -478,6 +496,7 @@ def run(
478
496
  timeout: int | None,
479
497
  repeats: int | None,
480
498
  show_prompt: bool,
499
+ suite_path: str | None,
481
500
  ) -> None:
482
501
  """Run eval tasks against an AI coding agent.
483
502
 
@@ -510,6 +529,7 @@ def run(
510
529
  force_rich=force_rich,
511
530
  timeout=timeout,
512
531
  repeats=repeats if repeats is not None else 1,
532
+ suite_path=suite_path,
513
533
  )
514
534
 
515
535
 
@@ -690,3 +710,8 @@ main.add_command(preambles)
690
710
  from codeprobe.cli.doctor_cmd import doctor # noqa: E402
691
711
 
692
712
  main.add_command(doctor)
713
+
714
+ # Register the validate command
715
+ from codeprobe.cli.validate_cmd import validate # noqa: E402
716
+
717
+ main.add_command(validate)
@@ -83,7 +83,6 @@ def run_checks() -> list[CheckResult]:
83
83
  _check_tool(
84
84
  "codex", "Install OpenAI Codex CLI: https://github.com/openai/codex"
85
85
  ),
86
- _check_tool("aider", "Install aider: https://aider.chat/docs/install.html"),
87
86
  _check_env_key(
88
87
  "ANTHROPIC_API_KEY", "Set ANTHROPIC_API_KEY in your environment."
89
88
  ),