codeprobe 0.2.8__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. {codeprobe-0.2.8 → codeprobe-0.3.1}/PKG-INFO +35 -22
  2. {codeprobe-0.2.8 → codeprobe-0.3.1}/README.md +33 -21
  3. {codeprobe-0.2.8 → codeprobe-0.3.1}/pyproject.toml +8 -3
  4. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/__init__.py +1 -1
  5. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/assess/heuristics.py +42 -9
  6. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/__init__.py +228 -1
  7. codeprobe-0.3.1/src/codeprobe/cli/doctor_cmd.py +114 -0
  8. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/experiment_cmd.py +38 -1
  9. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/init_cmd.py +3 -45
  10. codeprobe-0.3.1/src/codeprobe/cli/json_display.py +48 -0
  11. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/mine_cmd.py +605 -54
  12. codeprobe-0.3.1/src/codeprobe/cli/preamble_cmd.py +92 -0
  13. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/probe_cmd.py +24 -4
  14. codeprobe-0.3.1/src/codeprobe/cli/rich_display.py +234 -0
  15. codeprobe-0.3.1/src/codeprobe/cli/run_cmd.py +556 -0
  16. codeprobe-0.3.1/src/codeprobe/cli/validate_cmd.py +288 -0
  17. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/yaml_writer.py +17 -5
  18. codeprobe-0.3.1/src/codeprobe/core/__main__.py +8 -0
  19. codeprobe-0.3.1/src/codeprobe/core/events.py +274 -0
  20. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/executor.py +157 -15
  21. codeprobe-0.3.1/src/codeprobe/core/mcp_discovery.py +47 -0
  22. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/registry.py +35 -3
  23. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/scoring.py +260 -21
  24. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/loaders/__init__.py +19 -1
  25. codeprobe-0.3.1/src/codeprobe/loaders/suite.py +76 -0
  26. codeprobe-0.3.1/src/codeprobe/mining/_graph.py +310 -0
  27. codeprobe-0.3.1/src/codeprobe/mining/comprehension.py +473 -0
  28. codeprobe-0.3.1/src/codeprobe/mining/comprehension_writer.py +114 -0
  29. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/__init__.py +8 -1
  30. codeprobe-0.3.1/src/codeprobe/models/suite.py +23 -0
  31. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/task.py +40 -0
  32. codeprobe-0.3.1/src/codeprobe/probe/adapter.py +151 -0
  33. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/PKG-INFO +35 -22
  34. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/SOURCES.txt +33 -1
  35. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/entry_points.txt +6 -2
  36. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/requires.txt +1 -0
  37. codeprobe-0.3.1/tests/test_adapter_contracts.py +104 -0
  38. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_adapters.py +0 -161
  39. codeprobe-0.3.1/tests/test_artifact_scorer.py +316 -0
  40. codeprobe-0.3.1/tests/test_checkpoint_scoring.py +369 -0
  41. codeprobe-0.3.1/tests/test_comprehension.py +329 -0
  42. codeprobe-0.3.1/tests/test_ctrlc_integration.py +119 -0
  43. codeprobe-0.3.1/tests/test_doctor_cmd.py +127 -0
  44. codeprobe-0.3.1/tests/test_events.py +343 -0
  45. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_executor.py +167 -0
  46. codeprobe-0.3.1/tests/test_executor_events.py +423 -0
  47. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_init_wizard.py +59 -15
  48. codeprobe-0.3.1/tests/test_json_display.py +229 -0
  49. codeprobe-0.3.1/tests/test_mine_goals.py +518 -0
  50. codeprobe-0.3.1/tests/test_mine_presets.py +163 -0
  51. codeprobe-0.3.1/tests/test_mine_profiles.py +384 -0
  52. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_pipeline_integration.py +233 -0
  53. codeprobe-0.3.1/tests/test_preamble_cmd.py +115 -0
  54. codeprobe-0.3.1/tests/test_probe_adapter.py +317 -0
  55. codeprobe-0.3.1/tests/test_run_config_resolution.py +221 -0
  56. codeprobe-0.3.1/tests/test_shell_shim.py +177 -0
  57. codeprobe-0.3.1/tests/test_show_prompt.py +108 -0
  58. codeprobe-0.3.1/tests/test_suite.py +243 -0
  59. codeprobe-0.3.1/tests/test_validate_cmd.py +272 -0
  60. codeprobe-0.2.8/src/codeprobe/adapters/aider.py +0 -79
  61. codeprobe-0.2.8/src/codeprobe/cli/run_cmd.py +0 -251
  62. {codeprobe-0.2.8 → codeprobe-0.3.1}/LICENSE +0 -0
  63. {codeprobe-0.2.8 → codeprobe-0.3.1}/setup.cfg +0 -0
  64. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/__main__.py +0 -0
  65. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/__init__.py +0 -0
  66. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/_base.py +0 -0
  67. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/claude.py +0 -0
  68. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/codex.py +0 -0
  69. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/copilot.py +0 -0
  70. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/openai_compat.py +0 -0
  71. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/protocol.py +0 -0
  72. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/session.py +0 -0
  73. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/adapters/telemetry.py +0 -0
  74. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/__init__.py +0 -0
  75. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/ranking.py +0 -0
  76. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/report.py +0 -0
  77. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/analysis/stats.py +0 -0
  78. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/api.py +0 -0
  79. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/assess/__init__.py +0 -0
  80. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/assess_cmd.py +0 -0
  81. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/interpret_cmd.py +0 -0
  82. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/ratings_cmd.py +0 -0
  83. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  84. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/cli/wizard.py +0 -0
  85. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/config/__init__.py +0 -0
  86. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/config/loader.py +0 -0
  87. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/__init__.py +0 -0
  88. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/_shared.py +0 -0
  89. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/adaptive.py +0 -0
  90. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/counterfactual.py +0 -0
  91. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/debate.py +0 -0
  92. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/decision_tree.py +0 -0
  93. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/elo.py +0 -0
  94. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/fingerprint.py +0 -0
  95. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/mutation.py +0 -0
  96. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/pareto.py +0 -0
  97. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/sprt.py +0 -0
  98. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/contrib/tournament.py +0 -0
  99. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/__init__.py +0 -0
  100. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/checkpoint.py +0 -0
  101. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/experiment.py +0 -0
  102. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/isolation.py +0 -0
  103. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/llm.py +0 -0
  104. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/preamble.py +0 -0
  105. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/core/sandbox.py +0 -0
  106. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/__init__.py +0 -0
  107. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/_lang.py +0 -0
  108. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/curator.py +0 -0
  109. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/curator_backends.py +0 -0
  110. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/curator_tiers.py +0 -0
  111. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/extractor.py +0 -0
  112. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale.py +0 -0
  113. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_families.py +0 -0
  114. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  115. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_scanner.py +0 -0
  116. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/org_scale_validate.py +0 -0
  117. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/sg_ground_truth.py +0 -0
  118. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/sources.py +0 -0
  119. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/mining/writer.py +0 -0
  120. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/evalrc.py +0 -0
  121. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/experiment.py +0 -0
  122. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/models/preamble.py +0 -0
  123. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/preambles/__init__.py +0 -0
  124. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/preambles/github.md +0 -0
  125. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/preambles/sourcegraph.md +0 -0
  126. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/probe/__init__.py +0 -0
  127. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/probe/generator.py +0 -0
  128. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/probe/writer.py +0 -0
  129. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/ratings/__init__.py +0 -0
  130. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/ratings/collector.py +0 -0
  131. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/scaffold/__init__.py +0 -0
  132. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/scaffold/writer.py +0 -0
  133. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/__init__.py +0 -0
  134. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
  135. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  136. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  137. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  138. {codeprobe-0.2.8 → codeprobe-0.3.1}/src/codeprobe.egg-info/top_level.txt +0 -0
  139. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_analysis.py +0 -0
  140. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_api.py +0 -0
  141. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_assess.py +0 -0
  142. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_changed_symbols.py +0 -0
  143. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_checkpoint.py +0 -0
  144. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_cli.py +0 -0
  145. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_config_loader.py +0 -0
  146. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_contrib.py +0 -0
  147. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_backends.py +0 -0
  148. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_core.py +0 -0
  149. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_integration.py +0 -0
  150. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_curator_tiers.py +0 -0
  151. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_experiment_cmd.py +0 -0
  152. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_experiment_core.py +0 -0
  153. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_llm.py +0 -0
  154. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_loaders.py +0 -0
  155. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_mcp_families_mining.py +0 -0
  156. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_mcp_validate.py +0 -0
  157. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_mining.py +0 -0
  158. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_models.py +0 -0
  159. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_new_families.py +0 -0
  160. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_openai_compat.py +0 -0
  161. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_oracle_types.py +0 -0
  162. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_org_scale.py +0 -0
  163. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_preamble.py +0 -0
  164. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_probe.py +0 -0
  165. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_ratings.py +0 -0
  166. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_ratings_cmd.py +0 -0
  167. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_registry.py +0 -0
  168. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_scaffold.py +0 -0
  169. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_scanner_refactor.py +0 -0
  170. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_scoring.py +0 -0
  171. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_session.py +0 -0
  172. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_sg_ground_truth.py +0 -0
  173. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_telemetry.py +0 -0
  174. {codeprobe-0.2.8 → codeprobe-0.3.1}/tests/test_weighted_f1.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codeprobe
3
- Version: 0.2.8
3
+ Version: 0.3.1
4
4
  Summary: Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results.
5
5
  Author: codeprobe contributors
6
6
  License-Expression: Apache-2.0
@@ -24,6 +24,7 @@ Requires-Dist: anthropic>=0.39
24
24
  Requires-Dist: openai>=1.66
25
25
  Requires-Dist: tiktoken<1,>=0.7
26
26
  Requires-Dist: scipy<2,>=1.11
27
+ Requires-Dist: rich<14,>=13.7
27
28
  Provides-Extra: dev
28
29
  Requires-Dist: pytest<9,>=8.0; extra == "dev"
29
30
  Requires-Dist: pytest-cov<6,>=5.0; extra == "dev"
@@ -37,11 +38,11 @@ Dynamic: license-file
37
38
 
38
39
  Benchmark AI coding agents against **your own codebase**.
39
40
 
40
- Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code not someone else's benchmark suite.
41
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
41
42
 
42
43
  ## Why codeprobe?
43
44
 
44
- Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
45
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
45
46
 
46
47
  ## Prerequisites
47
48
 
@@ -84,18 +85,20 @@ codeprobe interpret . # Get recommendations
84
85
 
85
86
  ## Commands
86
87
 
87
- | Command | Purpose |
88
- | ------------------------ | ------------------------------------------------ |
89
- | `codeprobe assess` | Score a codebase's benchmarking potential |
90
- | `codeprobe init` | Interactive wizard — choose what to compare |
91
- | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
92
- | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
93
- | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
94
- | `codeprobe run` | Execute tasks against AI agents |
95
- | `codeprobe interpret` | Analyze results, rank configurations |
96
- | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
97
- | `codeprobe scaffold` | Create/validate eval task directories |
98
- | `codeprobe ratings` | Record and analyze agent session quality ratings |
88
+ | Command | Purpose |
89
+ | -------------------------- | ------------------------------------------------ |
90
+ | `codeprobe assess` | Score a codebase's benchmarking potential |
91
+ | `codeprobe init` | Interactive wizard — choose what to compare |
92
+ | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
93
+ | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
94
+ | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
95
+ | `codeprobe run` | Execute tasks against AI agents |
96
+ | `codeprobe interpret` | Analyze results, rank configurations |
97
+ | `codeprobe doctor` | Check environment readiness (agents, keys, git) |
98
+ | `codeprobe preambles list` | List available preambles at all search levels |
99
+ | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
100
+ | `codeprobe scaffold` | Create/validate eval task directories |
101
+ | `codeprobe ratings` | Record and analyze agent session quality ratings |
99
102
 
100
103
  ## Two Ways to Generate Tasks
101
104
 
@@ -181,17 +184,32 @@ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}
181
184
  codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
182
185
  codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
183
186
  codeprobe run . --dry-run # Estimate resource usage without running
187
+ codeprobe run . --model opus-4 # Override experiment.json model
188
+ codeprobe run . --timeout 600 # Override default 300s timeout
189
+ codeprobe run . --repeats 3 # Run each task 3 times
190
+ codeprobe run . --show-prompt # Print resolved prompt without running agent
184
191
 
185
192
  # Mining
186
193
  codeprobe mine . --enrich # Use LLM to improve weak task instructions
187
194
  codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
188
195
  codeprobe mine . --mcp-families # Include MCP-optimized task families
189
196
  codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
197
+ codeprobe mine . --preset quick # Quick scan: count=3
198
+ codeprobe mine . --preset mcp # MCP eval: org-scale + MCP families + enrich
199
+
200
+ # Mine profiles (save/load custom flag combinations)
201
+ codeprobe mine --save-profile my-setup --count 10 --org-scale .
202
+ codeprobe mine --profile my-setup . # Load saved flags
203
+ codeprobe mine --list-profiles # Show available profiles
190
204
 
191
205
  # Experiment configs
192
206
  codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
193
207
  codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
194
208
 
209
+ # Diagnostics
210
+ codeprobe doctor # Check agents, API keys, git, Python
211
+ codeprobe preambles list # Show available preambles at all levels
212
+
195
213
  # Output
196
214
  codeprobe interpret . --format csv # Export for pivot tables
197
215
  codeprobe interpret . --format html # Self-contained HTML report
@@ -210,14 +228,9 @@ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
210
228
 
211
229
  ## Configuration
212
230
 
213
- Create a `.evalrc.yaml` in your repo root:
231
+ Configuration lives in `experiment.json` (created by `codeprobe init` or `codeprobe experiment init`). CLI flags override experiment.json values — precedence: built-in defaults < experiment.json < CLI flags.
214
232
 
215
- ```yaml
216
- name: my-experiment
217
- agents: [claude, copilot]
218
- models: [claude-sonnet-4-6, claude-opus-4-6]
219
- tasks_dir: .codeprobe/tasks
220
- ```
233
+ Run-time observability is on by default: Rich Live dashboard in TTY, JSON event lines with `--log-format json` for CI. Cost budget warnings at 80% and 100% thresholds are always visible on stderr.
221
234
 
222
235
  ## License
223
236
 
@@ -2,11 +2,11 @@
2
2
 
3
3
  Benchmark AI coding agents against **your own codebase**.
4
4
 
5
- Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for YOUR code not someone else's benchmark suite.
5
+ Mine real tasks from your repo history, run agents against them, and find out which setup actually works best for **your** code, not someone else's benchmark suite.
6
6
 
7
7
  ## Why codeprobe?
8
8
 
9
- Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate.
9
+ Existing benchmarks (SWE-bench, HumanEval) use fixed task sets that AI models may have memorized from training data, and as general public benchmarks likely don't capture what is most important to your unique workflows. codeprobe mines tasks from **your private repo history**, producing benchmarks that are impossible to contaminate. You can also point the tool at any public repo to mine tasks from.
10
10
 
11
11
  ## Prerequisites
12
12
 
@@ -49,18 +49,20 @@ codeprobe interpret . # Get recommendations
49
49
 
50
50
  ## Commands
51
51
 
52
- | Command | Purpose |
53
- | ------------------------ | ------------------------------------------------ |
54
- | `codeprobe assess` | Score a codebase's benchmarking potential |
55
- | `codeprobe init` | Interactive wizard — choose what to compare |
56
- | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
57
- | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
58
- | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
59
- | `codeprobe run` | Execute tasks against AI agents |
60
- | `codeprobe interpret` | Analyze results, rank configurations |
61
- | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
62
- | `codeprobe scaffold` | Create/validate eval task directories |
63
- | `codeprobe ratings` | Record and analyze agent session quality ratings |
52
+ | Command | Purpose |
53
+ | -------------------------- | ------------------------------------------------ |
54
+ | `codeprobe assess` | Score a codebase's benchmarking potential |
55
+ | `codeprobe init` | Interactive wizard — choose what to compare |
56
+ | `codeprobe mine` | Mine eval tasks from merged PRs/MRs |
57
+ | `codeprobe probe` | Generate fast micro-benchmark probes (30s each) |
58
+ | `codeprobe experiment` | Manage comparison experiments (init, add-config) |
59
+ | `codeprobe run` | Execute tasks against AI agents |
60
+ | `codeprobe interpret` | Analyze results, rank configurations |
61
+ | `codeprobe doctor` | Check environment readiness (agents, keys, git) |
62
+ | `codeprobe preambles list` | List available preambles at all search levels |
63
+ | `codeprobe oracle-check` | Compare agent answer against oracle ground truth |
64
+ | `codeprobe scaffold` | Create/validate eval task directories |
65
+ | `codeprobe ratings` | Record and analyze agent session quality ratings |
64
66
 
65
67
  ## Two Ways to Generate Tasks
66
68
 
@@ -146,17 +148,32 @@ Template variables: `{{sg_repo}}`, `{{repo_name}}`, `{{repo_path}}`, `{{task_id}
146
148
  codeprobe run . --parallel 5 # Run 5 tasks concurrently (worktree-isolated)
147
149
  codeprobe run . --max-cost-usd 2.00 # Stop when cost budget is reached
148
150
  codeprobe run . --dry-run # Estimate resource usage without running
151
+ codeprobe run . --model opus-4 # Override experiment.json model
152
+ codeprobe run . --timeout 600 # Override default 300s timeout
153
+ codeprobe run . --repeats 3 # Run each task 3 times
154
+ codeprobe run . --show-prompt # Print resolved prompt without running agent
149
155
 
150
156
  # Mining
151
157
  codeprobe mine . --enrich # Use LLM to improve weak task instructions
152
158
  codeprobe mine . --org-scale # Mine comprehension tasks (not SDLC)
153
159
  codeprobe mine . --mcp-families # Include MCP-optimized task families
154
160
  codeprobe mine . --sg-repo REPO # Sourcegraph repo for ground truth enrichment
161
+ codeprobe mine . --preset quick # Quick scan: count=3
162
+ codeprobe mine . --preset mcp # MCP eval: org-scale + MCP families + enrich
163
+
164
+ # Mine profiles (save/load custom flag combinations)
165
+ codeprobe mine --save-profile my-setup --count 10 --org-scale .
166
+ codeprobe mine --profile my-setup . # Load saved flags
167
+ codeprobe mine --list-profiles # Show available profiles
155
168
 
156
169
  # Experiment configs
157
170
  codeprobe experiment add-config . --preamble sourcegraph # Attach MCP preamble
158
171
  codeprobe experiment add-config . --mcp-config config.json # Attach MCP server
159
172
 
173
+ # Diagnostics
174
+ codeprobe doctor # Check agents, API keys, git, Python
175
+ codeprobe preambles list # Show available preambles at all levels
176
+
160
177
  # Output
161
178
  codeprobe interpret . --format csv # Export for pivot tables
162
179
  codeprobe interpret . --format html # Self-contained HTML report
@@ -175,14 +192,9 @@ GitHub, GitLab, Bitbucket, Azure DevOps, Gitea/Forgejo, and local repos.
175
192
 
176
193
  ## Configuration
177
194
 
178
- Create a `.evalrc.yaml` in your repo root:
195
+ Configuration lives in `experiment.json` (created by `codeprobe init` or `codeprobe experiment init`). CLI flags override experiment.json values — precedence: built-in defaults < experiment.json < CLI flags.
179
196
 
180
- ```yaml
181
- name: my-experiment
182
- agents: [claude, copilot]
183
- models: [claude-sonnet-4-6, claude-opus-4-6]
184
- tasks_dir: .codeprobe/tasks
185
- ```
197
+ Run-time observability is on by default: Rich Live dashboard in TTY, JSON event lines with `--log-format json` for CI. Cost budget warnings at 80% and 100% thresholds are always visible on stderr.
186
198
 
187
199
  ## License
188
200
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "codeprobe"
3
- version = "0.2.8"
3
+ version = "0.3.1"
4
4
  description = "Benchmark AI coding agents against your own codebase. Mine real tasks from repo history, run agents, interpret results."
5
5
  readme = "README.md"
6
6
  license = "Apache-2.0"
@@ -25,6 +25,7 @@ dependencies = [
25
25
  "openai>=1.66",
26
26
  "tiktoken>=0.7,<1",
27
27
  "scipy>=1.11,<2",
28
+ "rich>=13.7,<14",
28
29
  ]
29
30
 
30
31
  [project.urls]
@@ -46,17 +47,21 @@ dev = [
46
47
  codeprobe = "codeprobe.cli:main"
47
48
 
48
49
  [project.entry-points."codeprobe.agents"]
49
- aider = "codeprobe.adapters.aider:AiderAdapter"
50
50
  claude = "codeprobe.adapters.claude:ClaudeAdapter"
51
51
  codex = "codeprobe.adapters.codex:CodexAdapter"
52
52
  copilot = "codeprobe.adapters.copilot:CopilotAdapter"
53
- openai = "codeprobe.adapters.openai_compat:OpenAICompatAdapter"
54
53
 
55
54
  [project.entry-points."codeprobe.sessions"]
56
55
  claude = "codeprobe.adapters.session:ClaudeSessionCollector"
57
56
  codex = "codeprobe.adapters.session:CodexSessionCollector"
58
57
  copilot = "codeprobe.adapters.session:CopilotSessionCollector"
59
58
 
59
+ [project.entry-points."codeprobe.scorers"]
60
+ binary = "codeprobe.core.scoring:BinaryScorer"
61
+ continuous = "codeprobe.core.scoring:ContinuousScorer"
62
+ checkpoint = "codeprobe.core.scoring:CheckpointScorer"
63
+ test_ratio = "codeprobe.core.scoring:ContinuousScorer"
64
+
60
65
  [build-system]
61
66
  requires = ["setuptools>=68", "wheel"]
62
67
  build-backend = "setuptools.build_meta"
@@ -1,3 +1,3 @@
1
1
  """codeprobe — Benchmark AI coding agents against your own codebase."""
2
2
 
3
- __version__ = "0.2.8"
3
+ __version__ = "0.3.1"
@@ -142,7 +142,12 @@ def _run_git(args: list[str], cwd: Path) -> str:
142
142
  timeout=30,
143
143
  )
144
144
  if result.returncode != 0:
145
- logger.debug("git %s exited %d: %s", " ".join(args), result.returncode, result.stderr.strip())
145
+ logger.debug(
146
+ "git %s exited %d: %s",
147
+ " ".join(args),
148
+ result.returncode,
149
+ result.stderr.strip(),
150
+ )
146
151
  return ""
147
152
  return result.stdout.strip()
148
153
  except (subprocess.TimeoutExpired, OSError) as exc:
@@ -307,7 +312,9 @@ def gather_heuristics(repo_path: Path) -> RepoHeuristics:
307
312
  history, CI presence, test coverage, languages, and activity.
308
313
  """
309
314
  total_commits_str = _run_git(["rev-list", "--count", "HEAD"], cwd=repo_path)
310
- merge_commits_str = _run_git(["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path)
315
+ merge_commits_str = _run_git(
316
+ ["rev-list", "--merges", "--count", "HEAD"], cwd=repo_path
317
+ )
311
318
  contributors_str = _run_git(["shortlog", "-sn", "HEAD"], cwd=repo_path)
312
319
  file_list = _run_git(["ls-files"], cwd=repo_path)
313
320
 
@@ -354,7 +361,10 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
354
361
  has_ci = heuristics.has_ci
355
362
  has_fw = len(heuristics.test_frameworks) > 0
356
363
  if has_tests and has_ci and has_fw:
357
- tc_score, tc_reason = 1.0, f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})"
364
+ tc_score, tc_reason = (
365
+ 1.0,
366
+ f"Tests + CI + framework ({', '.join(heuristics.test_frameworks)})",
367
+ )
358
368
  elif has_tests and (has_ci or has_fw):
359
369
  tc_score, tc_reason = 0.7, "Tests present with partial CI/framework support"
360
370
  elif has_tests:
@@ -409,15 +419,29 @@ def score_repo_heuristic(heuristics: RepoHeuristics) -> AssessmentScore:
409
419
  DimensionScore(name="ci_maturity", score=ci_score, reasoning=ci_reason),
410
420
  )
411
421
 
412
- # Equal weights for heuristic path (model path lets the model weight them).
413
- overall = sum(d.score for d in dimensions) / len(dimensions)
422
+ # Weighted average ci_maturity is a weak signal because CI configs are
423
+ # often absent in shallow clones / Sourcegraph views, and codeprobe
424
+ # validates via mined test.sh scripts, not CI pipelines.
425
+ _WEIGHTS: dict[str, float] = {
426
+ "task_richness": 0.25,
427
+ "test_coverage": 0.25,
428
+ "complexity": 0.20,
429
+ "activity": 0.15,
430
+ "documentation": 0.10,
431
+ "ci_maturity": 0.05,
432
+ }
433
+ overall = sum(d.score * _WEIGHTS[d.name] for d in dimensions)
414
434
 
415
435
  if overall >= 0.7:
416
436
  recommendation = "Excellent benchmarking candidate — rich history with tests"
417
437
  elif overall >= 0.5:
418
- recommendation = "Good candidate — may need more merge history for diverse tasks"
438
+ recommendation = (
439
+ "Good candidate — may need more merge history for diverse tasks"
440
+ )
419
441
  elif overall >= 0.3:
420
- recommendation = "Fair candidate — limited test coverage may reduce task quality"
442
+ recommendation = (
443
+ "Fair candidate — limited test coverage may reduce task quality"
444
+ )
421
445
  else:
422
446
  recommendation = "Poor candidate — consider a repo with more history and tests"
423
447
 
@@ -458,11 +482,15 @@ def _parse_model_assessment(
458
482
  score_val = float(item.get("score", 0))
459
483
  score_val = max(0.0, min(1.0, score_val))
460
484
  reasoning = str(item.get("reasoning", ""))
461
- dim_by_name[name] = DimensionScore(name=name, score=score_val, reasoning=reasoning)
485
+ dim_by_name[name] = DimensionScore(
486
+ name=name, score=score_val, reasoning=reasoning
487
+ )
462
488
 
463
489
  missing = set(RUBRIC_V1) - set(dim_by_name)
464
490
  if missing:
465
- raise LLMParseError(f"Model response missing dimensions: {', '.join(sorted(missing))}")
491
+ raise LLMParseError(
492
+ f"Model response missing dimensions: {', '.join(sorted(missing))}"
493
+ )
466
494
 
467
495
  dimensions = tuple(dim_by_name[name] for name in RUBRIC_V1)
468
496
 
@@ -498,6 +526,11 @@ def score_repo_with_model(heuristics: RepoHeuristics) -> AssessmentScore:
498
526
  "You are evaluating a code repository's suitability for AI agent benchmarking.\n\n"
499
527
  f"Here are the raw repository statistics:\n{stats_json}\n\n"
500
528
  f"Score this repository on each of these dimensions (0.0 to 1.0):\n{rubric_list}\n\n"
529
+ "Weighting guidance for the overall score: task_richness and test_coverage "
530
+ "are the most important (~25% each), followed by complexity (~20%), "
531
+ "activity (~15%), documentation (~10%). ci_maturity should be a minor "
532
+ "signal (~5%) because CI configs are often absent in cloned repos and "
533
+ "codeprobe validates via mined test scripts, not CI pipelines.\n\n"
501
534
  "Respond with ONLY valid JSON matching this exact schema:\n"
502
535
  "{\n"
503
536
  ' "overall": <float 0.0-1.0>,\n'
@@ -84,6 +84,10 @@ def main(verbose: int, quiet: bool, log_format: str) -> None:
84
84
  and interpret the results to find which setup works best for YOUR code.
85
85
  """
86
86
  _configure_logging(verbose=verbose, quiet=quiet, log_format=log_format)
87
+ ctx = click.get_current_context()
88
+ ctx.ensure_object(dict)
89
+ ctx.obj["log_format"] = log_format
90
+ ctx.obj["quiet"] = quiet
87
91
 
88
92
 
89
93
  @main.command()
@@ -101,6 +105,40 @@ def init(path: str) -> None:
101
105
 
102
106
  @main.command()
103
107
  @click.argument("path", default=".")
108
+ @click.option(
109
+ "--preset",
110
+ type=click.Choice(["quick", "mcp"], case_sensitive=False),
111
+ default=None,
112
+ help="Apply a named preset: 'quick' (count=3) or 'mcp' (org-scale + MCP families).",
113
+ )
114
+ @click.option(
115
+ "--goal",
116
+ type=click.Choice(
117
+ ["quality", "navigation", "mcp", "general"], case_sensitive=False
118
+ ),
119
+ default=None,
120
+ help="Eval goal: quality, navigation, mcp, general. Skips interactive goal prompt.",
121
+ )
122
+ @click.option(
123
+ "--profile",
124
+ "profile_name",
125
+ default=None,
126
+ help="Load a user-defined profile from ~/.codeprobe/mine-profiles.json "
127
+ "or .codeprobe/mine-profiles.json. Explicit flags override profile values.",
128
+ )
129
+ @click.option(
130
+ "--save-profile",
131
+ "save_profile_name",
132
+ default=None,
133
+ help="Save current flag values as a named profile to ~/.codeprobe/mine-profiles.json.",
134
+ )
135
+ @click.option(
136
+ "--list-profiles",
137
+ "list_profiles_flag",
138
+ is_flag=True,
139
+ default=False,
140
+ help="Show available profiles from user and project levels.",
141
+ )
104
142
  @click.option("--count", default=5, help="Number of tasks to mine (3-20).")
105
143
  @click.option(
106
144
  "--source",
@@ -206,8 +244,15 @@ def init(path: str) -> None:
206
244
  "(e.g. github.com/sg-evals/numpy). Defaults to github.com/sg-evals/{repo_name} "
207
245
  "when --mcp-families is used. Requires SOURCEGRAPH_TOKEN env var.",
208
246
  )
247
+ @click.pass_context
209
248
  def mine(
249
+ ctx: click.Context,
210
250
  path: str,
251
+ preset: str | None,
252
+ goal: str | None,
253
+ profile_name: str | None,
254
+ save_profile_name: str | None,
255
+ list_profiles_flag: bool,
211
256
  count: int,
212
257
  source: str,
213
258
  min_files: int,
@@ -232,6 +277,21 @@ def mine(
232
277
  Extracts real code-change tasks from merged PRs/MRs with ground truth,
233
278
  test scripts, and scoring rubrics.
234
279
 
280
+ \b
281
+ Presets (--preset):
282
+ quick — Fast scan: count=3, default SDLC mode
283
+ mcp — MCP eval: count=8, org-scale + MCP families + enrich
284
+
285
+ \b
286
+ Profiles (--profile / --save-profile / --list-profiles):
287
+ Save: codeprobe mine --save-profile my-setup --count 10 --org-scale .
288
+ Load: codeprobe mine --profile my-setup /path/to/repo
289
+ List: codeprobe mine --list-profiles
290
+
291
+ \b
292
+ Precedence: built-in defaults < profile < --preset < explicit CLI flags.
293
+
294
+ \b
235
295
  Use --org-scale to mine comprehension/IR tasks with oracle verification
236
296
  instead of SDLC code-change tasks.
237
297
 
@@ -242,10 +302,100 @@ def mine(
242
302
  choosing an eval goal, task count, and git host before mining.
243
303
  Use --no-interactive to skip the prompts and use defaults/flags directly.
244
304
  """
245
- from codeprobe.cli.mine_cmd import run_mine
305
+ from pathlib import Path as _Path
306
+
307
+ from codeprobe.cli.mine_cmd import (
308
+ list_profiles,
309
+ load_profile,
310
+ run_mine,
311
+ save_profile,
312
+ )
313
+
314
+ # --list-profiles: show and exit
315
+ if list_profiles_flag:
316
+ repo_path = _Path(path).resolve() if path != "." else _Path.cwd()
317
+ entries = list_profiles(repo_path)
318
+ if not entries:
319
+ click.echo("No profiles found.")
320
+ else:
321
+ click.echo(f"{'Name':<20s} {'Source':<10s} {'Settings'}")
322
+ click.echo("-" * 60)
323
+ for name, source_label, prof in entries:
324
+ summary = ", ".join(f"{k}={v}" for k, v in sorted(prof.items()))
325
+ click.echo(f"{name:<20s} {source_label:<10s} {summary}")
326
+ return
327
+
328
+ # --save-profile: save current flags and exit
329
+ if save_profile_name is not None:
330
+ # Collect all current param values, keeping only those that differ
331
+ # from Click defaults.
332
+ param_defaults = {p.name: p.default for p in ctx.command.params}
333
+ # Exclude meta-params that aren't mining flags
334
+ _EXCLUDE_FROM_PROFILE = frozenset(
335
+ {
336
+ "path",
337
+ "profile_name",
338
+ "save_profile_name",
339
+ "list_profiles_flag",
340
+ }
341
+ )
342
+ values = {
343
+ k: (list(v) if isinstance(v, tuple) else v)
344
+ for k, v in ctx.params.items()
345
+ if k not in _EXCLUDE_FROM_PROFILE and v != param_defaults.get(k)
346
+ }
347
+ saved_path = save_profile(save_profile_name, values)
348
+ click.echo(f"Profile '{save_profile_name}' saved to {saved_path}")
349
+ return
350
+
351
+ # --profile: load profile values as defaults, then apply preset and CLI overrides
352
+ if profile_name is not None:
353
+ repo_path = _Path(path).resolve() if path != "." else _Path.cwd()
354
+ prof = load_profile(profile_name, repo_path)
355
+
356
+ # Determine which params were explicitly set on the CLI
357
+ explicitly_set = {
358
+ p.name
359
+ for p in ctx.command.params
360
+ if ctx.get_parameter_source(p.name) is not None
361
+ and ctx.get_parameter_source(p.name).name == "COMMANDLINE"
362
+ }
363
+
364
+ # Apply profile values for params NOT explicitly set on CLI.
365
+ # Tuple-typed params (click multiple=True) need list→tuple coercion.
366
+ _TUPLE_PARAMS = frozenset({"subsystem", "family", "repos", "backends"})
367
+
368
+ def _prof_val(key: str, current: object) -> object:
369
+ if key in explicitly_set or key not in prof:
370
+ return current
371
+ v = prof[key]
372
+ return tuple(v) if key in _TUPLE_PARAMS else v
373
+
374
+ count = _prof_val("count", count) # type: ignore[assignment]
375
+ source = _prof_val("source", source) # type: ignore[assignment]
376
+ min_files = _prof_val("min_files", min_files) # type: ignore[assignment]
377
+ enrich = _prof_val("enrich", enrich) # type: ignore[assignment]
378
+ org_scale = _prof_val("org_scale", org_scale) # type: ignore[assignment]
379
+ mcp_families = _prof_val("mcp_families", mcp_families) # type: ignore[assignment]
380
+ no_llm = _prof_val("no_llm", no_llm) # type: ignore[assignment]
381
+ discover_subsystems = _prof_val("discover_subsystems", discover_subsystems) # type: ignore[assignment]
382
+ scan_timeout = _prof_val("scan_timeout", scan_timeout) # type: ignore[assignment]
383
+ validate_flag = _prof_val("validate_flag", validate_flag) # type: ignore[assignment]
384
+ curate = _prof_val("curate", curate) # type: ignore[assignment]
385
+ verify_curation_flag = _prof_val("verify_curation_flag", verify_curation_flag) # type: ignore[assignment]
386
+ sg_repo = _prof_val("sg_repo", sg_repo) # type: ignore[assignment]
387
+ subsystem = _prof_val("subsystem", subsystem) # type: ignore[assignment]
388
+ family = _prof_val("family", family) # type: ignore[assignment]
389
+ repos = _prof_val("repos", repos) # type: ignore[assignment]
390
+ backends = _prof_val("backends", backends) # type: ignore[assignment]
391
+ interactive = _prof_val("interactive", interactive) # type: ignore[assignment]
392
+ preset = _prof_val("preset", preset) # type: ignore[assignment]
393
+ goal = _prof_val("goal", goal) # type: ignore[assignment]
246
394
 
247
395
  run_mine(
248
396
  path,
397
+ preset=preset,
398
+ goal=goal,
249
399
  count=count,
250
400
  source=source,
251
401
  min_files=min_files,
@@ -294,7 +444,46 @@ def mine(
294
444
  default=False,
295
445
  help="Print estimated resource requirements without executing any agents.",
296
446
  )
447
+ @click.option(
448
+ "--force-plain",
449
+ is_flag=True,
450
+ default=False,
451
+ help="Force plain-text output even in a TTY (disable Rich dashboard).",
452
+ )
453
+ @click.option(
454
+ "--force-rich",
455
+ is_flag=True,
456
+ default=False,
457
+ help="Force Rich Live dashboard even in non-TTY environments.",
458
+ )
459
+ @click.option(
460
+ "--timeout",
461
+ default=None,
462
+ type=int,
463
+ help="Timeout in seconds per task (overrides experiment.json extra.timeout_seconds).",
464
+ )
465
+ @click.option(
466
+ "--repeats",
467
+ default=None,
468
+ type=int,
469
+ help="Number of repeats per task (overrides default of 1).",
470
+ )
471
+ @click.option(
472
+ "--show-prompt",
473
+ is_flag=True,
474
+ default=False,
475
+ help="Print the fully-resolved prompt for the first task and exit (no agent spawned).",
476
+ )
477
+ @click.option(
478
+ "--suite",
479
+ "suite_path",
480
+ default=None,
481
+ type=click.Path(exists=True),
482
+ help="Path to a suite.toml manifest to filter tasks by type, difficulty, and tags.",
483
+ )
484
+ @click.pass_context
297
485
  def run(
486
+ ctx: click.Context,
298
487
  path: str,
299
488
  agent: str,
300
489
  model: str | None,
@@ -302,6 +491,12 @@ def run(
302
491
  max_cost_usd: float | None,
303
492
  parallel: int,
304
493
  dry_run: bool,
494
+ force_plain: bool,
495
+ force_rich: bool,
496
+ timeout: int | None,
497
+ repeats: int | None,
498
+ show_prompt: bool,
499
+ suite_path: str | None,
305
500
  ) -> None:
306
501
  """Run eval tasks against an AI coding agent.
307
502
 
@@ -310,6 +505,16 @@ def run(
310
505
  """
311
506
  from codeprobe.cli.run_cmd import run_eval
312
507
 
508
+ ctx.ensure_object(dict)
509
+ log_format = ctx.obj.get("log_format", "text")
510
+ quiet = ctx.obj.get("quiet", False)
511
+
512
+ if show_prompt:
513
+ from codeprobe.cli.run_cmd import show_prompt_and_exit
514
+
515
+ show_prompt_and_exit(path, config=config, agent=agent, model=model)
516
+ return
517
+
313
518
  run_eval(
314
519
  path,
315
520
  agent=agent,
@@ -318,6 +523,13 @@ def run(
318
523
  max_cost_usd=max_cost_usd,
319
524
  parallel=parallel,
320
525
  dry_run=dry_run,
526
+ log_format=log_format,
527
+ quiet=quiet,
528
+ force_plain=force_plain,
529
+ force_rich=force_rich,
530
+ timeout=timeout,
531
+ repeats=repeats if repeats is not None else 1,
532
+ suite_path=suite_path,
321
533
  )
322
534
 
323
535
 
@@ -488,3 +700,18 @@ main.add_command(scaffold)
488
700
  from codeprobe.cli.probe_cmd import probe # noqa: E402
489
701
 
490
702
  main.add_command(probe)
703
+
704
+ # Register the preambles subcommand group
705
+ from codeprobe.cli.preamble_cmd import preambles # noqa: E402
706
+
707
+ main.add_command(preambles)
708
+
709
+ # Register the doctor command
710
+ from codeprobe.cli.doctor_cmd import doctor # noqa: E402
711
+
712
+ main.add_command(doctor)
713
+
714
+ # Register the validate command
715
+ from codeprobe.cli.validate_cmd import validate # noqa: E402
716
+
717
+ main.add_command(validate)