codeprobe 0.5.4__tar.gz → 0.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (324) hide show
  1. codeprobe-0.7.0/.claude/skills/acceptance-loop/SKILL.md +278 -0
  2. codeprobe-0.7.0/.claude/skills/assess-codebase/SKILL.md +95 -0
  3. codeprobe-0.7.0/.claude/skills/codeprobe-calibrate/SKILL.md +87 -0
  4. codeprobe-0.7.0/.claude/skills/codeprobe-check-infra/SKILL.md +106 -0
  5. codeprobe-0.7.0/.claude/skills/codeprobe-interpret/SKILL.md +80 -0
  6. codeprobe-0.7.0/.claude/skills/codeprobe-mine/SKILL.md +98 -0
  7. codeprobe-0.7.0/.claude/skills/codeprobe-run/SKILL.md +133 -0
  8. codeprobe-0.7.0/.claude/skills/experiment/SKILL.md +313 -0
  9. codeprobe-0.7.0/.claude/skills/integration-test/SKILL.md +242 -0
  10. codeprobe-0.7.0/.claude/skills/interpret/SKILL.md +172 -0
  11. codeprobe-0.7.0/.claude/skills/mine-tasks/SKILL.md +277 -0
  12. codeprobe-0.7.0/.claude/skills/probe/SKILL.md +162 -0
  13. codeprobe-0.7.0/.claude/skills/ratings/SKILL.md +164 -0
  14. codeprobe-0.7.0/.claude/skills/run-eval/SKILL.md +169 -0
  15. codeprobe-0.7.0/.claude/skills/scaffold/SKILL.md +154 -0
  16. codeprobe-0.7.0/MANIFEST.in +5 -0
  17. {codeprobe-0.5.4 → codeprobe-0.7.0}/PKG-INFO +2 -1
  18. {codeprobe-0.5.4 → codeprobe-0.7.0}/pyproject.toml +32 -2
  19. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/claude.py +69 -11
  20. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/telemetry.py +27 -0
  21. codeprobe-0.7.0/src/codeprobe/analysis/interpret.py +241 -0
  22. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/analysis/report.py +119 -3
  23. codeprobe-0.7.0/src/codeprobe/calibration/__init__.py +37 -0
  24. codeprobe-0.7.0/src/codeprobe/calibration/gate.py +235 -0
  25. codeprobe-0.7.0/src/codeprobe/calibration/profile.py +104 -0
  26. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/__init__.py +346 -15
  27. codeprobe-0.7.0/src/codeprobe/cli/_error_handler.py +239 -0
  28. codeprobe-0.7.0/src/codeprobe/cli/_output_helpers.py +188 -0
  29. codeprobe-0.7.0/src/codeprobe/cli/_output_mode.py +143 -0
  30. codeprobe-0.7.0/src/codeprobe/cli/_sandbox.py +65 -0
  31. codeprobe-0.7.0/src/codeprobe/cli/_tenant.py +151 -0
  32. codeprobe-0.7.0/src/codeprobe/cli/assess_cmd.py +122 -0
  33. codeprobe-0.7.0/src/codeprobe/cli/cache_cmd.py +51 -0
  34. codeprobe-0.7.0/src/codeprobe/cli/calibrate_cmd.py +151 -0
  35. codeprobe-0.7.0/src/codeprobe/cli/check_infra.py +507 -0
  36. codeprobe-0.7.0/src/codeprobe/cli/doctor_cmd.py +268 -0
  37. codeprobe-0.7.0/src/codeprobe/cli/envelope.py +108 -0
  38. codeprobe-0.7.0/src/codeprobe/cli/error_codes.json +277 -0
  39. codeprobe-0.7.0/src/codeprobe/cli/errors.py +143 -0
  40. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/init_cmd.py +30 -1
  41. codeprobe-0.7.0/src/codeprobe/cli/interpret_cmd.py +185 -0
  42. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/mine_cmd.py +626 -61
  43. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/run_cmd.py +381 -62
  44. codeprobe-0.7.0/src/codeprobe/cli/snapshot_cmd.py +439 -0
  45. codeprobe-0.7.0/src/codeprobe/cli/trace_cmd.py +46 -0
  46. codeprobe-0.7.0/src/codeprobe/config/__init__.py +42 -0
  47. codeprobe-0.7.0/src/codeprobe/config/defaults.py +501 -0
  48. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/executor.py +19 -2
  49. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/llm.py +16 -2
  50. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/preamble.py +17 -2
  51. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/scoring.py +17 -1
  52. codeprobe-0.7.0/src/codeprobe/llm/__init__.py +182 -0
  53. codeprobe-0.7.0/src/codeprobe/llm/backends/__init__.py +57 -0
  54. codeprobe-0.7.0/src/codeprobe/llm/backends/anthropic.py +86 -0
  55. codeprobe-0.7.0/src/codeprobe/llm/backends/azure_openai.py +109 -0
  56. codeprobe-0.7.0/src/codeprobe/llm/backends/base.py +84 -0
  57. codeprobe-0.7.0/src/codeprobe/llm/backends/bedrock.py +125 -0
  58. codeprobe-0.7.0/src/codeprobe/llm/backends/openai_compat.py +109 -0
  59. codeprobe-0.7.0/src/codeprobe/llm/backends/vertex.py +99 -0
  60. codeprobe-0.7.0/src/codeprobe/llm/model_registry.yaml +102 -0
  61. codeprobe-0.7.0/src/codeprobe/mcp/__init__.py +25 -0
  62. codeprobe-0.7.0/src/codeprobe/mcp/capabilities.py +163 -0
  63. codeprobe-0.7.0/src/codeprobe/mcp/fixtures/__init__.py +7 -0
  64. codeprobe-0.7.0/src/codeprobe/mcp/fixtures/fixture_server.py +57 -0
  65. codeprobe-0.7.0/src/codeprobe/mining/adapters/__init__.py +36 -0
  66. codeprobe-0.7.0/src/codeprobe/mining/adapters/commit.py +63 -0
  67. codeprobe-0.7.0/src/codeprobe/mining/adapters/pr.py +115 -0
  68. codeprobe-0.7.0/src/codeprobe/mining/adapters/rfc.py +107 -0
  69. codeprobe-0.7.0/src/codeprobe/mining/ast_scan.py +283 -0
  70. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/comprehension.py +67 -1
  71. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/comprehension_writer.py +6 -0
  72. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/curator_backends.py +90 -0
  73. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/curator_tiers.py +215 -1
  74. codeprobe-0.7.0/src/codeprobe/mining/dependency_upgrade.py +354 -0
  75. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/extractor.py +133 -46
  76. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/multi_repo.py +366 -15
  77. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/org_scale.py +77 -5
  78. codeprobe-0.7.0/src/codeprobe/mining/refresh.py +447 -0
  79. codeprobe-0.7.0/src/codeprobe/mining/retry.py +158 -0
  80. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/sg_ground_truth.py +7 -0
  81. codeprobe-0.7.0/src/codeprobe/mining/sources.py +270 -0
  82. codeprobe-0.7.0/src/codeprobe/mining/state.py +344 -0
  83. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/task_types.py +15 -1
  84. codeprobe-0.7.0/src/codeprobe/mining/trackers/__init__.py +6 -0
  85. codeprobe-0.7.0/src/codeprobe/mining/trackers/base.py +38 -0
  86. codeprobe-0.7.0/src/codeprobe/mining/trackers/jira.py +156 -0
  87. codeprobe-0.7.0/src/codeprobe/mining/vcs/__init__.py +19 -0
  88. codeprobe-0.7.0/src/codeprobe/mining/vcs/_http.py +65 -0
  89. codeprobe-0.7.0/src/codeprobe/mining/vcs/base.py +162 -0
  90. codeprobe-0.7.0/src/codeprobe/mining/vcs/gitlab.py +194 -0
  91. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/writer.py +509 -51
  92. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/models/task.py +17 -0
  93. codeprobe-0.7.0/src/codeprobe/net/__init__.py +46 -0
  94. codeprobe-0.7.0/src/codeprobe/net/credential_ttl.py +155 -0
  95. codeprobe-0.7.0/src/codeprobe/net/offline.py +71 -0
  96. codeprobe-0.7.0/src/codeprobe/paths.py +155 -0
  97. codeprobe-0.7.0/src/codeprobe/preambles/custom.md.j2 +37 -0
  98. codeprobe-0.7.0/src/codeprobe/preambles/generator.py +146 -0
  99. codeprobe-0.7.0/src/codeprobe/preambles/github.md +76 -0
  100. codeprobe-0.7.0/src/codeprobe/preambles/templates/__init__.py +63 -0
  101. codeprobe-0.7.0/src/codeprobe/sandbox/__init__.py +28 -0
  102. codeprobe-0.7.0/src/codeprobe/sandbox/runner.py +236 -0
  103. codeprobe-0.7.0/src/codeprobe/snapshot/__init__.py +111 -0
  104. codeprobe-0.7.0/src/codeprobe/snapshot/canary.py +156 -0
  105. codeprobe-0.7.0/src/codeprobe/snapshot/create.py +346 -0
  106. codeprobe-0.7.0/src/codeprobe/snapshot/exporters/__init__.py +28 -0
  107. codeprobe-0.7.0/src/codeprobe/snapshot/exporters/_common.py +105 -0
  108. codeprobe-0.7.0/src/codeprobe/snapshot/exporters/browse.py +195 -0
  109. codeprobe-0.7.0/src/codeprobe/snapshot/exporters/datadog.py +148 -0
  110. codeprobe-0.7.0/src/codeprobe/snapshot/exporters/sheets.py +75 -0
  111. codeprobe-0.7.0/src/codeprobe/snapshot/exporters/sigma.py +117 -0
  112. codeprobe-0.7.0/src/codeprobe/snapshot/manifest.py +228 -0
  113. codeprobe-0.7.0/src/codeprobe/snapshot/redact.py +465 -0
  114. codeprobe-0.7.0/src/codeprobe/snapshot/scanners.py +315 -0
  115. codeprobe-0.7.0/src/codeprobe/snapshot/verify.py +189 -0
  116. codeprobe-0.7.0/src/codeprobe/tenant.py +376 -0
  117. codeprobe-0.7.0/src/codeprobe/trace/__init__.py +26 -0
  118. codeprobe-0.7.0/src/codeprobe/trace/content_policy.py +108 -0
  119. codeprobe-0.7.0/src/codeprobe/trace/recorder.py +500 -0
  120. codeprobe-0.7.0/src/codeprobe/trace/store.py +159 -0
  121. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe.egg-info/PKG-INFO +2 -1
  122. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe.egg-info/SOURCES.txt +92 -0
  123. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe.egg-info/requires.txt +1 -0
  124. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_acceptance_compiler_integration.py +0 -2
  125. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_adapters.py +13 -7
  126. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_analysis.py +8 -0
  127. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_curator_integration.py +6 -5
  128. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_doctor_cmd.py +7 -3
  129. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_init_wizard.py +21 -7
  130. codeprobe-0.7.0/tests/test_lint_zfc.py +375 -0
  131. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_loader.py +0 -2
  132. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mine_goals.py +10 -8
  133. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mine_profiles.py +5 -3
  134. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mining.py +3 -2
  135. codeprobe-0.7.0/tests/test_paths.py +112 -0
  136. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_preamble.py +9 -2
  137. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_stats.py +1 -2
  138. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_task_model.py +1 -0
  139. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_verify.py +0 -3
  140. codeprobe-0.5.4/src/codeprobe/cli/assess_cmd.py +0 -41
  141. codeprobe-0.5.4/src/codeprobe/cli/doctor_cmd.py +0 -114
  142. codeprobe-0.5.4/src/codeprobe/cli/interpret_cmd.py +0 -84
  143. codeprobe-0.5.4/src/codeprobe/config/__init__.py +0 -5
  144. codeprobe-0.5.4/src/codeprobe/mining/sources.py +0 -118
  145. codeprobe-0.5.4/src/codeprobe/preambles/github.md +0 -21
  146. {codeprobe-0.5.4 → codeprobe-0.7.0}/LICENSE +0 -0
  147. {codeprobe-0.5.4 → codeprobe-0.7.0}/README.md +0 -0
  148. {codeprobe-0.5.4 → codeprobe-0.7.0}/setup.cfg +0 -0
  149. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/__init__.py +0 -0
  150. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/__main__.py +0 -0
  151. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/acceptance_compiler.py +0 -0
  152. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/__init__.py +0 -0
  153. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/_base.py +0 -0
  154. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/codex.py +0 -0
  155. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/copilot.py +0 -0
  156. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/openai_compat.py +0 -0
  157. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/protocol.py +0 -0
  158. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/adapters/session.py +0 -0
  159. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/analysis/__init__.py +0 -0
  160. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/analysis/dual.py +0 -0
  161. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/analysis/ranking.py +0 -0
  162. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/analysis/stats.py +0 -0
  163. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/api.py +0 -0
  164. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/assess/__init__.py +0 -0
  165. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/assess/heuristics.py +0 -0
  166. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/assess/oracle_diff.py +0 -0
  167. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/auth_cmd.py +0 -0
  168. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/experiment_cmd.py +0 -0
  169. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/json_display.py +0 -0
  170. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/preamble_cmd.py +0 -0
  171. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/probe_cmd.py +0 -0
  172. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/ratings_cmd.py +0 -0
  173. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/rich_display.py +0 -0
  174. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/scaffold_cmd.py +0 -0
  175. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/validate_cmd.py +0 -0
  176. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/wizard.py +0 -0
  177. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/cli/yaml_writer.py +0 -0
  178. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/config/loader.py +0 -0
  179. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/config/redact.py +0 -0
  180. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/__init__.py +0 -0
  181. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/_shared.py +0 -0
  182. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/adaptive.py +0 -0
  183. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/counterfactual.py +0 -0
  184. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/debate.py +0 -0
  185. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/decision_tree.py +0 -0
  186. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/elo.py +0 -0
  187. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/fingerprint.py +0 -0
  188. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/mutation.py +0 -0
  189. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/pareto.py +0 -0
  190. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/sprt.py +0 -0
  191. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/contrib/tournament.py +0 -0
  192. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/__init__.py +0 -0
  193. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/__main__.py +0 -0
  194. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/checkpoint.py +0 -0
  195. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/events.py +0 -0
  196. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/experiment.py +0 -0
  197. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/isolation.py +0 -0
  198. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/mcp_discovery.py +0 -0
  199. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/registry.py +0 -0
  200. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/repo_hygiene.py +0 -0
  201. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/core/sandbox.py +0 -0
  202. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/loaders/__init__.py +0 -0
  203. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/loaders/suite.py +0 -0
  204. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/__init__.py +0 -0
  205. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/_graph.py +0 -0
  206. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/_lang.py +0 -0
  207. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/curator.py +0 -0
  208. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/org_scale_families.py +0 -0
  209. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/org_scale_oracle.py +0 -0
  210. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/org_scale_scanner.py +0 -0
  211. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/org_scale_validate.py +0 -0
  212. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/mining/sg_auth.py +0 -0
  213. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/models/__init__.py +0 -0
  214. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/models/evalrc.py +0 -0
  215. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/models/experiment.py +0 -0
  216. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/models/preamble.py +0 -0
  217. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/models/suite.py +0 -0
  218. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/preambles/__init__.py +0 -0
  219. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/preambles/sourcegraph.md +0 -0
  220. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/probe/__init__.py +0 -0
  221. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/probe/adapter.py +0 -0
  222. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/probe/generator.py +0 -0
  223. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/probe/writer.py +0 -0
  224. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/ratings/__init__.py +0 -0
  225. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/ratings/collector.py +0 -0
  226. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/scaffold/__init__.py +0 -0
  227. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/scaffold/writer.py +0 -0
  228. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/templates/__init__.py +0 -0
  229. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/templates/evalrc-mcp-comparison.yaml +0 -0
  230. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/templates/evalrc-model-comparison.yaml +0 -0
  231. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe/templates/evalrc-prompt-comparison.yaml +0 -0
  232. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe.egg-info/dependency_links.txt +0 -0
  233. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe.egg-info/entry_points.txt +0 -0
  234. {codeprobe-0.5.4 → codeprobe-0.7.0}/src/codeprobe.egg-info/top_level.txt +0 -0
  235. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_acceptance_compiler.py +0 -0
  236. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_adapter_contracts.py +0 -0
  237. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_api.py +0 -0
  238. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_artifact_scorer.py +0 -0
  239. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_assess.py +0 -0
  240. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_auth_cmd.py +0 -0
  241. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_changed_symbols.py +0 -0
  242. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_checkpoint.py +0 -0
  243. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_checkpoint_scoring.py +0 -0
  244. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_cli.py +0 -0
  245. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_comprehension.py +0 -0
  246. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_config_loader.py +0 -0
  247. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_contrib.py +0 -0
  248. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_convergence.py +0 -0
  249. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_criteria_loader.py +0 -0
  250. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_ctrlc_integration.py +0 -0
  251. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_curator_backends.py +0 -0
  252. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_curator_core.py +0 -0
  253. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_curator_tiers.py +0 -0
  254. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_dual_adversarial_fixes.py +0 -0
  255. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_dual_composite.py +0 -0
  256. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_dual_e2e.py +0 -0
  257. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_dual_matrix.py +0 -0
  258. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_dual_scorer.py +0 -0
  259. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_dual_scoring_details.py +0 -0
  260. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_events.py +0 -0
  261. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_examples_dual.py +0 -0
  262. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_executor.py +0 -0
  263. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_executor_dual_isolation.py +0 -0
  264. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_executor_events.py +0 -0
  265. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_experiment_cmd.py +0 -0
  266. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_experiment_core.py +0 -0
  267. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_ground_truth_schema.py +0 -0
  268. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_isolation.py +0 -0
  269. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_json_display.py +0 -0
  270. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_listeners_dual.py +0 -0
  271. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_llm.py +0 -0
  272. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_loaders.py +0 -0
  273. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_loaders_dual.py +0 -0
  274. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mcp_families_mining.py +0 -0
  275. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mcp_validate.py +0 -0
  276. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mine_cli.py +0 -0
  277. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mine_presets.py +0 -0
  278. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_mining_dual.py +0 -0
  279. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_models.py +0 -0
  280. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_multi_repo_e2e.py +0 -0
  281. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_multi_repo_mining.py +0 -0
  282. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_new_families.py +0 -0
  283. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_openai_compat.py +0 -0
  284. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_oracle_diff.py +0 -0
  285. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_oracle_registry.py +0 -0
  286. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_oracle_types.py +0 -0
  287. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_org_scale.py +0 -0
  288. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_pipeline_integration.py +0 -0
  289. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_preamble_cmd.py +0 -0
  290. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_probe.py +0 -0
  291. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_probe_adapter.py +0 -0
  292. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_ratings.py +0 -0
  293. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_ratings_cmd.py +0 -0
  294. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_registry.py +0 -0
  295. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_regression_gate.py +0 -0
  296. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_release_gate.py +0 -0
  297. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_repo_hygiene.py +0 -0
  298. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_report_dual.py +0 -0
  299. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_run_config_resolution.py +0 -0
  300. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_safe_leg_score.py +0 -0
  301. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_scaffold.py +0 -0
  302. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_scaffold_upgrade.py +0 -0
  303. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_scanner_refactor.py +0 -0
  304. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_score_result.py +0 -0
  305. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_scoring.py +0 -0
  306. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_scoring_extended.py +0 -0
  307. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_scoring_v2.py +0 -0
  308. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_sdlc_ground_truth.py +0 -0
  309. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_secret_redaction.py +0 -0
  310. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_session.py +0 -0
  311. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_sg_auth.py +0 -0
  312. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_sg_ground_truth.py +0 -0
  313. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_shell_shim.py +0 -0
  314. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_show_prompt.py +0 -0
  315. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_suite.py +0 -0
  316. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_suite_manifest.py +0 -0
  317. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_task_types.py +0 -0
  318. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_telemetry.py +0 -0
  319. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_validate_cmd.py +0 -0
  320. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_validate_dual.py +0 -0
  321. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_verifier.py +0 -0
  322. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_weighted_checklist.py +0 -0
  323. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_weighted_f1.py +0 -0
  324. {codeprobe-0.5.4 → codeprobe-0.7.0}/tests/test_writer_dual.py +0 -0
@@ -0,0 +1,278 @@
1
+ ---
2
+ name: acceptance-loop
3
+ description: Orchestrate the continuous Test→Verify→Fix→Release acceptance loop for codeprobe. Spawns a Test Agent to produce a workspace, runs the Verifier to produce verdict.json, feeds verdicts into the convergence controller, spawns a Fix Agent when failures remain, runs the regression gate after every fix, and promotes to the release gate after two consecutive green verdicts. Triggers on acceptance loop, convergence loop, test verify fix, /acceptance-loop.
4
+ user-invocable: false
5
+ ---
6
+
7
+ # Acceptance Loop: Continuous Test→Verify→Fix→Release
8
+
9
+ ## Purpose
10
+
11
+ Drive codeprobe toward a releasable state by repeatedly spawning a Test Agent to exercise the tool, running the behavioral Verifier against the produced workspace, and spawning a Fix Agent when the verdict contains failures. Every fix is gated by `acceptance/regression.py` (pytest + ruff + mypy with auto-revert), every verdict is fed into `acceptance/converge.py` for a deterministic CONTINUE / HALT / RELEASE / ESCALATE decision, and release promotion is gated by `acceptance/release.py` (wheel build + staged smoke test + version bump + tag). The loop is ZFC-compliant: all policy decisions are structured-data policy, not model judgment.
12
+
13
+ This SKILL is the single entry point. Sub-skills for spawning each agent live in [`test-agent.md`](./test-agent.md) and [`fix-agent.md`](./fix-agent.md) — do **not** inline their prompts here; read them from disk and substitute parameters.
14
+
15
+ ---
16
+
17
+ ## Parameters
18
+
19
+ | Name | Required | Default | Description |
20
+ |------|----------|---------|-------------|
21
+ | `target_repo` | yes | — | Absolute path to the frozen test repo the Test Agent exercises. |
22
+ | `pinned_sha` | yes | — | Expected git SHA of `target_repo`. Mismatch halts the loop before iteration 1. |
23
+ | `max_iterations` | no | `5` | Hard cap on loop iterations. Passed to `ConvergenceController(max_iterations=...)`. |
24
+ | `eval_mode` | no | `dry-run` | `dry-run` (no agent calls) or `real` (cost-bounded). Forwarded to the Test Agent. |
25
+ | `repo_root` | no | `/home/ds/projects/codeprobe` | codeprobe repo the Fix Agent edits. Also the regression-gate target. |
26
+
27
+ Reject the invocation if `target_repo` or `pinned_sha` are missing — no interactive prompting; this skill assumes it is invoked programmatically by `/acceptance-loop` with fully-bound parameters.
28
+
29
+ ---
30
+
31
+ ## Phase 0: Configure
32
+
33
+ ### 0.1 Parse and validate parameters
34
+
35
+ Bind the parameters above into shell variables (`TARGET_REPO`, `PINNED_SHA`, `MAX_ITERATIONS`, `EVAL_MODE`, `REPO_ROOT`). Fail fast with a `FAILURE: <reason>` line if any required value is missing or non-absolute.
36
+
37
+ ### 0.2 Stale workspace cleanup
38
+
39
+ Remove any `/tmp/codeprobe-loop-*` directory older than 24 hours so long-running sessions don't fill `/tmp`:
40
+
41
+ ```bash
42
+ find /tmp -maxdepth 1 -type d -name 'codeprobe-loop-*' -mtime +1 -print -exec rm -rf {} +
43
+ ```
44
+
45
+ ### 0.3 Disk space pre-check
46
+
47
+ Refuse to start if `/tmp` has less than 2 GB free — the Test Agent captures full CLI output and the wheel staging step creates a venv:
48
+
49
+ ```bash
50
+ FREE_KB=$(df -Pk /tmp | awk 'NR==2 {print $4}')
51
+ if [ "$FREE_KB" -lt 2097152 ]; then
52
+ echo "FAILURE: /tmp has <2GB free ($FREE_KB KB); aborting acceptance loop"
53
+ exit 1
54
+ fi
55
+ ```
56
+
57
+ ### 0.4 Concurrent-run lock (git tag)
58
+
59
+ Use a local-only git tag `codeprobe-loop-running` as a mutex. Stale locks older than 4 hours are auto-removed; fresh locks block. The tag is removed in the Cleanup section regardless of how the loop exits:
60
+
61
+ ```bash
62
+ cd "$REPO_ROOT"
63
+ if git rev-parse -q --verify refs/tags/codeprobe-loop-running >/dev/null; then
64
+ LOCK_EPOCH=$(git log -1 --format=%ct refs/tags/codeprobe-loop-running 2>/dev/null || echo 0)
65
+ AGE=$(( $(date +%s) - LOCK_EPOCH ))
66
+ if [ "$AGE" -gt 14400 ]; then
67
+ git tag -d codeprobe-loop-running
68
+ else
69
+ echo "FAILURE: another acceptance-loop run holds codeprobe-loop-running (age ${AGE}s)"
70
+ exit 1
71
+ fi
72
+ fi
73
+ git tag codeprobe-loop-running
74
+ ```
75
+
76
+ ### 0.5 Loop workspace root
77
+
78
+ ```bash
79
+ LOOP_ROOT=/tmp/codeprobe-loop-$(date +%Y%m%d-%H%M%S)
80
+ mkdir -p "$LOOP_ROOT"
81
+ CONVERGE_DB="$LOOP_ROOT/converge.db"
82
+ VERDICT_HISTORY=()
83
+ ```
84
+
85
+ Each iteration gets its own subdirectory `$LOOP_ROOT/iter-<N>/` that the Test Agent uses as its workspace and that holds that iteration's `verdict.json`.
86
+
87
+ ---
88
+
89
+ ## Phase 1: Test & Verify (per iteration)
90
+
91
+ For each `ITER` in `1..MAX_ITERATIONS`:
92
+
93
+ ### 1.1 Per-iteration workspace
94
+
95
+ ```bash
96
+ WORKSPACE="$LOOP_ROOT/iter-$ITER"
97
+ mkdir -p "$WORKSPACE"
98
+ ```
99
+
100
+ ### 1.2 Spawn the Test Agent sub-agent
101
+
102
+ #### 1.2a Compile criterion-driven actions
103
+
104
+ Before reading the Test Agent prompt, compile the Phase 5 pipeline steps from `acceptance/criteria.toml`. This replaces the old hardcoded 5a-5e pipeline with one step per criterion that the Verifier can actually check:
105
+
106
+ ```bash
107
+ COMPILED_ACTIONS=$(python3 -c "
108
+ import pathlib
109
+ from acceptance.loader import load_criteria
110
+ from codeprobe.acceptance_compiler import compile_actions
111
+
112
+ criteria = load_criteria()
113
+ actions = compile_actions(
114
+ criteria,
115
+ target_repo=pathlib.Path('$TARGET_REPO'),
116
+ workspace=pathlib.Path('$WORKSPACE'),
117
+ project_root=pathlib.Path('$REPO_ROOT'),
118
+ )
119
+
120
+ for i, a in enumerate(actions, start=1):
121
+ print(f'### 5.{i}. {a.description}')
122
+ print()
123
+ print('\`\`\`')
124
+ print(a.shell_snippet)
125
+ print('\`\`\`')
126
+ print()
127
+ ")
128
+ ```
129
+
130
+ #### 1.2b Bind and spawn
131
+
132
+ Read `./.claude/skills/acceptance-loop/test-agent.md`, substitute the five `{{PARAM}}` tokens (`{{ITERATION}}`, `{{TARGET_REPO}}`, `{{PINNED_SHA}}`, `{{EVAL_MODE}}`, `{{COMPILED_ACTIONS}}`), and hand the bound prompt to a `general-purpose` sub-agent via the Agent tool. Also pass `{{WORKSPACE}} = $WORKSPACE` if the sub-skill references it.
133
+
134
+ Wait for the sub-agent to exit. It MUST produce `$WORKSPACE/workspace-manifest.json`. If the manifest is missing, jump to the ESCALATE handler with reason `test_agent_no_manifest`.
135
+
136
+ ### 1.3 Run the Verifier
137
+
138
+ The verifier has no argparse CLI, so drive it via a one-shot `python3 -c` that imports `Verifier`, runs it, and writes the verdict to `$WORKSPACE/verdict.json`:
139
+
140
+ ```bash
141
+ python3 -c "
142
+ import pathlib
143
+ from acceptance.verify import Verifier
144
+ v = Verifier(pathlib.Path('$REPO_ROOT/acceptance/criteria.toml'),
145
+ project_root=pathlib.Path('$REPO_ROOT'))
146
+ verdict = v.run(pathlib.Path('$WORKSPACE'), iteration=$ITER)
147
+ v.write_verdict(verdict, pathlib.Path('$WORKSPACE/verdict.json'))
148
+ " || { echo 'FAILURE: verifier crashed'; exit 3; }
149
+ VERDICT_HISTORY+=("$WORKSPACE/verdict.json")
150
+ ```
151
+
152
+ ### 1.4 Record the verdict with the convergence controller
153
+
154
+ ```bash
155
+ python3 -c "
156
+ import json, pathlib
157
+ from acceptance.converge import ConvergenceController
158
+ cc = ConvergenceController(pathlib.Path('$CONVERGE_DB'), max_iterations=$MAX_ITERATIONS)
159
+ cc.record_verdict(json.loads(pathlib.Path('$WORKSPACE/verdict.json').read_text()))
160
+ "
161
+ ```
162
+
163
+ ### 1.5 Ask for the decision
164
+
165
+ ```bash
166
+ DECISION=$(python3 -c "
167
+ import pathlib
168
+ from acceptance.converge import ConvergenceController
169
+ cc = ConvergenceController(pathlib.Path('$CONVERGE_DB'), max_iterations=$MAX_ITERATIONS)
170
+ print(cc.decide().decision.value)
171
+ ")
172
+ ```
173
+
174
+ Branch on `$DECISION`:
175
+ - `release` → jump to **Phase 3: Release**.
176
+ - `continue` → proceed to **Phase 2: Fix** (if the verdict has failures) or loop back to 1.1 with `ITER++`.
177
+ - `halt_max_iterations` | `halt_regression` | `halt_stuck` | `escalate` → jump to **Halt Conditions**.
178
+
179
+ ---
180
+
181
+ ## Phase 2: Fix (conditional)
182
+
183
+ Only entered when `$DECISION == continue` AND the verdict has `fail_count > 0`. If `fail_count == 0` but the controller still says `continue`, skip directly to the next iteration (the loop is waiting for the second green in a row).
184
+
185
+ ### 2.1 Spawn the Fix Agent sub-agent
186
+
187
+ Read `./.claude/skills/acceptance-loop/fix-agent.md`, substitute its parameters (`{{ITERATION}}`, `{{REPO_ROOT}}`, `{{VERDICT_PATH}}`), and hand the bound prompt to a fresh `general-purpose` sub-agent. The Fix Agent is contractually constrained to produce exactly ONE commit or print `FAILURE: <criterion_id>` on stdout.
188
+
189
+ ### 2.2 Regression gate after every fix
190
+
191
+ Run the regression gate against `$REPO_ROOT`. It pytests, ruffs, mypys, and auto-reverts HEAD on failure:
192
+
193
+ ```bash
194
+ python3 -m acceptance.regression --repo-root "$REPO_ROOT"
195
+ RC=$?
196
+ if [ $RC -ne 0 ]; then
197
+ echo "regression gate FAILED at iteration $ITER — commit reverted"
198
+ fi
199
+ ```
200
+
201
+ A regression-gate failure is **not** an automatic halt — the Test Agent re-runs on the reverted tree in the next iteration. The convergence controller halts the loop on its own via `HALT_REGRESSION` if `pass_count` drops between consecutive verdicts.
202
+
203
+ ### 2.3 Loop
204
+
205
+ `ITER=$((ITER+1))` and jump back to **Phase 1: Test & Verify**. Do not clear `$CONVERGE_DB` — it is the source of truth for the two-green-in-a-row release check.
206
+
207
+ ---
208
+
209
+ ## Phase 3: Release (conditional)
210
+
211
+ Entered exactly once when `cc.decide() == Decision.RELEASE`. Release is all-or-nothing: any sub-step failure aborts with an escalation report, leaves the lock tag in place until Cleanup, and returns non-zero.
212
+
213
+ ```bash
214
+ python3 -c "
215
+ import pathlib, sys
216
+ from acceptance.release import ReleaseGate
217
+ gate = ReleaseGate(pathlib.Path('$REPO_ROOT'))
218
+ verdicts = [pathlib.Path(p) for p in '''${VERDICT_HISTORY[@]}'''.split()]
219
+ if not gate.check_ready(verdicts):
220
+ print('FAILURE: release gate refused — verdict history not ready'); sys.exit(2)
221
+ staging = gate.build_and_stage()
222
+ if staging.error:
223
+ print(f'FAILURE: staging failed — {staging.error}'); sys.exit(3)
224
+ new_version = gate.bump_version('patch')
225
+ tag = gate.prepare_tag(new_version)
226
+ print(f'RELEASE_READY version={new_version} tag={tag}')
227
+ " || { echo 'release gate failed'; exit 4; }
228
+ ```
229
+
230
+ Show the user the `RELEASE_READY` line plus the staged wheel path. The actual `git push --tags` is a human action — this loop stops at "tag prepared locally".
231
+
232
+ ---
233
+
234
+ ## Halt Conditions
235
+
236
+ When `cc.decide()` returns a non-CONTINUE/non-RELEASE decision, render and surface the escalation report before cleaning up:
237
+
238
+ ```bash
239
+ python3 -c "
240
+ import pathlib
241
+ from acceptance.converge import ConvergenceController
242
+ cc = ConvergenceController(pathlib.Path('$CONVERGE_DB'), max_iterations=$MAX_ITERATIONS)
243
+ print(cc.get_escalation_report())
244
+ " > "$LOOP_ROOT/escalation.md"
245
+ cat "$LOOP_ROOT/escalation.md"
246
+ ```
247
+
248
+ Decision-specific user messaging:
249
+
250
+ - **`halt_max_iterations`** — Loop cap hit without reaching two-green. Report iteration count, latest `pass_count/fail_count`, and the escalation markdown. Exit code 10.
251
+ - **`halt_regression`** — A fix made things worse (`pass_count` dropped). Report the regression delta from the decision context. The offending commit was already reverted by the regression gate; point the user at `$LOOP_ROOT/iter-<N>/verdict.json`. Exit code 11.
252
+ - **`halt_stuck`** / **`escalate`** — Three-strike rule triggered: same criterion failed 3 iterations in a row with identical evidence. Report the stuck criterion IDs, their evidence, and recommend human investigation. Exit code 12.
253
+
254
+ In every halt path, preserve `$LOOP_ROOT` for post-mortem — do NOT delete it in Cleanup.
255
+
256
+ ---
257
+
258
+ ## Cleanup
259
+
260
+ Always executed, even on failure, via a `trap` at the top of the loop or an explicit final block:
261
+
262
+ 1. Remove the concurrent-run lock: `cd "$REPO_ROOT" && git tag -d codeprobe-loop-running 2>/dev/null || true`.
263
+ 2. On RELEASE or max-iterations-green paths, optionally prune `$LOOP_ROOT` — otherwise preserve it and print the path so the user can inspect iteration workspaces and `escalation.md`.
264
+ 3. Print a one-line summary: `acceptance-loop done: iterations=N decision=$DECISION workspace=$LOOP_ROOT`.
265
+
266
+ ---
267
+
268
+ ## References
269
+
270
+ - `acceptance/criteria.toml` — 25 seed criteria in TOML.
271
+ - `acceptance/loader.py::load_criteria()` — parsed into `Criterion` objects.
272
+ - `src/codeprobe/acceptance_compiler.py::compile_actions()` — compiles criteria into Test Agent shell actions.
273
+ - `acceptance/verify.py::Verifier.run()` / `.write_verdict()` — produces `verdict.json`.
274
+ - `acceptance/converge.py::ConvergenceController` — `record_verdict`, `decide`, `is_release_ready`, `get_escalation_report`.
275
+ - `acceptance/regression.py` — `python3 -m acceptance.regression --repo-root <path>`.
276
+ - `acceptance/release.py::ReleaseGate` — `check_ready`, `build_and_stage`, `bump_version`, `prepare_tag`.
277
+ - [`test-agent.md`](./test-agent.md) — Test Agent sub-skill prompt (do not inline).
278
+ - [`fix-agent.md`](./fix-agent.md) — Fix Agent sub-skill prompt (do not inline).
@@ -0,0 +1,95 @@
1
+ ---
2
+ name: assess-codebase
3
+ description: Assess a codebase for AI agent benchmarking potential. Analyzes repo structure, complexity, and history to estimate how well-suited it is for meaningful agent evaluation. Triggers on assess codebase, codebase assessment, evaluate codebase, codebase readiness, benchmark potential.
4
+ user-invocable: true
5
+ ---
6
+
7
+ # Assess Codebase
8
+
9
+ Analyze a codebase to determine how well-suited it is for meaningful AI agent benchmarking. Produces a readiness report covering repo structure, complexity, history depth, test infrastructure, and task mining potential.
10
+
11
+ Invokes `codeprobe assess` under the hood -- all analysis runs through the CLI, not Python imports.
12
+
13
+ ---
14
+
15
+ ## Phase 0: Assessment Goals
16
+
17
+ Ask the user:
18
+
19
+ **Question 1** -- Header: "Target codebase"
20
+ - Question: "Which codebase should I assess?"
21
+ - Options:
22
+ - **Current directory** -- "Assess the repo in the current working directory"
23
+ - **Specific path** -- "I'll provide a path to a local repo"
24
+
25
+ If **Current directory**, set `REPO_PATH=.`.
26
+ If **Specific path**, prompt for the absolute path and set `REPO_PATH={user_input}`.
27
+
28
+ ### Validate Path
29
+
30
+ Before proceeding, confirm the path is a valid git repo:
31
+
32
+ ```bash
33
+ git -C {REPO_PATH} rev-parse --git-dir 2>/dev/null && echo "valid" || echo "not a git repo"
34
+ ```
35
+
36
+ If not a git repo, ask the user for a different path.
37
+
38
+ ---
39
+
40
+ ## Phase 1: Run Assessment
41
+
42
+ Execute the codeprobe CLI:
43
+
44
+ ```bash
45
+ codeprobe assess {REPO_PATH}
46
+ ```
47
+
48
+ This analyzes:
49
+ - Repository structure and size
50
+ - Language distribution
51
+ - Code complexity signals
52
+ - Git history depth and merge activity
53
+ - Test infrastructure coverage
54
+ - Build system and CI presence
55
+
56
+ ---
57
+
58
+ ## Phase 2: Present Results
59
+
60
+ Display the assessment output to the user. Highlight:
61
+
62
+ 1. **Benchmarking potential** -- Is this repo a good candidate for agent evaluation?
63
+ 2. **Task mining readiness** -- Does the repo have enough merge history and test coverage for `/mine-tasks`?
64
+ 3. **Key strengths** -- What makes this repo good for benchmarking (e.g., rich PR history, strong test suite)
65
+ 4. **Gaps** -- What's missing that would improve benchmarking quality (e.g., no CI, sparse test coverage)
66
+
67
+ ---
68
+
69
+ ## Phase 3: Next Steps
70
+
71
+ Based on the assessment, suggest concrete follow-up actions:
72
+
73
+ ```
74
+ Suggested next steps:
75
+
76
+ 1. {If repo scores well}: Run `codeprobe mine {REPO_PATH}` to extract eval tasks
77
+ from merged PRs.
78
+
79
+ 2. {If test coverage is low}: Consider adding tests before benchmarking --
80
+ agents can't be scored without a ground truth.
81
+
82
+ 3. {If history is shallow}: The repo needs more merged PRs for meaningful
83
+ task mining. Consider using a more active repo.
84
+ ```
85
+
86
+ ---
87
+
88
+ ## Quick Reference
89
+
90
+ | User says | What happens |
91
+ |-----------|-------------|
92
+ | `/assess-codebase` | Assess current directory |
93
+ | `/assess-codebase /path/to/repo` | Assess specific repo |
94
+ | "is this repo good for benchmarking?" | Same as `/assess-codebase` |
95
+ | "evaluate my codebase" | Same as `/assess-codebase` |
@@ -0,0 +1,87 @@
1
+ ---
2
+ name: codeprobe-calibrate
3
+ description: Run the codeprobe calibration gate and emit a curator profile when the R11 validity thresholds are met. Compares two curators over a holdout and enforces minimum tasks, minimum repos, and Pearson correlation before accepting. Triggers on calibrate curator, calibration gate, validity gate, curator profile, r11 gate, pearson correlation. Use this when a new curator version needs to be qualified before it is used in mining or scoring pipelines.
4
+ user-invocable: false
5
+ ---
6
+
7
+ # codeprobe calibrate (autonomous agent contract)
8
+
9
+ Gate a curator version against a holdout set. A profile is emitted only when
10
+ three validity conditions are met: holdout size, repo diversity, and Pearson
11
+ correlation against the reference curator. Any failure exits non-zero without
12
+ writing a profile.
13
+
14
+ ## Environment (pre-loaded)
15
+
16
+ - !`codeprobe doctor --json`
17
+
18
+ If doctor reports provider-related failures (e.g. `LLM_UNAVAILABLE`), calibrate
19
+ will almost certainly fail as well. Resolve doctor first.
20
+
21
+ ## Bare invocation
22
+
23
+ Minimum viable call. `--curator-version` is required:
24
+
25
+ ```bash
26
+ codeprobe calibrate <holdout_path> --json --curator-version <id>
27
+ ```
28
+
29
+ Emit the profile to a specific path:
30
+
31
+ ```bash
32
+ codeprobe calibrate <holdout_path> --json --curator-version <id> --out <profile.json>
33
+ ```
34
+
35
+ Adjust acceptance thresholds for an exploratory run (defaults are the R11
36
+ thresholds of 0.6 correlation / 100 tasks / 3 repos — do NOT relax in CI):
37
+
38
+ ```bash
39
+ codeprobe calibrate <holdout_path> --json --curator-version <id> --threshold 0.6 --min-tasks 100 --min-repos 3
40
+ ```
41
+
42
+ ## JSON fields to parse
43
+
44
+ ```json
45
+ {
46
+ "status": "ok" | "error",
47
+ "command": "calibrate",
48
+ "exit_code": 0,
49
+ "data": {
50
+ "curator_version": "...",
51
+ "holdout_tasks": <int>,
52
+ "holdout_repos": <int>,
53
+ "pearson_correlation": <float>,
54
+ "thresholds": { "min_tasks": <int>, "min_repos": <int>, "threshold": <float> },
55
+ "profile_path": "<abs-path | null>",
56
+ "passed": <bool>
57
+ },
58
+ "errors": [ { "code": "<CODE>", "message": "...", "remediation": "...", "terminal": <bool> } ]
59
+ }
60
+ ```
61
+
62
+ `profile_path` is `null` unless `passed == true`. A passed gate is the only
63
+ condition under which any profile artifact exists.
64
+
65
+ ## Error handling
66
+
67
+ Only the codes below may surface. Cross-reference `src/codeprobe/cli/error_codes.json`.
68
+
69
+ | Code | Kind | Retryable? | Action |
70
+ |---|---|---|---|
71
+ | CALIBRATION_REJECTED | diagnostic | no | Increase holdout size / repo diversity, or accept the curator is not qualified. Do not auto-retry with a lowered threshold — that defeats the gate. |
72
+ | METADATA_INVALID | diagnostic | no | Holdout rows are malformed; fix data and re-run. |
73
+ | METADATA_MISSING | diagnostic | no | Required metadata columns are missing from the holdout. |
74
+ | LLM_UNAVAILABLE | diagnostic | yes (bounded) | Provider outage; one retry permitted. |
75
+ | INTERRUPTED | diagnostic | **TERMINAL — do not retry** | Signal halted the run; stop. |
76
+
77
+ ## Retry policy
78
+
79
+ - Maximum retry depth per error chain: **2**. After two consecutive errors
80
+ sharing the same code, stop and surface the envelope to the caller.
81
+ - Terminal errors (INTERRUPTED) are **never** retried.
82
+ - CALIBRATION_REJECTED is a validity signal, not a transient error. Treat it
83
+ as terminal-for-this-holdout even though the error code itself is diagnostic
84
+ — retrying the same inputs will produce the same rejection.
85
+ - Never mutate `--threshold`, `--min-tasks`, or `--min-repos` on retry.
86
+ Those values encode the R11 validity contract; changing them is a human
87
+ decision that must live in configuration, not in retry logic.
@@ -0,0 +1,106 @@
1
+ ---
2
+ name: codeprobe-check-infra
3
+ description: Diagnose mined-task infrastructure for drift and offline readiness. Compares metadata.json capability snapshots to live capabilities and runs credential-TTL preflight for airgapped runs. Triggers on check infra, capability drift, preamble drift, offline preflight, credential ttl, airgapped run readiness. Use this before running mined tasks that were produced on a different machine or weeks ago.
4
+ user-invocable: false
5
+ ---
6
+
7
+ # codeprobe check-infra (autonomous agent contract)
8
+
9
+ Pre-run diagnostics for mined task directories and airgapped environments.
10
+ Splits into two primary subcommands: `drift` (capability snapshot vs live) and
11
+ `offline` (credential TTL vs expected run duration).
12
+
13
+ ## Environment (pre-loaded)
14
+
15
+ - !`codeprobe doctor --json`
16
+ - !`codeprobe check-infra offline --json`
17
+
18
+ `doctor` gives the overall readiness state; `check-infra offline --json`
19
+ pre-warms the credential-TTL surface so the agent can decide up front whether
20
+ an offline run is viable. If the offline envelope reports `status == "error"`
21
+ with `OFFLINE_PREFLIGHT_FAILED`, do NOT attempt an offline run before resolving.
22
+
23
+ ## Bare invocation
24
+
25
+ Capability drift against a specific task directory:
26
+
27
+ ```bash
28
+ codeprobe check-infra drift <task_dir> --json
29
+ ```
30
+
31
+ Tolerate drift (emit warning instead of failing):
32
+
33
+ ```bash
34
+ codeprobe check-infra drift <task_dir> --json --allow-capability-drift
35
+ ```
36
+
37
+ Offline credential preflight for an anticipated 2-hour run:
38
+
39
+ ```bash
40
+ codeprobe check-infra offline --json --expected-run-duration 2h
41
+ ```
42
+
43
+ Restrict the offline check to a single backend:
44
+
45
+ ```bash
46
+ codeprobe check-infra offline --json --backend claude
47
+ ```
48
+
49
+ ## JSON fields to parse
50
+
51
+ Drift:
52
+
53
+ ```json
54
+ {
55
+ "status": "ok" | "error",
56
+ "command": "check-infra drift",
57
+ "exit_code": 0,
58
+ "data": {
59
+ "task_dir": "<abs-path>",
60
+ "drift_detected": <bool>,
61
+ "snapshot_capabilities": [ "..." ],
62
+ "live_capabilities": [ "..." ],
63
+ "added": [ "..." ],
64
+ "removed": [ "..." ]
65
+ },
66
+ "errors": [ { "code": "<CODE>", "message": "...", "remediation": "...", "terminal": <bool> } ]
67
+ }
68
+ ```
69
+
70
+ Offline:
71
+
72
+ ```json
73
+ {
74
+ "status": "ok" | "error",
75
+ "command": "check-infra offline",
76
+ "exit_code": 0,
77
+ "data": {
78
+ "expected_run_duration_seconds": <int>,
79
+ "backends": [ { "name": "...", "ttl_seconds": <int | null>, "ok": <bool> } ]
80
+ },
81
+ "errors": [ { "code": "<CODE>", "message": "...", "remediation": "...", "terminal": <bool> } ]
82
+ }
83
+ ```
84
+
85
+ ## Error handling
86
+
87
+ Only the codes below may surface. Cross-reference `src/codeprobe/cli/error_codes.json`.
88
+
89
+ | Code | Kind | Retryable? | Action |
90
+ |---|---|---|---|
91
+ | CAPABILITY_DRIFT | diagnostic | no | Run `codeprobe doctor --capabilities`; re-mine or re-baseline if intentional. |
92
+ | METADATA_MISSING | diagnostic | no | Target task_dir has no metadata.json; stop. |
93
+ | OFFLINE_PREFLIGHT_FAILED | diagnostic | no | At least one backend's credential TTL is too short; rotate/refresh credentials. |
94
+ | OFFLINE_NET_ATTEMPT | diagnostic | no | Component attempted network IO while offline; fix config. |
95
+ | STALE_USER_HOME_SKILL | diagnostic | yes (with fix) | Re-install the referenced skill bundle per remediation. |
96
+ | DOCTOR_CHECKS_FAILED | diagnostic | no | Cross-surfaced from doctor; resolve those checks first. |
97
+ | INTERRUPTED | diagnostic | **TERMINAL — do not retry** | Signal halted the command; stop. |
98
+
99
+ ## Retry policy
100
+
101
+ - Maximum retry depth per error chain: **2**. After two consecutive errors
102
+ sharing the same code, stop and surface the envelope to the caller.
103
+ - Terminal errors (INTERRUPTED) are **never** retried.
104
+ - Drift errors almost always need a human decision (re-mine vs accept-drift).
105
+ Do not auto-retry with `--allow-capability-drift` unless the caller asked
106
+ for it — that flag changes semantics, not transient state.
@@ -0,0 +1,80 @@
1
+ ---
2
+ name: codeprobe-interpret
3
+ description: Analyze eval results from codeprobe runs. Compares configurations statistically, ranks by score and cost-efficiency, and produces actionable recommendations in JSON or pretty text. Triggers on interpret results, analyze eval results, compare configurations, rank agents, score regression, plot regression. Use this when the agent needs to turn a `codeprobe run` output directory into structured analysis.
4
+ user-invocable: false
5
+ ---
6
+
7
+ # codeprobe interpret (autonomous agent contract)
8
+
9
+ Turn a results directory (or mined-tasks directory in `--regression` mode) into
10
+ a structured analysis envelope. Reporting-only: no side effects on the target
11
+ data.
12
+
13
+ ## Environment (pre-loaded)
14
+
15
+ - !`codeprobe doctor --json`
16
+
17
+ `doctor` is the single source of truth for environment readiness. Interpret is
18
+ read-only, so most doctor failures (missing backends, credentials) do NOT block
19
+ this command. Still, if doctor reports a corrupt `.codeprobe` state, resolve it
20
+ before interpreting.
21
+
22
+ ## Bare invocation
23
+
24
+ ```bash
25
+ codeprobe interpret <results_path> --json
26
+ ```
27
+
28
+ Regression mode (per-task score over commit history from `codeprobe mine --refresh`):
29
+
30
+ ```bash
31
+ codeprobe interpret <tasks_path> --json --regression --results <results_path>
32
+ ```
33
+
34
+ Alternative serialization via `--format` (applies only when `--json` is not set):
35
+
36
+ ```bash
37
+ codeprobe interpret <results_path> --format csv
38
+ ```
39
+
40
+ ## JSON fields to parse
41
+
42
+ ```json
43
+ {
44
+ "status": "ok" | "error",
45
+ "command": "interpret",
46
+ "exit_code": 0,
47
+ "data": {
48
+ "configs": [
49
+ { "id": "...", "score_mean": <float>, "cost_mean_usd": <float>, "rank": <int> }
50
+ ],
51
+ "recommendations": [ { "text": "...", "confidence": <float> } ],
52
+ "regression": { "task_id": "...", "series": [ { "sha": "...", "score": <float> } ] }
53
+ },
54
+ "errors": [ { "code": "<CODE>", "message": "...", "remediation": "...", "terminal": <bool> } ]
55
+ }
56
+ ```
57
+
58
+ `data.regression` is only present when `--regression` is passed. `data.configs`
59
+ is always a sorted list; `rank == 1` is the top config.
60
+
61
+ ## Error handling
62
+
63
+ Interpret is reporting-only, so the error surface is small. Only the codes
64
+ below may surface. Cross-reference `src/codeprobe/cli/error_codes.json`.
65
+
66
+ | Code | Kind | Retryable? | Action |
67
+ |---|---|---|---|
68
+ | NO_TASKS | diagnostic | no | Target results dir has no tasks; check the path. |
69
+ | METADATA_MISSING | diagnostic | no | Structural integrity problem; stop and surface. |
70
+ | METADATA_INVALID | diagnostic | no | Structural integrity problem; run `codeprobe validate --strict` first. |
71
+ | INTERRUPTED | diagnostic | **TERMINAL — do not retry** | Signal halted the run; stop. |
72
+
73
+ ## Retry policy
74
+
75
+ - Maximum retry depth per error chain: **2**. After two consecutive errors
76
+ sharing the same code, stop and surface the envelope to the caller.
77
+ - Terminal errors (INTERRUPTED) are **never** retried.
78
+ - Because interpret is read-only, "retry" almost always means the upstream data
79
+ is wrong. Fix the data (re-run `codeprobe run` or `codeprobe validate`)
80
+ rather than loop on the same inputs.