oh-my-opencode 4.3.1 → 4.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (222) hide show
  1. package/.agents/command/get-unpublished-changes.md +148 -0
  2. package/.agents/command/omomomo.md +37 -0
  3. package/.agents/command/publish.md +376 -0
  4. package/.agents/command/remove-deadcode.md +221 -0
  5. package/.agents/command/security-research.md +16 -0
  6. package/.agents/skills/get-unpublished-changes/SKILL.md +24 -0
  7. package/.agents/skills/github-triage/SKILL.md +587 -0
  8. package/.agents/skills/github-triage/scripts/gh_fetch.py +398 -0
  9. package/.agents/skills/hyperplan/SKILL.md +450 -0
  10. package/.agents/skills/omomomo/SKILL.md +36 -0
  11. package/.agents/skills/pre-publish-review/SKILL.md +407 -0
  12. package/.agents/skills/publish/SKILL.md +428 -0
  13. package/.agents/skills/remove-deadcode/SKILL.md +216 -0
  14. package/.agents/skills/security-research/SKILL.md +204 -0
  15. package/.agents/skills/work-with-pr/SKILL.md +360 -0
  16. package/.agents/skills/work-with-pr-workspace/evals/evals.json +76 -0
  17. package/.agents/skills/work-with-pr-workspace/iteration-1/benchmark.json +138 -0
  18. package/.agents/skills/work-with-pr-workspace/iteration-1/benchmark.md +42 -0
  19. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/eval_metadata.json +57 -0
  20. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/grading.json +15 -0
  21. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/code-changes.md +454 -0
  22. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/execution-plan.md +136 -0
  23. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/pr-description.md +47 -0
  24. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/verification-strategy.md +163 -0
  25. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/timing.json +1 -0
  26. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/grading.json +15 -0
  27. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/code-changes.md +615 -0
  28. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/execution-plan.md +99 -0
  29. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/pr-description.md +50 -0
  30. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/verification-strategy.md +111 -0
  31. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/timing.json +1 -0
  32. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/eval_metadata.json +37 -0
  33. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/grading.json +11 -0
  34. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/code-changes.md +205 -0
  35. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/execution-plan.md +78 -0
  36. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/pr-description.md +42 -0
  37. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/verification-strategy.md +87 -0
  38. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/timing.json +1 -0
  39. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/grading.json +11 -0
  40. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/code-changes.md +334 -0
  41. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/execution-plan.md +86 -0
  42. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/pr-description.md +23 -0
  43. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/verification-strategy.md +119 -0
  44. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/timing.json +1 -0
  45. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/eval_metadata.json +32 -0
  46. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/grading.json +10 -0
  47. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/code-changes.md +221 -0
  48. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/execution-plan.md +104 -0
  49. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/pr-description.md +41 -0
  50. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/verification-strategy.md +84 -0
  51. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/timing.json +1 -0
  52. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/grading.json +10 -0
  53. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/code-changes.md +342 -0
  54. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/execution-plan.md +131 -0
  55. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/pr-description.md +39 -0
  56. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/verification-strategy.md +128 -0
  57. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/timing.json +1 -0
  58. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/eval_metadata.json +32 -0
  59. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/grading.json +10 -0
  60. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/code-changes.md +143 -0
  61. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/execution-plan.md +82 -0
  62. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/pr-description.md +51 -0
  63. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/verification-strategy.md +69 -0
  64. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/timing.json +1 -0
  65. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/grading.json +10 -0
  66. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/code-changes.md +252 -0
  67. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/execution-plan.md +83 -0
  68. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/pr-description.md +33 -0
  69. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/verification-strategy.md +101 -0
  70. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/timing.json +1 -0
  71. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/eval_metadata.json +32 -0
  72. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/grading.json +10 -0
  73. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/code-changes.md +387 -0
  74. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/execution-plan.md +112 -0
  75. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/pr-description.md +51 -0
  76. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/verification-strategy.md +75 -0
  77. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/timing.json +1 -0
  78. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/grading.json +10 -0
  79. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/code-changes.md +529 -0
  80. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/execution-plan.md +127 -0
  81. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/pr-description.md +42 -0
  82. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/verification-strategy.md +120 -0
  83. package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/timing.json +1 -0
  84. package/.agents/skills/work-with-pr-workspace/iteration-1/review.html +1326 -0
  85. package/.opencode/command/get-unpublished-changes.md +148 -0
  86. package/.opencode/command/omomomo.md +37 -0
  87. package/.opencode/command/publish.md +376 -0
  88. package/.opencode/command/remove-deadcode.md +221 -0
  89. package/.opencode/command/security-research.md +16 -0
  90. package/.opencode/skills/github-triage/SKILL.md +587 -0
  91. package/.opencode/skills/github-triage/scripts/gh_fetch.py +398 -0
  92. package/.opencode/skills/hyperplan/SKILL.md +450 -0
  93. package/.opencode/skills/pre-publish-review/SKILL.md +407 -0
  94. package/.opencode/skills/work-with-pr/SKILL.md +360 -0
  95. package/.opencode/skills/work-with-pr-workspace/evals/evals.json +76 -0
  96. package/.opencode/skills/work-with-pr-workspace/iteration-1/benchmark.json +138 -0
  97. package/.opencode/skills/work-with-pr-workspace/iteration-1/benchmark.md +42 -0
  98. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/eval_metadata.json +57 -0
  99. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/grading.json +15 -0
  100. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/code-changes.md +454 -0
  101. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/execution-plan.md +136 -0
  102. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/pr-description.md +47 -0
  103. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/verification-strategy.md +163 -0
  104. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/timing.json +1 -0
  105. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/grading.json +15 -0
  106. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/code-changes.md +615 -0
  107. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/execution-plan.md +99 -0
  108. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/pr-description.md +50 -0
  109. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/verification-strategy.md +111 -0
  110. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/timing.json +1 -0
  111. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/eval_metadata.json +37 -0
  112. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/grading.json +11 -0
  113. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/code-changes.md +205 -0
  114. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/execution-plan.md +78 -0
  115. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/pr-description.md +42 -0
  116. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/verification-strategy.md +87 -0
  117. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/timing.json +1 -0
  118. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/grading.json +11 -0
  119. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/code-changes.md +334 -0
  120. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/execution-plan.md +86 -0
  121. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/pr-description.md +23 -0
  122. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/verification-strategy.md +119 -0
  123. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/timing.json +1 -0
  124. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/eval_metadata.json +32 -0
  125. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/grading.json +10 -0
  126. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/code-changes.md +221 -0
  127. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/execution-plan.md +104 -0
  128. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/pr-description.md +41 -0
  129. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/verification-strategy.md +84 -0
  130. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/timing.json +1 -0
  131. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/grading.json +10 -0
  132. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/code-changes.md +342 -0
  133. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/execution-plan.md +131 -0
  134. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/pr-description.md +39 -0
  135. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/verification-strategy.md +128 -0
  136. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/timing.json +1 -0
  137. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/eval_metadata.json +32 -0
  138. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/grading.json +10 -0
  139. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/code-changes.md +143 -0
  140. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/execution-plan.md +82 -0
  141. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/pr-description.md +51 -0
  142. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/verification-strategy.md +69 -0
  143. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/timing.json +1 -0
  144. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/grading.json +10 -0
  145. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/code-changes.md +252 -0
  146. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/execution-plan.md +83 -0
  147. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/pr-description.md +33 -0
  148. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/verification-strategy.md +101 -0
  149. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/timing.json +1 -0
  150. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/eval_metadata.json +32 -0
  151. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/grading.json +10 -0
  152. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/code-changes.md +387 -0
  153. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/execution-plan.md +112 -0
  154. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/pr-description.md +51 -0
  155. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/verification-strategy.md +75 -0
  156. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/timing.json +1 -0
  157. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/grading.json +10 -0
  158. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/code-changes.md +529 -0
  159. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/execution-plan.md +127 -0
  160. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/pr-description.md +42 -0
  161. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/verification-strategy.md +120 -0
  162. package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/timing.json +1 -0
  163. package/.opencode/skills/work-with-pr-workspace/iteration-1/review.html +1326 -0
  164. package/README.ja.md +1 -1
  165. package/README.ko.md +1 -1
  166. package/README.md +1 -1
  167. package/README.ru.md +1 -1
  168. package/README.zh-cn.md +1 -1
  169. package/dist/agents/atlas/agent.d.ts +6 -6
  170. package/dist/agents/prometheus/gemini.d.ts +0 -11
  171. package/dist/agents/prometheus/gpt.d.ts +0 -10
  172. package/dist/agents/prometheus/system-prompt.d.ts +2 -20
  173. package/dist/agents/types.d.ts +1 -16
  174. package/dist/cli/index.js +60 -20
  175. package/dist/config/schema/agent-names.d.ts +3 -3
  176. package/dist/config/schema/agent-overrides.d.ts +208 -208
  177. package/dist/config/schema/categories.d.ts +28 -28
  178. package/dist/config/schema/fallback-models.d.ts +20 -20
  179. package/dist/config/schema/oh-my-opencode-config.d.ts +208 -208
  180. package/dist/features/background-agent/parent-wake-dedupe.d.ts +19 -0
  181. package/dist/features/background-agent/parent-wake-notifier.d.ts +8 -19
  182. package/dist/help/schema/acp.d.ts +95 -0
  183. package/dist/help/schema/doctor.d.ts +147 -0
  184. package/dist/help/schema/sandbox.d.ts +74 -0
  185. package/dist/help/schema/status.d.ts +139 -0
  186. package/dist/hooks/keyword-detector/analyze/default.d.ts +1 -1
  187. package/dist/hooks/keyword-detector/hyperplan/default.d.ts +1 -1
  188. package/dist/hooks/keyword-detector/search/default.d.ts +1 -1
  189. package/dist/hooks/keyword-detector/team/default.d.ts +2 -7
  190. package/dist/hooks/keyword-detector/ultrawork/default.d.ts +1 -9
  191. package/dist/hooks/keyword-detector/ultrawork/gemini.d.ts +1 -16
  192. package/dist/hooks/keyword-detector/ultrawork/gpt.d.ts +1 -10
  193. package/dist/hooks/keyword-detector/ultrawork/planner.d.ts +1 -5
  194. package/dist/hooks/ralph-loop/no-progress-turn-detector.d.ts +7 -0
  195. package/dist/hooks/ralph-loop/pending-verification-handler.d.ts +1 -0
  196. package/dist/hooks/ralph-loop/types.d.ts +1 -0
  197. package/dist/hooks/runtime-fallback/error-classifier.d.ts +1 -0
  198. package/dist/hooks/tool-pair-validator/hook.d.ts +6 -1
  199. package/dist/index.js +51976 -50299
  200. package/dist/plugin-handlers/provider-config-handler.d.ts +1 -0
  201. package/dist/shared/migration/model-versions.d.ts +6 -0
  202. package/dist/shared/prompt-async-gate/pending-tool-turn.d.ts +1 -0
  203. package/dist/shared/prompt-async-gate/types.d.ts +4 -3
  204. package/package.json +19 -13
  205. package/dist/agents/atlas/default-prompt-sections.d.ts +0 -6
  206. package/dist/agents/atlas/default.d.ts +0 -2
  207. package/dist/agents/atlas/gemini-prompt-sections.d.ts +0 -6
  208. package/dist/agents/atlas/gemini.d.ts +0 -2
  209. package/dist/agents/atlas/gpt-prompt-sections.d.ts +0 -6
  210. package/dist/agents/atlas/gpt.d.ts +0 -2
  211. package/dist/agents/atlas/kimi-prompt-sections.d.ts +0 -6
  212. package/dist/agents/atlas/kimi.d.ts +0 -2
  213. package/dist/agents/atlas/opus-4-7-prompt-sections.d.ts +0 -6
  214. package/dist/agents/atlas/opus-4-7.d.ts +0 -2
  215. package/dist/agents/atlas/shared-prompt.d.ts +0 -9
  216. package/dist/agents/prometheus/behavioral-summary.d.ts +0 -6
  217. package/dist/agents/prometheus/high-accuracy-mode.d.ts +0 -6
  218. package/dist/agents/prometheus/identity-constraints.d.ts +0 -7
  219. package/dist/agents/prometheus/interview-mode.d.ts +0 -7
  220. package/dist/agents/prometheus/plan-generation.d.ts +0 -7
  221. package/dist/agents/prometheus/plan-template.d.ts +0 -7
  222. package/dist/agents/prometheus/spec-driven-mode.d.ts +0 -7
@@ -0,0 +1,360 @@
1
+ ---
2
+ name: work-with-pr
3
+ description: "Full PR lifecycle: git worktree → implement → atomic commits → PR creation → verification loop (CI + review-work + Cubic approval) → merge. Keeps iterating until ALL gates pass and PR is merged. Worktree auto-cleanup after merge. Use whenever implementation work needs to land as a PR. Triggers: 'create a PR', 'implement and PR', 'work on this and make a PR', 'implement issue', 'land this as a PR', 'work-with-pr', 'PR workflow', 'implement end to end', even when user just says 'implement X' if the context implies PR delivery."
4
+ ---
5
+
6
+ # Work With PR — Full PR Lifecycle
7
+
8
+ You are executing a complete PR lifecycle: from isolated worktree setup through implementation, PR creation, and an unbounded verification loop until the PR is merged. The loop has three gates — CI, review-work, and Cubic — and you keep fixing and pushing until all three pass simultaneously.
9
+
10
+ <architecture>
11
+
12
+ ```
13
+ Phase 0: Setup → Branch + worktree in sibling directory
14
+ Phase 1: Implement → Do the work, atomic commits
15
+ Phase 2: PR Creation → Push, create PR targeting dev
16
+ Phase 3: Verify Loop → Unbounded iteration until ALL gates pass:
17
+ ├─ Gate A: CI → gh pr checks (bun test, typecheck, build)
18
+ ├─ Gate B: review-work → 5-agent parallel review
19
+ └─ Gate C: Cubic → cubic-dev-ai[bot] "No issues found"
20
+ Phase 4: Merge → Squash merge, worktree cleanup
21
+ ```
22
+
23
+ </architecture>
24
+
25
+ ---
26
+
27
+ ## Phase 0: Setup
28
+
29
+ Create an isolated worktree so the user's main working directory stays clean. This matters because the user may have uncommitted work, and checking out a branch would destroy it.
30
+
31
+ <setup>
32
+
33
+ ### 1. Resolve repository context
34
+
35
+ ```bash
36
+ REPO=$(gh repo view --json nameWithOwner -q .nameWithOwner)
37
+ REPO_NAME=$(basename "$PWD")
38
+ BASE_BRANCH="dev" # CI blocks PRs to master
39
+ ```
40
+
41
+ ### 2. Create branch
42
+
43
+ If user provides a branch name, use it. Otherwise, derive from the task:
44
+
45
+ ```bash
46
+ # Auto-generate: feature/short-description or fix/short-description
47
+ BRANCH_NAME="feature/$(echo "$TASK_SUMMARY" | tr '[:upper:] ' '[:lower:]-' | head -c 50)"
48
+ git fetch origin "$BASE_BRANCH"
49
+ git branch "$BRANCH_NAME" "origin/$BASE_BRANCH"
50
+ ```
51
+
52
+ ### 3. Create worktree
53
+
54
+ Place worktrees as siblings to the repo — not inside it. This avoids git nested repo issues and keeps the working tree clean.
55
+
56
+ ```bash
57
+ WORKTREE_PATH="../${REPO_NAME}-wt/${BRANCH_NAME}"
58
+ mkdir -p "$(dirname "$WORKTREE_PATH")"
59
+ git worktree add "$WORKTREE_PATH" "$BRANCH_NAME"
60
+ ```
61
+
62
+ ### 4. Set working context
63
+
64
+ All subsequent work happens inside the worktree. Install dependencies if needed:
65
+
66
+ ```bash
67
+ cd "$WORKTREE_PATH"
68
+ # If bun project:
69
+ [ -f "bun.lock" ] && bun install
70
+ ```
71
+
72
+ </setup>
73
+
74
+ ---
75
+
76
+ ## Phase 1: Implement
77
+
78
+ Do the actual implementation work inside the worktree. The agent using this skill does the work directly — no subagent delegation for the implementation itself.
79
+
80
+ **Scope discipline**: For bug fixes, stay minimal. Fix the bug, add a test for it, done. Do not refactor surrounding code, add config options, or "improve" things that aren't broken. The verification loop will catch regressions — trust the process.
81
+
82
+ <implementation>
83
+
84
+ ### Commit strategy
85
+
86
+ Use the git-master skill's atomic commit principles. The reason for atomic commits: if CI fails on one change, you can isolate and fix it without unwinding everything.
87
+
88
+ ```
89
+ 3+ files changed → 2+ commits minimum
90
+ 5+ files changed → 3+ commits minimum
91
+ 10+ files changed → 5+ commits minimum
92
+ ```
93
+
94
+ Each commit should pair implementation with its tests. Load `git-master` skill when committing:
95
+
96
+ ```
97
+ task(category="quick", load_skills=["git-master"], prompt="Commit the changes atomically following git-master conventions. Repository is at {WORKTREE_PATH}.")
98
+ ```
99
+
100
+ ### Pre-push local validation
101
+
102
+ Before pushing, run the same checks CI will run. Catching failures locally saves a full CI round-trip (~3-5 min):
103
+
104
+ ```bash
105
+ bun run typecheck
106
+ bun test
107
+ bun run build
108
+ ```
109
+
110
+ Fix any failures before pushing. Each fix-commit cycle should be atomic.
111
+
112
+ </implementation>
113
+
114
+ ---
115
+
116
+ ## Phase 2: PR Creation
117
+
118
+ <pr_creation>
119
+
120
+ ### Push and create PR
121
+
122
+ ```bash
123
+ git push -u origin "$BRANCH_NAME"
124
+ ```
125
+
126
+ Create the PR using the project's template structure:
127
+
128
+ ```bash
129
+ gh pr create \
130
+ --base "$BASE_BRANCH" \
131
+ --head "$BRANCH_NAME" \
132
+ --title "$PR_TITLE" \
133
+ --body "$(cat <<'EOF'
134
+ ## Summary
135
+ [1-3 sentences describing what this PR does and why]
136
+
137
+ ## Changes
138
+ [Bullet list of key changes]
139
+
140
+ ## Testing
141
+ - `bun run typecheck` ✅
142
+ - `bun test` ✅
143
+ - `bun run build` ✅
144
+
145
+ ## Related Issues
146
+ [Link to issue if applicable]
147
+ EOF
148
+ )"
149
+ ```
150
+
151
+ Capture the PR number:
152
+
153
+ ```bash
154
+ PR_NUMBER=$(gh pr view --json number -q .number)
155
+ ```
156
+
157
+ </pr_creation>
158
+
159
+ ---
160
+
161
+ ## Phase 3: Verification Loop
162
+
163
+ This is the core of the skill. Three gates must ALL pass for the PR to be ready. The loop has no iteration cap — keep going until done. Gate ordering is intentional: CI is cheapest/fastest, review-work is most thorough, Cubic is external and asynchronous.
164
+
165
+ <verify_loop>
166
+
167
+ ```
168
+ while true:
169
+ 1. Wait for CI → Gate A
170
+ 2. If CI fails → read logs, fix, commit, push, continue
171
+ 3. Run review-work → Gate B
172
+ 4. If review fails → fix blocking issues, commit, push, continue
173
+ 5. Check Cubic → Gate C
174
+ 6. If Cubic has issues → fix issues, commit, push, continue
175
+ 7. All three pass → break
176
+ ```
177
+
178
+ ### Gate A: CI Checks
179
+
180
+ CI is the fastest feedback loop. Wait for it to complete, then parse results.
181
+
182
+ ```bash
183
+ # Wait for checks to start (GitHub needs a moment after push)
184
+ # Then watch for completion
185
+ gh pr checks "$PR_NUMBER" --watch --fail-fast
186
+ ```
187
+
188
+ **On failure**: Get the failed run logs to understand what broke:
189
+
190
+ ```bash
191
+ # Find the failed run
192
+ RUN_ID=$(gh run list --branch "$BRANCH_NAME" --status failure --json databaseId --jq '.[0].databaseId')
193
+
194
+ # Get failed job logs
195
+ gh run view "$RUN_ID" --log-failed
196
+ ```
197
+
198
+ Read the logs, fix the issue, commit atomically, push, and re-enter the loop.
199
+
200
+ ### Gate B: review-work
201
+
202
+ The review-work skill launches 5 parallel sub-agents (goal verification, QA, code quality, security, context mining). All 5 must pass.
203
+
204
+ Invoke review-work after CI passes — there's no point reviewing code that doesn't build:
205
+
206
+ ```
207
+ task(
208
+ category="unspecified-high",
209
+ load_skills=["review-work"],
210
+ run_in_background=false,
211
+ description="Post-implementation review of PR changes",
212
+ prompt="Review the implementation work on branch {BRANCH_NAME}. The worktree is at {WORKTREE_PATH}. Goal: {ORIGINAL_GOAL}. Constraints: {CONSTRAINTS}. Run command: bun run dev (or as appropriate)."
213
+ )
214
+ ```
215
+
216
+ **On failure**: review-work reports blocking issues with specific files and line numbers. Fix each blocking issue, commit, push, and re-enter the loop from Gate A (since code changed, CI must re-run).
217
+
218
+ ### Gate C: Cubic Approval
219
+
220
+ Cubic (`cubic-dev-ai[bot]`) is an automated review bot that comments on PRs. It does NOT use GitHub's APPROVED review state — instead it posts comments with issue counts and confidence scores.
221
+
222
+ **Approval signal**: The latest Cubic comment contains `**No issues found**` and confidence `**5/5**`.
223
+
224
+ **Issue signal**: The comment lists issues with file-level detail.
225
+
226
+ ```bash
227
+ # Get the latest Cubic review
228
+ CUBIC_REVIEW=$(gh api "repos/${REPO}/pulls/${PR_NUMBER}/reviews" \
229
+ --jq '[.[] | select(.user.login == "cubic-dev-ai[bot]")] | last | .body')
230
+
231
+ # Check if approved
232
+ if echo "$CUBIC_REVIEW" | grep -q "No issues found"; then
233
+ echo "Cubic: APPROVED"
234
+ else
235
+ echo "Cubic: ISSUES FOUND"
236
+ echo "$CUBIC_REVIEW"
237
+ fi
238
+ ```
239
+
240
+ **On issues**: Cubic's review body contains structured issue descriptions. Parse them, determine which are valid (some may be false positives), fix the valid ones, commit, push, re-enter from Gate A.
241
+
242
+ Cubic reviews are triggered automatically on PR updates. After pushing a fix, wait for the new review to appear before checking again. Use `gh api` polling with a conditional loop:
243
+
244
+ ```bash
245
+ # Wait for new Cubic review after push
246
+ PUSH_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)
247
+ while true; do
248
+ LATEST_REVIEW_TIME=$(gh api "repos/${REPO}/pulls/${PR_NUMBER}/reviews" \
249
+ --jq '[.[] | select(.user.login == "cubic-dev-ai[bot]")] | last | .submitted_at')
250
+ if [[ "$LATEST_REVIEW_TIME" > "$PUSH_TIME" ]]; then
251
+ break
252
+ fi
253
+ # Use gh api call itself as the delay mechanism — each call takes ~1-2s
254
+ # For longer waits, use: timeout 30 gh pr checks "$PR_NUMBER" --watch 2>/dev/null || true
255
+ done
256
+ ```
257
+
258
+ ### Iteration discipline
259
+
260
+ Each iteration through the loop:
261
+ 1. Fix ONLY the issues identified by the failing gate
262
+ 2. Commit atomically (one logical fix per commit)
263
+ 3. Push
264
+ 4. Re-enter from Gate A (code changed → full re-verification)
265
+
266
+ Avoid the temptation to "improve" unrelated code during fix iterations. Scope creep in the fix loop makes debugging harder and can introduce new failures.
267
+
268
+ </verify_loop>
269
+
270
+ ---
271
+
272
+ ## Phase 4: Merge & Cleanup
273
+
274
+ Once all three gates pass:
275
+
276
+ <merge_cleanup>
277
+
278
+ ### Merge the PR
279
+
280
+ ```bash
281
+ # Squash merge to keep history clean
282
+ gh pr merge "$PR_NUMBER" --squash --delete-branch
283
+ ```
284
+
285
+ ### Sync .omo state back to main repo
286
+
287
+ Before removing the worktree, copy `.omo/` state back. When `.omo/` is gitignored, files written there during worktree execution are not committed or merged — they would be lost on worktree removal.
288
+
289
+ ```bash
290
+ # Sync .omo state from worktree to main repo (preserves task state, plans, notepads)
291
+ if [ -d "$WORKTREE_PATH/.omo" ]; then
292
+ mkdir -p "$ORIGINAL_DIR/.omo"
293
+ cp -r "$WORKTREE_PATH/.omo/"* "$ORIGINAL_DIR/.omo/" 2>/dev/null || true
294
+ fi
295
+ ```
296
+
297
+ ### Clean up the worktree
298
+
299
+ The worktree served its purpose — remove it to avoid disk bloat:
300
+
301
+ ```bash
302
+ cd "$ORIGINAL_DIR" # Return to original working directory
303
+ git worktree remove "$WORKTREE_PATH"
304
+ # Prune any stale worktree references
305
+ git worktree prune
306
+ ```
307
+
308
+ ### Report completion
309
+
310
+ Summarize what happened:
311
+
312
+ ```
313
+ ## PR Merged ✅
314
+
315
+ - **PR**: #{PR_NUMBER} — {PR_TITLE}
316
+ - **Branch**: {BRANCH_NAME} → {BASE_BRANCH}
317
+ - **Iterations**: {N} verification loops
318
+ - **Gates passed**: CI ✅ | review-work ✅ | Cubic ✅
319
+ - **Worktree**: cleaned up
320
+ ```
321
+
322
+ </merge_cleanup>
323
+
324
+ ---
325
+
326
+ ## Failure Recovery
327
+
328
+ <failure_recovery>
329
+
330
+ If you hit an unrecoverable error (e.g., merge conflict with base branch, infrastructure failure):
331
+
332
+ 1. **Do NOT delete the worktree** — the user may want to inspect or continue manually
333
+ 2. Report what happened, what was attempted, and where things stand
334
+ 3. Include the worktree path so the user can resume
335
+
336
+ For merge conflicts:
337
+
338
+ ```bash
339
+ cd "$WORKTREE_PATH"
340
+ git fetch origin "$BASE_BRANCH"
341
+ git rebase "origin/$BASE_BRANCH"
342
+ # Resolve conflicts, then continue the loop
343
+ ```
344
+
345
+ </failure_recovery>
346
+
347
+ ---
348
+
349
+ ## Anti-Patterns
350
+
351
+ | Violation | Why it fails | Severity |
352
+ |-----------|-------------|----------|
353
+ | Working in main worktree instead of isolated worktree | Pollutes user's working directory, may destroy uncommitted work | CRITICAL |
354
+ | Pushing directly to dev/master | Bypasses review entirely | CRITICAL |
355
+ | Skipping CI gate after code changes | review-work and Cubic may pass on stale code | CRITICAL |
356
+ | Fixing unrelated code during verification loop | Scope creep causes new failures | HIGH |
357
+ | Deleting worktree on failure | User loses ability to inspect/resume | HIGH |
358
+ | Ignoring Cubic false positives without justification | Cubic issues should be evaluated, not blindly dismissed | MEDIUM |
359
+ | Giant single commits | Harder to isolate failures, violates git-master principles | MEDIUM |
360
+ | Not running local checks before push | Wastes CI time on obvious failures | MEDIUM |
@@ -0,0 +1,76 @@
1
+ {
2
+ "skill_name": "work-with-pr",
3
+ "evals": [
4
+ {
5
+ "id": 1,
6
+ "prompt": "I need to add a `max_background_agents` config option to oh-my-opencode that limits how many background agents can run simultaneously. It should be in the plugin config schema with a default of 5. Add validation and make sure the background manager respects it. Create a PR for this.",
7
+ "expected_output": "Agent creates worktree, implements config option with schema validation, adds tests, creates PR, iterates through verification gates until merged",
8
+ "files": [],
9
+ "assertions": [
10
+ {"id": "worktree-isolation", "text": "Plan uses git worktree in a sibling directory (not main working directory)"},
11
+ {"id": "branch-from-dev", "text": "Branch is created from origin/dev (not master/main)"},
12
+ {"id": "atomic-commits", "text": "Plan specifies multiple atomic commits for multi-file changes"},
13
+ {"id": "local-validation", "text": "Runs bun run typecheck, bun test, and bun run build before pushing"},
14
+ {"id": "pr-targets-dev", "text": "PR is created targeting dev branch (not master)"},
15
+ {"id": "three-gates", "text": "Verification loop includes all 3 gates: CI, review-work, and Cubic"},
16
+ {"id": "gate-ordering", "text": "Gates are checked in order: CI first, then review-work, then Cubic"},
17
+ {"id": "cubic-check-method", "text": "Cubic check uses gh api to check cubic-dev-ai[bot] reviews for 'No issues found'"},
18
+ {"id": "worktree-cleanup", "text": "Plan includes worktree cleanup after merge"},
19
+ {"id": "real-file-references", "text": "Code changes reference actual files in the codebase (config schema, background manager)"}
20
+ ]
21
+ },
22
+ {
23
+ "id": 2,
24
+ "prompt": "The atlas hook has a bug where it crashes when boulder.json is missing the worktree_path field. Fix it and land the fix as a PR. Make sure CI passes.",
25
+ "expected_output": "Agent creates worktree for the fix branch, adds null check and test for missing worktree_path, creates PR, iterates verification loop",
26
+ "files": [],
27
+ "assertions": [
28
+ {"id": "worktree-isolation", "text": "Plan uses git worktree in a sibling directory"},
29
+ {"id": "minimal-fix", "text": "Fix is minimal — adds null check, doesn't refactor unrelated code"},
30
+ {"id": "test-added", "text": "Test case added for the missing worktree_path scenario"},
31
+ {"id": "three-gates", "text": "Verification loop includes all 3 gates: CI, review-work, Cubic"},
32
+ {"id": "real-atlas-files", "text": "References actual atlas hook files in src/hooks/atlas/"},
33
+ {"id": "fix-branch-naming", "text": "Branch name follows fix/ prefix convention"}
34
+ ]
35
+ },
36
+ {
37
+ "id": 3,
38
+ "prompt": "Refactor src/tools/delegate-task/constants.ts to split DEFAULT_CATEGORIES and CATEGORY_MODEL_REQUIREMENTS into separate files. Keep backward compatibility with the barrel export. Make a PR.",
39
+ "expected_output": "Agent creates worktree, splits file with atomic commits, ensures imports still work via barrel, creates PR, runs through all gates",
40
+ "files": [],
41
+ "assertions": [
42
+ {"id": "worktree-isolation", "text": "Plan uses git worktree in a sibling directory"},
43
+ {"id": "multiple-atomic-commits", "text": "Uses 2+ commits for the multi-file refactor"},
44
+ {"id": "barrel-export", "text": "Maintains backward compatibility via barrel re-export in constants.ts or index.ts"},
45
+ {"id": "three-gates", "text": "Verification loop includes all 3 gates"},
46
+ {"id": "real-constants-file", "text": "References actual src/tools/delegate-task/constants.ts file and its exports"}
47
+ ]
48
+ },
49
+ {
50
+ "id": 4,
51
+ "prompt": "implement issue #100 - we need to add a new built-in MCP for arxiv paper search. just the basic search endpoint, nothing fancy. pr it",
52
+ "expected_output": "Agent creates worktree, implements arxiv MCP following existing MCP patterns (websearch, context7, grep_app), creates PR with proper template, verification loop runs",
53
+ "files": [],
54
+ "assertions": [
55
+ {"id": "worktree-isolation", "text": "Plan uses git worktree in a sibling directory"},
56
+ {"id": "follows-mcp-pattern", "text": "New MCP follows existing pattern from src/mcp/ (websearch, context7, grep_app)"},
57
+ {"id": "three-gates", "text": "Verification loop includes all 3 gates"},
58
+ {"id": "pr-targets-dev", "text": "PR targets dev branch"},
59
+ {"id": "local-validation", "text": "Runs local checks before pushing"}
60
+ ]
61
+ },
62
+ {
63
+ "id": 5,
64
+ "prompt": "The comment-checker hook is too aggressive - it's flagging legitimate comments that happen to contain 'Note:' as AI slop. Relax the regex pattern and add test cases for the false positives. Work on a separate branch and make a PR.",
65
+ "expected_output": "Agent creates worktree, fixes regex, adds specific test cases for false positive scenarios, creates PR, all three gates pass",
66
+ "files": [],
67
+ "assertions": [
68
+ {"id": "worktree-isolation", "text": "Plan uses git worktree in a sibling directory"},
69
+ {"id": "real-comment-checker-files", "text": "References actual comment-checker hook files in the codebase"},
70
+ {"id": "regression-tests", "text": "Adds test cases specifically for 'Note:' false positive scenarios"},
71
+ {"id": "three-gates", "text": "Verification loop includes all 3 gates"},
72
+ {"id": "minimal-change", "text": "Only modifies regex and adds tests — no unrelated changes"}
73
+ ]
74
+ }
75
+ ]
76
+ }
@@ -0,0 +1,138 @@
1
+ {
2
+ "skill_name": "work-with-pr",
3
+ "iteration": 1,
4
+ "summary": {
5
+ "with_skill": {
6
+ "pass_rate": 0.968,
7
+ "mean_duration_seconds": 340.2,
8
+ "stddev_duration_seconds": 169.3
9
+ },
10
+ "without_skill": {
11
+ "pass_rate": 0.516,
12
+ "mean_duration_seconds": 303.0,
13
+ "stddev_duration_seconds": 77.8
14
+ },
15
+ "delta": {
16
+ "pass_rate": 0.452,
17
+ "mean_duration_seconds": 37.2,
18
+ "stddev_duration_seconds": 91.5
19
+ }
20
+ },
21
+ "evals": [
22
+ {
23
+ "eval_name": "happy-path-feature-config-option",
24
+ "with_skill": {
25
+ "pass_rate": 1.0,
26
+ "passed": 10,
27
+ "total": 10,
28
+ "duration_seconds": 292,
29
+ "failed_assertions": []
30
+ },
31
+ "without_skill": {
32
+ "pass_rate": 0.4,
33
+ "passed": 4,
34
+ "total": 10,
35
+ "duration_seconds": 365,
36
+ "failed_assertions": [
37
+ {"assertion": "Plan uses git worktree in a sibling directory", "reason": "Uses git checkout -b, no worktree isolation"},
38
+ {"assertion": "Plan specifies multiple atomic commits for multi-file changes", "reason": "Steps listed sequentially but no atomic commit strategy mentioned"},
39
+ {"assertion": "Verification loop includes all 3 gates: CI, review-work, and Cubic", "reason": "Only mentions CI pipeline in step 6. No review-work or Cubic."},
40
+ {"assertion": "Gates are checked in order: CI first, then review-work, then Cubic", "reason": "No gate ordering - only CI mentioned"},
41
+ {"assertion": "Cubic check uses gh api to check cubic-dev-ai[bot] reviews", "reason": "No mention of Cubic at all"},
42
+ {"assertion": "Plan includes worktree cleanup after merge", "reason": "No worktree used, no cleanup needed"}
43
+ ]
44
+ }
45
+ },
46
+ {
47
+ "eval_name": "bugfix-atlas-null-check",
48
+ "with_skill": {
49
+ "pass_rate": 1.0,
50
+ "passed": 6,
51
+ "total": 6,
52
+ "duration_seconds": 506,
53
+ "failed_assertions": []
54
+ },
55
+ "without_skill": {
56
+ "pass_rate": 0.667,
57
+ "passed": 4,
58
+ "total": 6,
59
+ "duration_seconds": 325,
60
+ "failed_assertions": [
61
+ {"assertion": "Plan uses git worktree in a sibling directory", "reason": "No worktree. Steps go directly to creating branch and modifying files."},
62
+ {"assertion": "Verification loop includes all 3 gates", "reason": "Only mentions CI pipeline (step 5). No review-work or Cubic."}
63
+ ]
64
+ }
65
+ },
66
+ {
67
+ "eval_name": "refactor-split-constants",
68
+ "with_skill": {
69
+ "pass_rate": 1.0,
70
+ "passed": 5,
71
+ "total": 5,
72
+ "duration_seconds": 181,
73
+ "failed_assertions": []
74
+ },
75
+ "without_skill": {
76
+ "pass_rate": 0.4,
77
+ "passed": 2,
78
+ "total": 5,
79
+ "duration_seconds": 229,
80
+ "failed_assertions": [
81
+ {"assertion": "Plan uses git worktree in a sibling directory", "reason": "git checkout -b only, no worktree"},
82
+ {"assertion": "Uses 2+ commits for the multi-file refactor", "reason": "Single atomic commit: 'refactor: split delegate-task constants and category model requirements'"},
83
+ {"assertion": "Verification loop includes all 3 gates", "reason": "Only mentions typecheck/test/build. No review-work or Cubic."}
84
+ ]
85
+ }
86
+ },
87
+ {
88
+ "eval_name": "new-mcp-arxiv-casual",
89
+ "with_skill": {
90
+ "pass_rate": 1.0,
91
+ "passed": 5,
92
+ "total": 5,
93
+ "duration_seconds": 152,
94
+ "failed_assertions": []
95
+ },
96
+ "without_skill": {
97
+ "pass_rate": 0.6,
98
+ "passed": 3,
99
+ "total": 5,
100
+ "duration_seconds": 197,
101
+ "failed_assertions": [
102
+ {"assertion": "Verification loop includes all 3 gates", "reason": "Only mentions bun test/typecheck/build. No review-work or Cubic."}
103
+ ]
104
+ }
105
+ },
106
+ {
107
+ "eval_name": "regex-fix-false-positive",
108
+ "with_skill": {
109
+ "pass_rate": 0.8,
110
+ "passed": 4,
111
+ "total": 5,
112
+ "duration_seconds": 570,
113
+ "failed_assertions": [
114
+ {"assertion": "Only modifies regex and adds tests — no unrelated changes", "reason": "Also proposes config schema change (exclude_patterns) and Go binary update — goes beyond minimal fix"}
115
+ ]
116
+ },
117
+ "without_skill": {
118
+ "pass_rate": 0.6,
119
+ "passed": 3,
120
+ "total": 5,
121
+ "duration_seconds": 399,
122
+ "failed_assertions": [
123
+ {"assertion": "Plan uses git worktree in a sibling directory", "reason": "git checkout -b, no worktree"},
124
+ {"assertion": "Verification loop includes all 3 gates", "reason": "Only bun test and typecheck. No review-work or Cubic."}
125
+ ]
126
+ }
127
+ }
128
+ ],
129
+ "analyst_observations": [
130
+ "Three-gates assertion (CI + review-work + Cubic) is the strongest discriminator: 5/5 with-skill vs 0/5 without-skill. Without the skill, agents never know about Cubic or review-work gates.",
131
+ "Worktree isolation is nearly as discriminating (5/5 vs 1/5). One without-skill run (eval-4) independently chose worktree, suggesting some agents already know worktree patterns, but the skill makes it consistent.",
132
+ "The skill's only failure (eval-5 minimal-change) reveals a potential over-engineering tendency: the skill-guided agent proposed config schema changes and Go binary updates for what should have been a minimal regex fix. Consider adding explicit guidance for fix-type tasks to stay minimal.",
133
+ "Duration tradeoff: with-skill is 12% slower on average (340s vs 303s), driven mainly by eval-2 (bugfix) and eval-5 (regex fix) where the skill's thorough verification planning adds overhead. For eval-1 and eval-3-4, with-skill was actually faster.",
134
+ "Without-skill duration has lower variance (stddev 78s vs 169s), suggesting the skill introduces more variable execution paths depending on task complexity.",
135
+ "Non-discriminating assertions: 'References actual files', 'PR targets dev', 'Runs local checks' — these pass regardless of skill. They validate baseline agent competence, not skill value. Consider removing or downweighting in future iterations.",
136
+ "Atomic commits assertion discriminates moderately (2/2 with-skill tested vs 0/2 without-skill tested). Without the skill, agents default to single commits even for multi-file refactors."
137
+ ]
138
+ }
@@ -0,0 +1,42 @@
1
+ # Benchmark: work-with-pr (Iteration 1)
2
+
3
+ ## Summary
4
+
5
+ | Metric | With Skill | Without Skill | Delta |
6
+ |--------|-----------|---------------|-------|
7
+ | Pass Rate | 96.8% (30/31) | 51.6% (16/31) | +45.2% |
8
+ | Mean Duration | 340.2s | 303.0s | +37.2s |
9
+ | Duration Stddev | 169.3s | 77.8s | +91.5s |
10
+
11
+ ## Per-Eval Breakdown
12
+
13
+ | Eval | With Skill | Without Skill | Delta |
14
+ |------|-----------|---------------|-------|
15
+ | happy-path-feature-config-option | 100% (10/10) | 40% (4/10) | +60% |
16
+ | bugfix-atlas-null-check | 100% (6/6) | 67% (4/6) | +33% |
17
+ | refactor-split-constants | 100% (5/5) | 40% (2/5) | +60% |
18
+ | new-mcp-arxiv-casual | 100% (5/5) | 60% (3/5) | +40% |
19
+ | regex-fix-false-positive | 80% (4/5) | 60% (3/5) | +20% |
20
+
21
+ ## Key Discriminators
22
+
23
+ - **three-gates** (CI + review-work + Cubic): 5/5 vs 0/5 — strongest signal
24
+ - **worktree-isolation**: 5/5 vs 1/5
25
+ - **atomic-commits**: 2/2 vs 0/2
26
+ - **cubic-check-method**: 1/1 vs 0/1
27
+
28
+ ## Non-Discriminating Assertions
29
+
30
+ - References actual files: passes in both conditions
31
+ - PR targets dev: passes in both conditions
32
+ - Runs local checks before pushing: passes in both conditions
33
+
34
+ ## Only With-Skill Failure
35
+
36
+ - **eval-5 minimal-change**: Skill-guided agent proposed config schema changes and Go binary update for a minimal regex fix. The skill may encourage over-engineering in fix scenarios.
37
+
38
+ ## Analyst Notes
39
+
40
+ - The skill adds most value for procedural knowledge (verification gates, worktree workflow) that agents cannot infer from codebase alone.
41
+ - Duration cost is modest (+12%) and acceptable given the +45% pass rate improvement.
42
+ - Consider adding explicit "fix-type tasks: stay minimal" guidance in iteration 2.
@@ -0,0 +1,57 @@
1
+ {
2
+ "eval_id": 1,
3
+ "eval_name": "happy-path-feature-config-option",
4
+ "prompt": "I need to add a `max_background_agents` config option to oh-my-opencode that limits how many background agents can run simultaneously. It should be in the plugin config schema with a default of 5. Add validation and make sure the background manager respects it. Create a PR for this.",
5
+ "assertions": [
6
+ {
7
+ "id": "worktree-isolation",
8
+ "text": "Plan uses git worktree in a sibling directory (not main working directory)",
9
+ "type": "manual"
10
+ },
11
+ {
12
+ "id": "branch-from-dev",
13
+ "text": "Branch is created from origin/dev (not master/main)",
14
+ "type": "manual"
15
+ },
16
+ {
17
+ "id": "atomic-commits",
18
+ "text": "Plan specifies multiple atomic commits for multi-file changes",
19
+ "type": "manual"
20
+ },
21
+ {
22
+ "id": "local-validation",
23
+ "text": "Runs bun run typecheck, bun test, and bun run build before pushing",
24
+ "type": "manual"
25
+ },
26
+ {
27
+ "id": "pr-targets-dev",
28
+ "text": "PR is created targeting dev branch (not master)",
29
+ "type": "manual"
30
+ },
31
+ {
32
+ "id": "three-gates",
33
+ "text": "Verification loop includes all 3 gates: CI, review-work, and Cubic",
34
+ "type": "manual"
35
+ },
36
+ {
37
+ "id": "gate-ordering",
38
+ "text": "Gates are checked in order: CI first, then review-work, then Cubic",
39
+ "type": "manual"
40
+ },
41
+ {
42
+ "id": "cubic-check-method",
43
+ "text": "Cubic check uses gh api to check cubic-dev-ai[bot] reviews for 'No issues found'",
44
+ "type": "manual"
45
+ },
46
+ {
47
+ "id": "worktree-cleanup",
48
+ "text": "Plan includes worktree cleanup after merge",
49
+ "type": "manual"
50
+ },
51
+ {
52
+ "id": "real-file-references",
53
+ "text": "Code changes reference actual files in the codebase (config schema, background manager)",
54
+ "type": "manual"
55
+ }
56
+ ]
57
+ }