oh-my-opencode 4.4.0 → 4.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/command/get-unpublished-changes.md +148 -0
- package/.agents/command/omomomo.md +37 -0
- package/.agents/command/publish.md +376 -0
- package/.agents/command/remove-deadcode.md +221 -0
- package/.agents/command/security-research.md +16 -0
- package/.agents/skills/get-unpublished-changes/SKILL.md +24 -0
- package/.agents/skills/github-triage/SKILL.md +587 -0
- package/.agents/skills/github-triage/scripts/gh_fetch.py +398 -0
- package/.agents/skills/hyperplan/SKILL.md +450 -0
- package/.agents/skills/omomomo/SKILL.md +36 -0
- package/.agents/skills/pre-publish-review/SKILL.md +407 -0
- package/.agents/skills/publish/SKILL.md +428 -0
- package/.agents/skills/remove-deadcode/SKILL.md +216 -0
- package/.agents/skills/security-research/SKILL.md +204 -0
- package/.agents/skills/work-with-pr/SKILL.md +360 -0
- package/.agents/skills/work-with-pr-workspace/evals/evals.json +76 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/benchmark.json +138 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/benchmark.md +42 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/eval_metadata.json +57 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/grading.json +15 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/code-changes.md +454 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/execution-plan.md +136 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/pr-description.md +47 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/verification-strategy.md +163 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/grading.json +15 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/code-changes.md +615 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/execution-plan.md +99 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/pr-description.md +50 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/verification-strategy.md +111 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/eval_metadata.json +37 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/grading.json +11 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/code-changes.md +205 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/execution-plan.md +78 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/pr-description.md +42 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/verification-strategy.md +87 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/grading.json +11 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/code-changes.md +334 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/execution-plan.md +86 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/pr-description.md +23 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/verification-strategy.md +119 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/eval_metadata.json +32 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/grading.json +10 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/code-changes.md +221 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/execution-plan.md +104 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/pr-description.md +41 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/verification-strategy.md +84 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/grading.json +10 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/code-changes.md +342 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/execution-plan.md +131 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/pr-description.md +39 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/verification-strategy.md +128 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/eval_metadata.json +32 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/grading.json +10 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/code-changes.md +143 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/execution-plan.md +82 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/pr-description.md +51 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/verification-strategy.md +69 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/grading.json +10 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/code-changes.md +252 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/execution-plan.md +83 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/pr-description.md +33 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/verification-strategy.md +101 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/eval_metadata.json +32 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/grading.json +10 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/code-changes.md +387 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/execution-plan.md +112 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/pr-description.md +51 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/verification-strategy.md +75 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/grading.json +10 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/code-changes.md +529 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/execution-plan.md +127 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/pr-description.md +42 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/verification-strategy.md +120 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/timing.json +1 -0
- package/.agents/skills/work-with-pr-workspace/iteration-1/review.html +1326 -0
- package/.opencode/command/get-unpublished-changes.md +148 -0
- package/.opencode/command/omomomo.md +37 -0
- package/.opencode/command/publish.md +376 -0
- package/.opencode/command/remove-deadcode.md +221 -0
- package/.opencode/command/security-research.md +16 -0
- package/.opencode/skills/github-triage/SKILL.md +587 -0
- package/.opencode/skills/github-triage/scripts/gh_fetch.py +398 -0
- package/.opencode/skills/hyperplan/SKILL.md +450 -0
- package/.opencode/skills/pre-publish-review/SKILL.md +407 -0
- package/.opencode/skills/work-with-pr/SKILL.md +360 -0
- package/.opencode/skills/work-with-pr-workspace/evals/evals.json +76 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/benchmark.json +138 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/benchmark.md +42 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/eval_metadata.json +57 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/grading.json +15 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/code-changes.md +454 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/execution-plan.md +136 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/pr-description.md +47 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/outputs/verification-strategy.md +163 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/with_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/grading.json +15 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/code-changes.md +615 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/execution-plan.md +99 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/pr-description.md +50 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/outputs/verification-strategy.md +111 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-1/without_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/eval_metadata.json +37 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/grading.json +11 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/code-changes.md +205 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/execution-plan.md +78 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/pr-description.md +42 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/outputs/verification-strategy.md +87 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/with_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/grading.json +11 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/code-changes.md +334 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/execution-plan.md +86 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/pr-description.md +23 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/outputs/verification-strategy.md +119 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-2/without_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/eval_metadata.json +32 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/grading.json +10 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/code-changes.md +221 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/execution-plan.md +104 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/pr-description.md +41 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/outputs/verification-strategy.md +84 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/with_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/grading.json +10 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/code-changes.md +342 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/execution-plan.md +131 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/pr-description.md +39 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/outputs/verification-strategy.md +128 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-3/without_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/eval_metadata.json +32 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/grading.json +10 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/code-changes.md +143 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/execution-plan.md +82 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/pr-description.md +51 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/outputs/verification-strategy.md +69 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/with_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/grading.json +10 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/code-changes.md +252 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/execution-plan.md +83 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/pr-description.md +33 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/outputs/verification-strategy.md +101 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-4/without_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/eval_metadata.json +32 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/grading.json +10 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/code-changes.md +387 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/execution-plan.md +112 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/pr-description.md +51 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/outputs/verification-strategy.md +75 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/with_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/grading.json +10 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/code-changes.md +529 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/execution-plan.md +127 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/pr-description.md +42 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/outputs/verification-strategy.md +120 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/eval-5/without_skill/timing.json +1 -0
- package/.opencode/skills/work-with-pr-workspace/iteration-1/review.html +1326 -0
- package/README.ja.md +1 -1
- package/README.ko.md +1 -1
- package/README.md +1 -1
- package/README.ru.md +1 -1
- package/README.zh-cn.md +1 -1
- package/dist/agents/atlas/agent.d.ts +6 -6
- package/dist/agents/prometheus/gemini.d.ts +0 -11
- package/dist/agents/prometheus/gpt.d.ts +0 -10
- package/dist/agents/prometheus/system-prompt.d.ts +2 -20
- package/dist/agents/types.d.ts +1 -16
- package/dist/cli/index.js +50 -17
- package/dist/config/schema/agent-names.d.ts +3 -3
- package/dist/config/schema/agent-overrides.d.ts +208 -208
- package/dist/config/schema/categories.d.ts +28 -28
- package/dist/config/schema/fallback-models.d.ts +20 -20
- package/dist/config/schema/oh-my-opencode-config.d.ts +208 -208
- package/dist/features/background-agent/parent-wake-notifier.d.ts +8 -1
- package/dist/help/schema/acp.d.ts +95 -0
- package/dist/help/schema/doctor.d.ts +147 -0
- package/dist/help/schema/sandbox.d.ts +74 -0
- package/dist/help/schema/status.d.ts +139 -0
- package/dist/hooks/keyword-detector/analyze/default.d.ts +1 -1
- package/dist/hooks/keyword-detector/hyperplan/default.d.ts +1 -1
- package/dist/hooks/keyword-detector/search/default.d.ts +1 -1
- package/dist/hooks/keyword-detector/team/default.d.ts +2 -7
- package/dist/hooks/keyword-detector/ultrawork/default.d.ts +1 -9
- package/dist/hooks/keyword-detector/ultrawork/gemini.d.ts +1 -16
- package/dist/hooks/keyword-detector/ultrawork/gpt.d.ts +1 -10
- package/dist/hooks/keyword-detector/ultrawork/planner.d.ts +1 -5
- package/dist/hooks/ralph-loop/no-progress-turn-detector.d.ts +7 -0
- package/dist/hooks/ralph-loop/pending-verification-handler.d.ts +1 -0
- package/dist/hooks/ralph-loop/types.d.ts +1 -0
- package/dist/hooks/runtime-fallback/error-classifier.d.ts +1 -0
- package/dist/index.js +51910 -50310
- package/dist/shared/prompt-async-gate/pending-tool-turn.d.ts +1 -0
- package/dist/shared/prompt-async-gate/types.d.ts +4 -3
- package/package.json +19 -13
- package/dist/agents/atlas/default-prompt-sections.d.ts +0 -6
- package/dist/agents/atlas/default.d.ts +0 -2
- package/dist/agents/atlas/gemini-prompt-sections.d.ts +0 -6
- package/dist/agents/atlas/gemini.d.ts +0 -2
- package/dist/agents/atlas/gpt-prompt-sections.d.ts +0 -6
- package/dist/agents/atlas/gpt.d.ts +0 -2
- package/dist/agents/atlas/kimi-prompt-sections.d.ts +0 -6
- package/dist/agents/atlas/kimi.d.ts +0 -2
- package/dist/agents/atlas/opus-4-7-prompt-sections.d.ts +0 -6
- package/dist/agents/atlas/opus-4-7.d.ts +0 -2
- package/dist/agents/atlas/shared-prompt.d.ts +0 -9
- package/dist/agents/prometheus/behavioral-summary.d.ts +0 -6
- package/dist/agents/prometheus/high-accuracy-mode.d.ts +0 -6
- package/dist/agents/prometheus/identity-constraints.d.ts +0 -7
- package/dist/agents/prometheus/interview-mode.d.ts +0 -7
- package/dist/agents/prometheus/plan-generation.d.ts +0 -7
- package/dist/agents/prometheus/plan-template.d.ts +0 -7
- package/dist/agents/prometheus/spec-driven-mode.d.ts +0 -7
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "work-with-pr",
|
|
3
|
+
"iteration": 1,
|
|
4
|
+
"summary": {
|
|
5
|
+
"with_skill": {
|
|
6
|
+
"pass_rate": 0.968,
|
|
7
|
+
"mean_duration_seconds": 340.2,
|
|
8
|
+
"stddev_duration_seconds": 169.3
|
|
9
|
+
},
|
|
10
|
+
"without_skill": {
|
|
11
|
+
"pass_rate": 0.516,
|
|
12
|
+
"mean_duration_seconds": 303.0,
|
|
13
|
+
"stddev_duration_seconds": 77.8
|
|
14
|
+
},
|
|
15
|
+
"delta": {
|
|
16
|
+
"pass_rate": 0.452,
|
|
17
|
+
"mean_duration_seconds": 37.2,
|
|
18
|
+
"stddev_duration_seconds": 91.5
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"evals": [
|
|
22
|
+
{
|
|
23
|
+
"eval_name": "happy-path-feature-config-option",
|
|
24
|
+
"with_skill": {
|
|
25
|
+
"pass_rate": 1.0,
|
|
26
|
+
"passed": 10,
|
|
27
|
+
"total": 10,
|
|
28
|
+
"duration_seconds": 292,
|
|
29
|
+
"failed_assertions": []
|
|
30
|
+
},
|
|
31
|
+
"without_skill": {
|
|
32
|
+
"pass_rate": 0.4,
|
|
33
|
+
"passed": 4,
|
|
34
|
+
"total": 10,
|
|
35
|
+
"duration_seconds": 365,
|
|
36
|
+
"failed_assertions": [
|
|
37
|
+
{"assertion": "Plan uses git worktree in a sibling directory", "reason": "Uses git checkout -b, no worktree isolation"},
|
|
38
|
+
{"assertion": "Plan specifies multiple atomic commits for multi-file changes", "reason": "Steps listed sequentially but no atomic commit strategy mentioned"},
|
|
39
|
+
{"assertion": "Verification loop includes all 3 gates: CI, review-work, and Cubic", "reason": "Only mentions CI pipeline in step 6. No review-work or Cubic."},
|
|
40
|
+
{"assertion": "Gates are checked in order: CI first, then review-work, then Cubic", "reason": "No gate ordering - only CI mentioned"},
|
|
41
|
+
{"assertion": "Cubic check uses gh api to check cubic-dev-ai[bot] reviews", "reason": "No mention of Cubic at all"},
|
|
42
|
+
{"assertion": "Plan includes worktree cleanup after merge", "reason": "No worktree used, no cleanup needed"}
|
|
43
|
+
]
|
|
44
|
+
}
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"eval_name": "bugfix-atlas-null-check",
|
|
48
|
+
"with_skill": {
|
|
49
|
+
"pass_rate": 1.0,
|
|
50
|
+
"passed": 6,
|
|
51
|
+
"total": 6,
|
|
52
|
+
"duration_seconds": 506,
|
|
53
|
+
"failed_assertions": []
|
|
54
|
+
},
|
|
55
|
+
"without_skill": {
|
|
56
|
+
"pass_rate": 0.667,
|
|
57
|
+
"passed": 4,
|
|
58
|
+
"total": 6,
|
|
59
|
+
"duration_seconds": 325,
|
|
60
|
+
"failed_assertions": [
|
|
61
|
+
{"assertion": "Plan uses git worktree in a sibling directory", "reason": "No worktree. Steps go directly to creating branch and modifying files."},
|
|
62
|
+
{"assertion": "Verification loop includes all 3 gates", "reason": "Only mentions CI pipeline (step 5). No review-work or Cubic."}
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"eval_name": "refactor-split-constants",
|
|
68
|
+
"with_skill": {
|
|
69
|
+
"pass_rate": 1.0,
|
|
70
|
+
"passed": 5,
|
|
71
|
+
"total": 5,
|
|
72
|
+
"duration_seconds": 181,
|
|
73
|
+
"failed_assertions": []
|
|
74
|
+
},
|
|
75
|
+
"without_skill": {
|
|
76
|
+
"pass_rate": 0.4,
|
|
77
|
+
"passed": 2,
|
|
78
|
+
"total": 5,
|
|
79
|
+
"duration_seconds": 229,
|
|
80
|
+
"failed_assertions": [
|
|
81
|
+
{"assertion": "Plan uses git worktree in a sibling directory", "reason": "git checkout -b only, no worktree"},
|
|
82
|
+
{"assertion": "Uses 2+ commits for the multi-file refactor", "reason": "Single atomic commit: 'refactor: split delegate-task constants and category model requirements'"},
|
|
83
|
+
{"assertion": "Verification loop includes all 3 gates", "reason": "Only mentions typecheck/test/build. No review-work or Cubic."}
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
"eval_name": "new-mcp-arxiv-casual",
|
|
89
|
+
"with_skill": {
|
|
90
|
+
"pass_rate": 1.0,
|
|
91
|
+
"passed": 5,
|
|
92
|
+
"total": 5,
|
|
93
|
+
"duration_seconds": 152,
|
|
94
|
+
"failed_assertions": []
|
|
95
|
+
},
|
|
96
|
+
"without_skill": {
|
|
97
|
+
"pass_rate": 0.6,
|
|
98
|
+
"passed": 3,
|
|
99
|
+
"total": 5,
|
|
100
|
+
"duration_seconds": 197,
|
|
101
|
+
"failed_assertions": [
|
|
102
|
+
{"assertion": "Verification loop includes all 3 gates", "reason": "Only mentions bun test/typecheck/build. No review-work or Cubic."}
|
|
103
|
+
]
|
|
104
|
+
}
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
"eval_name": "regex-fix-false-positive",
|
|
108
|
+
"with_skill": {
|
|
109
|
+
"pass_rate": 0.8,
|
|
110
|
+
"passed": 4,
|
|
111
|
+
"total": 5,
|
|
112
|
+
"duration_seconds": 570,
|
|
113
|
+
"failed_assertions": [
|
|
114
|
+
{"assertion": "Only modifies regex and adds tests — no unrelated changes", "reason": "Also proposes config schema change (exclude_patterns) and Go binary update — goes beyond minimal fix"}
|
|
115
|
+
]
|
|
116
|
+
},
|
|
117
|
+
"without_skill": {
|
|
118
|
+
"pass_rate": 0.6,
|
|
119
|
+
"passed": 3,
|
|
120
|
+
"total": 5,
|
|
121
|
+
"duration_seconds": 399,
|
|
122
|
+
"failed_assertions": [
|
|
123
|
+
{"assertion": "Plan uses git worktree in a sibling directory", "reason": "git checkout -b, no worktree"},
|
|
124
|
+
{"assertion": "Verification loop includes all 3 gates", "reason": "Only bun test and typecheck. No review-work or Cubic."}
|
|
125
|
+
]
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
],
|
|
129
|
+
"analyst_observations": [
|
|
130
|
+
"Three-gates assertion (CI + review-work + Cubic) is the strongest discriminator: 5/5 with-skill vs 0/5 without-skill. Without the skill, agents never know about Cubic or review-work gates.",
|
|
131
|
+
"Worktree isolation is nearly as discriminating (5/5 vs 1/5). One without-skill run (eval-4) independently chose worktree, suggesting some agents already know worktree patterns, but the skill makes it consistent.",
|
|
132
|
+
"The skill's only failure (eval-5 minimal-change) reveals a potential over-engineering tendency: the skill-guided agent proposed config schema changes and Go binary updates for what should have been a minimal regex fix. Consider adding explicit guidance for fix-type tasks to stay minimal.",
|
|
133
|
+
"Duration tradeoff: with-skill is 12% slower on average (340s vs 303s), driven mainly by eval-2 (bugfix) and eval-5 (regex fix) where the skill's thorough verification planning adds overhead. For eval-1 and eval-3-4, with-skill was actually faster.",
|
|
134
|
+
"Without-skill duration has lower variance (stddev 78s vs 169s), suggesting the skill introduces more variable execution paths depending on task complexity.",
|
|
135
|
+
"Non-discriminating assertions: 'References actual files', 'PR targets dev', 'Runs local checks' — these pass regardless of skill. They validate baseline agent competence, not skill value. Consider removing or downweighting in future iterations.",
|
|
136
|
+
"Atomic commits assertion discriminates moderately (2/2 with-skill tested vs 0/2 without-skill tested). Without the skill, agents default to single commits even for multi-file refactors."
|
|
137
|
+
]
|
|
138
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Benchmark: work-with-pr (Iteration 1)
|
|
2
|
+
|
|
3
|
+
## Summary
|
|
4
|
+
|
|
5
|
+
| Metric | With Skill | Without Skill | Delta |
|
|
6
|
+
|--------|-----------|---------------|-------|
|
|
7
|
+
| Pass Rate | 96.8% (30/31) | 51.6% (16/31) | +45.2% |
|
|
8
|
+
| Mean Duration | 340.2s | 303.0s | +37.2s |
|
|
9
|
+
| Duration Stddev | 169.3s | 77.8s | +91.5s |
|
|
10
|
+
|
|
11
|
+
## Per-Eval Breakdown
|
|
12
|
+
|
|
13
|
+
| Eval | With Skill | Without Skill | Delta |
|
|
14
|
+
|------|-----------|---------------|-------|
|
|
15
|
+
| happy-path-feature-config-option | 100% (10/10) | 40% (4/10) | +60% |
|
|
16
|
+
| bugfix-atlas-null-check | 100% (6/6) | 67% (4/6) | +33% |
|
|
17
|
+
| refactor-split-constants | 100% (5/5) | 40% (2/5) | +60% |
|
|
18
|
+
| new-mcp-arxiv-casual | 100% (5/5) | 60% (3/5) | +40% |
|
|
19
|
+
| regex-fix-false-positive | 80% (4/5) | 60% (3/5) | +20% |
|
|
20
|
+
|
|
21
|
+
## Key Discriminators
|
|
22
|
+
|
|
23
|
+
- **three-gates** (CI + review-work + Cubic): 5/5 vs 0/5 — strongest signal
|
|
24
|
+
- **worktree-isolation**: 5/5 vs 1/5
|
|
25
|
+
- **atomic-commits**: 2/2 vs 0/2
|
|
26
|
+
- **cubic-check-method**: 1/1 vs 0/1
|
|
27
|
+
|
|
28
|
+
## Non-Discriminating Assertions
|
|
29
|
+
|
|
30
|
+
- References actual files: passes in both conditions
|
|
31
|
+
- PR targets dev: passes in both conditions
|
|
32
|
+
- Runs local checks before pushing: passes in both conditions
|
|
33
|
+
|
|
34
|
+
## Only With-Skill Failure
|
|
35
|
+
|
|
36
|
+
- **eval-5 minimal-change**: Skill-guided agent proposed config schema changes and Go binary update for a minimal regex fix. The skill may encourage over-engineering in fix scenarios.
|
|
37
|
+
|
|
38
|
+
## Analyst Notes
|
|
39
|
+
|
|
40
|
+
- The skill adds most value for procedural knowledge (verification gates, worktree workflow) that agents cannot infer from codebase alone.
|
|
41
|
+
- Duration cost is modest (+12%) and acceptable given the +45% pass rate improvement.
|
|
42
|
+
- Consider adding explicit "fix-type tasks: stay minimal" guidance in iteration 2.
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
{
|
|
2
|
+
"eval_id": 1,
|
|
3
|
+
"eval_name": "happy-path-feature-config-option",
|
|
4
|
+
"prompt": "I need to add a `max_background_agents` config option to oh-my-opencode that limits how many background agents can run simultaneously. It should be in the plugin config schema with a default of 5. Add validation and make sure the background manager respects it. Create a PR for this.",
|
|
5
|
+
"assertions": [
|
|
6
|
+
{
|
|
7
|
+
"id": "worktree-isolation",
|
|
8
|
+
"text": "Plan uses git worktree in a sibling directory (not main working directory)",
|
|
9
|
+
"type": "manual"
|
|
10
|
+
},
|
|
11
|
+
{
|
|
12
|
+
"id": "branch-from-dev",
|
|
13
|
+
"text": "Branch is created from origin/dev (not master/main)",
|
|
14
|
+
"type": "manual"
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
"id": "atomic-commits",
|
|
18
|
+
"text": "Plan specifies multiple atomic commits for multi-file changes",
|
|
19
|
+
"type": "manual"
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
"id": "local-validation",
|
|
23
|
+
"text": "Runs bun run typecheck, bun test, and bun run build before pushing",
|
|
24
|
+
"type": "manual"
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
"id": "pr-targets-dev",
|
|
28
|
+
"text": "PR is created targeting dev branch (not master)",
|
|
29
|
+
"type": "manual"
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"id": "three-gates",
|
|
33
|
+
"text": "Verification loop includes all 3 gates: CI, review-work, and Cubic",
|
|
34
|
+
"type": "manual"
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
"id": "gate-ordering",
|
|
38
|
+
"text": "Gates are checked in order: CI first, then review-work, then Cubic",
|
|
39
|
+
"type": "manual"
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
"id": "cubic-check-method",
|
|
43
|
+
"text": "Cubic check uses gh api to check cubic-dev-ai[bot] reviews for 'No issues found'",
|
|
44
|
+
"type": "manual"
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
"id": "worktree-cleanup",
|
|
48
|
+
"text": "Plan includes worktree cleanup after merge",
|
|
49
|
+
"type": "manual"
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
"id": "real-file-references",
|
|
53
|
+
"text": "Code changes reference actual files in the codebase (config schema, background manager)",
|
|
54
|
+
"type": "manual"
|
|
55
|
+
}
|
|
56
|
+
]
|
|
57
|
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"run_id": "eval-1-with_skill",
|
|
3
|
+
"expectations": [
|
|
4
|
+
{"text": "Plan uses git worktree in a sibling directory", "passed": true, "evidence": "Uses ../omo-wt/feat-max-background-agents"},
|
|
5
|
+
{"text": "Branch is created from origin/dev", "passed": true, "evidence": "git checkout dev && git pull origin dev, then branch"},
|
|
6
|
+
{"text": "Plan specifies multiple atomic commits for multi-file changes", "passed": true, "evidence": "2 commits: schema+tests, then concurrency+manager"},
|
|
7
|
+
{"text": "Runs bun run typecheck, bun test, and bun run build before pushing", "passed": true, "evidence": "Explicit pre-push section with all 3 commands"},
|
|
8
|
+
{"text": "PR is created targeting dev branch", "passed": true, "evidence": "--base dev in gh pr create"},
|
|
9
|
+
{"text": "Verification loop includes all 3 gates: CI, review-work, and Cubic", "passed": true, "evidence": "Gate A (CI), Gate B (review-work 5 agents), Gate C (Cubic)"},
|
|
10
|
+
{"text": "Gates are checked in order: CI first, then review-work, then Cubic", "passed": true, "evidence": "Explicit ordering in verify loop pseudocode"},
|
|
11
|
+
{"text": "Cubic check uses gh api to check cubic-dev-ai[bot] reviews", "passed": true, "evidence": "Mentions cubic-dev-ai[bot] and 'No issues found' signal"},
|
|
12
|
+
{"text": "Plan includes worktree cleanup after merge", "passed": true, "evidence": "Phase 4: git worktree remove ../omo-wt/feat-max-background-agents"},
|
|
13
|
+
{"text": "Code changes reference actual files in the codebase", "passed": true, "evidence": "References src/config/schema/background-task.ts, src/features/background-agent/concurrency.ts, manager.ts"}
|
|
14
|
+
]
|
|
15
|
+
}
|