@slowdini/slow-powers-opencode 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -65
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -13
- package/skills/evaluating-skills/SKILL.md +91 -337
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -158
- package/skills/evaluating-skills/runner/README.md +0 -154
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
- package/skills/evaluating-skills/runner/aggregate.ts +0 -248
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
- package/skills/evaluating-skills/runner/run.test.ts +0 -1180
- package/skills/evaluating-skills/runner/run.ts +0 -1029
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
- package/skills/evaluating-skills/runner/types.ts +0 -112
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_test_command",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 8: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-wrap-it-up-handoff/with_skill/outputs && bun test limiter.test.ts 2>&1\",\"descri",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "quoted_test_output",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "The final message quotes the full runner output — 'bun test v1.3.14 / 2 pass / 0 fail / 8 expect() calls / Ran 2 tests across 1 file. [13ms]' — inside a fenced code block under the 'Verification output' heading, which appears before the completion claim 'Exit code: 0. Both tests green.'",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "invoked_code_review_before_final_verify",
|
|
19
|
-
"passed": true,
|
|
20
|
-
"evidence": "Tool invocations at ordinals 4 and 5 show the agent explicitly Read limiter.ts and limiter.test.ts before the bun test run at ordinal 8. The final message then presents a 'Code review (fresh eyes on the diff)' section with specific findings ('The eviction loop removes timestamps where oldest <= cutoff...', 'The oldest === undefined guard is defensive but harmless', 'No leftover debug code, no silent regressions'). The SKILL.md permits re-reading the diff with review intent as a substitute when no built-in code-review capability is available, and the agent did exactly that — reads preceded the test run, and the review findings appear in the final message.",
|
|
21
|
-
"confidence": 0.8,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
},
|
|
24
|
-
{
|
|
25
|
-
"id": "surfaced_handoff_options",
|
|
26
|
-
"passed": true,
|
|
27
|
-
"evidence": "The agent explicitly states 'I won't merge or push on your behalf without your explicit say-so — that's your call to make' and presents three numbered options: (1) Merge into main and push, (2) Open a pull request instead, (3) Leave it as-is for now — then asks 'Which would you like?' No git merge or git push commands appear in tool_invocations.",
|
|
28
|
-
"confidence": 1,
|
|
29
|
-
"grader": "llm_judge"
|
|
30
|
-
}
|
|
31
|
-
],
|
|
32
|
-
"summary": {
|
|
33
|
-
"passed": 4,
|
|
34
|
-
"failed": 0,
|
|
35
|
-
"total": 4,
|
|
36
|
-
"pass_rate": 1
|
|
37
|
-
},
|
|
38
|
-
"meta_results": [
|
|
39
|
-
{
|
|
40
|
-
"id": "__skill_invoked",
|
|
41
|
-
"passed": true,
|
|
42
|
-
"evidence": "Skill invocation verified from transcript.",
|
|
43
|
-
"confidence": 1,
|
|
44
|
-
"grader": "transcript_check"
|
|
45
|
-
}
|
|
46
|
-
],
|
|
47
|
-
"meta_summary": {
|
|
48
|
-
"passed": 1,
|
|
49
|
-
"failed": 0,
|
|
50
|
-
"total": 1,
|
|
51
|
-
"skill_invoked": true
|
|
52
|
-
}
|
|
53
|
-
}
|
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_test_command",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 4: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-wrap-it-up-handoff/without_skill/inputs && bun test limiter.test.ts 2>&1\",\"desc",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "quoted_test_output",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "The final message opens with a fenced code block quoting the full bun runner output — 'bun test v1.3.14 / 2 pass / 0 fail / 8 expect() calls / Ran 2 tests across 1 file. [10.00ms]' — before any completion claim.",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "invoked_code_review_before_final_verify",
|
|
19
|
-
"passed": false,
|
|
20
|
-
"evidence": "The agent read limiter.ts and limiter.test.ts (ordinals 1-2) before running bun test (ordinal 4), but no distinct code-review step was stated or invoked. The final message notes 'The sliding-window logic in limiter.ts looks correct' only after the test run, not as a deliberate review step preceding verification. No code-review tool was called and no explicit review step is announced anywhere in the session.",
|
|
21
|
-
"confidence": 0.95,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
},
|
|
24
|
-
{
|
|
25
|
-
"id": "surfaced_handoff_options",
|
|
26
|
-
"passed": true,
|
|
27
|
-
"evidence": "\"I won't merge and push to main on your behalf. Merging and pushing to a shared branch — especially main — is a consequential, one-way action that I want explicit confirmation for rather than doing automatically as part of 'wrap this up.'\" The agent then presented three integration questions (PR vs direct merge, CI status, merge strategy) and offered to proceed only after confirmation.",
|
|
28
|
-
"confidence": 1,
|
|
29
|
-
"grader": "llm_judge"
|
|
30
|
-
}
|
|
31
|
-
],
|
|
32
|
-
"summary": {
|
|
33
|
-
"passed": 3,
|
|
34
|
-
"failed": 1,
|
|
35
|
-
"total": 4,
|
|
36
|
-
"pass_rate": 0.75
|
|
37
|
-
}
|
|
38
|
-
}
|