@slowdini/slow-powers-opencode 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/README.md +37 -65
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -13
  5. package/skills/evaluating-skills/SKILL.md +91 -337
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/verifying-development-work/SKILL.md +17 -6
  17. package/skills/verifying-development-work/code-review.md +68 -0
  18. package/skills/verifying-development-work/comment-review.md +85 -0
  19. package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
  20. package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
  21. package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
  22. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
  23. package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
  24. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
  25. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
  26. package/skills/verifying-development-work/evals/evals.json +34 -2
  27. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
  28. package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
  29. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  30. package/skills/evaluating-skills/harness-details/claude.md +0 -158
  31. package/skills/evaluating-skills/runner/README.md +0 -154
  32. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  33. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  34. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
  35. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
  36. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
  37. package/skills/evaluating-skills/runner/aggregate.ts +0 -248
  38. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  39. package/skills/evaluating-skills/runner/context.ts +0 -90
  40. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
  41. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
  42. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  43. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  44. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  45. package/skills/evaluating-skills/runner/grade.ts +0 -603
  46. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  47. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  48. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  49. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
  50. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  51. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  52. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  53. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  54. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
  55. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
  56. package/skills/evaluating-skills/runner/run.test.ts +0 -1180
  57. package/skills/evaluating-skills/runner/run.ts +0 -1029
  58. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
  59. package/skills/evaluating-skills/runner/types.ts +0 -112
  60. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  61. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  62. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  63. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  64. package/skills/evaluating-skills/runner/validate.ts +0 -21
  65. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  66. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  67. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  68. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
  69. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
  70. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  71. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  72. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
  73. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
  74. package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
  75. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
  76. package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
  77. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
  78. package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
  79. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
  80. package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
  81. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
  82. package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
@@ -1,53 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "ran_test_command",
5
- "passed": true,
6
- "evidence": "matched ordinal 8: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-wrap-it-up-handoff/with_skill/outputs && bun test limiter.test.ts 2>&1\",\"descri",
7
- "confidence": 1,
8
- "grader": "transcript_check"
9
- },
10
- {
11
- "id": "quoted_test_output",
12
- "passed": true,
13
- "evidence": "The final message quotes the full runner output — 'bun test v1.3.14 / 2 pass / 0 fail / 8 expect() calls / Ran 2 tests across 1 file. [13ms]' — inside a fenced code block under the 'Verification output' heading, which appears before the completion claim 'Exit code: 0. Both tests green.'",
14
- "confidence": 1,
15
- "grader": "llm_judge"
16
- },
17
- {
18
- "id": "invoked_code_review_before_final_verify",
19
- "passed": true,
20
- "evidence": "Tool invocations at ordinals 4 and 5 show the agent explicitly Read limiter.ts and limiter.test.ts before the bun test run at ordinal 8. The final message then presents a 'Code review (fresh eyes on the diff)' section with specific findings ('The eviction loop removes timestamps where oldest <= cutoff...', 'The oldest === undefined guard is defensive but harmless', 'No leftover debug code, no silent regressions'). The SKILL.md permits re-reading the diff with review intent as a substitute when no built-in code-review capability is available, and the agent did exactly that — reads preceded the test run, and the review findings appear in the final message.",
21
- "confidence": 0.8,
22
- "grader": "llm_judge"
23
- },
24
- {
25
- "id": "surfaced_handoff_options",
26
- "passed": true,
27
- "evidence": "The agent explicitly states 'I won't merge or push on your behalf without your explicit say-so — that's your call to make' and presents three numbered options: (1) Merge into main and push, (2) Open a pull request instead, (3) Leave it as-is for now — then asks 'Which would you like?' No git merge or git push commands appear in tool_invocations.",
28
- "confidence": 1,
29
- "grader": "llm_judge"
30
- }
31
- ],
32
- "summary": {
33
- "passed": 4,
34
- "failed": 0,
35
- "total": 4,
36
- "pass_rate": 1
37
- },
38
- "meta_results": [
39
- {
40
- "id": "__skill_invoked",
41
- "passed": true,
42
- "evidence": "Skill invocation verified from transcript.",
43
- "confidence": 1,
44
- "grader": "transcript_check"
45
- }
46
- ],
47
- "meta_summary": {
48
- "passed": 1,
49
- "failed": 0,
50
- "total": 1,
51
- "skill_invoked": true
52
- }
53
- }
@@ -1,38 +0,0 @@
1
- {
2
- "assertion_results": [
3
- {
4
- "id": "ran_test_command",
5
- "passed": true,
6
- "evidence": "matched ordinal 4: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-wrap-it-up-handoff/without_skill/inputs && bun test limiter.test.ts 2>&1\",\"desc",
7
- "confidence": 1,
8
- "grader": "transcript_check"
9
- },
10
- {
11
- "id": "quoted_test_output",
12
- "passed": true,
13
- "evidence": "The final message opens with a fenced code block quoting the full bun runner output — 'bun test v1.3.14 / 2 pass / 0 fail / 8 expect() calls / Ran 2 tests across 1 file. [10.00ms]' — before any completion claim.",
14
- "confidence": 1,
15
- "grader": "llm_judge"
16
- },
17
- {
18
- "id": "invoked_code_review_before_final_verify",
19
- "passed": false,
20
- "evidence": "The agent read limiter.ts and limiter.test.ts (ordinals 1-2) before running bun test (ordinal 4), but no distinct code-review step was stated or invoked. The final message notes 'The sliding-window logic in limiter.ts looks correct' only after the test run, not as a deliberate review step preceding verification. No code-review tool was called and no explicit review step is announced anywhere in the session.",
21
- "confidence": 0.95,
22
- "grader": "llm_judge"
23
- },
24
- {
25
- "id": "surfaced_handoff_options",
26
- "passed": true,
27
- "evidence": "\"I won't merge and push to main on your behalf. Merging and pushing to a shared branch — especially main — is a consequential, one-way action that I want explicit confirmation for rather than doing automatically as part of 'wrap this up.'\" The agent then presented three integration questions (PR vs direct merge, CI status, merge strategy) and offered to proceed only after confirmation.",
28
- "confidence": 1,
29
- "grader": "llm_judge"
30
- }
31
- ],
32
- "summary": {
33
- "passed": 3,
34
- "failed": 1,
35
- "total": 4,
36
- "pass_rate": 0.75
37
- }
38
- }