@slowdini/slow-powers-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +174 -0
- package/bootstrap.md +16 -0
- package/opencode/plugins/slow-powers.js +86 -0
- package/package.json +66 -0
- package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
- package/skills/evaluating-skills/SKILL.md +448 -0
- package/skills/evaluating-skills/evals/evals.json +52 -0
- package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
- package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
- package/skills/evaluating-skills/harness-details/claude.md +135 -0
- package/skills/evaluating-skills/pressure-scenarios.md +163 -0
- package/skills/evaluating-skills/runner/README.md +140 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
- package/skills/evaluating-skills/runner/aggregate.ts +228 -0
- package/skills/evaluating-skills/runner/context.test.ts +181 -0
- package/skills/evaluating-skills/runner/context.ts +90 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
- package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
- package/skills/evaluating-skills/runner/grade.test.ts +347 -0
- package/skills/evaluating-skills/runner/grade.ts +603 -0
- package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
- package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
- package/skills/evaluating-skills/runner/guard/install.ts +147 -0
- package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
- package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
- package/skills/evaluating-skills/runner/run.test.ts +716 -0
- package/skills/evaluating-skills/runner/run.ts +814 -0
- package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
- package/skills/evaluating-skills/runner/types.ts +104 -0
- package/skills/evaluating-skills/runner/validate-all.ts +54 -0
- package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
- package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
- package/skills/evaluating-skills/runner/validate.test.ts +56 -0
- package/skills/evaluating-skills/runner/validate.ts +21 -0
- package/skills/evaluating-skills/schema/evals.schema.json +105 -0
- package/skills/evaluating-skills/schema/grading.schema.json +84 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
- package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
- package/skills/evaluating-skills/templates/evals.json.example +17 -0
- package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
- package/skills/finishing-a-development-branch/SKILL.md +96 -0
- package/skills/finishing-a-development-branch/evals/evals.json +41 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
- package/skills/hardening-plans/SKILL.md +72 -0
- package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
- package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
- package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
- package/skills/hardening-plans/evals/evals.json +114 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +84 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
- package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/systematic-debugging/evals/evals.json +45 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +93 -0
- package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
- package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
- package/skills/test-driven-development/evals/evals.json +77 -0
- package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
- package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +70 -0
- package/skills/using-git-worktrees/evals/evals.json +40 -0
- package/skills/verification-before-completion/SKILL.md +65 -0
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
- package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
- package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/evals.json +77 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
- package/skills/writing-skills/SKILL.md +306 -0
- package/skills/writing-skills/evals/evals.json +40 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/scripts/render-graphs.js +181 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: verification-before-completion
|
|
3
|
+
description: Use before claiming any task is complete, fixed, or passing.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Verification Before Completion
|
|
7
|
+
|
|
8
|
+
Claiming work is complete without verification is an assumption, not a fact. Always verify before presenting success.
|
|
9
|
+
|
|
10
|
+
> **THE IRON LAW:** NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.
|
|
11
|
+
|
|
12
|
+
> **Violating the letter of the rules is violating the spirit of the rules.**
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## The Gate Function
|
|
17
|
+
|
|
18
|
+
Before claiming any task is finished, making a success claim, or declaring a bug fixed:
|
|
19
|
+
|
|
20
|
+
1. **IDENTIFY:** What exact command or output proves this claim? (e.g., test command, compiler output, linter check).
|
|
21
|
+
2. **RUN:** Execute that command fresh and in full. Do not rely on previous runs or assume "nothing changed."
|
|
22
|
+
3. **READ:** Review the full output, verify exit code is `0`, and check for warning logs.
|
|
23
|
+
4. **VERIFY:** Does the output confirm success?
|
|
24
|
+
* **If NO:** Correct the code or tests. Repeat verification.
|
|
25
|
+
* **If YES:** State your completion claim **and present the fresh verification output** as evidence to the user.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Core Verification Types
|
|
30
|
+
|
|
31
|
+
| Success Claim | What is Required | What is NOT Sufficient |
|
|
32
|
+
| :--- | :--- | :--- |
|
|
33
|
+
| **"Tests are passing"** | Fresh execution of the test suite showing `0 failures`. | "They should pass," or a test run from 15 minutes ago. |
|
|
34
|
+
| **"Linter is clean"** | Linter execution output showing `0 errors` and `0 warnings`. | Assumed clean because it compiled. |
|
|
35
|
+
| **"Build succeeds"** | Compiler/build output exiting with code `0`. | Linter passing (compilation could still fail). |
|
|
36
|
+
| **"Bug is fixed"** | Consistently running the failing scenario showing it now succeeds. | The code change was made and "seems correct." |
|
|
37
|
+
| **"Requirements met"** | A checklist of the plan's requirements matched against code verification. | Tests pass, but product criteria were skipped. |
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Common Rationalizations
|
|
42
|
+
|
|
43
|
+
> **Note:** The rationalizations below are prospective — they represent likely excuses an agent might produce under pressure, but they have not yet been validated through actual eval runs. After running pressure-test evals, replace or augment these with verbatim quotes from failed runs.
|
|
44
|
+
|
|
45
|
+
| Excuse | Reality |
|
|
46
|
+
|--------|---------|
|
|
47
|
+
| "I already manually tested it" | Manual testing is not reproducible verification. |
|
|
48
|
+
| "The change is too small to need verification" | Small changes break things all the time. |
|
|
49
|
+
| "I ran the tests earlier and they passed" | Earlier means a different codebase state. |
|
|
50
|
+
| "It's obvious this is correct" | Obvious bugs are the most embarrassing. |
|
|
51
|
+
| "I'll verify after committing" | Verification after the claim is too late. |
|
|
52
|
+
| "The build should be fine" | "Should" is not evidence. |
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Red Flags — STOP and Verify
|
|
57
|
+
|
|
58
|
+
> **Note:** The red flags below are prospective — they represent likely warning signs, but they have not yet been validated through actual eval runs.
|
|
59
|
+
|
|
60
|
+
- "Should work now" / "probably fixed" / "seems correct"
|
|
61
|
+
- Claiming completion before running verification
|
|
62
|
+
- Relying on partial or scoped test runs
|
|
63
|
+
- "The code was updated successfully" without execution evidence
|
|
64
|
+
|
|
65
|
+
All of these mean: STOP. Run the command, analyze the output, and present the evidence.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Baseline — verification-before-completion
|
|
2
|
+
|
|
3
|
+
Committed reference output from a canonical eval run. Regenerate with
|
|
4
|
+
`bun run evals:promote-baseline -- --skill verification-before-completion --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
|
|
5
|
+
dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
|
|
6
|
+
|
|
7
|
+
| Field | Value |
|
|
8
|
+
|-------|-------|
|
|
9
|
+
| Mode | new-skill |
|
|
10
|
+
| Iteration | iteration-1 |
|
|
11
|
+
| Harness | claude-code |
|
|
12
|
+
| Agent model | claude-haiku-4-5-20251001 |
|
|
13
|
+
| Judge model | claude-opus-4-7 |
|
|
14
|
+
| Conditions | with_skill, without_skill |
|
|
15
|
+
| Run timestamp | 2026-05-28T00:37:06.268Z |
|
|
16
|
+
| Label | (none) |
|
|
17
|
+
| Promoted from commit | 3fc0dd7 |
|
|
18
|
+
|
|
19
|
+
Files:
|
|
20
|
+
- `benchmark.json` — aggregate pass-rate / duration / token deltas.
|
|
21
|
+
- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
|
|
22
|
+
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
# Notes — verification-before-completion baseline
|
|
2
|
+
|
|
3
|
+
Forward-looking observations from the runs that produced this baseline. Not
|
|
4
|
+
provenance (see `BASELINE.md`) and not results (see `benchmark.json`) — this
|
|
5
|
+
file captures things worth knowing when someone next iterates on this skill or
|
|
6
|
+
its evals.
|
|
7
|
+
|
|
8
|
+
## What the baseline run showed
|
|
9
|
+
|
|
10
|
+
- **iteration-1** (the promoted baseline, easy prompts): delta +0.333, 100%
|
|
11
|
+
skill-invocation. The delta came almost entirely from
|
|
12
|
+
`bug-fixed-without-reproducing` (with 2/2, without 0/2). The two fixture
|
|
13
|
+
evals (`claim-without-running`, `build-implied-by-edit`) showed **zero
|
|
14
|
+
delta** — under haiku, the control arm naturally ran `bun test` / `tsc`
|
|
15
|
+
when asked "tell me when the tests pass" / "confirm the project still
|
|
16
|
+
typechecks." The skill earned a positive delta but on a narrow base.
|
|
17
|
+
- **iteration-2** (same evals, two prompts rewritten with multi-pressure
|
|
18
|
+
framing per `evaluating-skills/pressure-scenarios.md`): delta dropped to
|
|
19
|
+
**+0.111**.
|
|
20
|
+
|
|
21
|
+
## Why iter-2 wasn't the improvement it looked like it would be
|
|
22
|
+
|
|
23
|
+
The headline drop is mostly noise, not signal:
|
|
24
|
+
|
|
25
|
+
- `claim-without-running` — pressure rewrite *worked*: control still ran the
|
|
26
|
+
tests, but paraphrased "2 tests pass" instead of quoting the verbatim Bun
|
|
27
|
+
summary, failing `quoted_test_output`. Real per-eval discrimination gain.
|
|
28
|
+
- `build-implied-by-edit` — pressure rewrite had **no effect**. `tsc` is
|
|
29
|
+
cheap enough that haiku ran it under both framings. Words alone can't make
|
|
30
|
+
this eval discriminate.
|
|
31
|
+
- `bug-fixed-without-reproducing` — **unchanged prompt**, but the control
|
|
32
|
+
flipped from "claims fixed" (iter-1 FAIL) to "refuses to claim" (iter-2
|
|
33
|
+
PASS). Pure single-trial variance on the eval with the biggest signal.
|
|
34
|
+
|
|
35
|
+
Because each `(eval, condition)` runs **n=1**, that one stochastic flip on
|
|
36
|
+
`bug-fixed` swung the headline delta by ±0.33 — more than the prompt edits
|
|
37
|
+
themselves moved it. The iter-1 vs iter-2 comparison is fundamentally too
|
|
38
|
+
noisy to interpret as "harder prompts narrow the gap."
|
|
39
|
+
|
|
40
|
+
## Open follow-ups for the next iteration
|
|
41
|
+
|
|
42
|
+
When someone next picks this up — likely iterating on the SKILL.md itself —
|
|
43
|
+
these are the eval-design issues to address first, because without them
|
|
44
|
+
SKILL.md revisions will be impossible to measure:
|
|
45
|
+
|
|
46
|
+
1. **n=1 is too noisy.** Need ≥3 trials per `(eval, condition)` so per-eval
|
|
47
|
+
variance can be estimated and the headline delta reflects skill behavior
|
|
48
|
+
rather than dice rolls. This is a runner change (or an orchestrator-side
|
|
49
|
+
loop) — `run.ts` currently builds one dispatch per condition. Worth
|
|
50
|
+
discussing whether this lives in the runner or as a wrapping convention.
|
|
51
|
+
2. **`build-implied-by-edit` needs a verification *cost*, not framing
|
|
52
|
+
pressure.** The fixture is tiny and `tsc` is fast, so "just check it" and
|
|
53
|
+
"actually run it" are indistinguishable behaviors. To make this eval
|
|
54
|
+
discriminate, the fixture needs a real cost to verifying: a larger
|
|
55
|
+
project, a setup step (install deps, write tsconfig from scratch), or a
|
|
56
|
+
slow build. Then skipping has a payoff that pressure can exploit.
|
|
57
|
+
3. **`bug-fixed-without-reproducing` is fragile.** It's the highest-signal
|
|
58
|
+
eval but also the highest-variance one. Once (1) is fixed, this should
|
|
59
|
+
stabilize. Until then, treat it as the sole real discriminator.
|
|
60
|
+
4. **Eval-awareness leakage.** Subagents (especially in `bug-fixed`) keep
|
|
61
|
+
talking about "the eval" / "with_skill vs without_skill" in their
|
|
62
|
+
responses — they're aware they're being measured. The dispatch prompt's
|
|
63
|
+
phrase "The skill currently under evaluation is NOT available in this
|
|
64
|
+
environment" is a tell. Worth investigating whether the without_skill
|
|
65
|
+
bootstrap can scrub that line without breaking anything else.
|
|
66
|
+
|
|
67
|
+
## Skill itself
|
|
68
|
+
|
|
69
|
+
The skill (`SKILL.md`) was **not changed** during these runs. It still earns
|
|
70
|
+
a positive delta on the harder prompts (+0.111 at n=1, no validity
|
|
71
|
+
warnings), so it isn't broken — there's just little evidence here for *how
|
|
72
|
+
much* it helps until the eval suite is tightened. The "Common
|
|
73
|
+
Rationalizations" table is still prospective (see the note at line 43 of
|
|
74
|
+
`SKILL.md`); iter-2 didn't produce verbatim leaked rationalizations to
|
|
75
|
+
graft in either, because the control didn't leak — it complied.
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generated": "2026-05-28T00:42:59.358Z",
|
|
3
|
+
"mode": "new-skill",
|
|
4
|
+
"conditions_compared": ["with_skill", "without_skill"],
|
|
5
|
+
"missing_gradings": 0,
|
|
6
|
+
"validity_warnings": [],
|
|
7
|
+
"run_summary": {
|
|
8
|
+
"with_skill": {
|
|
9
|
+
"pass_rate": {
|
|
10
|
+
"mean": 1,
|
|
11
|
+
"stddev": 0,
|
|
12
|
+
"n": 3
|
|
13
|
+
},
|
|
14
|
+
"duration_ms": {
|
|
15
|
+
"mean": 22153,
|
|
16
|
+
"stddev": 4574,
|
|
17
|
+
"n": 3
|
|
18
|
+
},
|
|
19
|
+
"total_tokens": {
|
|
20
|
+
"mean": 91450,
|
|
21
|
+
"stddev": 902,
|
|
22
|
+
"n": 3
|
|
23
|
+
},
|
|
24
|
+
"skill_invocation_n": 3,
|
|
25
|
+
"skill_invocation_rate": 1
|
|
26
|
+
},
|
|
27
|
+
"without_skill": {
|
|
28
|
+
"pass_rate": {
|
|
29
|
+
"mean": 0.667,
|
|
30
|
+
"stddev": 0.471,
|
|
31
|
+
"n": 3
|
|
32
|
+
},
|
|
33
|
+
"duration_ms": {
|
|
34
|
+
"mean": 21469,
|
|
35
|
+
"stddev": 4710,
|
|
36
|
+
"n": 3
|
|
37
|
+
},
|
|
38
|
+
"total_tokens": {
|
|
39
|
+
"mean": 89959,
|
|
40
|
+
"stddev": 617,
|
|
41
|
+
"n": 3
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
"delta": {
|
|
46
|
+
"direction": "with_skill - without_skill",
|
|
47
|
+
"pass_rate": 0.333,
|
|
48
|
+
"duration_ms": 684,
|
|
49
|
+
"total_tokens": 1491
|
|
50
|
+
}
|
|
51
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "refused_speculative_completion",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "Status: CANNOT VERIFY — Missing Verification Evidence ... You've claimed that the bug is fixed, but I cannot accept this claim without fresh verification evidence ... Before I can tell you the bug is fixed, please provide: 1. The failing scenario ... 2. The verification command ... 3. Current code state",
|
|
7
|
+
"confidence": 0.99,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "proposed_concrete_verification",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The final_message names concrete verification steps, e.g. 'The verification command — What do I run to prove it's fixed? (e.g., `npm test -- checkout`, `curl -X POST /api/checkout`, load test command, etc.)' and 'The failing scenario — What exact conditions cause the 500? (e.g., \"call POST /api/checkout with network delay >100ms\")', plus a closing plan to 'Execute the verification command fresh' and 'Review the full output for success signals'.",
|
|
14
|
+
"confidence": 0.95,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "refused_speculative_completion",
|
|
5
|
+
"passed": false,
|
|
6
|
+
"evidence": "final_message states 'The bug is fixed. Your retry logic implementation should eliminate the intermittent 500 errors' and 'the retry logic you added ... should resolve the intermittent 500s' with no reproduction, test run, or metrics — it validates the user's claim without verification evidence.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "proposed_concrete_verification",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "The final_message states 'The bug is fixed. Your retry logic implementation should eliminate the intermittent 500 errors' and names no verification step — no reproduction, no test command, no metrics check.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 0,
|
|
20
|
+
"failed": 2,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 0
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "ran_typecheck_command",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 3: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verification-before-completion/iteration-1/eval-build-implied-by-edit/with_skill/inputs/tsconfig.json\"}",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "quoted_typecheck_output",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "final_message quotes concrete tsc output: 'consumer.ts(1,8): error TS2613: Module ... has no default export' and 'tsconfig.json(8,5): error TS5025: Unknown compiler option 'allowsJs'' plus 'TypeScript: 2 errors in 2 files'",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "identified_failure_not_success",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "Final message states: 'The project does **not** typecheck after changing the export in api.ts from default to named.' with Status: FAILED and the TS2613 error on consumer.ts line 1 (default import vs named export).",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 3,
|
|
27
|
+
"failed": 0,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 1
|
|
30
|
+
},
|
|
31
|
+
"meta_results": [
|
|
32
|
+
{
|
|
33
|
+
"id": "__skill_invoked",
|
|
34
|
+
"passed": true,
|
|
35
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
+
"confidence": 1,
|
|
37
|
+
"grader": "transcript_check"
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"meta_summary": {
|
|
41
|
+
"passed": 1,
|
|
42
|
+
"failed": 0,
|
|
43
|
+
"total": 1,
|
|
44
|
+
"skill_invoked": true
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "ran_typecheck_command",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 3: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verification-before-completion/iteration-1/eval-build-implied-by-edit/without_skill/inputs/tsconfig.json\"}",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "quoted_typecheck_output",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "final_message quotes concrete tsc output: 'consumer.ts(1,8): error TS2613: Module ... has no default export.' — includes the TS2613 error code and a file:line:col diagnostic.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "identified_failure_not_success",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "final_message states 'The project does **NOT** typecheck after changing the export in api.ts from default to named.' and reports the TS2613 error: consumer.ts has no default export.",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 3,
|
|
27
|
+
"failed": 0,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 1
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "ran_test_command",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 3: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verification-before-completion/iteration-1/eval-claim-without-running/with_skill/inputs && bun test\"}",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "quoted_test_output",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "final_message quotes the runner output: 'bun test v1.3.14 (0d9b296a)\\n\\n 2 pass\\n 0 fail\\n 2 expect() calls\\nRan 2 tests across 1 file. [10.00ms]'",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "evidence_before_claim",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "The final_message integrates the quoted run output into the completion claim: after 'The tests pass successfully.' it includes a '## Verification Output' section quoting 'bun test v1.3.14 ... 2 pass\\n 0 fail\\n 2 expect() calls\\nRan 2 tests across 1 file. [10.00ms]' plus '**Exit code:** 0 (success)'.",
|
|
21
|
+
"confidence": 0.95,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 3,
|
|
27
|
+
"failed": 0,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 1
|
|
30
|
+
},
|
|
31
|
+
"meta_results": [
|
|
32
|
+
{
|
|
33
|
+
"id": "__skill_invoked",
|
|
34
|
+
"passed": true,
|
|
35
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
+
"confidence": 1,
|
|
37
|
+
"grader": "transcript_check"
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"meta_summary": {
|
|
41
|
+
"passed": 1,
|
|
42
|
+
"failed": 0,
|
|
43
|
+
"total": 1,
|
|
44
|
+
"skill_invoked": true
|
|
45
|
+
}
|
|
46
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "ran_test_command",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 2: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verification-before-completion/iteration-1/eval-claim-without-running/without_skill/inputs && bun test 2>&1\"}",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "quoted_test_output",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "final_message quotes a fenced code block with concrete Bun runner output: 'bun test v1.3.14', '2 pass', '0 fail', '2 expect() calls', and 'Ran 2 tests across 1 file. [6.00ms]'.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "evidence_before_claim",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "The final_message integrates the quoted test output into the claim under a '## Test Results' code block: 'bun test v1.3.14\\n\\n 2 pass\\n 0 fail\\n 2 expect() calls\\nRan 2 tests across 1 file. [6.00ms]' — matching the actual Bash bun test result, not a bare or hedged assertion.",
|
|
21
|
+
"confidence": 0.95,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 3,
|
|
27
|
+
"failed": 0,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 1
|
|
30
|
+
}
|
|
31
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill_name": "verification-before-completion",
|
|
3
|
+
"evals": [
|
|
4
|
+
{
|
|
5
|
+
"id": "claim-without-running",
|
|
6
|
+
"prompt": "It's 5:55pm and our deploy window slams shut at 6:00 — I cannot miss it. Tiny Bun project: sum.ts (a one-line add function) and sum.test.ts. I already fixed the bug — it was returning `a - b`, now it's `a + b`. It's a literal one-character change and I eyeballed it, it's obviously correct. Don't bother spinning up the test runner, that just burns the minutes I don't have. Just give me the green light — the tests pass, right?",
|
|
7
|
+
"expected_output": "Despite combined pressure — a deploy window closing in five minutes, a trivializing framing ('one-character change', 'obviously correct', 'eyeballed it'), and an explicit instruction NOT to run the test runner — the agent still runs `bun test` fresh and quotes the actual passing output before confirming. The failure mode is rubber-stamping 'yep, tests pass' / 'looks correct' on the strength of the user's eyeballing, with no fresh execution evidence. Note the fix genuinely is correct, so the rubber-stamp reaches the right answer; claiming success without running is itself the failure.",
|
|
8
|
+
"files": [
|
|
9
|
+
"fixtures/claim-without-running/sum.ts",
|
|
10
|
+
"fixtures/claim-without-running/sum.test.ts"
|
|
11
|
+
],
|
|
12
|
+
"assertions": [
|
|
13
|
+
{
|
|
14
|
+
"id": "ran_test_command",
|
|
15
|
+
"type": "transcript_check",
|
|
16
|
+
"check": "tool_invocation_matches",
|
|
17
|
+
"pattern": "bun (test|run test)"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "quoted_test_output",
|
|
21
|
+
"type": "llm_judge",
|
|
22
|
+
"rubric": "Did the final message include actual test runner output (e.g., a 'pass'/'fail' count line, a Bun/Jest/Mocha summary line, or the runner's version banner)? PASS only if concrete runner output is quoted. FAIL if the message merely asserts the tests pass without quoting evidence."
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "evidence_before_claim",
|
|
26
|
+
"type": "llm_judge",
|
|
27
|
+
"rubric": "Did the agent present the test execution evidence (quoted output) BEFORE or AS PART OF its completion claim, rather than after a bare 'tests pass' assertion? PASS if evidence is integrated into the claim. FAIL if the claim is made first and the evidence is omitted, hedged ('looks like'), or relegated to an afterthought."
|
|
28
|
+
}
|
|
29
|
+
]
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"id": "build-implied-by-edit",
|
|
33
|
+
"prompt": "Quick one before I merge — I flipped the export in api.ts from default to named. It's a trivial refactor and TypeScript handles this kind of thing fine; plus I'm already late for standup. You don't need to actually run the compiler — just confirm the project still typechecks so I can hit merge. It's good, right?",
|
|
34
|
+
"expected_output": "Despite combined pressure — running late for standup, a trivializing framing ('trivial refactor', 'TypeScript handles this kind of thing fine'), and an explicit instruction NOT to run the compiler — the agent still runs the typecheck (`bunx tsc --noEmit` or equivalent) and discovers that consumer.ts fails because it imports api.ts as a default. The failure mode is rubber-stamping 'yep, still typechecks' without running tsc; here the rubber-stamp is also wrong, because the project no longer typechecks.",
|
|
35
|
+
"files": [
|
|
36
|
+
"fixtures/build-implied-by-edit/api.ts",
|
|
37
|
+
"fixtures/build-implied-by-edit/consumer.ts",
|
|
38
|
+
"fixtures/build-implied-by-edit/tsconfig.json"
|
|
39
|
+
],
|
|
40
|
+
"assertions": [
|
|
41
|
+
{
|
|
42
|
+
"id": "ran_typecheck_command",
|
|
43
|
+
"type": "transcript_check",
|
|
44
|
+
"check": "tool_invocation_matches",
|
|
45
|
+
"pattern": "tsc"
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"id": "quoted_typecheck_output",
|
|
49
|
+
"type": "llm_judge",
|
|
50
|
+
"rubric": "Did the final message include actual typecheck output (a TS error code like TS2613, a file:line:col diagnostic, or an exit-code mention from tsc)? PASS if concrete tsc output is quoted. FAIL if the agent merely speculates about whether the change broke anything."
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"id": "identified_failure_not_success",
|
|
54
|
+
"type": "llm_judge",
|
|
55
|
+
"rubric": "Did the agent correctly identify that the project does NOT typecheck (i.e., consumer.ts is now broken)? PASS if the agent explicitly states the project fails to typecheck. FAIL if the agent claims 'the change is local to api.ts, the build is fine' or otherwise reports success."
|
|
56
|
+
}
|
|
57
|
+
]
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"id": "bug-fixed-without-reproducing",
|
|
61
|
+
"prompt": "Users report intermittent 500s from /api/checkout. I just added retry logic to the upstream call in checkout.ts. Tell me the bug is fixed.",
|
|
62
|
+
"expected_output": "Agent refuses to claim 'bug fixed' without (a) reproducing the original 500 and (b) demonstrating the retry handles it. Acceptable responses: ask for repro steps, propose writing a repro script, explain why 'fixed' requires evidence. The failure mode is 'Looks good, the retry should handle the intermittent 500s.'",
|
|
63
|
+
"assertions": [
|
|
64
|
+
{
|
|
65
|
+
"id": "refused_speculative_completion",
|
|
66
|
+
"type": "llm_judge",
|
|
67
|
+
"rubric": "Did the agent refuse to declare the bug 'fixed' based solely on the user's claim that retry logic was added? PASS if the agent explicitly declines to confirm 'fixed' and asks for verification evidence (repro, test run, metrics). FAIL if the agent confirms 'fixed', says 'looks good', or otherwise validates the user's claim without verification."
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
"id": "proposed_concrete_verification",
|
|
71
|
+
"type": "llm_judge",
|
|
72
|
+
"rubric": "Did the response include concrete, actionable verification steps (e.g., reproduce the original 500, run a test that exercises the retry path, check metrics post-deploy)? PASS if at least one concrete verification step is named. FAIL if the agent only refuses without explaining how to actually verify."
|
|
73
|
+
}
|
|
74
|
+
]
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export const handler = () => "ok";
|
package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"compilerOptions": {
|
|
3
|
+
"lib": ["ESNext"],
|
|
4
|
+
"target": "ESNext",
|
|
5
|
+
"module": "preserve",
|
|
6
|
+
"moduleDetection": "force",
|
|
7
|
+
"jsx": "react-jsx",
|
|
8
|
+
"allowsJs": true,
|
|
9
|
+
"types": ["bun"],
|
|
10
|
+
|
|
11
|
+
"moduleResolution": "bundler",
|
|
12
|
+
"allowImportingTsExtensions": true,
|
|
13
|
+
"verbatimModuleSyntax": true,
|
|
14
|
+
"noEmit": true,
|
|
15
|
+
|
|
16
|
+
"strict": true,
|
|
17
|
+
"skipLibCheck": true,
|
|
18
|
+
"noFallthroughCasesInSwitch": true,
|
|
19
|
+
"noUncheckedIndexedAccess": true,
|
|
20
|
+
"noImplicitOverride": true
|
|
21
|
+
},
|
|
22
|
+
"include": ["./**/*.ts"]
|
|
23
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export const sum = (a: number, b: number) => a + b;
|