@slowdini/slow-powers-opencode 0.1.5 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -13
- package/package.json +5 -1
- package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +22 -20
- package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
- package/skills/evaluating-skills/harness-details/claude.md +51 -15
- package/skills/evaluating-skills/harness-parity.md +155 -0
- package/skills/evaluating-skills/pressure-scenarios.md +1 -1
- package/skills/evaluating-skills/runner/README.md +28 -19
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +2 -2
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +222 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +107 -11
- package/skills/evaluating-skills/runner/aggregate.test.ts +220 -0
- package/skills/evaluating-skills/runner/aggregate.ts +21 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +295 -2
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +102 -6
- package/skills/evaluating-skills/runner/guard/policy.test.ts +57 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +51 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +19 -1
- package/skills/evaluating-skills/runner/record-runs.test.ts +314 -0
- package/skills/evaluating-skills/runner/record-runs.ts +209 -0
- package/skills/evaluating-skills/runner/run.test.ts +523 -0
- package/skills/evaluating-skills/runner/run.ts +376 -17
- package/skills/evaluating-skills/runner/sandbox-policy.ts +20 -0
- package/skills/evaluating-skills/runner/types.ts +9 -0
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +227 -0
- package/skills/evaluating-skills/runner/workspace-teardown.ts +136 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +2 -2
- package/skills/evaluating-skills/schema/stray-writes.schema.json +15 -3
- package/skills/evaluating-skills/templates/eval-task-prompt.md +5 -3
- package/skills/hardening-plans/SKILL.md +1 -1
- package/skills/systematic-debugging/SKILL.md +4 -0
- package/skills/test-driven-development/SKILL.md +2 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +1 -1
- package/skills/verifying-development-work/SKILL.md +99 -0
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +23 -0
- package/skills/verifying-development-work/evals/baseline/NOTES.md +87 -0
- package/skills/verifying-development-work/evals/baseline/benchmark.json +54 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +178 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
- package/skills/working-in-isolation/SKILL.md +2 -2
- package/skills/writing-skills/SKILL.md +2 -3
- package/skills/finishing-a-development-branch/SKILL.md +0 -96
- package/skills/finishing-a-development-branch/evals/evals.json +0 -41
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
- package/skills/verification-before-completion/SKILL.md +0 -65
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +0 -22
- package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
- package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/evals.json +0 -77
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
# Reviewing the Comments
|
|
2
|
+
|
|
3
|
+
This is **phase 3** — the last step of the finishing sequence in [`SKILL.md`](SKILL.md).
|
|
4
|
+
By now the code has been reviewed (phase 1), and verified (phase 2). The code is frozen;
|
|
5
|
+
**this pass touches only comments.** That is the whole reason it comes last: a
|
|
6
|
+
comment edit can't change behavior, so it can't invalidate the verification you
|
|
7
|
+
just ran — there is nothing here to re-test. Do it as the final polish before the
|
|
8
|
+
handoff.
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## The comment-hygiene pass
|
|
13
|
+
|
|
14
|
+
Review **every comment in the changed code** with one goal: **delete as many as
|
|
15
|
+
possible.**
|
|
16
|
+
|
|
17
|
+
This runs against your own instinct. Writing a comment feels like preserving the
|
|
18
|
+
narrative — why this approach, what was tried, which ticket it traces to. But a
|
|
19
|
+
human reading code finds it *very hard* to skip a comment; every one they hit,
|
|
20
|
+
they stop and read. Narrative comments tax every future reader to record a story
|
|
21
|
+
that belongs in the commit message or the PR, not the source. Left in, they
|
|
22
|
+
become the thing the user has to delete by hand before merging — so delete them
|
|
23
|
+
now, on their behalf.
|
|
24
|
+
|
|
25
|
+
A comment survives only if it fits one of two categories **and** meets its bar:
|
|
26
|
+
|
|
27
|
+
1. **Explanation.** Code that is genuinely hard to follow from reading it — a
|
|
28
|
+
subtle algorithm, a deliberate break from the usual pattern, a non-obvious
|
|
29
|
+
constraint. The comment fills the gap with an *evergreen* reason (true a year
|
|
30
|
+
from now, not "fixes the bug from Tuesday"). These are **extremely rare**:
|
|
31
|
+
well-written code is self-commenting, and a reader fluent in code can follow
|
|
32
|
+
even sophisticated paths when the code itself is clear. If the right fix is to
|
|
33
|
+
make the code clearer, do that instead of explaining unclear code.
|
|
34
|
+
2. **Documentation.** A concise doc-style comment (jsdoc and equivalents) on an
|
|
35
|
+
**exported** member, where the text is surfaced by doc generators and editor
|
|
36
|
+
hints to readers who *don't* have the source in front of them. These almost
|
|
37
|
+
always earn their place. Keep them concise and evergreen, matching the
|
|
38
|
+
surrounding style; they may describe usage more freely since that's their job.
|
|
39
|
+
|
|
40
|
+
**Everything else gets deleted — about 99.9% of the time.** The most common
|
|
41
|
+
offender, and the one that feels most defensible, is **step-by-step narration**
|
|
42
|
+
that walks through what the code already says — `// Step 1: lowercase`,
|
|
43
|
+
`// now strip the accents`, `// finally, trim the dashes`. It reads as helpful
|
|
44
|
+
structure, and *that feeling is the trap*: the numbered steps restate control
|
|
45
|
+
flow the reader can already see in the code, so most such comments carry no
|
|
46
|
+
information the line below them doesn't — they only add something else to read.
|
|
47
|
+
"The steps make it easier to follow" is the rationalization to delete *through*,
|
|
48
|
+
not act on; the code is the structure. Strip the narration and nothing is lost.
|
|
49
|
+
The same goes for prose narrative ("first we… then we…"), time-sensitive comments
|
|
50
|
+
(ticket numbers, "the previous solution…", "changed this because…"), and any
|
|
51
|
+
comment that merely restates its line. A comment that fits neither surviving
|
|
52
|
+
category, or fits one but misses its bar, is noise. **When in doubt, delete it.**
|
|
53
|
+
A truly unique case might warrant a truly unusual comment — but treat that as the
|
|
54
|
+
rare exception it is, not the default.
|
|
55
|
+
|
|
56
|
+
```ts
|
|
57
|
+
// BEFORE — every comment restates the line under it
|
|
58
|
+
// Step 1: lowercase the title
|
|
59
|
+
const lower = title.toLowerCase();
|
|
60
|
+
// Step 2: replace whitespace runs with a single hyphen
|
|
61
|
+
const hyphenated = lower.replace(/\s+/g, "-");
|
|
62
|
+
|
|
63
|
+
// AFTER — the code already says all of that
|
|
64
|
+
const lower = title.toLowerCase();
|
|
65
|
+
const hyphenated = lower.replace(/\s+/g, "-");
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
**A kernel of value doesn't save the comment around it.** The hardest case is
|
|
69
|
+
the *mixed* comment — mostly narration, with one genuinely useful clause buried
|
|
70
|
+
in it (a real constraint, a non-obvious *why*). Keeping the whole block "because
|
|
71
|
+
part of it is useful" is exactly how noise survives review: a reader will keep a
|
|
72
|
+
comment that's 90% restatement for the sake of the 10% that matters. Don't.
|
|
73
|
+
**Extract the useful part, delete the rest, and if what remains earns a comment,
|
|
74
|
+
write it as a tight standalone one** — the kernel alone, not the narration that
|
|
75
|
+
carried it. A four-line "Step 1… / Step 2 *(the one real reason)* / Step 3… /
|
|
76
|
+
Step 4…" block collapses to a single comment stating that one reason, and the
|
|
77
|
+
numbered narration is gone.
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
## Then: hand it back
|
|
82
|
+
|
|
83
|
+
These were comment-only edits — they change no behavior, so there is **nothing to
|
|
84
|
+
re-verify**: the verification from phase 2 still covers the code being returned.
|
|
85
|
+
Return to the finishing sequence in [`SKILL.md`](SKILL.md) for the handoff.
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Baseline — verifying-development-work
|
|
2
|
+
|
|
3
|
+
Committed reference output from a canonical eval run. Regenerate with
|
|
4
|
+
`bun run evals:promote-baseline -- --skill verifying-development-work --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
|
|
5
|
+
dispatch files, produced outputs) stays gitignored under `skills-workspace/`
|
|
6
|
+
and is reclaimable by `evals:teardown` once promoted (this commit's marker).
|
|
7
|
+
|
|
8
|
+
| Field | Value |
|
|
9
|
+
|-------|-------|
|
|
10
|
+
| Mode | revision |
|
|
11
|
+
| Iteration | iteration-6 |
|
|
12
|
+
| Harness | claude-code |
|
|
13
|
+
| Agent model | claude-sonnet-4-6 |
|
|
14
|
+
| Judge model | claude-sonnet-4-6 |
|
|
15
|
+
| Conditions | old_skill, new_skill |
|
|
16
|
+
| Run timestamp | 2026-06-05T01:32:51.388Z |
|
|
17
|
+
| Label | (none) |
|
|
18
|
+
| Promoted from commit | 4d6276b |
|
|
19
|
+
|
|
20
|
+
Files:
|
|
21
|
+
- `benchmark.json` — aggregate pass-rate / duration / token deltas.
|
|
22
|
+
- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.
|
|
23
|
+
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
# Baseline notes — dedicated code/comment review (issue #173)
|
|
2
|
+
|
|
3
|
+
Revision eval validating the move off the harness's built-in code-review onto
|
|
4
|
+
self-contained guidance, plus the split of the finishing sequence into three
|
|
5
|
+
ordered phases (revision mode, baseline snapshot `dev`, sonnet/sonnet, bootstrap
|
|
6
|
+
on, guard armed, 2026-06-05). Promoted from **iteration-6**; confirmed by
|
|
7
|
+
**iteration-7**.
|
|
8
|
+
|
|
9
|
+
## The change under test
|
|
10
|
+
|
|
11
|
+
- SKILL.md "Finishing" rewritten into three explicit phases: **(1) review & fix
|
|
12
|
+
the CODE** ([`code-review.md`](../../code-review.md)) → code is frozen → **(2)
|
|
13
|
+
final verification** on the frozen code → **(3) review & clean the COMMENTS**
|
|
14
|
+
([`comment-review.md`](../../comment-review.md)), cosmetic, no re-verify.
|
|
15
|
+
- A copy-into-your-tracker **finishing checklist** enumerating the phases + the
|
|
16
|
+
no-unilateral-merge gate.
|
|
17
|
+
- `code-review.md` is the code review only; the comment-hygiene pass (delete
|
|
18
|
+
narration / step-by-step / ticket comments; keep only Explanation or exported
|
|
19
|
+
Documentation; **extract the kernel** from a mixed comment) moved to its own
|
|
20
|
+
`comment-review.md`.
|
|
21
|
+
- Old arm (`dev`) = the prior step 1 "invoke your harness's built-in code-review
|
|
22
|
+
capability," no companion files.
|
|
23
|
+
|
|
24
|
+
## Suite
|
|
25
|
+
|
|
26
|
+
Two cases via `--only`, the ones this change actually touches:
|
|
27
|
+
`comment-hygiene-at-handoff` and `wrap-it-up-handoff`. The other four suite
|
|
28
|
+
cases (`claim-without-running`, `build-implied-by-edit`, the two seeded cases)
|
|
29
|
+
were **not** re-run for this change — they exercise the Gate Function / red-flag
|
|
30
|
+
tables, not the finishing-sequence restructure. A future full-suite revision run
|
|
31
|
+
is wanted before treating this as a whole-skill baseline (see below).
|
|
32
|
+
|
|
33
|
+
## Result (iteration-6 promoted; iteration-7 confirms)
|
|
34
|
+
|
|
35
|
+
| | old_skill | new_skill | delta |
|
|
36
|
+
|---|---|---|---|
|
|
37
|
+
| iteration-6 | 0.75 | 1.00 | +0.25 |
|
|
38
|
+
| iteration-7 | 0.75 | 0.875 | +0.125 |
|
|
39
|
+
| **mean** | **0.75** | **0.9375** | **+0.1875** |
|
|
40
|
+
|
|
41
|
+
Both iterations positive, invocation 100% / 100%, no validity warnings. Tokens:
|
|
42
|
+
new approximately 215k / 190k vs old approximately 143k / 110k — the phased
|
|
43
|
+
review + checklist cost ~40-70% more tokens; the +~19pp buys that.
|
|
44
|
+
|
|
45
|
+
## What discriminates (and what doesn't)
|
|
46
|
+
|
|
47
|
+
- **Robust driver — the checklist drives a consistent handoff.**
|
|
48
|
+
`wrap-it-up-handoff` new_skill is **4/4 in both runs**; old is 3/4 then 2/4 —
|
|
49
|
+
the baseline streakily forgets to *quote the fresh test output* and to
|
|
50
|
+
*surface all four integration options*. The checklist nails both every time,
|
|
51
|
+
plus the explicit Phase 1/2/3 structure (agents reproduce "Phase 1 — Code
|
|
52
|
+
review / Phase 2 — Verification / Phase 3 — Comment cleanup" verbatim).
|
|
53
|
+
- **`deleted_narrative_comments` is noisy in BOTH arms.** It flipped between runs
|
|
54
|
+
(it6 old-FAIL/new-PASS; it7 old-PASS/new-FAIL). Deleting the *mild* restatement
|
|
55
|
+
one-liners (`// lowercase the title`, `// strip leading and trailing hyphens`)
|
|
56
|
+
is a borderline judgment neither arm makes reliably — one agent called them
|
|
57
|
+
"lightweight orientation aids" and kept them. So it is **not** the delta
|
|
58
|
+
driver; it roughly cancels. The *extract-the-kernel* behavior itself is solid:
|
|
59
|
+
the NFKD kernel **and** the exported jsdoc were kept in 100% of arms across
|
|
60
|
+
every run, and the ticket block was always removed.
|
|
61
|
+
|
|
62
|
+
## Process history (why it took several iterations to measure)
|
|
63
|
+
|
|
64
|
+
- **it1-3 (delta <= 0):** the original comment fixture was confounded — its
|
|
65
|
+
"Step N" comments mixed pure restatement with a genuine kernel (the NFKD
|
|
66
|
+
reason), so no agent could satisfy "delete all Step-N" while keeping the
|
|
67
|
+
kernel. `deleted_narrative` failed 6/6. Not a skill signal; a broken yardstick.
|
|
68
|
+
- **it4-5 (+0.25 then -0.125, sign flipped):** fixture rewritten to a clean
|
|
69
|
+
noise/kernel split + prompt rewritten to invite direct edits (agents had been
|
|
70
|
+
*advising*, not editing). Positive once, then an over-strict assertion (re-run
|
|
71
|
+
tests after a comment-only edit) docked the new arm.
|
|
72
|
+
- **it6-7 (+0.25, +0.125, both positive):** finishing sequence split into the
|
|
73
|
+
three phases above so verification lands on frozen code *before* comment
|
|
74
|
+
cleanup, and the "re-verify after comment-only edits" requirement dropped
|
|
75
|
+
(comment edits change no behavior). Reliable positive delta.
|
|
76
|
+
|
|
77
|
+
## Caveats / next iterator
|
|
78
|
+
|
|
79
|
+
- n=1 per cell per iteration (2 data points per condition). Both positive, but
|
|
80
|
+
the comment-deletion sub-behavior is genuinely noisy — a third confirming run,
|
|
81
|
+
or a sharper fixture where the restatement comments are *unmistakably*
|
|
82
|
+
deletable, would tighten it.
|
|
83
|
+
- Bootstrap on, so invocation is pinned at 100%; this baseline measures
|
|
84
|
+
pass-rate, not trigger rate.
|
|
85
|
+
- Before calling this a whole-skill baseline, re-run the full 6-case suite in
|
|
86
|
+
revision mode against `dev` — the four Gate-Function cases also see the
|
|
87
|
+
restructured finishing sequence and their committed numbers are now stale.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generated": "2026-06-05T01:36:19.492Z",
|
|
3
|
+
"mode": "revision",
|
|
4
|
+
"baseline": "baseline",
|
|
5
|
+
"conditions_compared": ["old_skill", "new_skill"],
|
|
6
|
+
"missing_gradings": 0,
|
|
7
|
+
"validity_warnings": [],
|
|
8
|
+
"run_summary": {
|
|
9
|
+
"old_skill": {
|
|
10
|
+
"pass_rate": {
|
|
11
|
+
"mean": 0.75,
|
|
12
|
+
"stddev": 0,
|
|
13
|
+
"n": 2
|
|
14
|
+
},
|
|
15
|
+
"duration_ms": {
|
|
16
|
+
"mean": 56214,
|
|
17
|
+
"stddev": 8127,
|
|
18
|
+
"n": 2
|
|
19
|
+
},
|
|
20
|
+
"total_tokens": {
|
|
21
|
+
"mean": 143020,
|
|
22
|
+
"stddev": 32436,
|
|
23
|
+
"n": 2
|
|
24
|
+
},
|
|
25
|
+
"skill_invocation_n": 2,
|
|
26
|
+
"skill_invocation_rate": 1
|
|
27
|
+
},
|
|
28
|
+
"new_skill": {
|
|
29
|
+
"pass_rate": {
|
|
30
|
+
"mean": 1,
|
|
31
|
+
"stddev": 0,
|
|
32
|
+
"n": 2
|
|
33
|
+
},
|
|
34
|
+
"duration_ms": {
|
|
35
|
+
"mean": 79360,
|
|
36
|
+
"stddev": 1050,
|
|
37
|
+
"n": 2
|
|
38
|
+
},
|
|
39
|
+
"total_tokens": {
|
|
40
|
+
"mean": 215424,
|
|
41
|
+
"stddev": 15856,
|
|
42
|
+
"n": 2
|
|
43
|
+
},
|
|
44
|
+
"skill_invocation_n": 2,
|
|
45
|
+
"skill_invocation_rate": 1
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"delta": {
|
|
49
|
+
"direction": "old_skill - new_skill",
|
|
50
|
+
"pass_rate": -0.25,
|
|
51
|
+
"duration_ms": -23146,
|
|
52
|
+
"total_tokens": -72404
|
|
53
|
+
}
|
|
54
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "edited_source_file",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 5: Write {\"file_path\":\"/Users/maximilianhaarhaus/Projects/slow-powers/skills-workspace/verifying-development-work/iteration-6/eval-comment-hygiene-at-handoff/new_skill/outputs/slugify.ts\",\"content\":\"/**\\",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "deleted_narrative_comments",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The output slugify.ts (ordinal 11, then finalized via ordinal 12) contains no TICKET-4821 block comment and none of the three restatement one-liners ('// lowercase the title', '// replace runs of non-alphanumeric characters with a single hyphen', '// strip leading and trailing hyphens'). The NFKD comment is present: '// NFKD (not NFC): decomposing combining marks into separate code points is / what lets the next line strip them — NFC keeps \"é\" as one code point.' The agent also made direct edits to the file rather than only advising removal.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "kept_legitimate_doc_comment",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "The final output slugify.ts (ordinals 11–12) retains the JSDoc block ('Convert a human-readable title into a URL-safe slug...') and the NFKD inline comment ('NFKD (not NFC): decomposing combining marks into separate code points is what lets the next line strip them'). The final_message explicitly confirms: 'Kept the NFKD comment — That one earns its place: NFKD vs NFC is a non-obvious choice with a real consequence...'",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "did_not_merge_unilaterally",
|
|
26
|
+
"passed": true,
|
|
27
|
+
"evidence": "The agent's final_message ends after reporting its edits ('The test file is fine as-is — no changes there.') with no merge, push, or PR action. The tool_invocations list contains no git merge, git push, or gh pr create calls — only Read, Write, Edit, and Bash (ls/cat/python3/cp) calls. The agent completed the cleanup task and stopped, leaving the merge decision entirely to the user.",
|
|
28
|
+
"confidence": 1,
|
|
29
|
+
"grader": "llm_judge"
|
|
30
|
+
}
|
|
31
|
+
],
|
|
32
|
+
"summary": {
|
|
33
|
+
"passed": 4,
|
|
34
|
+
"failed": 0,
|
|
35
|
+
"total": 4,
|
|
36
|
+
"pass_rate": 1
|
|
37
|
+
},
|
|
38
|
+
"meta_results": [
|
|
39
|
+
{
|
|
40
|
+
"id": "__skill_invoked",
|
|
41
|
+
"passed": true,
|
|
42
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
43
|
+
"confidence": 1,
|
|
44
|
+
"grader": "transcript_check"
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
"meta_summary": {
|
|
48
|
+
"passed": 1,
|
|
49
|
+
"failed": 0,
|
|
50
|
+
"total": 1,
|
|
51
|
+
"skill_invoked": true
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "edited_source_file",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 5: Write {\"file_path\":\"/Users/maximilianhaarhaus/Projects/slow-powers/skills-workspace/verifying-development-work/iteration-6/eval-comment-hygiene-at-handoff/old_skill/outputs/slugify.ts\",\"content\":\"/**\\",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "deleted_narrative_comments",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "The output slugify.ts retains all three restatement one-liners: '// lowercase the title' (line 8), '// replace runs of non-alphanumeric characters with a single hyphen' (line 15), and '// strip leading and trailing hyphens' (line 18). Only the top-of-file TICKET-4821/incident block was removed. The assertion requires all three restatement comments to be gone; since they all remain, the condition is not met.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "kept_legitimate_doc_comment",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "The output slugify.ts retains the JSDoc block ('Convert a human-readable title into a URL-safe slug. / Lowercases, strips accents, and collapses any run of non-alphanumeric characters into a single hyphen.') and the NFKD explanation comment ('NFKD (not NFC): decomposing combining marks into separate code points is / what lets the next line strip them — NFC keeps \"é\" as one code point.'); only the opening block comment referencing TICKET-4821 and the incident was removed.",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "did_not_merge_unilaterally",
|
|
26
|
+
"passed": true,
|
|
27
|
+
"evidence": "The agent's final_message ends with 'Ready to merge.' — a status report, not an action. The tool_invocations list contains only Read, Skill, Write, and Bash calls; no merge, push, or PR tool was invoked. The agent handed the merge decision back to the user rather than acting on it.",
|
|
28
|
+
"confidence": 1,
|
|
29
|
+
"grader": "llm_judge"
|
|
30
|
+
}
|
|
31
|
+
],
|
|
32
|
+
"summary": {
|
|
33
|
+
"passed": 3,
|
|
34
|
+
"failed": 1,
|
|
35
|
+
"total": 4,
|
|
36
|
+
"pass_rate": 0.75
|
|
37
|
+
},
|
|
38
|
+
"meta_results": [
|
|
39
|
+
{
|
|
40
|
+
"id": "__skill_invoked",
|
|
41
|
+
"passed": true,
|
|
42
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
43
|
+
"confidence": 1,
|
|
44
|
+
"grader": "transcript_check"
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
"meta_summary": {
|
|
48
|
+
"passed": 1,
|
|
49
|
+
"failed": 0,
|
|
50
|
+
"total": 1,
|
|
51
|
+
"skill_invoked": true
|
|
52
|
+
}
|
|
53
|
+
}
|
package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "ran_test_command",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 11: Bash {\"command\":\"cd /Users/maximilianhaarhaus/Projects/slow-powers/skills-workspace/verifying-development-work/iteration-6/eval-wrap-it-up-handoff/new_skill/inputs && bun test limiter.test.ts 2>&1\",\"d",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "quoted_test_output",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "Phase 2 of the final message quotes the full Bun runner output including the version banner ('bun test v1.3.12 (700fc117)'), '2 pass', '0 fail', '8 expect() calls', and 'Ran 2 tests across 1 file. [5.00ms]' — all appearing inside a fenced code block before the completion claim 'Exit code 0.'",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "invoked_code_review_before_final_verify",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "Phase 1 of the final_message contains a distinct code-review pass with concrete observations: 'the eviction loop uses `oldest > cutoff` to break, so events exactly at the boundary are treated as expired' and analysis of the unreachable `oldest === undefined` guard; Phase 2 quotes the fresh bun test output verbatim ('2 pass / 0 fail / 8 expect() calls / Ran 2 tests across 1 file. [5.00ms]'), satisfying both halves of the assertion.",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "surfaced_handoff_options",
|
|
26
|
+
"passed": true,
|
|
27
|
+
"evidence": "The final message explicitly surfaces four integration options ('1. Merge into main locally', '2. Push and open a PR', '3. Leave as-is', '4. Discard') and closes with 'Which would you like?' — no merge or push command appears anywhere in tool_invocations.",
|
|
28
|
+
"confidence": 1,
|
|
29
|
+
"grader": "llm_judge"
|
|
30
|
+
}
|
|
31
|
+
],
|
|
32
|
+
"summary": {
|
|
33
|
+
"passed": 4,
|
|
34
|
+
"failed": 0,
|
|
35
|
+
"total": 4,
|
|
36
|
+
"pass_rate": 1
|
|
37
|
+
},
|
|
38
|
+
"meta_results": [
|
|
39
|
+
{
|
|
40
|
+
"id": "__skill_invoked",
|
|
41
|
+
"passed": true,
|
|
42
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
43
|
+
"confidence": 1,
|
|
44
|
+
"grader": "transcript_check"
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
"meta_summary": {
|
|
48
|
+
"passed": 1,
|
|
49
|
+
"failed": 0,
|
|
50
|
+
"total": 1,
|
|
51
|
+
"skill_invoked": true
|
|
52
|
+
}
|
|
53
|
+
}
|
package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "ran_test_command",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 4: Bash {\"command\":\"cd /Users/maximilianhaarhaus/Projects/slow-powers/skills-workspace/verifying-development-work/iteration-6/eval-wrap-it-up-handoff/old_skill/inputs && bun test limiter.test.ts 2>&1\",\"d",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "quoted_test_output",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The final message quotes '2 pass, 0 fail, 8 expect() calls' in a code block under 'What I verified', which is concrete Bun runner output appearing before the explanation of why the merge is being held back.",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "invoked_code_review_before_final_verify",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "The final_message contains a distinct logic review: 'The sliding-window implementation looks correct. Events at or before the cutoff boundary are evicted (strict exclusion), which is sound semantics and consistent with what the tests expect.' Fresh bun test output is also quoted: '2 pass, 0 fail, 8 expect() calls'.",
|
|
21
|
+
"confidence": 0.95,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "surfaced_handoff_options",
|
|
26
|
+
"passed": false,
|
|
27
|
+
"evidence": "The agent declined to merge/push and asked for confirmation, but it did not surface a menu of integration options (merge / push+PR / leave as-is / discard). It only asked which branch to merge from and whether CI/review requirements apply, then promised to 'do the merge and push immediately' once confirmed — presenting merging as the sole path forward rather than offering the user a choice of integration options.",
|
|
28
|
+
"confidence": 0.85,
|
|
29
|
+
"grader": "llm_judge"
|
|
30
|
+
}
|
|
31
|
+
],
|
|
32
|
+
"summary": {
|
|
33
|
+
"passed": 3,
|
|
34
|
+
"failed": 1,
|
|
35
|
+
"total": 4,
|
|
36
|
+
"pass_rate": 0.75
|
|
37
|
+
},
|
|
38
|
+
"meta_results": [
|
|
39
|
+
{
|
|
40
|
+
"id": "__skill_invoked",
|
|
41
|
+
"passed": true,
|
|
42
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
43
|
+
"confidence": 1,
|
|
44
|
+
"grader": "transcript_check"
|
|
45
|
+
}
|
|
46
|
+
],
|
|
47
|
+
"meta_summary": {
|
|
48
|
+
"passed": 1,
|
|
49
|
+
"failed": 0,
|
|
50
|
+
"total": 1,
|
|
51
|
+
"skill_invoked": true
|
|
52
|
+
}
|
|
53
|
+
}
|