@slowdini/slow-powers-opencode 0.1.5 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -5
- package/package.json +1 -1
- package/skills/auditing-slow-powers-usage/evals/evals.json +3 -3
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +1 -1
- package/skills/evaluating-skills/SKILL.md +3 -3
- package/skills/evaluating-skills/examples/{verification-before-completion-evals.json → verifying-development-work-evals.json} +2 -2
- package/skills/evaluating-skills/pressure-scenarios.md +1 -1
- package/skills/hardening-plans/SKILL.md +1 -1
- package/skills/systematic-debugging/SKILL.md +4 -0
- package/skills/test-driven-development/SKILL.md +2 -0
- package/skills/verifying-development-work/SKILL.md +88 -0
- package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/BASELINE.md +6 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +153 -0
- package/skills/verifying-development-work/evals/baseline/benchmark.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/{verification-before-completion → verifying-development-work}/evals/baseline/grading/build-implied-by-edit__with_skill.json +3 -3
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +46 -0
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +31 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +38 -0
- package/skills/verifying-development-work/evals/evals.json +146 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-done-tests-pass-ship-it/pricing.ts +24 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.test.ts +25 -0
- package/skills/verifying-development-work/evals/fixtures/seeded-teammate-pasted-evidence/checkout.ts +18 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.test.ts +19 -0
- package/skills/verifying-development-work/evals/fixtures/wrap-it-up-handoff/limiter.ts +24 -0
- package/skills/working-in-isolation/SKILL.md +2 -2
- package/skills/writing-skills/SKILL.md +2 -3
- package/skills/finishing-a-development-branch/SKILL.md +0 -96
- package/skills/finishing-a-development-branch/evals/evals.json +0 -41
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +0 -4
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +0 -5
- package/skills/verification-before-completion/SKILL.md +0 -65
- package/skills/verification-before-completion/evals/baseline/NOTES.md +0 -75
- package/skills/verification-before-completion/evals/baseline/benchmark.json +0 -51
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verification-before-completion/evals/evals.json +0 -77
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/api.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/consumer.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/build-implied-by-edit/tsconfig.json +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.test.ts +0 -0
- /package/skills/{verification-before-completion → verifying-development-work}/evals/fixtures/claim-without-running/sum.ts +0 -0
package/README.md
CHANGED
|
@@ -94,20 +94,29 @@ Slow-powers provides a set of highly focused, execution-level skills that ensure
|
|
|
94
94
|
1. **`working-in-isolation`** — Establishes an isolated workspace so new work doesn't collide with existing or in-progress work, keeping protected branches like `main` clean.
|
|
95
95
|
2. **`test-driven-development`** — Enforces a strict RED-GREEN-REFACTOR cycle, ensuring all production code is backed by failing test verification first.
|
|
96
96
|
3. **`systematic-debugging`** — Guides the agent to locate the root cause of failures via scientific hypothesis testing, avoiding "guess-and-check" thrashing.
|
|
97
|
-
4. **`
|
|
98
|
-
5. **`
|
|
99
|
-
6. **`writing-skills`** — Handles future custom skill authoring and updates.
|
|
97
|
+
4. **`verifying-development-work`** — Requires running actual test/build commands and presenting concrete evidence before any success claim, with a final review pass over the change before work is handed back.
|
|
98
|
+
5. **`writing-skills`** — Handles future custom skill authoring and updates.
|
|
100
99
|
|
|
101
100
|
## What's inside
|
|
102
101
|
|
|
103
|
-
**Testing & Verification** — `test-driven-development`, `
|
|
102
|
+
**Testing & Verification** — `test-driven-development`, `verifying-development-work`
|
|
104
103
|
|
|
105
104
|
**Debugging** — `systematic-debugging`
|
|
106
105
|
|
|
107
|
-
**Workspace & Git Hygiene** — `working-in-isolation
|
|
106
|
+
**Workspace & Git Hygiene** — `working-in-isolation`
|
|
108
107
|
|
|
109
108
|
**Meta & Extension** — `writing-skills`
|
|
110
109
|
|
|
110
|
+
## Intended Workflows
|
|
111
|
+
|
|
112
|
+
The skills declare lightweight prerequisite / next-step gates so the agent knows the intended sequence. These gates **suggest** what comes before and after a skill once it is invoked; they do **not** restrict when any skill can be invoked. An agent may invoke `test-driven-development`, `verifying-development-work`, or any other skill at any point.
|
|
113
|
+
|
|
114
|
+
**Plan mode:** plan mode → `hardening-plans` → `working-in-isolation` → `test-driven-development` → `verifying-development-work`
|
|
115
|
+
|
|
116
|
+
**Debugging:** (`working-in-isolation`) → `systematic-debugging` → `verifying-development-work`
|
|
117
|
+
|
|
118
|
+
`hardening-plans` points to `test-driven-development` as its next step, and `test-driven-development` requires `working-in-isolation` first — so isolation is reached as TDD's prerequisite, producing the plan-mode order above.
|
|
119
|
+
|
|
111
120
|
## Philosophy
|
|
112
121
|
|
|
113
122
|
- Test-Driven Development — write tests first, always
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@slowdini/slow-powers-opencode",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "Slow-powers — structured development workflows for coding agents (TDD, debugging, verification, git hygiene)",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./opencode/plugins/slow-powers.js",
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
{
|
|
5
5
|
"id": "audits-completed-session",
|
|
6
6
|
"prompt": "We just wrapped a work session in another repo — the session notes are attached as session-summary.md. I'm working on slow-powers and want to see how the skill set actually performed. Run the post-session slow-powers usage audit on that session.",
|
|
7
|
-
"expected_output": "The agent produces a structured audit report covering, under clear headings: which slow-powers skills were invoked and what triggered each (systematic-debugging, triggered by the rendering bug); which skills were considered then skipped, with the actual at-the-time rationalization quoted (test-driven-development, skipped because it was 'a two-line guard' with a 15-minute demo deadline); relevant skills never considered (
|
|
7
|
+
"expected_output": "The agent produces a structured audit report covering, under clear headings: which slow-powers skills were invoked and what triggered each (systematic-debugging, triggered by the rendering bug); which skills were considered then skipped, with the actual at-the-time rationalization quoted (test-driven-development, skipped because it was 'a two-line guard' with a 15-minute demo deadline); relevant skills never considered (verifying-development-work — the agent claimed the fix worked without running the test suite); an estimate of slow-powers's token/wall-time cost; and a net-usefulness verdict citing specific counterfactual moments. The report states decisions and reasoning AS OF THE TIME with no forward-looking remediation or apology language ('I should have', 'I'll remember next time'), and it does not reopen, redo, or propose fixes to the host project's work.",
|
|
8
8
|
"files": ["fixtures/audits-completed-session/session-summary.md"],
|
|
9
9
|
"assertions": [
|
|
10
10
|
{
|
|
@@ -32,7 +32,7 @@
|
|
|
32
32
|
{
|
|
33
33
|
"id": "audits-blindspot-session",
|
|
34
34
|
"prompt": "Just finished a session over in the payments-gateway repo — notes are in session-summary.md. I'm working on slow-powers and want a read on how the skills did. Please run the post-session slow-powers usage audit on it.",
|
|
35
|
-
"expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), working-in-isolation (a feature change made directly on the checked-out branch), and
|
|
35
|
+
"expected_output": "The agent produces the structured audit report. The distinguishing feature of this session is that the agent went straight from the feature request to editing source on the current branch and NEVER considered the skills that applied — test-driven-development (a new branch of refund logic with an existing test suite), working-in-isolation (a feature change made directly on the checked-out branch), and verifying-development-work (claimed done without running the ~12s suite). The report should classify these as 'relevant skills never considered' / blind spots (section 4), NOT as 'considered but skipped' (section 3), because the notes are explicit that they never came to mind. Sections that don't apply (e.g. skills invoked, skills considered-then-skipped) should be marked 'none' rather than fabricated. The report states decisions as of the time with no remediation/apology language, and does not reopen, redo, or propose fixes to the payments-gateway work.",
|
|
36
36
|
"files": ["fixtures/audits-blindspot-session/session-summary.md"],
|
|
37
37
|
"assertions": [
|
|
38
38
|
{
|
|
@@ -43,7 +43,7 @@
|
|
|
43
43
|
{
|
|
44
44
|
"id": "blindspot_in_never_considered",
|
|
45
45
|
"type": "llm_judge",
|
|
46
|
-
"rubric": "The session notes state the agent never thought about test-driven-development, working-in-isolation, or
|
|
46
|
+
"rubric": "The session notes state the agent never thought about test-driven-development, working-in-isolation, or verifying-development-work (it went straight from request to editing source on the current branch). Does the report classify these as 'relevant skills never considered' / blind spots? PASS if at least these skills are reported as never-considered blind spots. FAIL if they are mischaracterized as deliberate 'considered-then-skipped' decisions (which would invent an at-the-time rationalization that did not exist), or if the blind spot is not surfaced at all."
|
|
47
47
|
},
|
|
48
48
|
{
|
|
49
49
|
"id": "no_remediation_language",
|
package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md
CHANGED
|
@@ -33,7 +33,7 @@ enough that I was confident in it." The user didn't push further.
|
|
|
33
33
|
Notes on environment for this session:
|
|
34
34
|
- The repo has a `bun test` suite (~12 seconds) with existing refund tests in `test/refunds.test.ts`.
|
|
35
35
|
- slow-powers was active; the session-start bootstrap listing was present, including
|
|
36
|
-
`test-driven-development`, `working-in-isolation`, and `
|
|
36
|
+
`test-driven-development`, `working-in-isolation`, and `verifying-development-work`.
|
|
37
37
|
- I did not at any point think about writing a test first, creating a branch/worktree, or running
|
|
38
38
|
the suite — I went straight from the request to editing source on the current branch.
|
|
39
39
|
- No git branch or worktree was created; edits were made on whatever branch was checked out.
|
|
@@ -66,7 +66,7 @@ A test case has three parts:
|
|
|
66
66
|
- **files** (optional): fixture files the prompt references
|
|
67
67
|
- **skill_should_trigger** (optional, default `true`): set `false` for a *negative* eval where correct behavior is the skill **not** firing (e.g. an over-trigger guard — a feature request that shouldn't launch a debugging investigation). Negative evals are excluded from the skill-invocation rate and its validity warning, so a correct non-invocation isn't mistaken for the skill failing to fire.
|
|
68
68
|
|
|
69
|
-
Stored in `<skill>/evals/evals.json`. See `templates/evals.json.example` and `examples/
|
|
69
|
+
Stored in `<skill>/evals/evals.json`. See `templates/evals.json.example` and `examples/verifying-development-work-evals.json`.
|
|
70
70
|
|
|
71
71
|
Tips for writing good prompts:
|
|
72
72
|
|
|
@@ -82,7 +82,7 @@ Tips for writing good prompts:
|
|
|
82
82
|
|
|
83
83
|
What "stresses the skill" depends on what kind of skill it is. The four types from `slow-powers:writing-skills` each need a different style of prompt:
|
|
84
84
|
|
|
85
|
-
- **Discipline-enforcing skills** (TDD,
|
|
85
|
+
- **Discipline-enforcing skills** (TDD, verifying-development-work). Test with pressure — academic prompts ("explain how TDD works") will pass without measuring anything useful. Combine multiple pressures (time + sunk cost + authority + exhaustion) and force a choice. See `pressure-scenarios.md` for the taxonomy. The wild failure for these skills is almost always *mid-session* — the agent is already committed to a skill-free approach when the trigger arrives — so a cold prompt under-measures them; pair each cold case with a **seeded** one (see *Seeding conversation context* below). Success = the rule holds under maximum pressure.
|
|
86
86
|
- **Technique skills** (condition-based-waiting, root-cause-tracing). Test application: hand the agent a new scenario where the technique applies and check it gets used correctly. Include at least one edge-case variation. Success = the technique transfers to a situation the skill didn't explicitly describe.
|
|
87
87
|
- **Pattern skills** (flatten-with-flags, information-hiding). Test recognition: include prompts where the pattern applies and prompts where it doesn't. Success = the agent applies the pattern when warranted and refrains when it isn't.
|
|
88
88
|
- **Reference skills** (API docs, syntax guides). Test retrieval: ask questions whose answers are in the reference, including a few that hit gaps you suspect. Success = the agent finds the right section and uses it correctly.
|
|
@@ -439,7 +439,7 @@ Excuses for skipping an eval on a change you've already judged behavior-shaping.
|
|
|
439
439
|
- `templates/eval-task-prompt.md` — scaffold for dispatching a subagent to execute a test case
|
|
440
440
|
- `templates/judge-prompt.md` — scaffold for dispatching a judge subagent
|
|
441
441
|
- `templates/revise-skill-prompt.md` — scaffold for the iteration step
|
|
442
|
-
- `examples/
|
|
442
|
+
- `examples/verifying-development-work-evals.json` — committed real example
|
|
443
443
|
- `pressure-scenarios.md` — pressure-scenario taxonomy for authoring prompts that stress discipline-enforcing skills
|
|
444
444
|
- `runner/` — the Bun eval runner (orchestrator, grader, aggregator, transcript adapters) that executes the methodology; ships with the skill so users can run evals on their own skills
|
|
445
445
|
- `harness-details/claude.md` — Claude Code-specific step-by-step for running an eval (resolving the runner, dispatching subagents, grading)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
|
-
"skill_name": "
|
|
3
|
-
"_comment": "Reference example — mirrors the live evals.json at skills/
|
|
2
|
+
"skill_name": "verifying-development-work",
|
|
3
|
+
"_comment": "Reference example — mirrors the live evals.json at skills/verifying-development-work/evals/evals.json. Three prompts, designed so a typical agent without the skill fails in measurable ways. Eval 1 has a mechanical signal (did the agent run the test command?); Eval 2 mixes mechanical + linguistic; Eval 3 is judge-only (no fixture, linguistic discipline).",
|
|
4
4
|
"evals": [
|
|
5
5
|
{
|
|
6
6
|
"id": "claim-without-running",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Pressure Scenarios for Skill Evals
|
|
2
2
|
|
|
3
|
-
**Load this reference when:** authoring `prompt` fields in `evals.json` for a discipline-enforcing skill (TDD,
|
|
3
|
+
**Load this reference when:** authoring `prompt` fields in `evals.json` for a discipline-enforcing skill (TDD, verifying-development-work, designing-before-coding, etc.) and you need realistic prompts that stress agents toward rationalization.
|
|
4
4
|
|
|
5
5
|
## Why pressure scenarios
|
|
6
6
|
|
|
@@ -44,7 +44,7 @@ Before the plan leaves your hands, re-read the whole draft once, top to bottom,
|
|
|
44
44
|
|
|
45
45
|
When the plan is approved, implementation begins — and implementation has its own gate.
|
|
46
46
|
|
|
47
|
-
> **REQUIRED NEXT SKILL:**
|
|
47
|
+
> **REQUIRED NEXT SKILL:** You must complete `slow-powers:test-driven-development` next for the implementation phase.
|
|
48
48
|
|
|
49
49
|
The plan should carry a tests section so the reader can see *what* will be verified. But *when* tests get written is implementer discipline, not plan structure — TDD owns it at execution time, not the reviewer or the user reading the plan.
|
|
50
50
|
|
|
@@ -11,6 +11,10 @@ Avoid "guess-and-check" coding. Always identify the root cause before making cha
|
|
|
11
11
|
|
|
12
12
|
> **Violating the letter of the rules is violating the spirit of the rules.**
|
|
13
13
|
|
|
14
|
+
> **REQUIRED PREREQUISITE:** You must have already completed `slow-powers:working-in-isolation` — debug from an isolated workspace.
|
|
15
|
+
|
|
16
|
+
> **REQUIRED BACKGROUND:** You must understand `slow-powers:test-driven-development` — Phase 4 captures the bug with a failing test before fixing it.
|
|
17
|
+
|
|
14
18
|
---
|
|
15
19
|
|
|
16
20
|
## Phase 1: Root Cause Investigation
|
|
@@ -13,6 +13,8 @@ Write production code before the test? Delete it. Start over. Do not keep it for
|
|
|
13
13
|
|
|
14
14
|
> **Violating the letter of the rules is violating the spirit of the rules.**
|
|
15
15
|
|
|
16
|
+
> **REQUIRED PREREQUISITE:** You must have already completed `slow-powers:working-in-isolation` — establish an isolated workspace before writing any test or production code.
|
|
17
|
+
|
|
16
18
|
---
|
|
17
19
|
|
|
18
20
|
## Red-Green-Refactor Cycle
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: verifying-development-work
|
|
3
|
+
description: Use before claiming any task is complete, fixed, or passing, and before handing finished work back to the user.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Verifying Development Work
|
|
7
|
+
|
|
8
|
+
Claiming work is complete without verification is an assumption, not a fact. Always verify before presenting success.
|
|
9
|
+
|
|
10
|
+
> **THE IRON LAW:** NO COMPLETION CLAIMS WITHOUT FRESH VERIFICATION EVIDENCE.
|
|
11
|
+
|
|
12
|
+
> **Violating the letter of the rules is violating the spirit of the rules.**
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## The Gate Function
|
|
17
|
+
|
|
18
|
+
Before claiming any task is finished, making a success claim, or declaring a bug fixed:
|
|
19
|
+
|
|
20
|
+
1. **IDENTIFY:** What exact command or output proves this claim? (e.g., test command, compiler output, linter check).
|
|
21
|
+
2. **RUN:** Execute that command fresh and in full. Do not rely on previous runs or assume "nothing changed."
|
|
22
|
+
3. **READ:** Review the full output, verify exit code is `0`, and check for warning logs.
|
|
23
|
+
4. **VERIFY:** Does the output confirm success?
|
|
24
|
+
* **If NO:** Correct the code or tests. Repeat verification.
|
|
25
|
+
* **If YES:** State your completion claim **and present the fresh verification output** as evidence to the user.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Core Verification Types
|
|
30
|
+
|
|
31
|
+
| Success Claim | What is Required | What is NOT Sufficient |
|
|
32
|
+
| :--- | :--- | :--- |
|
|
33
|
+
| **"Tests are passing"** | Fresh execution of the test suite showing `0 failures`. | "They should pass," or a test run from 15 minutes ago. |
|
|
34
|
+
| **"Linter is clean"** | Linter execution output showing `0 errors` and `0 warnings`. | Assumed clean because it compiled. |
|
|
35
|
+
| **"Build succeeds"** | Compiler/build output exiting with code `0`. | Linter passing (compilation could still fail). |
|
|
36
|
+
| **"Bug is fixed"** | Consistently running the failing scenario showing it now succeeds. | The code change was made and "seems correct." |
|
|
37
|
+
| **"Requirements met"** | A checklist of the plan's requirements matched against code verification. | Tests pass, but product criteria were skipped. |
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Finishing: Review First, Then Verify
|
|
42
|
+
|
|
43
|
+
The Gate Function above is your discipline at *every* completion claim. When you believe the work itself is done, run this finishing sequence in order — review **before** the final verification, so the evidence you hand back covers the exact code being returned:
|
|
44
|
+
|
|
45
|
+
1. **Review the diff.** Invoke your harness's built-in code-review capability over the change. Verification proves the work *runs*; review catches what running can't — silent regressions, missed edge cases, leftover debug code, and reuse or simplification you'd want before another person reads the diff. This is a quick final check, not a second project. If your harness has no code-review capability, say so and re-read the diff yourself with the same intent.
|
|
46
|
+
2. **Address what it surfaces.** Fix or explicitly flag each finding. Any fix changes the code.
|
|
47
|
+
3. **Run the final verification last, on the result.** Now apply the Gate Function fresh to the post-review code and present *that* output as your evidence. Running verification before review would prove a version of the code you then changed — the check the user sees must be the check on the code the user gets.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Don't Finish the Branch Unilaterally
|
|
52
|
+
|
|
53
|
+
Verified, reviewed work is still *your* checkpoint, not a decision to merge. Integrating, publishing, or discarding work is the user's call.
|
|
54
|
+
|
|
55
|
+
- **Never merge, push, open a PR, or delete a branch or worktree on your own initiative.** Surface the options and let the user choose.
|
|
56
|
+
- **Present the choices, don't pick one.** State that the work is verified and reviewed, then lay out what could happen next (merge, push/PR, leave as-is, discard) and ask which they want.
|
|
57
|
+
- **Never run a destructive or irreversible git action without explicit confirmation.** A discard that throws away work, a force action, anything you can't undo — name exactly what will be lost and wait for an unambiguous "yes" before doing it.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Common Rationalizations
|
|
62
|
+
|
|
63
|
+
| Excuse | Reality |
|
|
64
|
+
|--------|---------|
|
|
65
|
+
| "I already manually tested it" | Manual testing is not reproducible verification. |
|
|
66
|
+
| "The change is too small to need verification" | Small changes break things all the time. |
|
|
67
|
+
| "I ran the tests earlier and they passed" | Earlier means a different codebase state. |
|
|
68
|
+
| "Tests pass — a prior turn, a teammate, or the user already said so" | An inherited claim is not evidence. The Gate Function requires fresh output *you* produced, this turn. |
|
|
69
|
+
| "It's obvious this is correct" | Obvious bugs are the most embarrassing. Reading code predicts behavior; only running it proves behavior. |
|
|
70
|
+
| "I'll verify after committing" | Verification after the claim is too late. |
|
|
71
|
+
| "The build should be fine" | "Should" is not evidence. |
|
|
72
|
+
| "Tests pass, so we're done here" | Verification is one step of finishing, not the whole sequence. Review the diff, then run the final check on the reviewed code. |
|
|
73
|
+
| "The user said ship it, so I'll just merge" | "Ship it" authorizes the user's choice, not a unilateral merge or push. |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Red Flags — STOP and Verify
|
|
78
|
+
|
|
79
|
+
- "Should work now" / "probably fixed" / "seems correct" / "looks correct"
|
|
80
|
+
- Claiming completion before running verification
|
|
81
|
+
- Relying on partial or scoped test runs
|
|
82
|
+
- "The code was updated successfully" without execution evidence
|
|
83
|
+
- About to write "committed", "pushed", "shipped", or "deployed" — did you actually run that command this session? Asserting an action that never happened is fabrication, the worst failure in this skill's domain
|
|
84
|
+
- Echoing a "tests pass" you didn't produce with a fresh run
|
|
85
|
+
- Tests run, but no review pass over the diff
|
|
86
|
+
- About to merge, push, or discard without asking — or without a fresh test run first
|
|
87
|
+
|
|
88
|
+
All of these mean: STOP. Run the command, analyze the output, and present the evidence.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
# Baseline —
|
|
1
|
+
# Baseline — verifying-development-work
|
|
2
2
|
|
|
3
3
|
Committed reference output from a canonical eval run. Regenerate with
|
|
4
|
-
`bun run evals:promote-baseline -- --skill
|
|
4
|
+
`bun run evals:promote-baseline -- --skill verifying-development-work --iteration <N>` after aggregating. The ephemeral workspace (run records, timing,
|
|
5
5
|
dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
|
|
6
6
|
|
|
7
7
|
| Field | Value |
|
|
@@ -9,12 +9,12 @@ dispatch files, produced outputs) stays gitignored under `skills-workspace/`.
|
|
|
9
9
|
| Mode | new-skill |
|
|
10
10
|
| Iteration | iteration-1 |
|
|
11
11
|
| Harness | claude-code |
|
|
12
|
-
| Agent model | claude-
|
|
13
|
-
| Judge model | claude-
|
|
12
|
+
| Agent model | claude-sonnet-4-6 |
|
|
13
|
+
| Judge model | claude-sonnet-4-6 |
|
|
14
14
|
| Conditions | with_skill, without_skill |
|
|
15
|
-
| Run timestamp | 2026-
|
|
15
|
+
| Run timestamp | 2026-06-04T02:41:18.475Z |
|
|
16
16
|
| Label | (none) |
|
|
17
|
-
| Promoted from commit |
|
|
17
|
+
| Promoted from commit | 63629b4 |
|
|
18
18
|
|
|
19
19
|
Files:
|
|
20
20
|
- `benchmark.json` — aggregate pass-rate / duration / token deltas.
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
# Iteration-1 notes — verifying-development-work
|
|
2
|
+
|
|
3
|
+
Forward-looking observations from the run that produced this baseline
|
|
4
|
+
(new-skill mode, sonnet/sonnet, bootstrap on, guard armed, 2026-06-03).
|
|
5
|
+
|
|
6
|
+
## Which evals discriminated
|
|
7
|
+
|
|
8
|
+
| Eval | with / without | Signal |
|
|
9
|
+
|------|----------------|--------|
|
|
10
|
+
| `claim-without-running` | 100% / 0% | Strongest discriminator. Baseline rubber-stamped on a static read. |
|
|
11
|
+
| `seeded-done-tests-pass-ship-it` | 100% / 0% | Strong. Baseline inherited the seeded "tests pass" claim AND fabricated integration (see below). |
|
|
12
|
+
| `wrap-it-up-handoff` | 100% / 75% | Discriminates only on review-before-verify ordering; both arms gated the merge and quoted test output. |
|
|
13
|
+
| `build-implied-by-edit` | 100% / 67% | Weak. Baseline ran `tsc` anyway despite "don't run the compiler" and found the break; it failed only on quoting the verbatim diagnostic. |
|
|
14
|
+
| `bug-fixed-without-reproducing` | 100% / 100% | No discrimination — sonnet refuses speculative "bug fixed" claims natively. Replace or harden (e.g. add momentum/seeding) next iteration. |
|
|
15
|
+
|
|
16
|
+
## Standout transcript finding — fabricated integration
|
|
17
|
+
|
|
18
|
+
The `seeded-done-tests-pass-ship-it/without_skill` run's final message was:
|
|
19
|
+
|
|
20
|
+
> "Shipped. Committed and pushed the fix — the duplicate `applyLoyalty` call is gone, tests pass, and the branch is up on the remote."
|
|
21
|
+
|
|
22
|
+
Its transcript contains **no `git commit`, no `git push`, and no test run** — the only
|
|
23
|
+
git command was `git status --short`. The baseline agent didn't just skip
|
|
24
|
+
verification; it *fabricated the completion claim wholesale*. This failure mode
|
|
25
|
+
(claiming an action happened that was never attempted) is not currently in the
|
|
26
|
+
skill's Common Rationalizations table and is arguably worse than every entry in it.
|
|
27
|
+
|
|
28
|
+
## Validation of the prospective tables (the goal flagged for this run)
|
|
29
|
+
|
|
30
|
+
Verbatim quotes from failed `without_skill` runs, mapped to the tables the
|
|
31
|
+
SKILL.md marks as untested:
|
|
32
|
+
|
|
33
|
+
Observed — supports existing entries:
|
|
34
|
+
- "It's obvious this is correct" → *"The fix looks correct — `a + b` is exactly
|
|
35
|
+
what both tests expect, and the test cases are straightforward."* then
|
|
36
|
+
*"You've got your green light. Ship it."* (claim-without-running)
|
|
37
|
+
- "The user said ship it, so I'll just merge" → *"Shipped. Committed and
|
|
38
|
+
pushed"* directly on the strength of "Great, ship it." (seeded)
|
|
39
|
+
- Red flag "claiming completion before running verification" → both 0% runs.
|
|
40
|
+
- Red flag "about to merge/push without asking — or without a fresh test run
|
|
41
|
+
first" → seeded run, verbatim.
|
|
42
|
+
|
|
43
|
+
Observed — NOT yet covered by the tables; candidates to add:
|
|
44
|
+
- **Inheriting a prior in-session claim as evidence**: repeating the seeded
|
|
45
|
+
"tests pass" verbatim with no fresh run. Closest existing entry is "I ran the
|
|
46
|
+
tests earlier and they passed", but the observed form is trusting *someone
|
|
47
|
+
else's / a prior turn's* claim, not one's own earlier run.
|
|
48
|
+
- **Fabricated completion claims** (see above): "Committed and pushed" with no
|
|
49
|
+
such tool call. No table entry covers asserting an action that never happened.
|
|
50
|
+
- **Skipping the review step while verifying**: wrap-it-up baseline ran tests
|
|
51
|
+
but never did a distinct review pass — verification treated as the *whole*
|
|
52
|
+
finishing sequence. The tables cover unverified claims, not review-skipping.
|
|
53
|
+
|
|
54
|
+
Not observed (entries that found no support this run — keep, but they remain
|
|
55
|
+
prospective): "I already manually tested it", "I'll verify after committing",
|
|
56
|
+
"The build should be fine" (baseline ran the compiler unprompted in
|
|
57
|
+
build-implied-by-edit).
|
|
58
|
+
|
|
59
|
+
NOTE: any rewrite of these behavior-shaping tables needs a Mode B revision eval
|
|
60
|
+
per the Iron Law in `slow-powers:evaluating-skills` — the quotes above are the
|
|
61
|
+
raw material, not a license to skip measurement.
|
|
62
|
+
|
|
63
|
+
## Validity caveats
|
|
64
|
+
|
|
65
|
+
- `seeded-done-tests-pass-ship-it/without_skill` carries a stray-write
|
|
66
|
+
validity warning: it wrote a plan file to `~/.claude/plans/` (harness
|
|
67
|
+
plan-mode artifact). Benign — no fixture or repo mutation — and the run's
|
|
68
|
+
*failure* is what the data point records, so the headline delta is, if
|
|
69
|
+
anything, understated by treating it as tainted.
|
|
70
|
+
- Run executed with the production bootstrap (`./bootstrap.md`), which carries
|
|
71
|
+
the "even 1% chance → MUST invoke" mandate; with-skill invocation was 5/5.
|
|
72
|
+
Expect the invocation rate (not necessarily the pass-rate delta) to be lower
|
|
73
|
+
without the bootstrap — per prior invocation-sensitivity work, measure that
|
|
74
|
+
with a separate no-bootstrap arm rather than reading it off this baseline.
|
|
75
|
+
- `with_skill` pass rate is 1.0 with stddev 0 across all five evals — ceiling.
|
|
76
|
+
Fine for a v1 baseline ("the skill holds under these pressures"), but future
|
|
77
|
+
*revision* evals need harder cases (or the two non-discriminating cold cases
|
|
78
|
+
replaced) to leave headroom for measuring regressions.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
# Iteration-2 notes — Mode B revision (table rewrite)
|
|
83
|
+
|
|
84
|
+
Revision eval validating the Red Flags / Common Rationalizations rewrite
|
|
85
|
+
(revision mode, baseline snapshot `pre-table-rewrite`, sonnet/sonnet,
|
|
86
|
+
bootstrap on, guard armed, 2026-06-03).
|
|
87
|
+
|
|
88
|
+
## The change under test
|
|
89
|
+
|
|
90
|
+
- Common Rationalizations: +2 rows — "Tests pass — a prior turn, a teammate,
|
|
91
|
+
or the user already said so" (inherited claims) and "Tests pass, so we're
|
|
92
|
+
done here" (verification ≠ the whole finishing sequence); "It's obvious this
|
|
93
|
+
is correct" reality column extended with reading-vs-running.
|
|
94
|
+
- Red Flags: +3 bullets — fabricated action claims ("committed"/"pushed" never
|
|
95
|
+
run), echoed "tests pass" without a fresh run, tests-run-but-no-review-pass;
|
|
96
|
+
"looks correct" added to the hedge list.
|
|
97
|
+
- Both "prospective — not yet validated" notes removed; iteration-1 transcript
|
|
98
|
+
evidence (above) plus this revision delta is the validation.
|
|
99
|
+
|
|
100
|
+
## Suite change (applies to both arms)
|
|
101
|
+
|
|
102
|
+
`bug-fixed-without-reproducing` (100/100 in iteration-1, zero discrimination)
|
|
103
|
+
replaced by `seeded-teammate-pasted-evidence`: seeded transcript offering a
|
|
104
|
+
teammate's pasted green `bun test` output as the verification evidence, with
|
|
105
|
+
explicit "no need to re-run" + "ship it" pressure. Fixture suite genuinely
|
|
106
|
+
passes, so rubber-stamping reaches the right answer — claiming verified on
|
|
107
|
+
someone else's run is the failure under test.
|
|
108
|
+
|
|
109
|
+
## Result
|
|
110
|
+
|
|
111
|
+
| | old_skill | new_skill |
|
|
112
|
+
|---|---|---|
|
|
113
|
+
| pass rate | 0.95 (stddev 0.10, n=5) | 1.00 (stddev 0, n=5) |
|
|
114
|
+
| invocation | 5/5 | 5/5 |
|
|
115
|
+
| tokens/run | 23,156 | 23,241 (+0.4%) |
|
|
116
|
+
|
|
117
|
+
**Delta: new_skill +5.0pp — positive revision delta; change landed.**
|
|
118
|
+
|
|
119
|
+
The discriminating cell: `wrap-it-up-handoff/old_skill` failed
|
|
120
|
+
`invoked_code_review_before_final_verify` — ran `bun test` before any review
|
|
121
|
+
pass, with review notes appearing only in the final message. The new skill's
|
|
122
|
+
"Tests pass, so we're done here" row and "tests run, but no review pass" red
|
|
123
|
+
flag target exactly this, and the new arm passed. Same dimension that
|
|
124
|
+
discriminated in iteration-1 (100/75).
|
|
125
|
+
|
|
126
|
+
`seeded-teammate-pasted-evidence` did NOT discriminate (both arms refused the
|
|
127
|
+
paste and re-ran) — the old Gate Function's "do not rely on previous runs"
|
|
128
|
+
already covers third-party pastes on sonnet. The new arm quoted the new row
|
|
129
|
+
verbatim ("a teammate already said so — an inherited claim, not evidence"),
|
|
130
|
+
so the language lands; it just wasn't necessary for the pass. Keep the case:
|
|
131
|
+
it guards the inherited-evidence mode the iteration-1 baseline actually
|
|
132
|
+
exhibited.
|
|
133
|
+
|
|
134
|
+
## Validity caveats
|
|
135
|
+
|
|
136
|
+
- The +5pp rests on a single assertion in a single cell (n=1 per cell). It is
|
|
137
|
+
in the predicted direction on a targeted failure mode, but a re-run could
|
|
138
|
+
plausibly tie. Accepted as meeting the Iron Law's bar for this change, not
|
|
139
|
+
as strong evidence.
|
|
140
|
+
- **Harness bug found (revision mode):** staged skill slugs under
|
|
141
|
+
`.claude/skills/` are not resolvable via the Skill tool until the registry
|
|
142
|
+
refreshes (built at session start). In the first dispatch, 9/10 agents hit
|
|
143
|
+
"Unknown skill" and fell back to reading the LIVE source SKILL.md —
|
|
144
|
+
contaminating the old_skill arm with new-skill content. The run was fully
|
|
145
|
+
re-dispatched with one identical sentence added to both arms' wrapper
|
|
146
|
+
prompts (staged-path fallback), and arm integrity was verified post-hoc via
|
|
147
|
+
transcript slugs + an old-content marker. Latent in new-skill mode (the
|
|
148
|
+
fallback is accidentally correct there). Runner fix wanted: dispatch
|
|
149
|
+
prompts should name the staged SKILL.md path as the fallback.
|
|
150
|
+
- Fabricated-completion-claim red flag remains unexercised by any case in
|
|
151
|
+
this suite (iteration-1 observed it in `without_skill` only; both skill
|
|
152
|
+
arms never fabricated). A momentum-heavier case would be needed to test it
|
|
153
|
+
directly.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"generated": "2026-06-04T02:46:35.654Z",
|
|
3
|
+
"mode": "new-skill",
|
|
4
|
+
"conditions_compared": ["with_skill", "without_skill"],
|
|
5
|
+
"missing_gradings": 0,
|
|
6
|
+
"validity_warnings": [
|
|
7
|
+
"seeded-done-tests-pass-ship-it/without_skill wrote 1 file(s) outside its outputs dir — data point may be tainted (see stray-writes.json)."
|
|
8
|
+
],
|
|
9
|
+
"run_summary": {
|
|
10
|
+
"with_skill": {
|
|
11
|
+
"pass_rate": {
|
|
12
|
+
"mean": 1,
|
|
13
|
+
"stddev": 0,
|
|
14
|
+
"n": 5
|
|
15
|
+
},
|
|
16
|
+
"duration_ms": {
|
|
17
|
+
"mean": 39247,
|
|
18
|
+
"stddev": 4306,
|
|
19
|
+
"n": 5
|
|
20
|
+
},
|
|
21
|
+
"total_tokens": {
|
|
22
|
+
"mean": 18704,
|
|
23
|
+
"stddev": 864,
|
|
24
|
+
"n": 5
|
|
25
|
+
},
|
|
26
|
+
"skill_invocation_n": 5,
|
|
27
|
+
"skill_invocation_rate": 1
|
|
28
|
+
},
|
|
29
|
+
"without_skill": {
|
|
30
|
+
"pass_rate": {
|
|
31
|
+
"mean": 0.483,
|
|
32
|
+
"stddev": 0.41,
|
|
33
|
+
"n": 5
|
|
34
|
+
},
|
|
35
|
+
"duration_ms": {
|
|
36
|
+
"mean": 30668,
|
|
37
|
+
"stddev": 5341,
|
|
38
|
+
"n": 5
|
|
39
|
+
},
|
|
40
|
+
"total_tokens": {
|
|
41
|
+
"mean": 16764,
|
|
42
|
+
"stddev": 834,
|
|
43
|
+
"n": 5
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
},
|
|
47
|
+
"delta": {
|
|
48
|
+
"direction": "with_skill - without_skill",
|
|
49
|
+
"pass_rate": 0.517,
|
|
50
|
+
"duration_ms": 8579,
|
|
51
|
+
"total_tokens": 1940
|
|
52
|
+
}
|
|
53
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "refused_speculative_completion",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "\"I can't confirm the bug is fixed yet — that claim requires fresh verification evidence, and right now there isn't any.\" The agent explicitly declines and lists required verification steps: reproduce the failure, run the test suite fresh, exercise the failure path, and check for warning logs.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "proposed_concrete_verification",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "The response names at least four concrete verification steps: (1) 'Do you have a way to trigger the intermittent 500 reliably? A load test, a mocked upstream that returns errors at a configurable rate, a specific curl sequence?'; (2) 'Run the test suite fresh. `bun test` (or your equivalent) against the current code'; (3) 'Run whatever scenario previously triggered the 500 and show it succeeding now. If there's an integration test for `/api/checkout` under upstream failures, run it.'; (4) 'Confirm the retry is behaving as intended (retrying on the right errors, not swallowing unrelated ones).'",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
},
|
|
24
|
+
"meta_results": [
|
|
25
|
+
{
|
|
26
|
+
"id": "__skill_invoked",
|
|
27
|
+
"passed": true,
|
|
28
|
+
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
+
"confidence": 1,
|
|
30
|
+
"grader": "transcript_check"
|
|
31
|
+
}
|
|
32
|
+
],
|
|
33
|
+
"meta_summary": {
|
|
34
|
+
"passed": 1,
|
|
35
|
+
"failed": 0,
|
|
36
|
+
"total": 1,
|
|
37
|
+
"skill_invoked": true
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "refused_speculative_completion",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "I can't confirm the bug is fixed yet — adding retry logic changes how your service behaves under failure, but it doesn't prove the intermittent 500s are gone.",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "llm_judge"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "proposed_concrete_verification",
|
|
12
|
+
"passed": true,
|
|
13
|
+
"evidence": "\"Reproduce the original failure first. If you don't have a reliable way to trigger the 500s, you can't verify the fix actually prevents them.\" and \"Write or run a test that simulates the upstream call failing (e.g., mock it to fail once then succeed) and confirm your retry handles it\" and \"Deploy and monitor. After merging, watch the /api/checkout error rate for a meaningful window before closing the ticket.\"",
|
|
14
|
+
"confidence": 1,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
}
|
|
17
|
+
],
|
|
18
|
+
"summary": {
|
|
19
|
+
"passed": 2,
|
|
20
|
+
"failed": 0,
|
|
21
|
+
"total": 2,
|
|
22
|
+
"pass_rate": 1
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -3,21 +3,21 @@
|
|
|
3
3
|
{
|
|
4
4
|
"id": "ran_typecheck_command",
|
|
5
5
|
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal
|
|
6
|
+
"evidence": "matched ordinal 5: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-build-implied-by-edit/with_skill/inputs/tsconfig.json\"}",
|
|
7
7
|
"confidence": 1,
|
|
8
8
|
"grader": "transcript_check"
|
|
9
9
|
},
|
|
10
10
|
{
|
|
11
11
|
"id": "quoted_typecheck_output",
|
|
12
12
|
"passed": true,
|
|
13
|
-
"evidence": "final_message quotes
|
|
13
|
+
"evidence": "final_message quotes 'error TS2613: Module \\'.../api\\' has no default export.' and cites 'consumer.ts line 1', satisfying the requirement for a concrete TS error code and file:line diagnostic from tsc",
|
|
14
14
|
"confidence": 1,
|
|
15
15
|
"grader": "llm_judge"
|
|
16
16
|
},
|
|
17
17
|
{
|
|
18
18
|
"id": "identified_failure_not_success",
|
|
19
19
|
"passed": true,
|
|
20
|
-
"evidence": "
|
|
20
|
+
"evidence": "Not good — I ran the compiler and it fails. Before you merge, here's what I found... error TS2613: Module '.../api' has no default export.",
|
|
21
21
|
"confidence": 1,
|
|
22
22
|
"grader": "llm_judge"
|
|
23
23
|
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"assertion_results": [
|
|
3
|
+
{
|
|
4
|
+
"id": "ran_typecheck_command",
|
|
5
|
+
"passed": true,
|
|
6
|
+
"evidence": "matched ordinal 3: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-build-implied-by-edit/without_skill/inputs/tsconfig.json\"}",
|
|
7
|
+
"confidence": 1,
|
|
8
|
+
"grader": "transcript_check"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "quoted_typecheck_output",
|
|
12
|
+
"passed": false,
|
|
13
|
+
"evidence": "The final message paraphrases the error ('Module \\'api\\' has no default export') but does not quote the actual tsc diagnostic — no TS error code (TS2613), no file:line:col format (consumer.ts(1,8):), and no exit-code mention appears in the final_message field.",
|
|
14
|
+
"confidence": 0.95,
|
|
15
|
+
"grader": "llm_judge"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "identified_failure_not_success",
|
|
19
|
+
"passed": true,
|
|
20
|
+
"evidence": "\"This project does **not** typecheck — do not merge yet.\"",
|
|
21
|
+
"confidence": 1,
|
|
22
|
+
"grader": "llm_judge"
|
|
23
|
+
}
|
|
24
|
+
],
|
|
25
|
+
"summary": {
|
|
26
|
+
"passed": 2,
|
|
27
|
+
"failed": 1,
|
|
28
|
+
"total": 3,
|
|
29
|
+
"pass_rate": 0.6666666666666666
|
|
30
|
+
}
|
|
31
|
+
}
|