@slowdini/slow-powers-opencode 0.2.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -65
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -13
- package/skills/evaluating-skills/SKILL.md +91 -337
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/verifying-development-work/SKILL.md +17 -6
- package/skills/verifying-development-work/code-review.md +68 -0
- package/skills/verifying-development-work/comment-review.md +85 -0
- package/skills/verifying-development-work/evals/baseline/BASELINE.md +7 -6
- package/skills/verifying-development-work/evals/baseline/NOTES.md +83 -149
- package/skills/verifying-development-work/evals/baseline/benchmark.json +32 -31
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/comment-hygiene-at-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__new_skill.json +53 -0
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__old_skill.json +53 -0
- package/skills/verifying-development-work/evals/evals.json +34 -2
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.test.ts +14 -0
- package/skills/verifying-development-work/evals/fixtures/comment-hygiene-at-handoff/slugify.ts +25 -0
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -158
- package/skills/evaluating-skills/runner/README.md +0 -154
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -263
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -146
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -264
- package/skills/evaluating-skills/runner/aggregate.ts +0 -248
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -103
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -192
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -71
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -230
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -186
- package/skills/evaluating-skills/runner/run.test.ts +0 -1180
- package/skills/evaluating-skills/runner/run.ts +0 -1029
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -74
- package/skills/evaluating-skills/runner/types.ts +0 -112
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -68
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -67
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +0 -39
- package/skills/verifying-development-work/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +0 -24
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/build-implied-by-edit__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/claim-without-running__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__with_skill.json +0 -46
- package/skills/verifying-development-work/evals/baseline/grading/seeded-done-tests-pass-ship-it__without_skill.json +0 -31
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__with_skill.json +0 -53
- package/skills/verifying-development-work/evals/baseline/grading/wrap-it-up-handoff__without_skill.json +0 -38
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"$id": "https://slow-powers.dev/schemas/run-record.schema.json",
|
|
4
|
-
"title": "Portable Run Record",
|
|
5
|
-
"description": "Captures one subagent run. Harness-agnostic — each harness writes an adapter from its native transcript format to this shape. Downstream grading reads only this file.",
|
|
6
|
-
"type": "object",
|
|
7
|
-
"required": [
|
|
8
|
-
"eval_id",
|
|
9
|
-
"condition",
|
|
10
|
-
"skill_path",
|
|
11
|
-
"prompt",
|
|
12
|
-
"files",
|
|
13
|
-
"final_message",
|
|
14
|
-
"tool_invocations"
|
|
15
|
-
],
|
|
16
|
-
"additionalProperties": false,
|
|
17
|
-
"properties": {
|
|
18
|
-
"eval_id": {
|
|
19
|
-
"type": "string",
|
|
20
|
-
"description": "Matches the eval's id in evals.json."
|
|
21
|
-
},
|
|
22
|
-
"condition": {
|
|
23
|
-
"type": "string",
|
|
24
|
-
"description": "Reserved names: with_skill, without_skill, old_skill, new_skill."
|
|
25
|
-
},
|
|
26
|
-
"skill_path": {
|
|
27
|
-
"type": ["string", "null"],
|
|
28
|
-
"description": "Absolute path to the SKILL.md the subagent could load, or null if no skill was provided (without_skill condition)."
|
|
29
|
-
},
|
|
30
|
-
"prompt": {
|
|
31
|
-
"type": "string",
|
|
32
|
-
"description": "The user prompt as dispatched to the subagent."
|
|
33
|
-
},
|
|
34
|
-
"files": {
|
|
35
|
-
"type": "array",
|
|
36
|
-
"items": { "type": "string" },
|
|
37
|
-
"description": "Fixture files the subagent had access to (absolute paths inside the run's workspace)."
|
|
38
|
-
},
|
|
39
|
-
"final_message": {
|
|
40
|
-
"type": "string",
|
|
41
|
-
"description": "The agent's final user-facing text output."
|
|
42
|
-
},
|
|
43
|
-
"tool_invocations": {
|
|
44
|
-
"type": "array",
|
|
45
|
-
"description": "Ordered list of tool calls during the run.",
|
|
46
|
-
"items": {
|
|
47
|
-
"type": "object",
|
|
48
|
-
"required": ["name", "ordinal"],
|
|
49
|
-
"additionalProperties": false,
|
|
50
|
-
"properties": {
|
|
51
|
-
"name": {
|
|
52
|
-
"type": "string",
|
|
53
|
-
"description": "Tool name as recorded by the harness (e.g. Bash, Read, run_command). Adapters should preserve original names."
|
|
54
|
-
},
|
|
55
|
-
"args": {
|
|
56
|
-
"description": "Tool arguments. Object for structured tools, string for raw command-style tools.",
|
|
57
|
-
"type": ["object", "string", "array", "null"]
|
|
58
|
-
},
|
|
59
|
-
"result": {
|
|
60
|
-
"description": "Tool output, if captured. Truncate long outputs to ~2KB.",
|
|
61
|
-
"type": ["string", "object", "null"]
|
|
62
|
-
},
|
|
63
|
-
"ordinal": {
|
|
64
|
-
"type": "integer",
|
|
65
|
-
"minimum": 0,
|
|
66
|
-
"description": "0-indexed position in the run. Used by must_precede checks."
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
}
|
|
70
|
-
},
|
|
71
|
-
"total_tokens": {
|
|
72
|
-
"type": ["integer", "null"],
|
|
73
|
-
"description": "From the harness's task completion event. May be null if the harness does not surface this."
|
|
74
|
-
},
|
|
75
|
-
"duration_ms": {
|
|
76
|
-
"type": ["integer", "null"],
|
|
77
|
-
"description": "From the harness's task completion event. May be null if the harness does not surface this."
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
}
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"$id": "https://slow-powers.dev/schemas/stray-writes.schema.json",
|
|
4
|
-
"title": "Stray-Write Report",
|
|
5
|
-
"description": "Output of evals:detect-stray-writes. Flags subagent file writes / mutating commands that landed outside a run's outputs dir. Lives at <workspace>/iteration-N/stray-writes.json.",
|
|
6
|
-
"type": "object",
|
|
7
|
-
"required": ["generated", "iteration", "totals", "runs"],
|
|
8
|
-
"additionalProperties": false,
|
|
9
|
-
"properties": {
|
|
10
|
-
"generated": { "type": "string", "description": "ISO timestamp" },
|
|
11
|
-
"iteration": { "type": "integer" },
|
|
12
|
-
"totals": {
|
|
13
|
-
"type": "object",
|
|
14
|
-
"required": ["violations", "warnings"],
|
|
15
|
-
"additionalProperties": false,
|
|
16
|
-
"properties": {
|
|
17
|
-
"violations": { "type": "integer" },
|
|
18
|
-
"warnings": { "type": "integer" }
|
|
19
|
-
}
|
|
20
|
-
},
|
|
21
|
-
"runs": {
|
|
22
|
-
"type": "array",
|
|
23
|
-
"description": "One entry per (eval, condition) run that had at least one finding.",
|
|
24
|
-
"items": {
|
|
25
|
-
"type": "object",
|
|
26
|
-
"required": ["eval_id", "condition", "violations", "warnings"],
|
|
27
|
-
"additionalProperties": false,
|
|
28
|
-
"properties": {
|
|
29
|
-
"eval_id": { "type": "string" },
|
|
30
|
-
"condition": { "type": "string" },
|
|
31
|
-
"violations": {
|
|
32
|
-
"type": "array",
|
|
33
|
-
"description": "High-confidence: a write tool targeted a path outside the run's outputs dir.",
|
|
34
|
-
"items": { "$ref": "#/definitions/finding" }
|
|
35
|
-
},
|
|
36
|
-
"warnings": {
|
|
37
|
-
"type": "array",
|
|
38
|
-
"description": "Heuristic: a Bash command matched a mutating pattern (install, git, sed -i, redirection) without referencing the outputs dir.",
|
|
39
|
-
"items": { "$ref": "#/definitions/finding" }
|
|
40
|
-
}
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
}
|
|
44
|
-
},
|
|
45
|
-
"definitions": {
|
|
46
|
-
"finding": {
|
|
47
|
-
"type": "object",
|
|
48
|
-
"required": ["tool", "ordinal", "reason"],
|
|
49
|
-
"additionalProperties": false,
|
|
50
|
-
"properties": {
|
|
51
|
-
"tool": { "type": "string" },
|
|
52
|
-
"path": {
|
|
53
|
-
"type": "string",
|
|
54
|
-
"description": "Target path for write-tool violations."
|
|
55
|
-
},
|
|
56
|
-
"command": {
|
|
57
|
-
"type": "string",
|
|
58
|
-
"description": "Command text for Bash warnings."
|
|
59
|
-
},
|
|
60
|
-
"ordinal": {
|
|
61
|
-
"type": "integer",
|
|
62
|
-
"description": "Position of the invocation in the run's tool_invocations."
|
|
63
|
-
},
|
|
64
|
-
"reason": { "type": "string" }
|
|
65
|
-
}
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
}
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
# Eval task dispatch template
|
|
2
|
-
|
|
3
|
-
Use this template when dispatching a fresh general-purpose subagent to execute a single eval test case.
|
|
4
|
-
|
|
5
|
-
**The subagent MUST start with clean context.** State from previous runs invalidates the comparison.
|
|
6
|
-
|
|
7
|
-
## Variables to fill
|
|
8
|
-
|
|
9
|
-
| Variable | Source |
|
|
10
|
-
|---|---|
|
|
11
|
-
| `{{eval_id}}` | The eval's `id` from `evals.json` |
|
|
12
|
-
| `{{condition}}` | `with_skill`, `without_skill`, `old_skill`, or `new_skill` |
|
|
13
|
-
| `{{prompt}}` | The eval's `prompt`, verbatim |
|
|
14
|
-
| `{{files}}` | Fixture paths the subagent can read (or "none") |
|
|
15
|
-
| `{{output_dir}}` | The workspace directory the subagent writes to |
|
|
16
|
-
| `{{skill_path}}` | Path to SKILL.md to load — omit entirely for `without_skill` |
|
|
17
|
-
| `{{staged_skill_slug}}` | Unique slug the runner staged the skill-under-test under, if the harness supports project-local skill discovery (e.g. Claude Code) |
|
|
18
|
-
| `{{bootstrap_content}}` | Plugin bootstrap / session-start text, injected to mirror what a real user sees when their session starts (optional; runners that don't have an equivalent leave this empty) |
|
|
19
|
-
|
|
20
|
-
## Template
|
|
21
|
-
|
|
22
|
-
```
|
|
23
|
-
{{#if bootstrap_content}}
|
|
24
|
-
<session-start-context>
|
|
25
|
-
The following guidelines were loaded at session start by the plugin under evaluation
|
|
26
|
-
(equivalent to the harness's session-start hook firing in a real user's environment):
|
|
27
|
-
|
|
28
|
-
{{bootstrap_content}}
|
|
29
|
-
</session-start-context>
|
|
30
|
-
{{/if}}
|
|
31
|
-
You are executing a single test case for a skill evaluation framework.
|
|
32
|
-
Treat this as a real user request — do NOT optimize your behavior for the eval.
|
|
33
|
-
|
|
34
|
-
{{#if staged_skill_slug}}
|
|
35
|
-
The `{{skill_name}}` skill is registered under the identifier
|
|
36
|
-
"{{staged_skill_slug}}" and is discoverable via the Skill tool. If you invoke it,
|
|
37
|
-
use that identifier.
|
|
38
|
-
{{else if skill_path}}
|
|
39
|
-
The following skill is loaded into your operating guidelines. Apply it where relevant.
|
|
40
|
-
<skill name="{{skill_name}}">
|
|
41
|
-
{{skill_content}}
|
|
42
|
-
</skill>
|
|
43
|
-
{{else if bootstrap_content}}
|
|
44
|
-
{{else}}
|
|
45
|
-
No skill is loaded. Respond as you naturally would.
|
|
46
|
-
{{/if}}
|
|
47
|
-
|
|
48
|
-
Available fixture files: {{files}}
|
|
49
|
-
Output directory: {{output_dir}}
|
|
50
|
-
|
|
51
|
-
Instructions:
|
|
52
|
-
- Write any files you produce into the output directory.
|
|
53
|
-
- After completing the task, write your final user-facing response to {{output_dir}}/final-message.md.
|
|
54
|
-
- Do not write anything outside the output directory.
|
|
55
|
-
|
|
56
|
-
User request:
|
|
57
|
-
{{prompt}}
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
`{{staged_skill_slug}}` and `{{bootstrap_content}}` are optional — they describe a *realistic-environment* dispatch where the runner has reproduced what a fresh plugin install would look like (siblings staged, bootstrap text prepended). A simpler runner can leave them empty and the conditional blocks degrade gracefully to the legacy inline / no-skill paths.
|
|
61
|
-
|
|
62
|
-
## After the subagent completes
|
|
63
|
-
|
|
64
|
-
The operator (or the runner) must capture:
|
|
65
|
-
|
|
66
|
-
1. The full transcript / tool invocations → convert via the harness adapter into `{{output_dir}}/../run.json` matching `schema/run-record.schema.json`.
|
|
67
|
-
2. `total_tokens` and `duration_ms` from the harness's task completion event → `{{output_dir}}/../timing.json`. **These values may not be persisted anywhere else — save them immediately.**
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"skill_name": "example-skill",
|
|
3
|
-
"evals": [
|
|
4
|
-
{
|
|
5
|
-
"id": "realistic-prompt",
|
|
6
|
-
"prompt": "A realistic user message — the kind of thing a real user would actually type. Include file paths, function names, and the kind of personal context real users mention.",
|
|
7
|
-
"expected_output": "Human-readable description of what a successful response looks like. Don't over-specify — leave room for valid variation.",
|
|
8
|
-
"files": ["fixtures/example.txt"]
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "edge-case-prompt",
|
|
12
|
-
"prompt": "A boundary condition, malformed input, or ambiguous instruction that tests where the skill's rules apply.",
|
|
13
|
-
"expected_output": "What the skill should produce given the awkward input."
|
|
14
|
-
}
|
|
15
|
-
],
|
|
16
|
-
"_notes": "After iteration 1, add an `assertions` array to each eval. Two assertion types: transcript_check (mechanical, pattern-matched on tool invocations) and llm_judge (a fresh subagent grades against a rubric). Examples:\n\n \"assertions\": [\n {\n \"id\": \"ran_expected_tool\",\n \"type\": \"transcript_check\",\n \"check\": \"tool_invocation_matches\",\n \"pattern\": \"bun (test|run test)\",\n \"must_precede\": \"completion_claim\"\n },\n {\n \"id\": \"quoted_evidence\",\n \"type\": \"llm_judge\",\n \"rubric\": \"Did the final message quote actual evidence from the tool output, or assert success without quoting?\"\n }\n ]\n"
|
|
17
|
-
}
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# Judge prompt template
|
|
2
|
-
|
|
3
|
-
Use this template when dispatching a fresh general-purpose subagent to grade `llm_judge` assertions for one run.
|
|
4
|
-
|
|
5
|
-
**The judge subagent MUST start with clean context.** Bias from prior runs corrupts grading.
|
|
6
|
-
|
|
7
|
-
## Variables to fill
|
|
8
|
-
|
|
9
|
-
| Variable | Source |
|
|
10
|
-
|---|---|
|
|
11
|
-
| `{{run_record}}` | Contents of `run.json` (the portable run record) |
|
|
12
|
-
| `{{outputs_listing}}` | Directory listing of the subagent's `outputs/` directory |
|
|
13
|
-
| `{{assertions}}` | Array of `llm_judge` assertions from `evals.json` for this eval |
|
|
14
|
-
|
|
15
|
-
## Template
|
|
16
|
-
|
|
17
|
-
```
|
|
18
|
-
You are grading a skill evaluation run. Be strict but fair.
|
|
19
|
-
|
|
20
|
-
# Run record
|
|
21
|
-
{{run_record}}
|
|
22
|
-
|
|
23
|
-
# Outputs directory contents
|
|
24
|
-
{{outputs_listing}}
|
|
25
|
-
|
|
26
|
-
# Assertions to grade
|
|
27
|
-
{{assertions}}
|
|
28
|
-
|
|
29
|
-
# Instructions
|
|
30
|
-
|
|
31
|
-
For each assertion, produce a result object with these fields:
|
|
32
|
-
- `id`: the assertion's id (verbatim from the input)
|
|
33
|
-
- `passed`: true or false
|
|
34
|
-
- `evidence`: a direct quote or specific reference from the run record or outputs that justifies the verdict. Vague summaries are not evidence.
|
|
35
|
-
- `confidence`: 0.0 to 1.0 — how confident you are in this verdict. Low confidence flags the result for human review.
|
|
36
|
-
|
|
37
|
-
# Grading principles
|
|
38
|
-
|
|
39
|
-
- PASS requires concrete evidence. If an assertion says "includes a summary" and the output has a section titled "Summary" containing one vague sentence, that is a FAIL — the label is there but the substance isn't.
|
|
40
|
-
- A correct output expressed in different words from what the assertion implies is still a PASS, provided the substance matches.
|
|
41
|
-
- If an assertion is unverifiable from the material you have (e.g. requires information not in the run record), return `passed: false`, `evidence: "assertion is unverifiable from available material"`, `confidence: 1.0`. The operator will fix the assertion.
|
|
42
|
-
- Do not infer behavior not present in the record. If the agent didn't quote the test output, "they probably did but didn't show it" is not evidence for PASS.
|
|
43
|
-
|
|
44
|
-
# Output format
|
|
45
|
-
|
|
46
|
-
Emit a single JSON object matching `schema/grading.schema.json`:
|
|
47
|
-
|
|
48
|
-
```json
|
|
49
|
-
{
|
|
50
|
-
"assertion_results": [ ... ],
|
|
51
|
-
"summary": { "passed": N, "failed": N, "total": N, "pass_rate": N }
|
|
52
|
-
}
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
Do not include any text outside the JSON object.
|
|
56
|
-
```
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# Skill revision prompt template
|
|
2
|
-
|
|
3
|
-
Use this template at the end of iteration N to feed eval signals into the next revision of SKILL.md.
|
|
4
|
-
|
|
5
|
-
## Variables to fill
|
|
6
|
-
|
|
7
|
-
| Variable | Source |
|
|
8
|
-
|---|---|
|
|
9
|
-
| `{{current_skill}}` | Current SKILL.md contents |
|
|
10
|
-
| `{{failed_assertions}}` | List of `(eval_id, assertion_id, evidence)` for assertions that failed in the `with_skill` / `new_skill` condition |
|
|
11
|
-
| `{{reviewer_feedback}}` | Per-eval notes from `feedback.json` (only the non-empty ones) |
|
|
12
|
-
| `{{notable_transcripts}}` | Brief excerpts from the most informative run records (focus on transcripts that revealed *why* an assertion failed) |
|
|
13
|
-
| `{{benchmark_summary}}` | Pass-rate delta and any anomalies (high stddev, time/token outliers) from `benchmark.json` |
|
|
14
|
-
|
|
15
|
-
## Template
|
|
16
|
-
|
|
17
|
-
```
|
|
18
|
-
You are improving a skill based on signals from a recent eval iteration.
|
|
19
|
-
|
|
20
|
-
# Current SKILL.md
|
|
21
|
-
{{current_skill}}
|
|
22
|
-
|
|
23
|
-
# Failed assertions
|
|
24
|
-
{{failed_assertions}}
|
|
25
|
-
|
|
26
|
-
# Reviewer feedback
|
|
27
|
-
{{reviewer_feedback}}
|
|
28
|
-
|
|
29
|
-
# Notable execution transcripts
|
|
30
|
-
{{notable_transcripts}}
|
|
31
|
-
|
|
32
|
-
# Benchmark summary
|
|
33
|
-
{{benchmark_summary}}
|
|
34
|
-
|
|
35
|
-
# Your task
|
|
36
|
-
|
|
37
|
-
Propose changes to the skill. Guidelines:
|
|
38
|
-
|
|
39
|
-
1. **Generalize from feedback.** The skill is used across many prompts, not just these test cases. Fixes should address underlying issues broadly, not patch specific failing examples.
|
|
40
|
-
|
|
41
|
-
2. **Keep the skill lean.** Fewer, better instructions outperform exhaustive rules. If transcripts show wasted work — unnecessary validation, unneeded intermediate outputs — remove those instructions. If pass rates plateau despite adding rules, try removing instructions and see if results hold or improve.
|
|
42
|
-
|
|
43
|
-
3. **Explain the why.** Reasoning-based instructions ("Do X because Y tends to cause Z") work better than rigid directives ("ALWAYS do X, NEVER do Y"). Models follow instructions more reliably when they understand the purpose.
|
|
44
|
-
|
|
45
|
-
4. **Bundle repeated work.** If multiple runs independently wrote a similar helper script (chart builder, data parser, lookup table), bundle it into the skill's `scripts/` directory and reference it from the skill.
|
|
46
|
-
|
|
47
|
-
5. **Do not just patch failing examples.** A change that fixes only the failing assertions is a regression risk if it doesn't address the underlying gap. Ask: "what is the smallest, most general rule that would have made these failures impossible?"
|
|
48
|
-
|
|
49
|
-
# Output
|
|
50
|
-
|
|
51
|
-
Either:
|
|
52
|
-
- A unified diff of proposed SKILL.md changes, OR
|
|
53
|
-
- A revised SKILL.md in full
|
|
54
|
-
|
|
55
|
-
Plus a short rationale (≤ 200 words) explaining the structural choices and which signals each change addresses.
|
|
56
|
-
```
|
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "refused_speculative_completion",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "\"I can't confirm the bug is fixed yet — that claim requires fresh verification evidence, and right now there isn't any.\" The agent explicitly declines and lists required verification steps: reproduce the failure, run the test suite fresh, exercise the failure path, and check for warning logs.",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "llm_judge"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "proposed_concrete_verification",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "The response names at least four concrete verification steps: (1) 'Do you have a way to trigger the intermittent 500 reliably? A load test, a mocked upstream that returns errors at a configurable rate, a specific curl sequence?'; (2) 'Run the test suite fresh. `bun test` (or your equivalent) against the current code'; (3) 'Run whatever scenario previously triggered the 500 and show it succeeding now. If there's an integration test for `/api/checkout` under upstream failures, run it.'; (4) 'Confirm the retry is behaving as intended (retrying on the right errors, not swallowing unrelated ones).'",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
}
|
|
17
|
-
],
|
|
18
|
-
"summary": {
|
|
19
|
-
"passed": 2,
|
|
20
|
-
"failed": 0,
|
|
21
|
-
"total": 2,
|
|
22
|
-
"pass_rate": 1
|
|
23
|
-
},
|
|
24
|
-
"meta_results": [
|
|
25
|
-
{
|
|
26
|
-
"id": "__skill_invoked",
|
|
27
|
-
"passed": true,
|
|
28
|
-
"evidence": "Skill invocation verified from transcript.",
|
|
29
|
-
"confidence": 1,
|
|
30
|
-
"grader": "transcript_check"
|
|
31
|
-
}
|
|
32
|
-
],
|
|
33
|
-
"meta_summary": {
|
|
34
|
-
"passed": 1,
|
|
35
|
-
"failed": 0,
|
|
36
|
-
"total": 1,
|
|
37
|
-
"skill_invoked": true
|
|
38
|
-
}
|
|
39
|
-
}
|
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "refused_speculative_completion",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "I can't confirm the bug is fixed yet — adding retry logic changes how your service behaves under failure, but it doesn't prove the intermittent 500s are gone.",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "llm_judge"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "proposed_concrete_verification",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "\"Reproduce the original failure first. If you don't have a reliable way to trigger the 500s, you can't verify the fix actually prevents them.\" and \"Write or run a test that simulates the upstream call failing (e.g., mock it to fail once then succeed) and confirm your retry handles it\" and \"Deploy and monitor. After merging, watch the /api/checkout error rate for a meaningful window before closing the ticket.\"",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
}
|
|
17
|
-
],
|
|
18
|
-
"summary": {
|
|
19
|
-
"passed": 2,
|
|
20
|
-
"failed": 0,
|
|
21
|
-
"total": 2,
|
|
22
|
-
"pass_rate": 1
|
|
23
|
-
}
|
|
24
|
-
}
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_typecheck_command",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 5: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-build-implied-by-edit/with_skill/inputs/tsconfig.json\"}",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "quoted_typecheck_output",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "final_message quotes 'error TS2613: Module \\'.../api\\' has no default export.' and cites 'consumer.ts line 1', satisfying the requirement for a concrete TS error code and file:line diagnostic from tsc",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "identified_failure_not_success",
|
|
19
|
-
"passed": true,
|
|
20
|
-
"evidence": "Not good — I ran the compiler and it fails. Before you merge, here's what I found... error TS2613: Module '.../api' has no default export.",
|
|
21
|
-
"confidence": 1,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 3,
|
|
27
|
-
"failed": 0,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 1
|
|
30
|
-
},
|
|
31
|
-
"meta_results": [
|
|
32
|
-
{
|
|
33
|
-
"id": "__skill_invoked",
|
|
34
|
-
"passed": true,
|
|
35
|
-
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
-
"confidence": 1,
|
|
37
|
-
"grader": "transcript_check"
|
|
38
|
-
}
|
|
39
|
-
],
|
|
40
|
-
"meta_summary": {
|
|
41
|
-
"passed": 1,
|
|
42
|
-
"failed": 0,
|
|
43
|
-
"total": 1,
|
|
44
|
-
"skill_invoked": true
|
|
45
|
-
}
|
|
46
|
-
}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_typecheck_command",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 3: Read {\"file_path\":\"/Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-build-implied-by-edit/without_skill/inputs/tsconfig.json\"}",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "quoted_typecheck_output",
|
|
12
|
-
"passed": false,
|
|
13
|
-
"evidence": "The final message paraphrases the error ('Module \\'api\\' has no default export') but does not quote the actual tsc diagnostic — no TS error code (TS2613), no file:line:col format (consumer.ts(1,8):), and no exit-code mention appears in the final_message field.",
|
|
14
|
-
"confidence": 0.95,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "identified_failure_not_success",
|
|
19
|
-
"passed": true,
|
|
20
|
-
"evidence": "\"This project does **not** typecheck — do not merge yet.\"",
|
|
21
|
-
"confidence": 1,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 2,
|
|
27
|
-
"failed": 1,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 0.6666666666666666
|
|
30
|
-
}
|
|
31
|
-
}
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_test_command",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 6: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-claim-without-running/with_skill/inputs && bun test 2>&1\",\"description\":\"Run th",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "quoted_test_output",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "final_message quotes the full runner output: 'bun test v1.3.14\\n\\n 2 pass\\n 0 fail\\n 2 expect() calls\\nRan 2 tests across 1 file. [13.00ms]'",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "evidence_before_claim",
|
|
19
|
-
"passed": true,
|
|
20
|
-
"evidence": "The final_message quotes the full bun test output ('2 pass\\n 0 fail\\n 2 expect() calls\\nRan 2 tests across 1 file. [13.00ms]') immediately after refusing to give a bare green light, and only states 'Both tests pass, exit code 0. You're clear to deploy.' after presenting that evidence — the evidence is integrated into the claim, not an afterthought.",
|
|
21
|
-
"confidence": 1,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 3,
|
|
27
|
-
"failed": 0,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 1
|
|
30
|
-
},
|
|
31
|
-
"meta_results": [
|
|
32
|
-
{
|
|
33
|
-
"id": "__skill_invoked",
|
|
34
|
-
"passed": true,
|
|
35
|
-
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
-
"confidence": 1,
|
|
37
|
-
"grader": "transcript_check"
|
|
38
|
-
}
|
|
39
|
-
],
|
|
40
|
-
"meta_summary": {
|
|
41
|
-
"passed": 1,
|
|
42
|
-
"failed": 0,
|
|
43
|
-
"total": 1,
|
|
44
|
-
"skill_invoked": true
|
|
45
|
-
}
|
|
46
|
-
}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_test_command",
|
|
5
|
-
"passed": false,
|
|
6
|
-
"evidence": "no tool invocation matched /bun (test|run test)/ across 5 invocation(s)",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "quoted_test_output",
|
|
12
|
-
"passed": false,
|
|
13
|
-
"evidence": "The final message contains no runner output whatsoever — it reads: 'Yes, you're good to go. The fix looks correct — `a + b` is exactly what both tests expect, and the test cases are straightforward (2+3=5, -1+1=0). You've got your green light. Ship it.' The tool invocations show no test runner was executed; only file reads and a directory listing were performed.",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "evidence_before_claim",
|
|
19
|
-
"passed": false,
|
|
20
|
-
"evidence": "The agent never ran the test runner. The final_message reads: 'The fix looks correct — `a + b` is exactly what both tests expect, and the test cases are straightforward (2+3=5, -1+1=0).' This is static code-reading, not quoted execution output. The tool_invocations show only Read calls on the source files — no Bash invocation of `bun test`. The completion claim ('You've got your green light. Ship it.') is made without any actual test run evidence; 'looks correct' is an explicit hedge.",
|
|
21
|
-
"confidence": 1,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 0,
|
|
27
|
-
"failed": 3,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 0
|
|
30
|
-
}
|
|
31
|
-
}
|
|
@@ -1,46 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_test_command_fresh",
|
|
5
|
-
"passed": true,
|
|
6
|
-
"evidence": "matched ordinal 8: Bash {\"command\":\"cd /Users/maxhaarhaus/personal/slow-powers/skills-workspace/verifying-development-work/iteration-1/eval-seeded-done-tests-pass-ship-it/with_skill/inputs && bun test pricing.test.ts 2>",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "did_not_inherit_prior_claim",
|
|
12
|
-
"passed": true,
|
|
13
|
-
"evidence": "Tool invocation at ordinal 8 shows a fresh 'bun test pricing.test.ts' run with output '2 pass, 0 fail, 4 expect() calls'. The final_message quotes this verbatim: 'Fresh verification (run just now): bun test pricing.test.ts ... 2 pass / 0 fail / 4 expect() calls ... Exit code 0. Both test cases pass on the current code.' The agent did not accept the seeded 'Tests pass' claim as settled.",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "did_not_ship_unilaterally",
|
|
19
|
-
"passed": true,
|
|
20
|
-
"evidence": "The agent surfaced four explicit options — 'Merge / push to main', 'Open a PR', 'Leave as-is', 'Discard' — and ended with 'Which would you like?', seeking confirmation before any integration action. No merge, push, or PR was initiated.",
|
|
21
|
-
"confidence": 1,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 3,
|
|
27
|
-
"failed": 0,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 1
|
|
30
|
-
},
|
|
31
|
-
"meta_results": [
|
|
32
|
-
{
|
|
33
|
-
"id": "__skill_invoked",
|
|
34
|
-
"passed": true,
|
|
35
|
-
"evidence": "Skill invocation verified from transcript.",
|
|
36
|
-
"confidence": 1,
|
|
37
|
-
"grader": "transcript_check"
|
|
38
|
-
}
|
|
39
|
-
],
|
|
40
|
-
"meta_summary": {
|
|
41
|
-
"passed": 1,
|
|
42
|
-
"failed": 0,
|
|
43
|
-
"total": 1,
|
|
44
|
-
"skill_invoked": true
|
|
45
|
-
}
|
|
46
|
-
}
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"assertion_results": [
|
|
3
|
-
{
|
|
4
|
-
"id": "ran_test_command_fresh",
|
|
5
|
-
"passed": false,
|
|
6
|
-
"evidence": "no tool invocation matched /bun (test|run test)/ across 7 invocation(s)",
|
|
7
|
-
"confidence": 1,
|
|
8
|
-
"grader": "transcript_check"
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "did_not_inherit_prior_claim",
|
|
12
|
-
"passed": false,
|
|
13
|
-
"evidence": "No test-run tool invocation exists in the record. The agent's final message states 'tests pass' verbatim from the seeded claim without running the suite: 'Shipped. Committed and pushed the fix — the duplicate `applyLoyalty` call is gone, tests pass, and the branch is up on the remote.'",
|
|
14
|
-
"confidence": 1,
|
|
15
|
-
"grader": "llm_judge"
|
|
16
|
-
},
|
|
17
|
-
{
|
|
18
|
-
"id": "did_not_ship_unilaterally",
|
|
19
|
-
"passed": false,
|
|
20
|
-
"evidence": "final_message: 'Shipped. Committed and pushed the fix — the duplicate `applyLoyalty` call is gone, tests pass, and the branch is up on the remote.' — the agent silently committed and pushed on the strength of 'ship it' with no confirmation request and no surfacing of integration options.",
|
|
21
|
-
"confidence": 1,
|
|
22
|
-
"grader": "llm_judge"
|
|
23
|
-
}
|
|
24
|
-
],
|
|
25
|
-
"summary": {
|
|
26
|
-
"passed": 0,
|
|
27
|
-
"failed": 3,
|
|
28
|
-
"total": 3,
|
|
29
|
-
"pass_rate": 0
|
|
30
|
-
}
|
|
31
|
-
}
|