@slowdini/slow-powers-opencode 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -72
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -17
- package/skills/evaluating-skills/SKILL.md +90 -338
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -194
- package/skills/evaluating-skills/harness-parity.md +0 -155
- package/skills/evaluating-skills/runner/README.md +0 -163
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
- package/skills/evaluating-skills/runner/aggregate.ts +0 -269
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
- package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
- package/skills/evaluating-skills/runner/record-runs.ts +0 -209
- package/skills/evaluating-skills/runner/run.test.ts +0 -1703
- package/skills/evaluating-skills/runner/run.ts +0 -1388
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
- package/skills/evaluating-skills/runner/types.ts +0 -121
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
- package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
|
@@ -1,80 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
-
"$id": "https://slow-powers.dev/schemas/stray-writes.schema.json",
|
|
4
|
-
"title": "Stray-Write Report",
|
|
5
|
-
"description": "Output of evals:detect-stray-writes. Flags subagent file writes / mutating commands that landed outside a run's outputs dir. Lives at <workspace>/iteration-N/stray-writes.json.",
|
|
6
|
-
"type": "object",
|
|
7
|
-
"required": ["generated", "iteration", "totals", "runs"],
|
|
8
|
-
"additionalProperties": false,
|
|
9
|
-
"properties": {
|
|
10
|
-
"generated": { "type": "string", "description": "ISO timestamp" },
|
|
11
|
-
"iteration": { "type": "integer" },
|
|
12
|
-
"totals": {
|
|
13
|
-
"type": "object",
|
|
14
|
-
"required": ["violations", "warnings", "live_source_reads"],
|
|
15
|
-
"additionalProperties": false,
|
|
16
|
-
"properties": {
|
|
17
|
-
"violations": { "type": "integer" },
|
|
18
|
-
"warnings": { "type": "integer" },
|
|
19
|
-
"live_source_reads": { "type": "integer" }
|
|
20
|
-
}
|
|
21
|
-
},
|
|
22
|
-
"runs": {
|
|
23
|
-
"type": "array",
|
|
24
|
-
"description": "One entry per (eval, condition) run that had at least one finding.",
|
|
25
|
-
"items": {
|
|
26
|
-
"type": "object",
|
|
27
|
-
"required": [
|
|
28
|
-
"eval_id",
|
|
29
|
-
"condition",
|
|
30
|
-
"violations",
|
|
31
|
-
"warnings",
|
|
32
|
-
"live_source_reads"
|
|
33
|
-
],
|
|
34
|
-
"additionalProperties": false,
|
|
35
|
-
"properties": {
|
|
36
|
-
"eval_id": { "type": "string" },
|
|
37
|
-
"condition": { "type": "string" },
|
|
38
|
-
"violations": {
|
|
39
|
-
"type": "array",
|
|
40
|
-
"description": "High-confidence: a write tool targeted a path outside the run's outputs dir.",
|
|
41
|
-
"items": { "$ref": "#/definitions/finding" }
|
|
42
|
-
},
|
|
43
|
-
"warnings": {
|
|
44
|
-
"type": "array",
|
|
45
|
-
"description": "Heuristic: a Bash command matched a mutating pattern (install, git, sed -i, redirection) without referencing the outputs dir.",
|
|
46
|
-
"items": { "$ref": "#/definitions/finding" }
|
|
47
|
-
},
|
|
48
|
-
"live_source_reads": {
|
|
49
|
-
"type": "array",
|
|
50
|
-
"description": "A read tool or Bash command accessed the live skill-under-test directory instead of the staged copy — the arm may be contaminated (staged-slug resolution race).",
|
|
51
|
-
"items": { "$ref": "#/definitions/finding" }
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
},
|
|
57
|
-
"definitions": {
|
|
58
|
-
"finding": {
|
|
59
|
-
"type": "object",
|
|
60
|
-
"required": ["tool", "ordinal", "reason"],
|
|
61
|
-
"additionalProperties": false,
|
|
62
|
-
"properties": {
|
|
63
|
-
"tool": { "type": "string" },
|
|
64
|
-
"path": {
|
|
65
|
-
"type": "string",
|
|
66
|
-
"description": "Target path for write-tool violations."
|
|
67
|
-
},
|
|
68
|
-
"command": {
|
|
69
|
-
"type": "string",
|
|
70
|
-
"description": "Command text for Bash warnings."
|
|
71
|
-
},
|
|
72
|
-
"ordinal": {
|
|
73
|
-
"type": "integer",
|
|
74
|
-
"description": "Position of the invocation in the run's tool_invocations."
|
|
75
|
-
},
|
|
76
|
-
"reason": { "type": "string" }
|
|
77
|
-
}
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
}
|
|
@@ -1,69 +0,0 @@
|
|
|
1
|
-
# Eval task dispatch template
|
|
2
|
-
|
|
3
|
-
Use this template when dispatching a fresh general-purpose subagent to execute a single eval test case.
|
|
4
|
-
|
|
5
|
-
**The subagent MUST start with clean context.** State from previous runs invalidates the comparison.
|
|
6
|
-
|
|
7
|
-
## Variables to fill
|
|
8
|
-
|
|
9
|
-
| Variable | Source |
|
|
10
|
-
|---|---|
|
|
11
|
-
| `{{eval_id}}` | The eval's `id` from `evals.json` |
|
|
12
|
-
| `{{condition}}` | `with_skill`, `without_skill`, `old_skill`, or `new_skill` |
|
|
13
|
-
| `{{prompt}}` | The eval's `prompt`, verbatim |
|
|
14
|
-
| `{{files}}` | Fixture paths the subagent can read (or "none") |
|
|
15
|
-
| `{{output_dir}}` | The workspace directory the subagent writes to |
|
|
16
|
-
| `{{skill_path}}` | Path to SKILL.md to load — omit entirely for `without_skill` |
|
|
17
|
-
| `{{staged_skill_slug}}` | Unique slug the runner staged the skill-under-test under, if the harness supports project-local skill discovery (e.g. Claude Code) |
|
|
18
|
-
| `{{bootstrap_content}}` | Plugin bootstrap / session-start text, injected to mirror what a real user sees when their session starts (optional; runners that don't have an equivalent leave this empty) |
|
|
19
|
-
|
|
20
|
-
## Template
|
|
21
|
-
|
|
22
|
-
```
|
|
23
|
-
{{#if bootstrap_content}}
|
|
24
|
-
<session-start-context>
|
|
25
|
-
The following guidelines were loaded at session start by the plugin under evaluation
|
|
26
|
-
(equivalent to the harness's session-start hook firing in a real user's environment):
|
|
27
|
-
|
|
28
|
-
{{bootstrap_content}}
|
|
29
|
-
</session-start-context>
|
|
30
|
-
{{/if}}
|
|
31
|
-
You are executing a single test case for a skill evaluation framework.
|
|
32
|
-
Treat this as a real user request — do NOT optimize your behavior for the eval.
|
|
33
|
-
|
|
34
|
-
{{#if staged_skill_slug}}
|
|
35
|
-
The `{{skill_name}}` skill is registered under the identifier
|
|
36
|
-
"{{staged_skill_slug}}" and is discoverable via the Skill tool. If you invoke it,
|
|
37
|
-
use that identifier.
|
|
38
|
-
{{else if skill_path}}
|
|
39
|
-
The following skill is loaded into your operating guidelines. Apply it where relevant.
|
|
40
|
-
<skill name="{{skill_name}}">
|
|
41
|
-
{{skill_content}}
|
|
42
|
-
</skill>
|
|
43
|
-
{{else if bootstrap_content}}
|
|
44
|
-
{{else}}
|
|
45
|
-
No skill is loaded. Respond as you naturally would.
|
|
46
|
-
{{/if}}
|
|
47
|
-
|
|
48
|
-
Available fixture files: {{files}}
|
|
49
|
-
Output directory: {{output_dir}}
|
|
50
|
-
|
|
51
|
-
Instructions:
|
|
52
|
-
- Write any files you produce into the output directory.
|
|
53
|
-
- After completing the task, write your final user-facing response to {{output_dir}}/final-message.md.
|
|
54
|
-
- Do not write anything outside the output directory.
|
|
55
|
-
|
|
56
|
-
User request:
|
|
57
|
-
{{prompt}}
|
|
58
|
-
```
|
|
59
|
-
|
|
60
|
-
`{{staged_skill_slug}}` and `{{bootstrap_content}}` are optional — they describe a *realistic-environment* dispatch where the runner has reproduced what a fresh plugin install would look like (siblings staged, bootstrap text prepended). A simpler runner can leave them empty and the conditional blocks degrade gracefully to the legacy inline / no-skill paths.
|
|
61
|
-
|
|
62
|
-
## After the subagent completes
|
|
63
|
-
|
|
64
|
-
Two records must exist per run: `{{output_dir}}/../run.json` (matching `schema/run-record.schema.json`) and `{{output_dir}}/../timing.json`.
|
|
65
|
-
|
|
66
|
-
- **Harnesses with persisted transcripts (Claude Code):** `record-runs` assembles both from disk after all dispatches — carry-over fields from `dispatch.json`, `final_message` from `{{output_dir}}/final-message.md`, `tool_invocations`/tokens/duration from the transcript. The operator captures nothing per-task. Optionally, completion-event timing written to `timing.json` at dispatch time (with `"source": "completion-event"`) takes precedence — `record-runs` only backfills, never overwrites.
|
|
67
|
-
- **Transcript-less harnesses:** the operator (or the runner) captures manually, as before:
|
|
68
|
-
1. The full transcript / tool invocations → convert via the harness adapter into `{{output_dir}}/../run.json`.
|
|
69
|
-
2. `total_tokens` and `duration_ms` from the harness's task completion event → `{{output_dir}}/../timing.json`. **These values may not be persisted anywhere else — save them immediately.**
|
|
@@ -1,17 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"skill_name": "example-skill",
|
|
3
|
-
"evals": [
|
|
4
|
-
{
|
|
5
|
-
"id": "realistic-prompt",
|
|
6
|
-
"prompt": "A realistic user message — the kind of thing a real user would actually type. Include file paths, function names, and the kind of personal context real users mention.",
|
|
7
|
-
"expected_output": "Human-readable description of what a successful response looks like. Don't over-specify — leave room for valid variation.",
|
|
8
|
-
"files": ["fixtures/example.txt"]
|
|
9
|
-
},
|
|
10
|
-
{
|
|
11
|
-
"id": "edge-case-prompt",
|
|
12
|
-
"prompt": "A boundary condition, malformed input, or ambiguous instruction that tests where the skill's rules apply.",
|
|
13
|
-
"expected_output": "What the skill should produce given the awkward input."
|
|
14
|
-
}
|
|
15
|
-
],
|
|
16
|
-
"_notes": "After iteration 1, add an `assertions` array to each eval. Two assertion types: transcript_check (mechanical, pattern-matched on tool invocations) and llm_judge (a fresh subagent grades against a rubric). Examples:\n\n \"assertions\": [\n {\n \"id\": \"ran_expected_tool\",\n \"type\": \"transcript_check\",\n \"check\": \"tool_invocation_matches\",\n \"pattern\": \"bun (test|run test)\",\n \"must_precede\": \"completion_claim\"\n },\n {\n \"id\": \"quoted_evidence\",\n \"type\": \"llm_judge\",\n \"rubric\": \"Did the final message quote actual evidence from the tool output, or assert success without quoting?\"\n }\n ]\n"
|
|
17
|
-
}
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# Judge prompt template
|
|
2
|
-
|
|
3
|
-
Use this template when dispatching a fresh general-purpose subagent to grade `llm_judge` assertions for one run.
|
|
4
|
-
|
|
5
|
-
**The judge subagent MUST start with clean context.** Bias from prior runs corrupts grading.
|
|
6
|
-
|
|
7
|
-
## Variables to fill
|
|
8
|
-
|
|
9
|
-
| Variable | Source |
|
|
10
|
-
|---|---|
|
|
11
|
-
| `{{run_record}}` | Contents of `run.json` (the portable run record) |
|
|
12
|
-
| `{{outputs_listing}}` | Directory listing of the subagent's `outputs/` directory |
|
|
13
|
-
| `{{assertions}}` | Array of `llm_judge` assertions from `evals.json` for this eval |
|
|
14
|
-
|
|
15
|
-
## Template
|
|
16
|
-
|
|
17
|
-
```
|
|
18
|
-
You are grading a skill evaluation run. Be strict but fair.
|
|
19
|
-
|
|
20
|
-
# Run record
|
|
21
|
-
{{run_record}}
|
|
22
|
-
|
|
23
|
-
# Outputs directory contents
|
|
24
|
-
{{outputs_listing}}
|
|
25
|
-
|
|
26
|
-
# Assertions to grade
|
|
27
|
-
{{assertions}}
|
|
28
|
-
|
|
29
|
-
# Instructions
|
|
30
|
-
|
|
31
|
-
For each assertion, produce a result object with these fields:
|
|
32
|
-
- `id`: the assertion's id (verbatim from the input)
|
|
33
|
-
- `passed`: true or false
|
|
34
|
-
- `evidence`: a direct quote or specific reference from the run record or outputs that justifies the verdict. Vague summaries are not evidence.
|
|
35
|
-
- `confidence`: 0.0 to 1.0 — how confident you are in this verdict. Low confidence flags the result for human review.
|
|
36
|
-
|
|
37
|
-
# Grading principles
|
|
38
|
-
|
|
39
|
-
- PASS requires concrete evidence. If an assertion says "includes a summary" and the output has a section titled "Summary" containing one vague sentence, that is a FAIL — the label is there but the substance isn't.
|
|
40
|
-
- A correct output expressed in different words from what the assertion implies is still a PASS, provided the substance matches.
|
|
41
|
-
- If an assertion is unverifiable from the material you have (e.g. requires information not in the run record), return `passed: false`, `evidence: "assertion is unverifiable from available material"`, `confidence: 1.0`. The operator will fix the assertion.
|
|
42
|
-
- Do not infer behavior not present in the record. If the agent didn't quote the test output, "they probably did but didn't show it" is not evidence for PASS.
|
|
43
|
-
|
|
44
|
-
# Output format
|
|
45
|
-
|
|
46
|
-
Emit a single JSON object matching `schema/grading.schema.json`:
|
|
47
|
-
|
|
48
|
-
```json
|
|
49
|
-
{
|
|
50
|
-
"assertion_results": [ ... ],
|
|
51
|
-
"summary": { "passed": N, "failed": N, "total": N, "pass_rate": N }
|
|
52
|
-
}
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
Do not include any text outside the JSON object.
|
|
56
|
-
```
|
|
@@ -1,56 +0,0 @@
|
|
|
1
|
-
# Skill revision prompt template
|
|
2
|
-
|
|
3
|
-
Use this template at the end of iteration N to feed eval signals into the next revision of SKILL.md.
|
|
4
|
-
|
|
5
|
-
## Variables to fill
|
|
6
|
-
|
|
7
|
-
| Variable | Source |
|
|
8
|
-
|---|---|
|
|
9
|
-
| `{{current_skill}}` | Current SKILL.md contents |
|
|
10
|
-
| `{{failed_assertions}}` | List of `(eval_id, assertion_id, evidence)` for assertions that failed in the `with_skill` / `new_skill` condition |
|
|
11
|
-
| `{{reviewer_feedback}}` | Per-eval notes from `feedback.json` (only the non-empty ones) |
|
|
12
|
-
| `{{notable_transcripts}}` | Brief excerpts from the most informative run records (focus on transcripts that revealed *why* an assertion failed) |
|
|
13
|
-
| `{{benchmark_summary}}` | Pass-rate delta and any anomalies (high stddev, time/token outliers) from `benchmark.json` |
|
|
14
|
-
|
|
15
|
-
## Template
|
|
16
|
-
|
|
17
|
-
```
|
|
18
|
-
You are improving a skill based on signals from a recent eval iteration.
|
|
19
|
-
|
|
20
|
-
# Current SKILL.md
|
|
21
|
-
{{current_skill}}
|
|
22
|
-
|
|
23
|
-
# Failed assertions
|
|
24
|
-
{{failed_assertions}}
|
|
25
|
-
|
|
26
|
-
# Reviewer feedback
|
|
27
|
-
{{reviewer_feedback}}
|
|
28
|
-
|
|
29
|
-
# Notable execution transcripts
|
|
30
|
-
{{notable_transcripts}}
|
|
31
|
-
|
|
32
|
-
# Benchmark summary
|
|
33
|
-
{{benchmark_summary}}
|
|
34
|
-
|
|
35
|
-
# Your task
|
|
36
|
-
|
|
37
|
-
Propose changes to the skill. Guidelines:
|
|
38
|
-
|
|
39
|
-
1. **Generalize from feedback.** The skill is used across many prompts, not just these test cases. Fixes should address underlying issues broadly, not patch specific failing examples.
|
|
40
|
-
|
|
41
|
-
2. **Keep the skill lean.** Fewer, better instructions outperform exhaustive rules. If transcripts show wasted work — unnecessary validation, unneeded intermediate outputs — remove those instructions. If pass rates plateau despite adding rules, try removing instructions and see if results hold or improve.
|
|
42
|
-
|
|
43
|
-
3. **Explain the why.** Reasoning-based instructions ("Do X because Y tends to cause Z") work better than rigid directives ("ALWAYS do X, NEVER do Y"). Models follow instructions more reliably when they understand the purpose.
|
|
44
|
-
|
|
45
|
-
4. **Bundle repeated work.** If multiple runs independently wrote a similar helper script (chart builder, data parser, lookup table), bundle it into the skill's `scripts/` directory and reference it from the skill.
|
|
46
|
-
|
|
47
|
-
5. **Do not just patch failing examples.** A change that fixes only the failing assertions is a regression risk if it doesn't address the underlying gap. Ask: "what is the smallest, most general rule that would have made these failures impossible?"
|
|
48
|
-
|
|
49
|
-
# Output
|
|
50
|
-
|
|
51
|
-
Either:
|
|
52
|
-
- A unified diff of proposed SKILL.md changes, OR
|
|
53
|
-
- A revised SKILL.md in full
|
|
54
|
-
|
|
55
|
-
Plus a short rationale (≤ 200 words) explaining the structural choices and which signals each change addresses.
|
|
56
|
-
```
|