@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,227 +0,0 @@
1
- import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
- import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs";
3
- import { tmpdir } from "node:os";
4
- import { join } from "node:path";
5
- import {
6
- cleanupWorkspace,
7
- PROMOTED_MARKER,
8
- SNAPSHOT_META,
9
- } from "./workspace-teardown";
10
-
11
- const FIXTURE_ROOT = join(
12
- tmpdir(),
13
- `slow-powers-workspace-teardown-test-${process.pid}`,
14
- );
15
-
16
- beforeAll(() => {
17
- mkdirSync(FIXTURE_ROOT, { recursive: true });
18
- });
19
-
20
- afterAll(() => {
21
- rmSync(FIXTURE_ROOT, { recursive: true, force: true });
22
- });
23
-
24
- let caseSeq = 0;
25
- function freshWorkspace(): string {
26
- caseSeq += 1;
27
- const workspaceRoot = join(
28
- FIXTURE_ROOT,
29
- `case-${caseSeq}`,
30
- "skills-workspace",
31
- );
32
- mkdirSync(workspaceRoot, { recursive: true });
33
- return workspaceRoot;
34
- }
35
-
36
- function writeJson(path: string, value: unknown) {
37
- mkdirSync(join(path, ".."), { recursive: true });
38
- writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
39
- }
40
-
41
- /** Build an iteration dir; `opts` controls which artifacts it carries. */
42
- function makeIteration(
43
- workspaceRoot: string,
44
- skill: string,
45
- iteration: string,
46
- opts: {
47
- promoted?: boolean;
48
- benchmark?: boolean;
49
- runRecord?: boolean;
50
- grading?: boolean;
51
- scaffoldingOnly?: boolean;
52
- },
53
- ): string {
54
- const dir = join(workspaceRoot, skill, iteration);
55
- mkdirSync(dir, { recursive: true });
56
- if (opts.scaffoldingOnly) {
57
- writeFileSync(join(dir, "dispatch.json"), "[]\n");
58
- }
59
- if (opts.benchmark) {
60
- writeJson(join(dir, "benchmark.json"), { delta: { pass_rate: 0.5 } });
61
- }
62
- if (opts.runRecord) {
63
- writeJson(join(dir, "eval-e1", "with_skill", "run.json"), {
64
- eval_id: "e1",
65
- });
66
- }
67
- if (opts.grading) {
68
- writeJson(join(dir, "eval-e1", "with_skill", "grading.json"), {
69
- summary: { pass_rate: 1 },
70
- });
71
- }
72
- if (opts.promoted) {
73
- writeJson(join(dir, PROMOTED_MARKER), {
74
- promoted_at: "2026-06-04T00:00:00.000Z",
75
- baseline_dir: "/somewhere/evals/baseline",
76
- commit: "abc1234",
77
- });
78
- }
79
- return dir;
80
- }
81
-
82
- function makeSnapshot(
83
- workspaceRoot: string,
84
- skill: string,
85
- label: string,
86
- source: "ref" | "working-tree" | null,
87
- ): string {
88
- const dir = join(workspaceRoot, skill, "snapshots", label);
89
- mkdirSync(dir, { recursive: true });
90
- writeFileSync(join(dir, "SKILL.md"), "snapshot body\n");
91
- if (source !== null) {
92
- writeJson(
93
- join(dir, SNAPSHOT_META),
94
- source === "ref" ? { source, ref: "HEAD~1" } : { source },
95
- );
96
- }
97
- return dir;
98
- }
99
-
100
- describe("cleanupWorkspace — iterations", () => {
101
- test("removes a promoted iteration and prunes the emptied workspace", () => {
102
- const ws = freshWorkspace();
103
- const iter = makeIteration(ws, "mr-review", "iteration-1", {
104
- promoted: true,
105
- benchmark: true,
106
- grading: true,
107
- });
108
-
109
- const summary = cleanupWorkspace(ws, "mr-review");
110
-
111
- expect(existsSync(iter)).toBe(false);
112
- expect(summary.removedIterations).toEqual(["iteration-1"]);
113
- expect(summary.workspaceRemoved).toBe(true);
114
- // Skill dir and the workspace root are pruned once empty.
115
- expect(existsSync(join(ws, "mr-review"))).toBe(false);
116
- expect(existsSync(ws)).toBe(false);
117
- });
118
-
119
- test("keeps an unpromoted iteration that holds a benchmark, and reports it", () => {
120
- const ws = freshWorkspace();
121
- const iter = makeIteration(ws, "mr-review", "iteration-1", {
122
- benchmark: true,
123
- });
124
-
125
- const summary = cleanupWorkspace(ws, "mr-review");
126
-
127
- expect(existsSync(iter)).toBe(true);
128
- expect(summary.removedIterations).toEqual([]);
129
- expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
130
- "iteration-1",
131
- ]);
132
- // Nothing was emptied, so the workspace stays.
133
- expect(existsSync(ws)).toBe(true);
134
- });
135
-
136
- test("keeps an unpromoted iteration that holds only a run record", () => {
137
- const ws = freshWorkspace();
138
- const iter = makeIteration(ws, "mr-review", "iteration-1", {
139
- runRecord: true,
140
- });
141
-
142
- const summary = cleanupWorkspace(ws, "mr-review");
143
-
144
- expect(existsSync(iter)).toBe(true);
145
- expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
146
- "iteration-1",
147
- ]);
148
- });
149
-
150
- test("removes an unpromoted scaffolding-only iteration (no captured results)", () => {
151
- const ws = freshWorkspace();
152
- const iter = makeIteration(ws, "mr-review", "iteration-1", {
153
- scaffoldingOnly: true,
154
- });
155
-
156
- const summary = cleanupWorkspace(ws, "mr-review");
157
-
158
- expect(existsSync(iter)).toBe(false);
159
- expect(summary.removedIterations).toEqual(["iteration-1"]);
160
- });
161
-
162
- test("mixed: promoted removed, unpromoted-with-results kept, skill dir NOT pruned", () => {
163
- const ws = freshWorkspace();
164
- const promoted = makeIteration(ws, "mr-review", "iteration-1", {
165
- promoted: true,
166
- benchmark: true,
167
- });
168
- const kept = makeIteration(ws, "mr-review", "iteration-2", {
169
- benchmark: true,
170
- });
171
-
172
- const summary = cleanupWorkspace(ws, "mr-review");
173
-
174
- expect(existsSync(promoted)).toBe(false);
175
- expect(existsSync(kept)).toBe(true);
176
- expect(summary.removedIterations).toEqual(["iteration-1"]);
177
- expect(summary.keptIterations.map((k) => k.iteration)).toEqual([
178
- "iteration-2",
179
- ]);
180
- expect(summary.workspaceRemoved).toBe(false);
181
- expect(existsSync(join(ws, "mr-review"))).toBe(true);
182
- });
183
- });
184
-
185
- describe("cleanupWorkspace — snapshots", () => {
186
- test("removes ref snapshots, keeps working-tree and legacy (no-meta) snapshots", () => {
187
- const ws = freshWorkspace();
188
- const refSnap = makeSnapshot(ws, "mr-review", "old-ref", "ref");
189
- const wtSnap = makeSnapshot(ws, "mr-review", "wt", "working-tree");
190
- const legacySnap = makeSnapshot(ws, "mr-review", "legacy", null);
191
-
192
- const summary = cleanupWorkspace(ws, "mr-review");
193
-
194
- expect(existsSync(refSnap)).toBe(false);
195
- expect(existsSync(wtSnap)).toBe(true);
196
- expect(existsSync(legacySnap)).toBe(true);
197
- expect(summary.removedSnapshots).toEqual(["old-ref"]);
198
- expect(summary.keptSnapshots.sort()).toEqual(["legacy", "wt"]);
199
- });
200
- });
201
-
202
- describe("cleanupWorkspace — safety", () => {
203
- test("never touches another skill's workspace, and leaves the root intact", () => {
204
- const ws = freshWorkspace();
205
- makeIteration(ws, "mr-review", "iteration-1", { promoted: true });
206
- const otherIter = makeIteration(ws, "other-skill", "iteration-1", {
207
- benchmark: true,
208
- });
209
-
210
- cleanupWorkspace(ws, "mr-review");
211
-
212
- expect(existsSync(join(ws, "mr-review"))).toBe(false);
213
- expect(existsSync(otherIter)).toBe(true);
214
- // Root survives because other-skill still lives there.
215
- expect(existsSync(ws)).toBe(true);
216
- });
217
-
218
- test("returns an empty summary and does not throw when the skill has no workspace", () => {
219
- const ws = freshWorkspace();
220
- const summary = cleanupWorkspace(ws, "never-ran");
221
- expect(summary.removedIterations).toEqual([]);
222
- expect(summary.keptIterations).toEqual([]);
223
- expect(summary.removedSnapshots).toEqual([]);
224
- expect(summary.keptSnapshots).toEqual([]);
225
- expect(summary.workspaceRemoved).toBe(false);
226
- });
227
- });
@@ -1,136 +0,0 @@
1
- import { existsSync, readdirSync, readFileSync, rmSync } from "node:fs";
2
- import { join } from "node:path";
3
-
4
- /**
5
- * Marker `promote-baseline` drops into an iteration dir once that iteration's
6
- * durable results (benchmark + gradings) are committed under the skill's
7
- * `evals/baseline/`. Teardown treats its presence as "safe to delete" — the
8
- * data now lives in version control.
9
- */
10
- export const PROMOTED_MARKER = ".promoted.json";
11
-
12
- /**
13
- * Provenance the `snapshot` command writes into each `snapshots/<label>/` dir,
14
- * recording whether it was materialized from a git ref (reproducible) or copied
15
- * from the working tree (not reproducible). Teardown only reclaims ref snapshots.
16
- */
17
- export const SNAPSHOT_META = ".snapshot-meta.json";
18
-
19
- export type WorkspaceCleanupSummary = {
20
- /** Iteration dir names removed (promoted, or pure scaffolding). */
21
- removedIterations: string[];
22
- /** Iterations kept because they hold uncommitted results, with the reason. */
23
- keptIterations: { iteration: string; reason: string }[];
24
- /** Snapshot labels removed (reproducible from a git ref). */
25
- removedSnapshots: string[];
26
- /** Snapshot labels kept (working-tree or legacy, can't be regenerated). */
27
- keptSnapshots: string[];
28
- /** True when the skill's whole workspace subtree was removed. */
29
- workspaceRemoved: boolean;
30
- };
31
-
32
- /** Remove `dir` only if it exists and is empty. */
33
- function pruneIfEmpty(dir: string): void {
34
- if (existsSync(dir) && readdirSync(dir).length === 0) {
35
- rmSync(dir, { recursive: true, force: true });
36
- }
37
- }
38
-
39
- /**
40
- * An iteration carries "captured results" worth preserving if it reached the
41
- * point of producing an aggregate (`benchmark.json`) or any per-run record or
42
- * grading. Anything short of that (e.g. a `--dry-run` or a run staged but never
43
- * dispatched) is reproducible scaffolding.
44
- */
45
- function iterationHasResults(iterDir: string): boolean {
46
- if (existsSync(join(iterDir, "benchmark.json"))) return true;
47
- for (const entry of readdirSync(iterDir, { withFileTypes: true })) {
48
- if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
49
- const evalDir = join(iterDir, entry.name);
50
- for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
51
- if (!cond.isDirectory()) continue;
52
- const condDir = join(evalDir, cond.name);
53
- if (existsSync(join(condDir, "run.json"))) return true;
54
- if (existsSync(join(condDir, "grading.json"))) return true;
55
- }
56
- }
57
- return false;
58
- }
59
-
60
- function snapshotSource(snapDir: string): string | null {
61
- const metaPath = join(snapDir, SNAPSHOT_META);
62
- if (!existsSync(metaPath)) return null;
63
- try {
64
- const meta = JSON.parse(readFileSync(metaPath, "utf8")) as {
65
- source?: string;
66
- };
67
- return meta.source ?? null;
68
- } catch {
69
- return null;
70
- }
71
- }
72
-
73
- /**
74
- * End-of-run cleanup of a skill's `skills-workspace/<skill>/` subtree, so a
75
- * finished eval leaves behind nothing that wasn't meant to be committed —
76
- * without ever destroying results the user hasn't moved into version control.
77
- *
78
- * Per iteration: promoted (marker present) → removed; unpromoted but holding
79
- * captured results → kept and reported; unpromoted scaffolding → removed. Per
80
- * snapshot: ref-sourced → removed; working-tree or legacy → kept. Empty parents
81
- * (`snapshots/`, the skill dir, the workspace root) are pruned, but a non-empty
82
- * one — e.g. another skill's artifacts — is never touched.
83
- */
84
- export function cleanupWorkspace(
85
- workspaceRoot: string,
86
- skillName: string,
87
- ): WorkspaceCleanupSummary {
88
- const summary: WorkspaceCleanupSummary = {
89
- removedIterations: [],
90
- keptIterations: [],
91
- removedSnapshots: [],
92
- keptSnapshots: [],
93
- workspaceRemoved: false,
94
- };
95
-
96
- const skillDir = join(workspaceRoot, skillName);
97
- if (!existsSync(skillDir)) return summary;
98
-
99
- for (const entry of readdirSync(skillDir, { withFileTypes: true })) {
100
- if (!entry.isDirectory() || !entry.name.startsWith("iteration-")) continue;
101
- const iterDir = join(skillDir, entry.name);
102
- if (existsSync(join(iterDir, PROMOTED_MARKER))) {
103
- rmSync(iterDir, { recursive: true, force: true });
104
- summary.removedIterations.push(entry.name);
105
- } else if (iterationHasResults(iterDir)) {
106
- summary.keptIterations.push({
107
- iteration: entry.name,
108
- reason: "uncommitted results — not promoted to evals/baseline/",
109
- });
110
- } else {
111
- rmSync(iterDir, { recursive: true, force: true });
112
- summary.removedIterations.push(entry.name);
113
- }
114
- }
115
-
116
- const snapshotsDir = join(skillDir, "snapshots");
117
- if (existsSync(snapshotsDir)) {
118
- for (const entry of readdirSync(snapshotsDir, { withFileTypes: true })) {
119
- if (!entry.isDirectory()) continue;
120
- const snapDir = join(snapshotsDir, entry.name);
121
- if (snapshotSource(snapDir) === "ref") {
122
- rmSync(snapDir, { recursive: true, force: true });
123
- summary.removedSnapshots.push(entry.name);
124
- } else {
125
- summary.keptSnapshots.push(entry.name);
126
- }
127
- }
128
- pruneIfEmpty(snapshotsDir);
129
- }
130
-
131
- pruneIfEmpty(skillDir);
132
- summary.workspaceRemoved = !existsSync(skillDir);
133
- pruneIfEmpty(workspaceRoot);
134
-
135
- return summary;
136
- }
@@ -1,105 +0,0 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "$id": "https://slow-powers.dev/schemas/evals.schema.json",
4
- "title": "Skill Evaluation Definition",
5
- "description": "Defines a set of test cases for evaluating a skill. Lives at <skill>/evals/evals.json.",
6
- "type": "object",
7
- "required": ["skill_name", "evals"],
8
- "additionalProperties": false,
9
- "properties": {
10
- "skill_name": {
11
- "type": "string",
12
- "description": "Name of the skill being evaluated. Should match the skill directory name."
13
- },
14
- "evals": {
15
- "type": "array",
16
- "minItems": 1,
17
- "items": { "$ref": "#/definitions/eval" }
18
- }
19
- },
20
- "definitions": {
21
- "eval": {
22
- "type": "object",
23
- "required": ["id", "prompt", "expected_output"],
24
- "additionalProperties": false,
25
- "properties": {
26
- "id": {
27
- "type": "string",
28
- "pattern": "^[a-z0-9][a-z0-9-]*$",
29
- "description": "Stable kebab-case identifier. Used as directory name in the workspace tree."
30
- },
31
- "prompt": {
32
- "type": "string",
33
- "minLength": 1,
34
- "description": "The user-facing message the subagent receives. Should read like a realistic user request."
35
- },
36
- "expected_output": {
37
- "type": "string",
38
- "minLength": 1,
39
- "description": "Human-readable description of what a successful response looks like."
40
- },
41
- "files": {
42
- "type": "array",
43
- "items": { "type": "string" },
44
- "description": "Fixture file paths relative to the skill's evals/ directory. Copied into the subagent's input directory before dispatch."
45
- },
46
- "skill_should_trigger": {
47
- "type": "boolean",
48
- "default": true,
49
- "description": "Whether the skill-under-test is expected to fire on this eval. Defaults to true. Set false for negative evals where correct behavior is NOT invoking the skill (e.g. an over-trigger guard); such evals are excluded from the skill-invocation rate and its validity warning."
50
- },
51
- "assertions": {
52
- "type": "array",
53
- "items": { "$ref": "#/definitions/assertion" },
54
- "description": "Pass/fail criteria, added after iteration 1 when you know what outputs look like."
55
- }
56
- }
57
- },
58
- "assertion": {
59
- "oneOf": [
60
- { "$ref": "#/definitions/transcriptCheck" },
61
- { "$ref": "#/definitions/llmJudge" }
62
- ]
63
- },
64
- "transcriptCheck": {
65
- "type": "object",
66
- "required": ["id", "type", "check"],
67
- "additionalProperties": false,
68
- "properties": {
69
- "id": { "type": "string" },
70
- "type": { "const": "transcript_check" },
71
- "check": {
72
- "type": "string",
73
- "description": "Name of a transcript-check kind handled by the runner's grader (runner/grade.ts), e.g. tool_invocation_matches."
74
- },
75
- "pattern": {
76
- "type": "string",
77
- "description": "Regex (or substring) the check uses to match tool invocations."
78
- },
79
- "must_precede": {
80
- "type": "string",
81
- "enum": ["completion_claim", "any"],
82
- "description": "Where in the run the matched invocation must occur. 'completion_claim' = before the final message. 'any' = anywhere in the run."
83
- }
84
- }
85
- },
86
- "llmJudge": {
87
- "type": "object",
88
- "required": ["id", "type", "rubric"],
89
- "additionalProperties": false,
90
- "properties": {
91
- "id": { "type": "string" },
92
- "type": { "const": "llm_judge" },
93
- "rubric": {
94
- "type": "string",
95
- "minLength": 1,
96
- "description": "The question the judge model answers. Should be answerable with PASS/FAIL + evidence."
97
- },
98
- "model": {
99
- "type": "string",
100
- "description": "Optional override. Defaults to whatever the harness operator configures for judge dispatches."
101
- }
102
- }
103
- }
104
- }
105
- }
@@ -1,84 +0,0 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "$id": "https://slow-powers.dev/schemas/grading.schema.json",
4
- "title": "Grading Result",
5
- "description": "Output of grading one (eval, condition) pair. Lives at <workspace>/iteration-N/eval-<id>/<condition>/grading.json.",
6
- "type": "object",
7
- "required": ["assertion_results", "summary"],
8
- "additionalProperties": false,
9
- "properties": {
10
- "assertion_results": {
11
- "type": "array",
12
- "items": {
13
- "type": "object",
14
- "required": ["id", "passed", "evidence"],
15
- "additionalProperties": false,
16
- "properties": {
17
- "id": {
18
- "type": "string",
19
- "description": "Matches the assertion id in evals.json."
20
- },
21
- "passed": { "type": "boolean" },
22
- "evidence": {
23
- "type": "string",
24
- "description": "Direct quote or specific reference from the run record. Vague summaries are not evidence."
25
- },
26
- "confidence": {
27
- "type": "number",
28
- "minimum": 0,
29
- "maximum": 1,
30
- "description": "Judge confidence. Low confidence (< 0.7) flags this result for human review. Always 1.0 for transcript_check results."
31
- },
32
- "grader": {
33
- "type": "string",
34
- "enum": ["transcript_check", "llm_judge"],
35
- "description": "Which grader produced this result."
36
- }
37
- }
38
- }
39
- },
40
- "summary": {
41
- "type": "object",
42
- "required": ["passed", "failed", "total", "pass_rate"],
43
- "additionalProperties": false,
44
- "properties": {
45
- "passed": { "type": "integer", "minimum": 0 },
46
- "failed": { "type": "integer", "minimum": 0 },
47
- "total": { "type": "integer", "minimum": 0 },
48
- "pass_rate": { "type": "number", "minimum": 0, "maximum": 1 }
49
- }
50
- },
51
- "meta_results": {
52
- "type": "array",
53
- "description": "Framework-injected meta-assertions (e.g. skill-invocation check). Reserved id prefix: __ (double underscore). Tracked separately from substantive assertion_results so they do not pollute the skill effectiveness pass_rate.",
54
- "items": {
55
- "type": "object",
56
- "required": ["id", "passed", "evidence"],
57
- "additionalProperties": false,
58
- "properties": {
59
- "id": { "type": "string" },
60
- "passed": { "type": "boolean" },
61
- "evidence": { "type": "string" },
62
- "confidence": { "type": "number", "minimum": 0, "maximum": 1 },
63
- "grader": {
64
- "type": "string",
65
- "enum": ["transcript_check", "llm_judge"]
66
- }
67
- }
68
- }
69
- },
70
- "meta_summary": {
71
- "type": "object",
72
- "additionalProperties": false,
73
- "properties": {
74
- "passed": { "type": "integer", "minimum": 0 },
75
- "failed": { "type": "integer", "minimum": 0 },
76
- "total": { "type": "integer", "minimum": 0 },
77
- "skill_invoked": {
78
- "description": "True when the skill-invocation meta-check passed; false when the judge found no evidence the skill influenced behavior; null when no skill was loaded for this run.",
79
- "type": ["boolean", "null"]
80
- }
81
- }
82
- }
83
- }
84
- }
@@ -1,80 +0,0 @@
1
- {
2
- "$schema": "http://json-schema.org/draft-07/schema#",
3
- "$id": "https://slow-powers.dev/schemas/run-record.schema.json",
4
- "title": "Portable Run Record",
5
- "description": "Captures one subagent run. Harness-agnostic — each harness writes an adapter from its native transcript format to this shape. Downstream grading reads only this file.",
6
- "type": "object",
7
- "required": [
8
- "eval_id",
9
- "condition",
10
- "skill_path",
11
- "prompt",
12
- "files",
13
- "final_message",
14
- "tool_invocations"
15
- ],
16
- "additionalProperties": false,
17
- "properties": {
18
- "eval_id": {
19
- "type": "string",
20
- "description": "Matches the eval's id in evals.json."
21
- },
22
- "condition": {
23
- "type": "string",
24
- "description": "Reserved names: with_skill, without_skill, old_skill, new_skill."
25
- },
26
- "skill_path": {
27
- "type": ["string", "null"],
28
- "description": "Absolute path to the SKILL.md the subagent could load, or null if no skill was provided (without_skill condition)."
29
- },
30
- "prompt": {
31
- "type": "string",
32
- "description": "The user prompt as dispatched to the subagent."
33
- },
34
- "files": {
35
- "type": "array",
36
- "items": { "type": "string" },
37
- "description": "Fixture files the subagent had access to (absolute paths inside the run's workspace)."
38
- },
39
- "final_message": {
40
- "type": "string",
41
- "description": "The agent's final user-facing text output."
42
- },
43
- "tool_invocations": {
44
- "type": "array",
45
- "description": "Ordered list of tool calls during the run.",
46
- "items": {
47
- "type": "object",
48
- "required": ["name", "ordinal"],
49
- "additionalProperties": false,
50
- "properties": {
51
- "name": {
52
- "type": "string",
53
- "description": "Tool name as recorded by the harness (e.g. Bash, Read, run_command). Adapters should preserve original names."
54
- },
55
- "args": {
56
- "description": "Tool arguments. Object for structured tools, string for raw command-style tools.",
57
- "type": ["object", "string", "array", "null"]
58
- },
59
- "result": {
60
- "description": "Tool output, if captured. Truncate long outputs to ~2KB.",
61
- "type": ["string", "object", "null"]
62
- },
63
- "ordinal": {
64
- "type": "integer",
65
- "minimum": 0,
66
- "description": "0-indexed position in the run. Used by must_precede checks."
67
- }
68
- }
69
- }
70
- },
71
- "total_tokens": {
72
- "type": ["integer", "null"],
73
- "description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (usage summed across unique message ids, including cache creation/read tokens — a different accounting than the completion event). Canonical timing lives in the sibling timing.json, whose `source` field records which origin produced it. May be null if neither source is available."
74
- },
75
- "duration_ms": {
76
- "type": ["integer", "null"],
77
- "description": "From the harness's task completion event, or derived from the persisted transcript by record-runs (wall clock between the first and last transcript timestamps). Canonical timing lives in the sibling timing.json. May be null if neither source is available."
78
- }
79
- }
80
- }