@slowdini/slow-powers-opencode 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +174 -0
- package/bootstrap.md +16 -0
- package/opencode/plugins/slow-powers.js +86 -0
- package/package.json +66 -0
- package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
- package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
- package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
- package/skills/evaluating-skills/SKILL.md +448 -0
- package/skills/evaluating-skills/evals/evals.json +52 -0
- package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
- package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
- package/skills/evaluating-skills/harness-details/claude.md +135 -0
- package/skills/evaluating-skills/pressure-scenarios.md +163 -0
- package/skills/evaluating-skills/runner/README.md +140 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
- package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
- package/skills/evaluating-skills/runner/aggregate.ts +228 -0
- package/skills/evaluating-skills/runner/context.test.ts +181 -0
- package/skills/evaluating-skills/runner/context.ts +90 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
- package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
- package/skills/evaluating-skills/runner/grade.test.ts +347 -0
- package/skills/evaluating-skills/runner/grade.ts +603 -0
- package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
- package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
- package/skills/evaluating-skills/runner/guard/install.ts +147 -0
- package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
- package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
- package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
- package/skills/evaluating-skills/runner/run.test.ts +716 -0
- package/skills/evaluating-skills/runner/run.ts +814 -0
- package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
- package/skills/evaluating-skills/runner/types.ts +104 -0
- package/skills/evaluating-skills/runner/validate-all.ts +54 -0
- package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
- package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
- package/skills/evaluating-skills/runner/validate.test.ts +56 -0
- package/skills/evaluating-skills/runner/validate.ts +21 -0
- package/skills/evaluating-skills/schema/evals.schema.json +105 -0
- package/skills/evaluating-skills/schema/grading.schema.json +84 -0
- package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
- package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
- package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
- package/skills/evaluating-skills/templates/evals.json.example +17 -0
- package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
- package/skills/finishing-a-development-branch/SKILL.md +96 -0
- package/skills/finishing-a-development-branch/evals/evals.json +41 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
- package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
- package/skills/hardening-plans/SKILL.md +72 -0
- package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
- package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
- package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
- package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
- package/skills/hardening-plans/evals/evals.json +114 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +84 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
- package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
- package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
- package/skills/systematic-debugging/evals/evals.json +45 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
- package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +93 -0
- package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
- package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
- package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
- package/skills/test-driven-development/evals/evals.json +77 -0
- package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
- package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +70 -0
- package/skills/using-git-worktrees/evals/evals.json +40 -0
- package/skills/verification-before-completion/SKILL.md +65 -0
- package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
- package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
- package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
- package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
- package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
- package/skills/verification-before-completion/evals/evals.json +77 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
- package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
- package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
- package/skills/writing-skills/SKILL.md +306 -0
- package/skills/writing-skills/evals/evals.json +40 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/scripts/render-graphs.js +181 -0
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import { isAbsolute, resolve, sep } from "node:path";
|
|
2
|
+
|
|
3
|
+
/** Tools that mutate the filesystem and carry a target path argument. */
|
|
4
|
+
export const WRITE_TOOLS = new Set([
|
|
5
|
+
"Write",
|
|
6
|
+
"Edit",
|
|
7
|
+
"MultiEdit",
|
|
8
|
+
"NotebookEdit",
|
|
9
|
+
]);
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Bash command patterns that mutate state outside an eval's sandbox. Heuristics
|
|
13
|
+
* — Bash is too flexible to parse exactly. `detect-stray-writes` surfaces these
|
|
14
|
+
* as warnings; the opt-in guard denies them. Each is meaningful only when the
|
|
15
|
+
* command does not reference an allowed root (see `classifyBash`).
|
|
16
|
+
*/
|
|
17
|
+
export const BASH_MUTATION_PATTERNS: Array<{ re: RegExp; reason: string }> = [
|
|
18
|
+
{
|
|
19
|
+
re: /\b(npm|pnpm|yarn|bun)\s+(install|add|ci|i)\b/,
|
|
20
|
+
reason: "package install/add",
|
|
21
|
+
},
|
|
22
|
+
{ re: /\bpip3?\s+install\b/, reason: "pip install" },
|
|
23
|
+
{ re: /\bsed\s+-i\b/, reason: "in-place file edit (sed -i)" },
|
|
24
|
+
{
|
|
25
|
+
re: /\bgit\s+(commit|add|push|checkout|reset|restore|merge|rebase)\b/,
|
|
26
|
+
reason: "git mutation",
|
|
27
|
+
},
|
|
28
|
+
{ re: /(^|\s)(>>?|tee)\s/, reason: "output redirection to a file" },
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
/** Pull the target path from a write tool's arguments. */
|
|
32
|
+
export function pathArg(args: unknown): string | undefined {
|
|
33
|
+
if (!args || typeof args !== "object") return undefined;
|
|
34
|
+
const a = args as Record<string, unknown>;
|
|
35
|
+
const p = a.file_path ?? a.notebook_path ?? a.path;
|
|
36
|
+
return typeof p === "string" ? p : undefined;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** True when `target` resolves to `dir` or a descendant of it. */
|
|
40
|
+
export function isUnder(
|
|
41
|
+
target: string,
|
|
42
|
+
dir: string,
|
|
43
|
+
repoRoot: string,
|
|
44
|
+
): boolean {
|
|
45
|
+
const base = resolve(dir);
|
|
46
|
+
const abs = isAbsolute(target) ? resolve(target) : resolve(repoRoot, target);
|
|
47
|
+
return abs === base || abs.startsWith(base + sep);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/** True when `target` is under any of `dirs`. */
|
|
51
|
+
export function isUnderAny(
|
|
52
|
+
target: string,
|
|
53
|
+
dirs: string[],
|
|
54
|
+
repoRoot: string,
|
|
55
|
+
): boolean {
|
|
56
|
+
return dirs.some((d) => isUnder(target, d, repoRoot));
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* If a Bash command matches a mutation pattern and is not scoped to one of
|
|
61
|
+
* `allowedRoots`, return the human reason; otherwise null. A command is treated
|
|
62
|
+
* as scoped when it textually references an allowed root.
|
|
63
|
+
*/
|
|
64
|
+
export function classifyBash(
|
|
65
|
+
command: string,
|
|
66
|
+
allowedRoots: string[],
|
|
67
|
+
): string | null {
|
|
68
|
+
if (!command) return null;
|
|
69
|
+
if (allowedRoots.some((r) => command.includes(r))) return null;
|
|
70
|
+
for (const { re, reason } of BASH_MUTATION_PATTERNS) {
|
|
71
|
+
if (re.test(command)) return reason;
|
|
72
|
+
}
|
|
73
|
+
return null;
|
|
74
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
export type AssertionTranscriptCheck = {
|
|
2
|
+
id: string;
|
|
3
|
+
type: "transcript_check";
|
|
4
|
+
check: string;
|
|
5
|
+
pattern?: string;
|
|
6
|
+
must_precede?: "completion_claim" | "any";
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
export type AssertionLLMJudge = {
|
|
10
|
+
id: string;
|
|
11
|
+
type: "llm_judge";
|
|
12
|
+
rubric: string;
|
|
13
|
+
model?: string;
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
export type Assertion = AssertionTranscriptCheck | AssertionLLMJudge;
|
|
17
|
+
|
|
18
|
+
export type Eval = {
|
|
19
|
+
id: string;
|
|
20
|
+
prompt: string;
|
|
21
|
+
expected_output: string;
|
|
22
|
+
files?: string[];
|
|
23
|
+
assertions?: Assertion[];
|
|
24
|
+
/**
|
|
25
|
+
* Whether the skill-under-test is expected to fire on this eval. Defaults to
|
|
26
|
+
* true. Set to false for negative evals where correct behavior is NOT
|
|
27
|
+
* invoking the skill (e.g. an over-trigger guard). Negative evals are
|
|
28
|
+
* excluded from the skill-invocation rate and its validity warning.
|
|
29
|
+
*/
|
|
30
|
+
skill_should_trigger?: boolean;
|
|
31
|
+
};
|
|
32
|
+
|
|
33
|
+
export type EvalsConfig = {
|
|
34
|
+
skill_name: string;
|
|
35
|
+
evals: Eval[];
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
export type ConditionEntry = {
|
|
39
|
+
name: string;
|
|
40
|
+
skill_path: string | null;
|
|
41
|
+
staged_skill_slug?: string | null;
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
export type ConditionsRecord = {
|
|
45
|
+
mode: "new-skill" | "revision";
|
|
46
|
+
baseline?: string;
|
|
47
|
+
conditions: ConditionEntry[];
|
|
48
|
+
timestamp: string;
|
|
49
|
+
harness?: string;
|
|
50
|
+
/** Per-run nonce; namespaces dispatch descriptions so transcripts can't
|
|
51
|
+
* collide across iterations sharing one parent session's subagents dir. */
|
|
52
|
+
run_nonce?: string;
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
export type ToolInvocation = {
|
|
56
|
+
name: string;
|
|
57
|
+
args?: unknown;
|
|
58
|
+
result?: unknown;
|
|
59
|
+
ordinal: number;
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
export type RunRecord = {
|
|
63
|
+
eval_id: string;
|
|
64
|
+
condition: string;
|
|
65
|
+
skill_path: string | null;
|
|
66
|
+
prompt: string;
|
|
67
|
+
files: string[];
|
|
68
|
+
final_message: string;
|
|
69
|
+
tool_invocations: ToolInvocation[];
|
|
70
|
+
total_tokens: number | null;
|
|
71
|
+
duration_ms: number | null;
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
export type AssertionResult = {
|
|
75
|
+
id: string;
|
|
76
|
+
passed: boolean;
|
|
77
|
+
evidence: string;
|
|
78
|
+
confidence?: number;
|
|
79
|
+
grader?: "transcript_check" | "llm_judge";
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
export type GradingResult = {
|
|
83
|
+
assertion_results: AssertionResult[];
|
|
84
|
+
meta_results?: AssertionResult[];
|
|
85
|
+
summary: {
|
|
86
|
+
passed: number;
|
|
87
|
+
failed: number;
|
|
88
|
+
total: number;
|
|
89
|
+
pass_rate: number;
|
|
90
|
+
};
|
|
91
|
+
meta_summary?: {
|
|
92
|
+
passed: number;
|
|
93
|
+
failed: number;
|
|
94
|
+
total: number;
|
|
95
|
+
skill_invoked: boolean | null;
|
|
96
|
+
};
|
|
97
|
+
};
|
|
98
|
+
|
|
99
|
+
export const SKILL_INVOKED_META_ID = "__skill_invoked";
|
|
100
|
+
|
|
101
|
+
export type TimingRecord = {
|
|
102
|
+
total_tokens?: number | null;
|
|
103
|
+
duration_ms?: number | null;
|
|
104
|
+
};
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
|
3
|
+
import { join, resolve } from "node:path";
|
|
4
|
+
import { validateEvalsConfig } from "./validate";
|
|
5
|
+
|
|
6
|
+
function flag(argv: string[], name: string): string | undefined {
|
|
7
|
+
const i = argv.indexOf(`--${name}`);
|
|
8
|
+
if (i === -1) return undefined;
|
|
9
|
+
return argv[i + 1];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const skillDirRaw = flag(Bun.argv.slice(2), "skill-dir");
|
|
13
|
+
if (!skillDirRaw) {
|
|
14
|
+
console.error("missing required flag --skill-dir <path>");
|
|
15
|
+
process.exit(1);
|
|
16
|
+
}
|
|
17
|
+
const SKILLS_DIR = resolve(skillDirRaw);
|
|
18
|
+
|
|
19
|
+
if (!existsSync(SKILLS_DIR)) {
|
|
20
|
+
console.error(`skills dir not found: ${SKILLS_DIR}`);
|
|
21
|
+
process.exit(1);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
const skills = readdirSync(SKILLS_DIR).filter((d) => {
|
|
25
|
+
const path = join(SKILLS_DIR, d);
|
|
26
|
+
return statSync(path).isDirectory();
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
let validated = 0;
|
|
30
|
+
let failed = 0;
|
|
31
|
+
const errors: string[] = [];
|
|
32
|
+
|
|
33
|
+
for (const skill of skills) {
|
|
34
|
+
const evalsPath = join(SKILLS_DIR, skill, "evals", "evals.json");
|
|
35
|
+
if (!existsSync(evalsPath)) continue;
|
|
36
|
+
|
|
37
|
+
try {
|
|
38
|
+
const raw = JSON.parse(readFileSync(evalsPath, "utf8"));
|
|
39
|
+
validateEvalsConfig(raw, evalsPath);
|
|
40
|
+
console.log(`✓ ${skill}/evals/evals.json`);
|
|
41
|
+
validated++;
|
|
42
|
+
} catch (err) {
|
|
43
|
+
console.error(`✗ ${skill}/evals/evals.json: ${(err as Error).message}`);
|
|
44
|
+
errors.push(`${skill}: ${(err as Error).message}`);
|
|
45
|
+
failed++;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
console.log(`\nValidated ${validated} evals.json file(s); ${failed} failed.`);
|
|
50
|
+
if (failed > 0) {
|
|
51
|
+
console.error("\nFailures:");
|
|
52
|
+
for (const e of errors) console.error(` - ${e}`);
|
|
53
|
+
process.exit(1);
|
|
54
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { validateAgainstSchema } from "./validate-schema";
|
|
3
|
+
|
|
4
|
+
const validRunRecord = {
|
|
5
|
+
eval_id: "e1",
|
|
6
|
+
condition: "with_skill",
|
|
7
|
+
skill_path: null,
|
|
8
|
+
prompt: "do the thing",
|
|
9
|
+
files: [],
|
|
10
|
+
final_message: "done",
|
|
11
|
+
tool_invocations: [],
|
|
12
|
+
total_tokens: 100,
|
|
13
|
+
duration_ms: 1000,
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
describe("validateAgainstSchema", () => {
|
|
17
|
+
test("returns the data when it matches the run-record schema", () => {
|
|
18
|
+
const result = validateAgainstSchema(
|
|
19
|
+
"run-record",
|
|
20
|
+
validRunRecord,
|
|
21
|
+
"run.json",
|
|
22
|
+
);
|
|
23
|
+
expect(result).toEqual(validRunRecord);
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
test("accepts an empty tool_invocations array (written pre-fill)", () => {
|
|
27
|
+
expect(() =>
|
|
28
|
+
validateAgainstSchema(
|
|
29
|
+
"run-record",
|
|
30
|
+
{ ...validRunRecord, tool_invocations: [] },
|
|
31
|
+
"run.json",
|
|
32
|
+
),
|
|
33
|
+
).not.toThrow();
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
test("accepts skill_path: null on the without_skill arm", () => {
|
|
37
|
+
expect(() =>
|
|
38
|
+
validateAgainstSchema(
|
|
39
|
+
"run-record",
|
|
40
|
+
{ ...validRunRecord, skill_path: null },
|
|
41
|
+
"run.json",
|
|
42
|
+
),
|
|
43
|
+
).not.toThrow();
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
test("throws a source-prefixed error when a required field is missing", () => {
|
|
47
|
+
const { eval_id, ...missing } = validRunRecord;
|
|
48
|
+
expect(() =>
|
|
49
|
+
validateAgainstSchema("run-record", missing, "/tmp/run.json"),
|
|
50
|
+
).toThrow(/\/tmp\/run\.json/);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test("requires skill_path and files (type is the contract)", () => {
|
|
54
|
+
const { skill_path, ...noSkillPath } = validRunRecord;
|
|
55
|
+
expect(() =>
|
|
56
|
+
validateAgainstSchema("run-record", noSkillPath, "run.json"),
|
|
57
|
+
).toThrow(/skill_path/);
|
|
58
|
+
|
|
59
|
+
const { files, ...noFiles } = validRunRecord;
|
|
60
|
+
expect(() =>
|
|
61
|
+
validateAgainstSchema("run-record", noFiles, "run.json"),
|
|
62
|
+
).toThrow(/files/);
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
test("rejects a run record with an unknown extra property", () => {
|
|
66
|
+
expect(() =>
|
|
67
|
+
validateAgainstSchema(
|
|
68
|
+
"run-record",
|
|
69
|
+
{ ...validRunRecord, surprise: true },
|
|
70
|
+
"run.json",
|
|
71
|
+
),
|
|
72
|
+
).toThrow();
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
test("validates a tool_invocation's ordinal must be an integer", () => {
|
|
76
|
+
expect(() =>
|
|
77
|
+
validateAgainstSchema(
|
|
78
|
+
"run-record",
|
|
79
|
+
{
|
|
80
|
+
...validRunRecord,
|
|
81
|
+
tool_invocations: [{ name: "Bash", ordinal: "zero" }],
|
|
82
|
+
},
|
|
83
|
+
"run.json",
|
|
84
|
+
),
|
|
85
|
+
).toThrow();
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
test("compiles and validates the grading schema too", () => {
|
|
89
|
+
const validGrading = {
|
|
90
|
+
assertion_results: [
|
|
91
|
+
{ id: "a1", passed: true, evidence: "quote", grader: "llm_judge" },
|
|
92
|
+
],
|
|
93
|
+
summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
|
|
94
|
+
};
|
|
95
|
+
expect(() =>
|
|
96
|
+
validateAgainstSchema("grading", validGrading, "grading.json"),
|
|
97
|
+
).not.toThrow();
|
|
98
|
+
});
|
|
99
|
+
});
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { Ajv, type ValidateFunction } from "ajv";
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* The four portable artifact schemas live in `../schema/<name>.schema.json` and
|
|
7
|
+
* are the single source of truth for each artifact's shape. This helper compiles
|
|
8
|
+
* them with ajv and enforces them at runtime, so the schema files are an enforced
|
|
9
|
+
* contract rather than documentation a hand-rolled validator can drift from.
|
|
10
|
+
*/
|
|
11
|
+
export type SchemaName = "run-record" | "evals" | "grading" | "stray-writes";
|
|
12
|
+
|
|
13
|
+
const SCHEMA_DIR = join(import.meta.dir, "..", "schema");
|
|
14
|
+
|
|
15
|
+
// strict: false — the schemas are plain draft-07; we don't want ajv's strict
|
|
16
|
+
// metaschema checks to reject otherwise-valid schemas over stylistic keywords.
|
|
17
|
+
const ajv = new Ajv({ allErrors: true, strict: false });
|
|
18
|
+
const validators = new Map<SchemaName, ValidateFunction>();
|
|
19
|
+
|
|
20
|
+
function getValidator(name: SchemaName): ValidateFunction {
|
|
21
|
+
let validate = validators.get(name);
|
|
22
|
+
if (!validate) {
|
|
23
|
+
const schema = JSON.parse(
|
|
24
|
+
readFileSync(join(SCHEMA_DIR, `${name}.schema.json`), "utf8"),
|
|
25
|
+
);
|
|
26
|
+
validate = ajv.compile(schema);
|
|
27
|
+
validators.set(name, validate);
|
|
28
|
+
}
|
|
29
|
+
return validate;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Validate `data` against the named schema. Returns the data typed as `T` on
|
|
34
|
+
* success; throws a `source`-prefixed Error listing every failure on mismatch.
|
|
35
|
+
*/
|
|
36
|
+
export function validateAgainstSchema<T>(
|
|
37
|
+
name: SchemaName,
|
|
38
|
+
data: unknown,
|
|
39
|
+
source: string,
|
|
40
|
+
): T {
|
|
41
|
+
const validate = getValidator(name);
|
|
42
|
+
if (!validate(data)) {
|
|
43
|
+
const details = (validate.errors ?? [])
|
|
44
|
+
.map((e) => ` ${e.instancePath || "/"} ${e.message}`)
|
|
45
|
+
.join("\n");
|
|
46
|
+
throw new Error(
|
|
47
|
+
`${source}: does not match the ${name} schema:\n${details}`,
|
|
48
|
+
);
|
|
49
|
+
}
|
|
50
|
+
return data as T;
|
|
51
|
+
}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { describe, expect, test } from "bun:test";
|
|
2
|
+
import { validateEvalsConfig } from "./validate";
|
|
3
|
+
|
|
4
|
+
const base = {
|
|
5
|
+
skill_name: "demo",
|
|
6
|
+
evals: [
|
|
7
|
+
{
|
|
8
|
+
id: "e1",
|
|
9
|
+
prompt: "do the thing",
|
|
10
|
+
expected_output: "the thing is done",
|
|
11
|
+
},
|
|
12
|
+
],
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
describe("validateEvalsConfig skill_should_trigger", () => {
|
|
16
|
+
test("accepts a boolean skill_should_trigger", () => {
|
|
17
|
+
const cfg = {
|
|
18
|
+
...base,
|
|
19
|
+
evals: [{ ...base.evals[0], skill_should_trigger: false }],
|
|
20
|
+
};
|
|
21
|
+
expect(() => validateEvalsConfig(cfg, "test")).not.toThrow();
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
test("accepts evals with no skill_should_trigger (defaults to true)", () => {
|
|
25
|
+
expect(() => validateEvalsConfig(base, "test")).not.toThrow();
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
test("rejects a non-boolean skill_should_trigger", () => {
|
|
29
|
+
const cfg = {
|
|
30
|
+
...base,
|
|
31
|
+
evals: [{ ...base.evals[0], skill_should_trigger: "false" }],
|
|
32
|
+
};
|
|
33
|
+
expect(() => validateEvalsConfig(cfg, "test")).toThrow(
|
|
34
|
+
/skill_should_trigger/,
|
|
35
|
+
);
|
|
36
|
+
});
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
describe("validateEvalsConfig structural + duplicate-id", () => {
|
|
40
|
+
test("rejects a non-kebab-case id", () => {
|
|
41
|
+
const cfg = { ...base, evals: [{ ...base.evals[0], id: "Not Kebab" }] };
|
|
42
|
+
expect(() => validateEvalsConfig(cfg, "test")).toThrow();
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("rejects duplicate eval ids (not expressible in JSON Schema)", () => {
|
|
46
|
+
const cfg = {
|
|
47
|
+
...base,
|
|
48
|
+
evals: [base.evals[0], { ...base.evals[0] }],
|
|
49
|
+
};
|
|
50
|
+
expect(() => validateEvalsConfig(cfg, "test")).toThrow(/duplicate/);
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
test("rejects an empty evals array", () => {
|
|
54
|
+
expect(() => validateEvalsConfig({ ...base, evals: [] }, "test")).toThrow();
|
|
55
|
+
});
|
|
56
|
+
});
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import type { EvalsConfig } from "./types";
|
|
2
|
+
import { validateAgainstSchema } from "./validate-schema";
|
|
3
|
+
|
|
4
|
+
export function validateEvalsConfig(
|
|
5
|
+
config: unknown,
|
|
6
|
+
source: string,
|
|
7
|
+
): EvalsConfig {
|
|
8
|
+
// Structural validation against the single source of truth.
|
|
9
|
+
const validated = validateAgainstSchema<EvalsConfig>("evals", config, source);
|
|
10
|
+
|
|
11
|
+
// Supplemental check: JSON Schema (draft-07) can't enforce uniqueness by a
|
|
12
|
+
// sub-field, so the duplicate-id guard stays hand-rolled.
|
|
13
|
+
const seenIds = new Set<string>();
|
|
14
|
+
for (const [i, ev] of validated.evals.entries()) {
|
|
15
|
+
if (seenIds.has(ev.id))
|
|
16
|
+
throw new Error(`${source}: evals[${i}].id duplicate: ${ev.id}`);
|
|
17
|
+
seenIds.add(ev.id);
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
return validated;
|
|
21
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://slow-powers.dev/schemas/evals.schema.json",
|
|
4
|
+
"title": "Skill Evaluation Definition",
|
|
5
|
+
"description": "Defines a set of test cases for evaluating a skill. Lives at <skill>/evals/evals.json.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["skill_name", "evals"],
|
|
8
|
+
"additionalProperties": false,
|
|
9
|
+
"properties": {
|
|
10
|
+
"skill_name": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"description": "Name of the skill being evaluated. Should match the skill directory name."
|
|
13
|
+
},
|
|
14
|
+
"evals": {
|
|
15
|
+
"type": "array",
|
|
16
|
+
"minItems": 1,
|
|
17
|
+
"items": { "$ref": "#/definitions/eval" }
|
|
18
|
+
}
|
|
19
|
+
},
|
|
20
|
+
"definitions": {
|
|
21
|
+
"eval": {
|
|
22
|
+
"type": "object",
|
|
23
|
+
"required": ["id", "prompt", "expected_output"],
|
|
24
|
+
"additionalProperties": false,
|
|
25
|
+
"properties": {
|
|
26
|
+
"id": {
|
|
27
|
+
"type": "string",
|
|
28
|
+
"pattern": "^[a-z0-9][a-z0-9-]*$",
|
|
29
|
+
"description": "Stable kebab-case identifier. Used as directory name in the workspace tree."
|
|
30
|
+
},
|
|
31
|
+
"prompt": {
|
|
32
|
+
"type": "string",
|
|
33
|
+
"minLength": 1,
|
|
34
|
+
"description": "The user-facing message the subagent receives. Should read like a realistic user request."
|
|
35
|
+
},
|
|
36
|
+
"expected_output": {
|
|
37
|
+
"type": "string",
|
|
38
|
+
"minLength": 1,
|
|
39
|
+
"description": "Human-readable description of what a successful response looks like."
|
|
40
|
+
},
|
|
41
|
+
"files": {
|
|
42
|
+
"type": "array",
|
|
43
|
+
"items": { "type": "string" },
|
|
44
|
+
"description": "Fixture file paths relative to the skill's evals/ directory. Copied into the subagent's input directory before dispatch."
|
|
45
|
+
},
|
|
46
|
+
"skill_should_trigger": {
|
|
47
|
+
"type": "boolean",
|
|
48
|
+
"default": true,
|
|
49
|
+
"description": "Whether the skill-under-test is expected to fire on this eval. Defaults to true. Set false for negative evals where correct behavior is NOT invoking the skill (e.g. an over-trigger guard); such evals are excluded from the skill-invocation rate and its validity warning."
|
|
50
|
+
},
|
|
51
|
+
"assertions": {
|
|
52
|
+
"type": "array",
|
|
53
|
+
"items": { "$ref": "#/definitions/assertion" },
|
|
54
|
+
"description": "Pass/fail criteria, added after iteration 1 when you know what outputs look like."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
"assertion": {
|
|
59
|
+
"oneOf": [
|
|
60
|
+
{ "$ref": "#/definitions/transcriptCheck" },
|
|
61
|
+
{ "$ref": "#/definitions/llmJudge" }
|
|
62
|
+
]
|
|
63
|
+
},
|
|
64
|
+
"transcriptCheck": {
|
|
65
|
+
"type": "object",
|
|
66
|
+
"required": ["id", "type", "check"],
|
|
67
|
+
"additionalProperties": false,
|
|
68
|
+
"properties": {
|
|
69
|
+
"id": { "type": "string" },
|
|
70
|
+
"type": { "const": "transcript_check" },
|
|
71
|
+
"check": {
|
|
72
|
+
"type": "string",
|
|
73
|
+
"description": "Name of a transcript-check kind handled by the runner's grader (runner/grade.ts), e.g. tool_invocation_matches."
|
|
74
|
+
},
|
|
75
|
+
"pattern": {
|
|
76
|
+
"type": "string",
|
|
77
|
+
"description": "Regex (or substring) the check uses to match tool invocations."
|
|
78
|
+
},
|
|
79
|
+
"must_precede": {
|
|
80
|
+
"type": "string",
|
|
81
|
+
"enum": ["completion_claim", "any"],
|
|
82
|
+
"description": "Where in the run the matched invocation must occur. 'completion_claim' = before the final message. 'any' = anywhere in the run."
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
"llmJudge": {
|
|
87
|
+
"type": "object",
|
|
88
|
+
"required": ["id", "type", "rubric"],
|
|
89
|
+
"additionalProperties": false,
|
|
90
|
+
"properties": {
|
|
91
|
+
"id": { "type": "string" },
|
|
92
|
+
"type": { "const": "llm_judge" },
|
|
93
|
+
"rubric": {
|
|
94
|
+
"type": "string",
|
|
95
|
+
"minLength": 1,
|
|
96
|
+
"description": "The question the judge model answers. Should be answerable with PASS/FAIL + evidence."
|
|
97
|
+
},
|
|
98
|
+
"model": {
|
|
99
|
+
"type": "string",
|
|
100
|
+
"description": "Optional override. Defaults to whatever the harness operator configures for judge dispatches."
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://slow-powers.dev/schemas/grading.schema.json",
|
|
4
|
+
"title": "Grading Result",
|
|
5
|
+
"description": "Output of grading one (eval, condition) pair. Lives at <workspace>/iteration-N/eval-<id>/<condition>/grading.json.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["assertion_results", "summary"],
|
|
8
|
+
"additionalProperties": false,
|
|
9
|
+
"properties": {
|
|
10
|
+
"assertion_results": {
|
|
11
|
+
"type": "array",
|
|
12
|
+
"items": {
|
|
13
|
+
"type": "object",
|
|
14
|
+
"required": ["id", "passed", "evidence"],
|
|
15
|
+
"additionalProperties": false,
|
|
16
|
+
"properties": {
|
|
17
|
+
"id": {
|
|
18
|
+
"type": "string",
|
|
19
|
+
"description": "Matches the assertion id in evals.json."
|
|
20
|
+
},
|
|
21
|
+
"passed": { "type": "boolean" },
|
|
22
|
+
"evidence": {
|
|
23
|
+
"type": "string",
|
|
24
|
+
"description": "Direct quote or specific reference from the run record. Vague summaries are not evidence."
|
|
25
|
+
},
|
|
26
|
+
"confidence": {
|
|
27
|
+
"type": "number",
|
|
28
|
+
"minimum": 0,
|
|
29
|
+
"maximum": 1,
|
|
30
|
+
"description": "Judge confidence. Low confidence (< 0.7) flags this result for human review. Always 1.0 for transcript_check results."
|
|
31
|
+
},
|
|
32
|
+
"grader": {
|
|
33
|
+
"type": "string",
|
|
34
|
+
"enum": ["transcript_check", "llm_judge"],
|
|
35
|
+
"description": "Which grader produced this result."
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
},
|
|
40
|
+
"summary": {
|
|
41
|
+
"type": "object",
|
|
42
|
+
"required": ["passed", "failed", "total", "pass_rate"],
|
|
43
|
+
"additionalProperties": false,
|
|
44
|
+
"properties": {
|
|
45
|
+
"passed": { "type": "integer", "minimum": 0 },
|
|
46
|
+
"failed": { "type": "integer", "minimum": 0 },
|
|
47
|
+
"total": { "type": "integer", "minimum": 0 },
|
|
48
|
+
"pass_rate": { "type": "number", "minimum": 0, "maximum": 1 }
|
|
49
|
+
}
|
|
50
|
+
},
|
|
51
|
+
"meta_results": {
|
|
52
|
+
"type": "array",
|
|
53
|
+
"description": "Framework-injected meta-assertions (e.g. skill-invocation check). Reserved id prefix: __ (double underscore). Tracked separately from substantive assertion_results so they do not pollute the skill effectiveness pass_rate.",
|
|
54
|
+
"items": {
|
|
55
|
+
"type": "object",
|
|
56
|
+
"required": ["id", "passed", "evidence"],
|
|
57
|
+
"additionalProperties": false,
|
|
58
|
+
"properties": {
|
|
59
|
+
"id": { "type": "string" },
|
|
60
|
+
"passed": { "type": "boolean" },
|
|
61
|
+
"evidence": { "type": "string" },
|
|
62
|
+
"confidence": { "type": "number", "minimum": 0, "maximum": 1 },
|
|
63
|
+
"grader": {
|
|
64
|
+
"type": "string",
|
|
65
|
+
"enum": ["transcript_check", "llm_judge"]
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
},
|
|
70
|
+
"meta_summary": {
|
|
71
|
+
"type": "object",
|
|
72
|
+
"additionalProperties": false,
|
|
73
|
+
"properties": {
|
|
74
|
+
"passed": { "type": "integer", "minimum": 0 },
|
|
75
|
+
"failed": { "type": "integer", "minimum": 0 },
|
|
76
|
+
"total": { "type": "integer", "minimum": 0 },
|
|
77
|
+
"skill_invoked": {
|
|
78
|
+
"description": "True when the skill-invocation meta-check passed; false when the judge found no evidence the skill influenced behavior; null when no skill was loaded for this run.",
|
|
79
|
+
"type": ["boolean", "null"]
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
}
|