@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,94 +0,0 @@
1
- import { isAbsolute, resolve, sep } from "node:path";
2
-
3
- /** Tools that mutate the filesystem and carry a target path argument. */
4
- export const WRITE_TOOLS = new Set([
5
- "Write",
6
- "Edit",
7
- "MultiEdit",
8
- "NotebookEdit",
9
- ]);
10
-
11
- /**
12
- * Bash command patterns that mutate state outside an eval's sandbox. Heuristics
13
- * — Bash is too flexible to parse exactly. `detect-stray-writes` surfaces these
14
- * as warnings; the opt-in guard denies them. Each is meaningful only when the
15
- * command does not reference an allowed root (see `classifyBash`).
16
- */
17
- export const BASH_MUTATION_PATTERNS: Array<{ re: RegExp; reason: string }> = [
18
- {
19
- re: /\b(npm|pnpm|yarn|bun)\s+(install|add|ci|i)\b/,
20
- reason: "package install/add",
21
- },
22
- { re: /\bpip3?\s+install\b/, reason: "pip install" },
23
- { re: /\bsed\s+-i\b/, reason: "in-place file edit (sed -i)" },
24
- {
25
- re: /\bgit\s+(commit|add|push|checkout|reset|restore|merge|rebase)\b/,
26
- reason: "git mutation",
27
- },
28
- {
29
- re: /\bgit\s+worktree\s+add\b/,
30
- reason: "git worktree add (working tree outside the sandbox)",
31
- },
32
- // A create/copy/move/link verb whose operand is a path under `.claude` —
33
- // catches stray writes to the harness config dir that aren't a `>` redirect
34
- // (those are caught below). Read-only verbs (`cat`, `ls`) aren't listed, so
35
- // inspecting `.claude` stays allowed.
36
- {
37
- re: /\b(cp|mv|mkdir|touch|ln|rsync|install)\b[^|;&\n]*\.claude(\/|\b)/,
38
- reason: "path under .claude",
39
- },
40
- // The same create verbs whose operand is a top-level `skills/` directory —
41
- // catches a bare `skills/` left in the cwd. `skills-workspace` and other
42
- // `skills`-prefixed names are excluded by the trailing `/`, whitespace, or
43
- // end-of-string boundary.
44
- {
45
- re: /\b(cp|mv|mkdir|touch|ln|rsync)\b[^|;&\n]*[\s'"=/]\.{0,2}\/?skills(\/|\s|$)/,
46
- reason: "creates a bare skills/ dir",
47
- },
48
- { re: /(^|\s)(>>?|tee)\s/, reason: "output redirection to a file" },
49
- ];
50
-
51
- /** Pull the target path from a write tool's arguments. */
52
- export function pathArg(args: unknown): string | undefined {
53
- if (!args || typeof args !== "object") return undefined;
54
- const a = args as Record<string, unknown>;
55
- const p = a.file_path ?? a.notebook_path ?? a.path;
56
- return typeof p === "string" ? p : undefined;
57
- }
58
-
59
- /** True when `target` resolves to `dir` or a descendant of it. */
60
- export function isUnder(
61
- target: string,
62
- dir: string,
63
- repoRoot: string,
64
- ): boolean {
65
- const base = resolve(dir);
66
- const abs = isAbsolute(target) ? resolve(target) : resolve(repoRoot, target);
67
- return abs === base || abs.startsWith(base + sep);
68
- }
69
-
70
- /** True when `target` is under any of `dirs`. */
71
- export function isUnderAny(
72
- target: string,
73
- dirs: string[],
74
- repoRoot: string,
75
- ): boolean {
76
- return dirs.some((d) => isUnder(target, d, repoRoot));
77
- }
78
-
79
- /**
80
- * If a Bash command matches a mutation pattern and is not scoped to one of
81
- * `allowedRoots`, return the human reason; otherwise null. A command is treated
82
- * as scoped when it textually references an allowed root.
83
- */
84
- export function classifyBash(
85
- command: string,
86
- allowedRoots: string[],
87
- ): string | null {
88
- if (!command) return null;
89
- if (allowedRoots.some((r) => command.includes(r))) return null;
90
- for (const { re, reason } of BASH_MUTATION_PATTERNS) {
91
- if (re.test(command)) return reason;
92
- }
93
- return null;
94
- }
@@ -1,121 +0,0 @@
1
- export type AssertionTranscriptCheck = {
2
- id: string;
3
- type: "transcript_check";
4
- check: string;
5
- pattern?: string;
6
- must_precede?: "completion_claim" | "any";
7
- };
8
-
9
- export type AssertionLLMJudge = {
10
- id: string;
11
- type: "llm_judge";
12
- rubric: string;
13
- model?: string;
14
- };
15
-
16
- export type Assertion = AssertionTranscriptCheck | AssertionLLMJudge;
17
-
18
- export type Eval = {
19
- id: string;
20
- prompt: string;
21
- expected_output: string;
22
- files?: string[];
23
- assertions?: Assertion[];
24
- /**
25
- * Whether the skill-under-test is expected to fire on this eval. Defaults to
26
- * true. Set to false for negative evals where correct behavior is NOT
27
- * invoking the skill (e.g. an over-trigger guard). Negative evals are
28
- * excluded from the skill-invocation rate and its validity warning.
29
- */
30
- skill_should_trigger?: boolean;
31
- };
32
-
33
- export type EvalsConfig = {
34
- skill_name: string;
35
- evals: Eval[];
36
- };
37
-
38
- /** A skill staged and discoverable for an eval — its natural name, on-disk
39
- * SKILL.md path, and frontmatter description. */
40
- export type AvailableSkill = {
41
- name: string;
42
- path: string;
43
- description: string;
44
- };
45
-
46
- export type ConditionEntry = {
47
- name: string;
48
- skill_path: string | null;
49
- staged_skill_slug?: string | null;
50
- };
51
-
52
- export type ConditionsRecord = {
53
- mode: "new-skill" | "revision";
54
- baseline?: string;
55
- conditions: ConditionEntry[];
56
- timestamp: string;
57
- harness?: string;
58
- /** Per-run nonce; namespaces dispatch descriptions so transcripts can't
59
- * collide across iterations sharing one parent session's subagents dir. */
60
- run_nonce?: string;
61
- };
62
-
63
- export type ToolInvocation = {
64
- name: string;
65
- args?: unknown;
66
- result?: unknown;
67
- ordinal: number;
68
- };
69
-
70
- export type RunRecord = {
71
- eval_id: string;
72
- condition: string;
73
- skill_path: string | null;
74
- prompt: string;
75
- files: string[];
76
- final_message: string;
77
- tool_invocations: ToolInvocation[];
78
- total_tokens: number | null;
79
- duration_ms: number | null;
80
- };
81
-
82
- export type AssertionResult = {
83
- id: string;
84
- passed: boolean;
85
- evidence: string;
86
- confidence?: number;
87
- grader?: "transcript_check" | "llm_judge";
88
- };
89
-
90
- export type GradingResult = {
91
- assertion_results: AssertionResult[];
92
- meta_results?: AssertionResult[];
93
- summary: {
94
- passed: number;
95
- failed: number;
96
- total: number;
97
- pass_rate: number;
98
- };
99
- meta_summary?: {
100
- passed: number;
101
- failed: number;
102
- total: number;
103
- skill_invoked: boolean | null;
104
- };
105
- };
106
-
107
- export const SKILL_INVOKED_META_ID = "__skill_invoked";
108
-
109
- export type TimingRecord = {
110
- total_tokens?: number | null;
111
- duration_ms?: number | null;
112
- /**
113
- * Where the numbers came from. "completion-event" = captured by the
114
- * dispatching agent from the harness's task completion event;
115
- * "transcript" = derived by record-runs from the persisted transcript
116
- * (includes cache accounting — a different metric, not comparable 1:1).
117
- * Absent on records written before provenance was tracked
118
- * (completion-event in practice).
119
- */
120
- source?: "completion-event" | "transcript";
121
- };
@@ -1,54 +0,0 @@
1
- #!/usr/bin/env bun
2
- import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
3
- import { join, resolve } from "node:path";
4
- import { validateEvalsConfig } from "./validate";
5
-
6
- function flag(argv: string[], name: string): string | undefined {
7
- const i = argv.indexOf(`--${name}`);
8
- if (i === -1) return undefined;
9
- return argv[i + 1];
10
- }
11
-
12
- const skillDirRaw = flag(Bun.argv.slice(2), "skill-dir");
13
- if (!skillDirRaw) {
14
- console.error("missing required flag --skill-dir <path>");
15
- process.exit(1);
16
- }
17
- const SKILLS_DIR = resolve(skillDirRaw);
18
-
19
- if (!existsSync(SKILLS_DIR)) {
20
- console.error(`skills dir not found: ${SKILLS_DIR}`);
21
- process.exit(1);
22
- }
23
-
24
- const skills = readdirSync(SKILLS_DIR).filter((d) => {
25
- const path = join(SKILLS_DIR, d);
26
- return statSync(path).isDirectory();
27
- });
28
-
29
- let validated = 0;
30
- let failed = 0;
31
- const errors: string[] = [];
32
-
33
- for (const skill of skills) {
34
- const evalsPath = join(SKILLS_DIR, skill, "evals", "evals.json");
35
- if (!existsSync(evalsPath)) continue;
36
-
37
- try {
38
- const raw = JSON.parse(readFileSync(evalsPath, "utf8"));
39
- validateEvalsConfig(raw, evalsPath);
40
- console.log(`✓ ${skill}/evals/evals.json`);
41
- validated++;
42
- } catch (err) {
43
- console.error(`✗ ${skill}/evals/evals.json: ${(err as Error).message}`);
44
- errors.push(`${skill}: ${(err as Error).message}`);
45
- failed++;
46
- }
47
- }
48
-
49
- console.log(`\nValidated ${validated} evals.json file(s); ${failed} failed.`);
50
- if (failed > 0) {
51
- console.error("\nFailures:");
52
- for (const e of errors) console.error(` - ${e}`);
53
- process.exit(1);
54
- }
@@ -1,99 +0,0 @@
1
- import { describe, expect, test } from "bun:test";
2
- import { validateAgainstSchema } from "./validate-schema";
3
-
4
- const validRunRecord = {
5
- eval_id: "e1",
6
- condition: "with_skill",
7
- skill_path: null,
8
- prompt: "do the thing",
9
- files: [],
10
- final_message: "done",
11
- tool_invocations: [],
12
- total_tokens: 100,
13
- duration_ms: 1000,
14
- };
15
-
16
- describe("validateAgainstSchema", () => {
17
- test("returns the data when it matches the run-record schema", () => {
18
- const result = validateAgainstSchema(
19
- "run-record",
20
- validRunRecord,
21
- "run.json",
22
- );
23
- expect(result).toEqual(validRunRecord);
24
- });
25
-
26
- test("accepts an empty tool_invocations array (written pre-fill)", () => {
27
- expect(() =>
28
- validateAgainstSchema(
29
- "run-record",
30
- { ...validRunRecord, tool_invocations: [] },
31
- "run.json",
32
- ),
33
- ).not.toThrow();
34
- });
35
-
36
- test("accepts skill_path: null on the without_skill arm", () => {
37
- expect(() =>
38
- validateAgainstSchema(
39
- "run-record",
40
- { ...validRunRecord, skill_path: null },
41
- "run.json",
42
- ),
43
- ).not.toThrow();
44
- });
45
-
46
- test("throws a source-prefixed error when a required field is missing", () => {
47
- const { eval_id, ...missing } = validRunRecord;
48
- expect(() =>
49
- validateAgainstSchema("run-record", missing, "/tmp/run.json"),
50
- ).toThrow(/\/tmp\/run\.json/);
51
- });
52
-
53
- test("requires skill_path and files (type is the contract)", () => {
54
- const { skill_path, ...noSkillPath } = validRunRecord;
55
- expect(() =>
56
- validateAgainstSchema("run-record", noSkillPath, "run.json"),
57
- ).toThrow(/skill_path/);
58
-
59
- const { files, ...noFiles } = validRunRecord;
60
- expect(() =>
61
- validateAgainstSchema("run-record", noFiles, "run.json"),
62
- ).toThrow(/files/);
63
- });
64
-
65
- test("rejects a run record with an unknown extra property", () => {
66
- expect(() =>
67
- validateAgainstSchema(
68
- "run-record",
69
- { ...validRunRecord, surprise: true },
70
- "run.json",
71
- ),
72
- ).toThrow();
73
- });
74
-
75
- test("validates a tool_invocation's ordinal must be an integer", () => {
76
- expect(() =>
77
- validateAgainstSchema(
78
- "run-record",
79
- {
80
- ...validRunRecord,
81
- tool_invocations: [{ name: "Bash", ordinal: "zero" }],
82
- },
83
- "run.json",
84
- ),
85
- ).toThrow();
86
- });
87
-
88
- test("compiles and validates the grading schema too", () => {
89
- const validGrading = {
90
- assertion_results: [
91
- { id: "a1", passed: true, evidence: "quote", grader: "llm_judge" },
92
- ],
93
- summary: { passed: 1, failed: 0, total: 1, pass_rate: 1 },
94
- };
95
- expect(() =>
96
- validateAgainstSchema("grading", validGrading, "grading.json"),
97
- ).not.toThrow();
98
- });
99
- });
@@ -1,51 +0,0 @@
1
- import { readFileSync } from "node:fs";
2
- import { join } from "node:path";
3
- import { Ajv, type ValidateFunction } from "ajv";
4
-
5
- /**
6
- * The four portable artifact schemas live in `../schema/<name>.schema.json` and
7
- * are the single source of truth for each artifact's shape. This helper compiles
8
- * them with ajv and enforces them at runtime, so the schema files are an enforced
9
- * contract rather than documentation a hand-rolled validator can drift from.
10
- */
11
- export type SchemaName = "run-record" | "evals" | "grading" | "stray-writes";
12
-
13
- const SCHEMA_DIR = join(import.meta.dir, "..", "schema");
14
-
15
- // strict: false — the schemas are plain draft-07; we don't want ajv's strict
16
- // metaschema checks to reject otherwise-valid schemas over stylistic keywords.
17
- const ajv = new Ajv({ allErrors: true, strict: false });
18
- const validators = new Map<SchemaName, ValidateFunction>();
19
-
20
- function getValidator(name: SchemaName): ValidateFunction {
21
- let validate = validators.get(name);
22
- if (!validate) {
23
- const schema = JSON.parse(
24
- readFileSync(join(SCHEMA_DIR, `${name}.schema.json`), "utf8"),
25
- );
26
- validate = ajv.compile(schema);
27
- validators.set(name, validate);
28
- }
29
- return validate;
30
- }
31
-
32
- /**
33
- * Validate `data` against the named schema. Returns the data typed as `T` on
34
- * success; throws a `source`-prefixed Error listing every failure on mismatch.
35
- */
36
- export function validateAgainstSchema<T>(
37
- name: SchemaName,
38
- data: unknown,
39
- source: string,
40
- ): T {
41
- const validate = getValidator(name);
42
- if (!validate(data)) {
43
- const details = (validate.errors ?? [])
44
- .map((e) => ` ${e.instancePath || "/"} ${e.message}`)
45
- .join("\n");
46
- throw new Error(
47
- `${source}: does not match the ${name} schema:\n${details}`,
48
- );
49
- }
50
- return data as T;
51
- }
@@ -1,56 +0,0 @@
1
- import { describe, expect, test } from "bun:test";
2
- import { validateEvalsConfig } from "./validate";
3
-
4
- const base = {
5
- skill_name: "demo",
6
- evals: [
7
- {
8
- id: "e1",
9
- prompt: "do the thing",
10
- expected_output: "the thing is done",
11
- },
12
- ],
13
- };
14
-
15
- describe("validateEvalsConfig skill_should_trigger", () => {
16
- test("accepts a boolean skill_should_trigger", () => {
17
- const cfg = {
18
- ...base,
19
- evals: [{ ...base.evals[0], skill_should_trigger: false }],
20
- };
21
- expect(() => validateEvalsConfig(cfg, "test")).not.toThrow();
22
- });
23
-
24
- test("accepts evals with no skill_should_trigger (defaults to true)", () => {
25
- expect(() => validateEvalsConfig(base, "test")).not.toThrow();
26
- });
27
-
28
- test("rejects a non-boolean skill_should_trigger", () => {
29
- const cfg = {
30
- ...base,
31
- evals: [{ ...base.evals[0], skill_should_trigger: "false" }],
32
- };
33
- expect(() => validateEvalsConfig(cfg, "test")).toThrow(
34
- /skill_should_trigger/,
35
- );
36
- });
37
- });
38
-
39
- describe("validateEvalsConfig structural + duplicate-id", () => {
40
- test("rejects a non-kebab-case id", () => {
41
- const cfg = { ...base, evals: [{ ...base.evals[0], id: "Not Kebab" }] };
42
- expect(() => validateEvalsConfig(cfg, "test")).toThrow();
43
- });
44
-
45
- test("rejects duplicate eval ids (not expressible in JSON Schema)", () => {
46
- const cfg = {
47
- ...base,
48
- evals: [base.evals[0], { ...base.evals[0] }],
49
- };
50
- expect(() => validateEvalsConfig(cfg, "test")).toThrow(/duplicate/);
51
- });
52
-
53
- test("rejects an empty evals array", () => {
54
- expect(() => validateEvalsConfig({ ...base, evals: [] }, "test")).toThrow();
55
- });
56
- });
@@ -1,21 +0,0 @@
1
- import type { EvalsConfig } from "./types";
2
- import { validateAgainstSchema } from "./validate-schema";
3
-
4
- export function validateEvalsConfig(
5
- config: unknown,
6
- source: string,
7
- ): EvalsConfig {
8
- // Structural validation against the single source of truth.
9
- const validated = validateAgainstSchema<EvalsConfig>("evals", config, source);
10
-
11
- // Supplemental check: JSON Schema (draft-07) can't enforce uniqueness by a
12
- // sub-field, so the duplicate-id guard stays hand-rolled.
13
- const seenIds = new Set<string>();
14
- for (const [i, ev] of validated.evals.entries()) {
15
- if (seenIds.has(ev.id))
16
- throw new Error(`${source}: evals[${i}].id duplicate: ${ev.id}`);
17
- seenIds.add(ev.id);
18
- }
19
-
20
- return validated;
21
- }