@slowdini/slow-powers-opencode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +174 -0
  3. package/bootstrap.md +16 -0
  4. package/opencode/plugins/slow-powers.js +86 -0
  5. package/package.json +66 -0
  6. package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
  7. package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
  8. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
  9. package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
  10. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
  11. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
  12. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
  13. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
  14. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
  15. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
  16. package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
  17. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
  18. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
  19. package/skills/evaluating-skills/SKILL.md +448 -0
  20. package/skills/evaluating-skills/evals/evals.json +52 -0
  21. package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
  22. package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
  23. package/skills/evaluating-skills/harness-details/claude.md +135 -0
  24. package/skills/evaluating-skills/pressure-scenarios.md +163 -0
  25. package/skills/evaluating-skills/runner/README.md +140 -0
  26. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
  27. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
  28. package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
  29. package/skills/evaluating-skills/runner/aggregate.ts +228 -0
  30. package/skills/evaluating-skills/runner/context.test.ts +181 -0
  31. package/skills/evaluating-skills/runner/context.ts +90 -0
  32. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
  33. package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
  34. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
  35. package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
  36. package/skills/evaluating-skills/runner/grade.test.ts +347 -0
  37. package/skills/evaluating-skills/runner/grade.ts +603 -0
  38. package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
  39. package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
  40. package/skills/evaluating-skills/runner/guard/install.ts +147 -0
  41. package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
  42. package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
  43. package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
  44. package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
  45. package/skills/evaluating-skills/runner/run.test.ts +716 -0
  46. package/skills/evaluating-skills/runner/run.ts +814 -0
  47. package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
  48. package/skills/evaluating-skills/runner/types.ts +104 -0
  49. package/skills/evaluating-skills/runner/validate-all.ts +54 -0
  50. package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
  51. package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
  52. package/skills/evaluating-skills/runner/validate.test.ts +56 -0
  53. package/skills/evaluating-skills/runner/validate.ts +21 -0
  54. package/skills/evaluating-skills/schema/evals.schema.json +105 -0
  55. package/skills/evaluating-skills/schema/grading.schema.json +84 -0
  56. package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
  57. package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
  58. package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
  59. package/skills/evaluating-skills/templates/evals.json.example +17 -0
  60. package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
  61. package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
  62. package/skills/finishing-a-development-branch/SKILL.md +96 -0
  63. package/skills/finishing-a-development-branch/evals/evals.json +41 -0
  64. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
  65. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
  66. package/skills/hardening-plans/SKILL.md +72 -0
  67. package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
  68. package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
  69. package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
  70. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
  71. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
  72. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
  73. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
  74. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
  75. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
  76. package/skills/hardening-plans/evals/evals.json +114 -0
  77. package/skills/systematic-debugging/CREATION-LOG.md +119 -0
  78. package/skills/systematic-debugging/SKILL.md +84 -0
  79. package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
  80. package/skills/systematic-debugging/condition-based-waiting.md +115 -0
  81. package/skills/systematic-debugging/defense-in-depth.md +122 -0
  82. package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
  83. package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
  84. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  85. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  86. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  87. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  88. package/skills/systematic-debugging/evals/evals.json +45 -0
  89. package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
  90. package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
  91. package/skills/systematic-debugging/find-polluter.sh +63 -0
  92. package/skills/systematic-debugging/root-cause-tracing.md +169 -0
  93. package/skills/systematic-debugging/test-academic.md +14 -0
  94. package/skills/systematic-debugging/test-pressure-1.md +58 -0
  95. package/skills/systematic-debugging/test-pressure-2.md +68 -0
  96. package/skills/systematic-debugging/test-pressure-3.md +69 -0
  97. package/skills/test-driven-development/SKILL.md +93 -0
  98. package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
  99. package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
  100. package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
  101. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
  102. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
  103. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
  104. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
  105. package/skills/test-driven-development/evals/evals.json +77 -0
  106. package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
  107. package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
  108. package/skills/test-driven-development/testing-anti-patterns.md +299 -0
  109. package/skills/using-git-worktrees/SKILL.md +70 -0
  110. package/skills/using-git-worktrees/evals/evals.json +40 -0
  111. package/skills/verification-before-completion/SKILL.md +65 -0
  112. package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
  113. package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
  114. package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
  115. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  116. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  117. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
  118. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  119. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  120. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  121. package/skills/verification-before-completion/evals/evals.json +77 -0
  122. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
  123. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
  124. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
  125. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
  126. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
  127. package/skills/writing-skills/SKILL.md +306 -0
  128. package/skills/writing-skills/evals/evals.json +40 -0
  129. package/skills/writing-skills/graphviz-conventions.dot +172 -0
  130. package/skills/writing-skills/persuasion-principles.md +187 -0
  131. package/skills/writing-skills/scripts/render-graphs.js +181 -0
@@ -0,0 +1,92 @@
1
+ import { afterEach, describe, expect, test } from "bun:test";
2
+ import {
3
+ existsSync,
4
+ mkdirSync,
5
+ readFileSync,
6
+ rmSync,
7
+ writeFileSync,
8
+ } from "node:fs";
9
+ import { tmpdir } from "node:os";
10
+ import { join } from "node:path";
11
+ import {
12
+ GUARD_MANIFEST,
13
+ GUARD_MARKER,
14
+ installGuard,
15
+ teardownGuard,
16
+ } from "./install";
17
+
18
+ const ROOT = join(tmpdir(), `guard-install-test-${process.pid}`);
19
+
20
+ afterEach(() => rmSync(ROOT, { recursive: true, force: true }));
21
+
22
+ function setup() {
23
+ const stageRoot = join(ROOT, `case-${Math.random().toString(36).slice(2)}`);
24
+ mkdirSync(stageRoot, { recursive: true });
25
+ const workspaceRoot = join(stageRoot, "skills-workspace");
26
+ return { stageRoot, workspaceRoot };
27
+ }
28
+
29
+ const skillsDir = (s: string) => join(s, ".claude", "skills");
30
+ const settingsPath = (s: string) => join(s, ".claude", "settings.local.json");
31
+
32
+ describe("installGuard / teardownGuard", () => {
33
+ test("install writes an active marker, hook, and manifest", () => {
34
+ const { stageRoot, workspaceRoot } = setup();
35
+ installGuard({ stageRoot, workspaceRoot, guardScriptPath: "/g/guard.ts" });
36
+
37
+ const marker = JSON.parse(
38
+ readFileSync(join(skillsDir(stageRoot), GUARD_MARKER), "utf8"),
39
+ );
40
+ expect(marker.active).toBe(true);
41
+ expect(Date.parse(marker.expiresAt)).toBeGreaterThan(Date.now());
42
+ expect(
43
+ marker.allowedRoots.some((r: string) => r.includes("skills-workspace")),
44
+ ).toBe(true);
45
+
46
+ const settings = JSON.parse(readFileSync(settingsPath(stageRoot), "utf8"));
47
+ expect(settings.hooks.PreToolUse[0].matcher).toContain("Write");
48
+ expect(settings.hooks.PreToolUse[0].hooks[0].command).toContain("guard.ts");
49
+
50
+ expect(existsSync(join(skillsDir(stageRoot), GUARD_MANIFEST))).toBe(true);
51
+ });
52
+
53
+ test("teardown deletes settings.local.json it created", () => {
54
+ const { stageRoot, workspaceRoot } = setup();
55
+ installGuard({ stageRoot, workspaceRoot, guardScriptPath: "/g/guard.ts" });
56
+ expect(existsSync(settingsPath(stageRoot))).toBe(true);
57
+
58
+ expect(teardownGuard(stageRoot)).toBe(true);
59
+ expect(existsSync(settingsPath(stageRoot))).toBe(false);
60
+ expect(existsSync(join(skillsDir(stageRoot), GUARD_MARKER))).toBe(false);
61
+ expect(existsSync(join(skillsDir(stageRoot), GUARD_MANIFEST))).toBe(false);
62
+ });
63
+
64
+ test("teardown restores a pre-existing settings.local.json verbatim", () => {
65
+ const { stageRoot, workspaceRoot } = setup();
66
+ mkdirSync(join(stageRoot, ".claude"), { recursive: true });
67
+ const original = `${JSON.stringify({ permissions: { allow: ["Bash(ls)"] } }, null, 2)}\n`;
68
+ writeFileSync(settingsPath(stageRoot), original);
69
+
70
+ installGuard({ stageRoot, workspaceRoot, guardScriptPath: "/g/guard.ts" });
71
+ // hook present while armed
72
+ expect(readFileSync(settingsPath(stageRoot), "utf8")).toContain(
73
+ "PreToolUse",
74
+ );
75
+
76
+ teardownGuard(stageRoot);
77
+ expect(readFileSync(settingsPath(stageRoot), "utf8")).toBe(original);
78
+ });
79
+
80
+ test("teardown is a safe no-op when nothing is installed", () => {
81
+ const { stageRoot } = setup();
82
+ expect(teardownGuard(stageRoot)).toBe(false);
83
+ });
84
+
85
+ test("teardown sweeps a stray marker even without a manifest", () => {
86
+ const { stageRoot } = setup();
87
+ mkdirSync(skillsDir(stageRoot), { recursive: true });
88
+ writeFileSync(join(skillsDir(stageRoot), GUARD_MARKER), "{}");
89
+ expect(teardownGuard(stageRoot)).toBe(true);
90
+ expect(existsSync(join(skillsDir(stageRoot), GUARD_MARKER))).toBe(false);
91
+ });
92
+ });
@@ -0,0 +1,147 @@
1
+ import {
2
+ existsSync,
3
+ mkdirSync,
4
+ readFileSync,
5
+ rmSync,
6
+ writeFileSync,
7
+ } from "node:fs";
8
+ import { tmpdir } from "node:os";
9
+ import { join, resolve } from "node:path";
10
+
11
+ export const GUARD_MARKER = ".slow-powers-eval-guard.json";
12
+ export const GUARD_MANIFEST = ".slow-powers-eval-guard-manifest.json";
13
+ const GUARD_TTL_MS = 6 * 60 * 60 * 1000; // 6h — bounds a crashed run's lingering hook
14
+
15
+ const HOOK_MATCHER = "Write|Edit|MultiEdit|NotebookEdit|Bash";
16
+
17
+ type GuardManifest = {
18
+ created_at: string;
19
+ settings_path: string;
20
+ settings_existed: boolean;
21
+ settings_backup: string | null;
22
+ marker_path: string;
23
+ };
24
+
25
+ type Settings = {
26
+ hooks?: {
27
+ PreToolUse?: Array<{ matcher?: string; hooks?: unknown[] }>;
28
+ [k: string]: unknown;
29
+ };
30
+ [k: string]: unknown;
31
+ };
32
+
33
+ /**
34
+ * Arm the Claude Code write guard for an eval run. Writes a marker listing the
35
+ * allowed roots and merges a `PreToolUse` hook into `.claude/settings.local.json`
36
+ * that runs `guard.ts` on every Write/Edit/Bash. The original settings file is
37
+ * backed up verbatim in a manifest so {@link teardownGuard} restores it exactly.
38
+ *
39
+ * Returns the marker path. The guard is a no-op until this marker exists and is
40
+ * unexpired (see guard/policy.ts), so the hook is inert outside an active run.
41
+ */
42
+ export function installGuard(opts: {
43
+ stageRoot: string;
44
+ workspaceRoot: string;
45
+ guardScriptPath: string;
46
+ ttlMs?: number;
47
+ }): string {
48
+ const skillsDir = join(opts.stageRoot, ".claude", "skills");
49
+ mkdirSync(skillsDir, { recursive: true });
50
+
51
+ const markerPath = join(skillsDir, GUARD_MARKER);
52
+ const allowedRoots = [
53
+ resolve(opts.workspaceRoot),
54
+ resolve(skillsDir),
55
+ resolve(tmpdir()),
56
+ ];
57
+ writeFileSync(
58
+ markerPath,
59
+ `${JSON.stringify(
60
+ {
61
+ active: true,
62
+ allowedRoots,
63
+ expiresAt: new Date(
64
+ Date.now() + (opts.ttlMs ?? GUARD_TTL_MS),
65
+ ).toISOString(),
66
+ },
67
+ null,
68
+ 2,
69
+ )}\n`,
70
+ );
71
+
72
+ const settingsPath = join(opts.stageRoot, ".claude", "settings.local.json");
73
+ const settingsExisted = existsSync(settingsPath);
74
+ const backup = settingsExisted ? readFileSync(settingsPath, "utf8") : null;
75
+
76
+ let settings: Settings = {};
77
+ if (backup) {
78
+ try {
79
+ settings = JSON.parse(backup);
80
+ } catch {
81
+ settings = {};
82
+ }
83
+ }
84
+ settings.hooks ??= {};
85
+ settings.hooks.PreToolUse ??= [];
86
+ settings.hooks.PreToolUse.push({
87
+ matcher: HOOK_MATCHER,
88
+ hooks: [
89
+ {
90
+ type: "command",
91
+ command: `bun run "${opts.guardScriptPath}" "${markerPath}"`,
92
+ },
93
+ ],
94
+ });
95
+ writeFileSync(settingsPath, `${JSON.stringify(settings, null, 2)}\n`);
96
+
97
+ const manifest: GuardManifest = {
98
+ created_at: new Date().toISOString(),
99
+ settings_path: settingsPath,
100
+ settings_existed: settingsExisted,
101
+ settings_backup: backup,
102
+ marker_path: markerPath,
103
+ };
104
+ writeFileSync(
105
+ join(skillsDir, GUARD_MANIFEST),
106
+ `${JSON.stringify(manifest, null, 2)}\n`,
107
+ );
108
+ return markerPath;
109
+ }
110
+
111
+ /**
112
+ * Disarm the guard: restore the original `settings.local.json` (or delete it if
113
+ * we created it) and remove the marker + manifest. Safe to call when no guard is
114
+ * installed. Returns true if a guard was found and torn down.
115
+ */
116
+ export function teardownGuard(stageRoot: string): boolean {
117
+ const skillsDir = join(stageRoot, ".claude", "skills");
118
+ const manifestPath = join(skillsDir, GUARD_MANIFEST);
119
+ const markerPath = join(skillsDir, GUARD_MARKER);
120
+
121
+ if (!existsSync(manifestPath)) {
122
+ // No manifest — still sweep a stray marker so the guard can't stay armed.
123
+ if (existsSync(markerPath)) {
124
+ rmSync(markerPath, { force: true });
125
+ return true;
126
+ }
127
+ return false;
128
+ }
129
+
130
+ let manifest: GuardManifest;
131
+ try {
132
+ manifest = JSON.parse(readFileSync(manifestPath, "utf8"));
133
+ } catch {
134
+ rmSync(manifestPath, { force: true });
135
+ rmSync(markerPath, { force: true });
136
+ return true;
137
+ }
138
+
139
+ if (manifest.settings_existed && manifest.settings_backup !== null) {
140
+ writeFileSync(manifest.settings_path, manifest.settings_backup);
141
+ } else if (existsSync(manifest.settings_path)) {
142
+ rmSync(manifest.settings_path, { force: true });
143
+ }
144
+ rmSync(manifest.marker_path, { force: true });
145
+ rmSync(manifestPath, { force: true });
146
+ return true;
147
+ }
@@ -0,0 +1,71 @@
1
+ import { describe, expect, test } from "bun:test";
2
+ import { decide, type GuardMarker } from "./policy";
3
+
4
+ const ROOTS = ["/work/skills-workspace", "/work/.claude/skills"];
5
+ const future = () => new Date(Date.now() + 60_000).toISOString();
6
+ const past = () => new Date(Date.now() - 60_000).toISOString();
7
+
8
+ function marker(over: Partial<GuardMarker> = {}): GuardMarker {
9
+ return { active: true, allowedRoots: ROOTS, expiresAt: future(), ...over };
10
+ }
11
+
12
+ describe("guard decide", () => {
13
+ test("allows everything when marker is null (guard inactive)", () => {
14
+ expect(decide("Write", { file_path: "/etc/passwd" }, null).allow).toBe(
15
+ true,
16
+ );
17
+ });
18
+
19
+ test("allows everything when marker is inactive or expired", () => {
20
+ expect(
21
+ decide("Write", { file_path: "/etc/passwd" }, marker({ active: false }))
22
+ .allow,
23
+ ).toBe(true);
24
+ expect(
25
+ decide(
26
+ "Write",
27
+ { file_path: "/etc/passwd" },
28
+ marker({ expiresAt: past() }),
29
+ ).allow,
30
+ ).toBe(true);
31
+ });
32
+
33
+ test("allows a write under an allowed root", () => {
34
+ expect(
35
+ decide(
36
+ "Write",
37
+ { file_path: "/work/skills-workspace/x/outputs/a.md" },
38
+ marker(),
39
+ ).allow,
40
+ ).toBe(true);
41
+ });
42
+
43
+ test("denies a write outside all allowed roots", () => {
44
+ const d = decide("Edit", { file_path: "/work/runner/run.ts" }, marker());
45
+ expect(d.allow).toBe(false);
46
+ expect(d.reason).toMatch(/outside/i);
47
+ });
48
+
49
+ test("denies an install command", () => {
50
+ const d = decide("Bash", { command: "npm install left-pad" }, marker());
51
+ expect(d.allow).toBe(false);
52
+ expect(d.reason).toMatch(/install/i);
53
+ });
54
+
55
+ test("allows a Bash command scoped to an allowed root", () => {
56
+ expect(
57
+ decide(
58
+ "Bash",
59
+ { command: "echo hi > /work/skills-workspace/x/outputs/log" },
60
+ marker(),
61
+ ).allow,
62
+ ).toBe(true);
63
+ });
64
+
65
+ test("allows non-mutating Bash and read tools", () => {
66
+ expect(decide("Bash", { command: "ls -la /" }, marker()).allow).toBe(true);
67
+ expect(decide("Read", { file_path: "/etc/passwd" }, marker()).allow).toBe(
68
+ true,
69
+ );
70
+ });
71
+ });
@@ -0,0 +1,74 @@
1
+ import {
2
+ classifyBash,
3
+ isUnderAny,
4
+ pathArg,
5
+ WRITE_TOOLS,
6
+ } from "../sandbox-policy";
7
+
8
+ /**
9
+ * The marker file (`<stageRoot>/.claude/skills/.slow-powers-eval-guard.json`)
10
+ * that arms the guard. The guard is a no-op unless this file exists, is active,
11
+ * and has not expired — so a crashed run that never tore the hook down can't
12
+ * silently block writes in the user's next interactive session.
13
+ */
14
+ export type GuardMarker = {
15
+ active?: boolean;
16
+ allowedRoots?: string[];
17
+ expiresAt?: string;
18
+ };
19
+
20
+ export type GuardDecision = { allow: boolean; reason?: string };
21
+
22
+ const ALLOW: GuardDecision = { allow: true };
23
+
24
+ function armed(marker: GuardMarker | null, now: number): boolean {
25
+ if (marker?.active !== true) return false;
26
+ if (marker.expiresAt && Date.parse(marker.expiresAt) <= now) return false;
27
+ return true;
28
+ }
29
+
30
+ /**
31
+ * Decide whether a tool call should be allowed while the eval guard is armed.
32
+ *
33
+ * Write tools targeting a path outside every allowed root are denied; Bash
34
+ * commands matching a mutation pattern (install/git/sed -i/redirection) that
35
+ * aren't scoped to an allowed root are denied. Everything else — including all
36
+ * read-only tools and the orchestrator's own writes under the workspace — is
37
+ * allowed. When the guard is not armed, every call is allowed.
38
+ */
39
+ export function decide(
40
+ toolName: string,
41
+ toolInput: unknown,
42
+ marker: GuardMarker | null,
43
+ now: number = Date.now(),
44
+ ): GuardDecision {
45
+ if (!armed(marker, now)) return ALLOW;
46
+ const roots = marker?.allowedRoots ?? [];
47
+ const repoRoot = process.cwd();
48
+
49
+ if (WRITE_TOOLS.has(toolName)) {
50
+ const p = pathArg(toolInput);
51
+ if (p && !isUnderAny(p, roots, repoRoot)) {
52
+ return {
53
+ allow: false,
54
+ reason: `eval guard: ${toolName} to ${p} is outside the eval sandbox (allowed: ${roots.join(", ")})`,
55
+ };
56
+ }
57
+ return ALLOW;
58
+ }
59
+
60
+ if (toolName === "Bash") {
61
+ const command =
62
+ toolInput && typeof toolInput === "object"
63
+ ? String((toolInput as Record<string, unknown>).command ?? "")
64
+ : "";
65
+ const reason = classifyBash(command, roots);
66
+ if (reason)
67
+ return {
68
+ allow: false,
69
+ reason: `eval guard: blocked Bash (${reason}) — runs outside the eval sandbox`,
70
+ };
71
+ }
72
+
73
+ return ALLOW;
74
+ }
@@ -0,0 +1,230 @@
1
+ import { afterAll, beforeAll, describe, expect, test } from "bun:test";
2
+ import {
3
+ existsSync,
4
+ mkdirSync,
5
+ readFileSync,
6
+ rmSync,
7
+ writeFileSync,
8
+ } from "node:fs";
9
+ import { tmpdir } from "node:os";
10
+ import { join } from "node:path";
11
+
12
+ const FIXTURE_ROOT = join(tmpdir(), `slow-powers-promote-test-${process.pid}`);
13
+ const PROMOTE_TS = join(import.meta.dir, "promote-baseline.ts");
14
+
15
+ beforeAll(() => {
16
+ mkdirSync(FIXTURE_ROOT, { recursive: true });
17
+ });
18
+
19
+ afterAll(() => {
20
+ rmSync(FIXTURE_ROOT, { recursive: true, force: true });
21
+ });
22
+
23
+ function writeJson(path: string, value: unknown) {
24
+ writeFileSync(path, `${JSON.stringify(value, null, 2)}\n`);
25
+ }
26
+
27
+ describe("promote-baseline.ts (--skill-dir, isolated CWD)", () => {
28
+ test("copies benchmark + per-run gradings into the skill's committed baseline/", () => {
29
+ const root = join(FIXTURE_ROOT, "promote-basic");
30
+
31
+ // Skill dir + skill-under-test (detectRunContext validates SKILL.md exists).
32
+ const skillDir = join(root, "skill-dir");
33
+ const skillSub = join(skillDir, "mr-review");
34
+ mkdirSync(skillSub, { recursive: true });
35
+ writeFileSync(
36
+ join(skillSub, "SKILL.md"),
37
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
38
+ );
39
+
40
+ // Working dir holding the workspace (mirrors workspaceRoot = <cwd>/skills-workspace).
41
+ const cwd = join(root, "work");
42
+ const iterationDir = join(
43
+ cwd,
44
+ "skills-workspace",
45
+ "mr-review",
46
+ "iteration-2",
47
+ );
48
+ mkdirSync(iterationDir, { recursive: true });
49
+
50
+ const timestamp = "2026-05-27T00:00:00.000Z";
51
+ writeJson(join(iterationDir, "conditions.json"), {
52
+ mode: "new-skill",
53
+ conditions: [
54
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
55
+ { name: "without_skill", skill_path: null },
56
+ ],
57
+ timestamp,
58
+ harness: "claude-code",
59
+ });
60
+ writeJson(join(iterationDir, "benchmark.json"), {
61
+ run_summary: {
62
+ with_skill: { pass_rate: { mean: 0.83 } },
63
+ without_skill: { pass_rate: { mean: 0.33 } },
64
+ },
65
+ delta: { pass_rate: 0.5 },
66
+ });
67
+
68
+ const mkGrading = (evalId: string, cond: string, passRate: number) => {
69
+ const condDir = join(iterationDir, `eval-${evalId}`, cond);
70
+ mkdirSync(condDir, { recursive: true });
71
+ writeJson(join(condDir, "grading.json"), {
72
+ assertion_results: [
73
+ {
74
+ id: "a1",
75
+ passed: passRate > 0,
76
+ evidence: `${cond} evidence`,
77
+ confidence: 1,
78
+ },
79
+ ],
80
+ summary: { passed: 1, failed: 0, total: 1, pass_rate: passRate },
81
+ });
82
+ };
83
+ mkGrading("e1", "with_skill", 1);
84
+ mkGrading("e1", "without_skill", 0);
85
+
86
+ const res = Bun.spawnSync(
87
+ [
88
+ "bun",
89
+ "run",
90
+ PROMOTE_TS,
91
+ "--skill-dir",
92
+ skillDir,
93
+ "--skill",
94
+ "mr-review",
95
+ "--iteration",
96
+ "2",
97
+ ],
98
+ { cwd, stdout: "pipe", stderr: "pipe" },
99
+ );
100
+ expect(res.stderr.toString()).toBe("");
101
+ expect(res.exitCode).toBe(0);
102
+
103
+ const baselineDir = join(skillSub, "evals", "baseline");
104
+
105
+ // benchmark.json copied verbatim.
106
+ const benchmarkPath = join(baselineDir, "benchmark.json");
107
+ expect(existsSync(benchmarkPath)).toBe(true);
108
+ const benchmark = JSON.parse(readFileSync(benchmarkPath, "utf8")) as {
109
+ delta: { pass_rate: number };
110
+ };
111
+ expect(benchmark.delta.pass_rate).toBe(0.5);
112
+
113
+ // Per-run gradings copied under grading/<eval-id>__<condition>.json.
114
+ const withGrading = join(baselineDir, "grading", "e1__with_skill.json");
115
+ const withoutGrading = join(
116
+ baselineDir,
117
+ "grading",
118
+ "e1__without_skill.json",
119
+ );
120
+ expect(existsSync(withGrading)).toBe(true);
121
+ expect(existsSync(withoutGrading)).toBe(true);
122
+ const withParsed = JSON.parse(readFileSync(withGrading, "utf8")) as {
123
+ summary: { pass_rate: number };
124
+ };
125
+ expect(withParsed.summary.pass_rate).toBe(1);
126
+
127
+ // Provenance file records mode, iteration, harness, timestamp.
128
+ const provenancePath = join(baselineDir, "BASELINE.md");
129
+ expect(existsSync(provenancePath)).toBe(true);
130
+ const provenance = readFileSync(provenancePath, "utf8");
131
+ expect(provenance).toContain("new-skill");
132
+ expect(provenance).toContain("iteration-2");
133
+ expect(provenance).toContain("claude-code");
134
+ expect(provenance).toContain(timestamp);
135
+ // Model rows default to "unspecified" when no flags are passed.
136
+ expect(provenance).toContain("Agent model | unspecified");
137
+ expect(provenance).toContain("Judge model | unspecified");
138
+ });
139
+
140
+ test("records agent and judge models in provenance when flags are passed", () => {
141
+ const root = join(FIXTURE_ROOT, "promote-models");
142
+
143
+ const skillDir = join(root, "skill-dir");
144
+ const skillSub = join(skillDir, "mr-review");
145
+ mkdirSync(skillSub, { recursive: true });
146
+ writeFileSync(
147
+ join(skillSub, "SKILL.md"),
148
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
149
+ );
150
+
151
+ const cwd = join(root, "work");
152
+ const iterationDir = join(
153
+ cwd,
154
+ "skills-workspace",
155
+ "mr-review",
156
+ "iteration-1",
157
+ );
158
+ mkdirSync(iterationDir, { recursive: true });
159
+ writeJson(join(iterationDir, "conditions.json"), {
160
+ mode: "new-skill",
161
+ conditions: [
162
+ { name: "with_skill", skill_path: join(skillSub, "SKILL.md") },
163
+ { name: "without_skill", skill_path: null },
164
+ ],
165
+ timestamp: "2026-05-27T00:00:00.000Z",
166
+ harness: "claude-code",
167
+ });
168
+ writeJson(join(iterationDir, "benchmark.json"), {
169
+ delta: { pass_rate: 0 },
170
+ });
171
+
172
+ const res = Bun.spawnSync(
173
+ [
174
+ "bun",
175
+ "run",
176
+ PROMOTE_TS,
177
+ "--skill-dir",
178
+ skillDir,
179
+ "--skill",
180
+ "mr-review",
181
+ "--iteration",
182
+ "1",
183
+ "--agent-model",
184
+ "claude-haiku-4-5-20251001",
185
+ "--judge-model",
186
+ "claude-opus-4-7",
187
+ ],
188
+ { cwd, stdout: "pipe", stderr: "pipe" },
189
+ );
190
+ expect(res.stderr.toString()).toBe("");
191
+ expect(res.exitCode).toBe(0);
192
+
193
+ const provenance = readFileSync(
194
+ join(skillSub, "evals", "baseline", "BASELINE.md"),
195
+ "utf8",
196
+ );
197
+ expect(provenance).toContain("Agent model | claude-haiku-4-5-20251001");
198
+ expect(provenance).toContain("Judge model | claude-opus-4-7");
199
+ });
200
+
201
+ test("fails clearly when the iteration directory is missing", () => {
202
+ const root = join(FIXTURE_ROOT, "promote-missing");
203
+ const skillDir = join(root, "skill-dir");
204
+ const skillSub = join(skillDir, "mr-review");
205
+ mkdirSync(skillSub, { recursive: true });
206
+ writeFileSync(
207
+ join(skillSub, "SKILL.md"),
208
+ "---\nname: mr-review\ndescription: review MRs\n---\n\nbody\n",
209
+ );
210
+ const cwd = join(root, "work");
211
+ mkdirSync(cwd, { recursive: true });
212
+
213
+ const res = Bun.spawnSync(
214
+ [
215
+ "bun",
216
+ "run",
217
+ PROMOTE_TS,
218
+ "--skill-dir",
219
+ skillDir,
220
+ "--skill",
221
+ "mr-review",
222
+ "--iteration",
223
+ "9",
224
+ ],
225
+ { cwd, stdout: "pipe", stderr: "pipe" },
226
+ );
227
+ expect(res.exitCode).not.toBe(0);
228
+ expect(res.stderr.toString()).toContain("iteration-9");
229
+ });
230
+ });