@slowdini/slow-powers-opencode 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +174 -0
  3. package/bootstrap.md +16 -0
  4. package/opencode/plugins/slow-powers.js +86 -0
  5. package/package.json +66 -0
  6. package/skills/auditing-slow-powers-usage/SKILL.md +157 -0
  7. package/skills/auditing-slow-powers-usage/evals/baseline/BASELINE.md +22 -0
  8. package/skills/auditing-slow-powers-usage/evals/baseline/NOTES.md +72 -0
  9. package/skills/auditing-slow-powers-usage/evals/baseline/benchmark.json +53 -0
  10. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__with_skill.json +53 -0
  11. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-blindspot-session__without_skill.json +38 -0
  12. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__with_skill.json +53 -0
  13. package/skills/auditing-slow-powers-usage/evals/baseline/grading/audits-completed-session__without_skill.json +38 -0
  14. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__with_skill.json +17 -0
  15. package/skills/auditing-slow-powers-usage/evals/baseline/grading/ordinary-dev-task-no-audit__without_skill.json +17 -0
  16. package/skills/auditing-slow-powers-usage/evals/evals.json +74 -0
  17. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-blindspot-session/session-summary.md +39 -0
  18. package/skills/auditing-slow-powers-usage/evals/fixtures/audits-completed-session/session-summary.md +33 -0
  19. package/skills/evaluating-skills/SKILL.md +448 -0
  20. package/skills/evaluating-skills/evals/evals.json +52 -0
  21. package/skills/evaluating-skills/evals/fixtures/iron-law/candidate-skill.md +13 -0
  22. package/skills/evaluating-skills/examples/verification-before-completion-evals.json +30 -0
  23. package/skills/evaluating-skills/harness-details/claude.md +135 -0
  24. package/skills/evaluating-skills/pressure-scenarios.md +163 -0
  25. package/skills/evaluating-skills/runner/README.md +140 -0
  26. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +263 -0
  27. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +146 -0
  28. package/skills/evaluating-skills/runner/aggregate.test.ts +188 -0
  29. package/skills/evaluating-skills/runner/aggregate.ts +228 -0
  30. package/skills/evaluating-skills/runner/context.test.ts +181 -0
  31. package/skills/evaluating-skills/runner/context.ts +90 -0
  32. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +103 -0
  33. package/skills/evaluating-skills/runner/detect-stray-writes.ts +192 -0
  34. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +73 -0
  35. package/skills/evaluating-skills/runner/fill-transcripts.ts +154 -0
  36. package/skills/evaluating-skills/runner/grade.test.ts +347 -0
  37. package/skills/evaluating-skills/runner/grade.ts +603 -0
  38. package/skills/evaluating-skills/runner/guard/guard.ts +49 -0
  39. package/skills/evaluating-skills/runner/guard/install.test.ts +92 -0
  40. package/skills/evaluating-skills/runner/guard/install.ts +147 -0
  41. package/skills/evaluating-skills/runner/guard/policy.test.ts +71 -0
  42. package/skills/evaluating-skills/runner/guard/policy.ts +74 -0
  43. package/skills/evaluating-skills/runner/promote-baseline.test.ts +230 -0
  44. package/skills/evaluating-skills/runner/promote-baseline.ts +186 -0
  45. package/skills/evaluating-skills/runner/run.test.ts +716 -0
  46. package/skills/evaluating-skills/runner/run.ts +814 -0
  47. package/skills/evaluating-skills/runner/sandbox-policy.ts +74 -0
  48. package/skills/evaluating-skills/runner/types.ts +104 -0
  49. package/skills/evaluating-skills/runner/validate-all.ts +54 -0
  50. package/skills/evaluating-skills/runner/validate-schema.test.ts +99 -0
  51. package/skills/evaluating-skills/runner/validate-schema.ts +51 -0
  52. package/skills/evaluating-skills/runner/validate.test.ts +56 -0
  53. package/skills/evaluating-skills/runner/validate.ts +21 -0
  54. package/skills/evaluating-skills/schema/evals.schema.json +105 -0
  55. package/skills/evaluating-skills/schema/grading.schema.json +84 -0
  56. package/skills/evaluating-skills/schema/run-record.schema.json +80 -0
  57. package/skills/evaluating-skills/schema/stray-writes.schema.json +68 -0
  58. package/skills/evaluating-skills/templates/eval-task-prompt.md +71 -0
  59. package/skills/evaluating-skills/templates/evals.json.example +17 -0
  60. package/skills/evaluating-skills/templates/judge-prompt.md +56 -0
  61. package/skills/evaluating-skills/templates/revise-skill-prompt.md +56 -0
  62. package/skills/finishing-a-development-branch/SKILL.md +96 -0
  63. package/skills/finishing-a-development-branch/evals/evals.json +41 -0
  64. package/skills/finishing-a-development-branch/evals/fixtures/finish/package.json +4 -0
  65. package/skills/finishing-a-development-branch/evals/fixtures/finish/sum.test.ts +5 -0
  66. package/skills/hardening-plans/SKILL.md +72 -0
  67. package/skills/hardening-plans/evals/baseline/BASELINE.md +22 -0
  68. package/skills/hardening-plans/evals/baseline/NOTES.md +58 -0
  69. package/skills/hardening-plans/evals/baseline/benchmark.json +54 -0
  70. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +39 -0
  71. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +39 -0
  72. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__new_skill.json +24 -0
  73. package/skills/hardening-plans/evals/baseline/grading/csv-parser-bug-no-plan__old_skill.json +24 -0
  74. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +46 -0
  75. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +46 -0
  76. package/skills/hardening-plans/evals/evals.json +114 -0
  77. package/skills/systematic-debugging/CREATION-LOG.md +119 -0
  78. package/skills/systematic-debugging/SKILL.md +84 -0
  79. package/skills/systematic-debugging/condition-based-waiting-example.ts +164 -0
  80. package/skills/systematic-debugging/condition-based-waiting.md +115 -0
  81. package/skills/systematic-debugging/defense-in-depth.md +122 -0
  82. package/skills/systematic-debugging/evals/baseline/BASELINE.md +22 -0
  83. package/skills/systematic-debugging/evals/baseline/benchmark.json +51 -0
  84. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__with_skill.json +17 -0
  85. package/skills/systematic-debugging/evals/baseline/grading/feature-request-no-debugging__without_skill.json +17 -0
  86. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__with_skill.json +46 -0
  87. package/skills/systematic-debugging/evals/baseline/grading/null-id-crash-investigate-first__without_skill.json +31 -0
  88. package/skills/systematic-debugging/evals/evals.json +45 -0
  89. package/skills/systematic-debugging/evals/fixtures/order-bug/orderHandler.ts +9 -0
  90. package/skills/systematic-debugging/evals/fixtures/order-bug/repro.ts +10 -0
  91. package/skills/systematic-debugging/find-polluter.sh +63 -0
  92. package/skills/systematic-debugging/root-cause-tracing.md +169 -0
  93. package/skills/systematic-debugging/test-academic.md +14 -0
  94. package/skills/systematic-debugging/test-pressure-1.md +58 -0
  95. package/skills/systematic-debugging/test-pressure-2.md +68 -0
  96. package/skills/systematic-debugging/test-pressure-3.md +69 -0
  97. package/skills/test-driven-development/SKILL.md +93 -0
  98. package/skills/test-driven-development/evals/baseline/BASELINE.md +22 -0
  99. package/skills/test-driven-development/evals/baseline/NOTES.md +74 -0
  100. package/skills/test-driven-development/evals/baseline/benchmark.json +51 -0
  101. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__with_skill.json +53 -0
  102. package/skills/test-driven-development/evals/baseline/grading/slugify-under-time-pressure__without_skill.json +38 -0
  103. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__with_skill.json +32 -0
  104. package/skills/test-driven-development/evals/baseline/grading/tests-after-rubber-stamp__without_skill.json +17 -0
  105. package/skills/test-driven-development/evals/evals.json +77 -0
  106. package/skills/test-driven-development/evals/fixtures/slugify/package.json +4 -0
  107. package/skills/test-driven-development/evals/fixtures/slugify/utils.ts +7 -0
  108. package/skills/test-driven-development/testing-anti-patterns.md +299 -0
  109. package/skills/using-git-worktrees/SKILL.md +70 -0
  110. package/skills/using-git-worktrees/evals/evals.json +40 -0
  111. package/skills/verification-before-completion/SKILL.md +65 -0
  112. package/skills/verification-before-completion/evals/baseline/BASELINE.md +22 -0
  113. package/skills/verification-before-completion/evals/baseline/NOTES.md +75 -0
  114. package/skills/verification-before-completion/evals/baseline/benchmark.json +51 -0
  115. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__with_skill.json +39 -0
  116. package/skills/verification-before-completion/evals/baseline/grading/bug-fixed-without-reproducing__without_skill.json +24 -0
  117. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__with_skill.json +46 -0
  118. package/skills/verification-before-completion/evals/baseline/grading/build-implied-by-edit__without_skill.json +31 -0
  119. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__with_skill.json +46 -0
  120. package/skills/verification-before-completion/evals/baseline/grading/claim-without-running__without_skill.json +31 -0
  121. package/skills/verification-before-completion/evals/evals.json +77 -0
  122. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/api.ts +1 -0
  123. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/consumer.ts +3 -0
  124. package/skills/verification-before-completion/evals/fixtures/build-implied-by-edit/tsconfig.json +23 -0
  125. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.test.ts +10 -0
  126. package/skills/verification-before-completion/evals/fixtures/claim-without-running/sum.ts +1 -0
  127. package/skills/writing-skills/SKILL.md +306 -0
  128. package/skills/writing-skills/evals/evals.json +40 -0
  129. package/skills/writing-skills/graphviz-conventions.dot +172 -0
  130. package/skills/writing-skills/persuasion-principles.md +187 -0
  131. package/skills/writing-skills/scripts/render-graphs.js +181 -0
@@ -0,0 +1,186 @@
1
+ #!/usr/bin/env bun
2
+ import {
3
+ copyFileSync,
4
+ existsSync,
5
+ mkdirSync,
6
+ readdirSync,
7
+ readFileSync,
8
+ writeFileSync,
9
+ } from "node:fs";
10
+ import { join } from "node:path";
11
+ import { detectRunContext } from "./context";
12
+ import type { ConditionsRecord } from "./types";
13
+
14
+ function die(msg: string): never {
15
+ console.error(`error: ${msg}`);
16
+ process.exit(1);
17
+ }
18
+
19
+ function ensureDir(path: string): void {
20
+ if (!existsSync(path)) mkdirSync(path, { recursive: true });
21
+ }
22
+
23
+ function gitHead(cwd: string): string {
24
+ try {
25
+ const res = Bun.spawnSync(["git", "rev-parse", "--short", "HEAD"], {
26
+ cwd,
27
+ stdout: "pipe",
28
+ stderr: "ignore",
29
+ });
30
+ if (res.exitCode === 0) return res.stdout.toString().trim();
31
+ } catch {
32
+ // not a git repo / git unavailable — provenance still useful without it
33
+ }
34
+ return "unknown";
35
+ }
36
+
37
+ export type PromoteOptions = {
38
+ workspaceRoot: string;
39
+ skillName: string;
40
+ skillSubdir: string;
41
+ iteration: string;
42
+ harness: string;
43
+ label: string | null;
44
+ /**
45
+ * Operator-declared models for provenance. The runner never dispatches the
46
+ * agent/judge itself, so it cannot observe these — record what was used.
47
+ */
48
+ agentModel: string | null;
49
+ judgeModel: string | null;
50
+ /** Directory used to resolve the committing repo's git HEAD for provenance. */
51
+ gitCwd: string;
52
+ };
53
+
54
+ /**
55
+ * Copies the durable, reference-worthy subset of a workspace iteration into the
56
+ * skill's version-controlled `evals/baseline/` directory: the aggregate
57
+ * `benchmark.json`, every per-run `grading.json` (judge rationales), and a
58
+ * `BASELINE.md` provenance file. Ephemeral scaffolding (dispatch files, timing,
59
+ * full run records, produced outputs, transcripts) is intentionally left behind
60
+ * in the gitignored workspace.
61
+ */
62
+ export function promoteBaseline(opts: PromoteOptions): {
63
+ baselineDir: string;
64
+ gradingsCopied: number;
65
+ } {
66
+ const iterationDir = join(
67
+ opts.workspaceRoot,
68
+ opts.skillName,
69
+ `iteration-${opts.iteration}`,
70
+ );
71
+ if (!existsSync(iterationDir)) {
72
+ die(
73
+ `not found: ${iterationDir} (build/grade iteration-${opts.iteration} first)`,
74
+ );
75
+ }
76
+
77
+ const benchmarkSrc = join(iterationDir, "benchmark.json");
78
+ if (!existsSync(benchmarkSrc)) {
79
+ die(
80
+ `missing benchmark.json in iteration-${opts.iteration} — run 'evals:aggregate' before promoting`,
81
+ );
82
+ }
83
+
84
+ const conditionsSrc = join(iterationDir, "conditions.json");
85
+ const conditions: ConditionsRecord | null = existsSync(conditionsSrc)
86
+ ? JSON.parse(readFileSync(conditionsSrc, "utf8"))
87
+ : null;
88
+
89
+ const baselineDir = join(opts.skillSubdir, "evals", "baseline");
90
+ const gradingDir = join(baselineDir, "grading");
91
+ ensureDir(gradingDir);
92
+
93
+ copyFileSync(benchmarkSrc, join(baselineDir, "benchmark.json"));
94
+
95
+ let gradingsCopied = 0;
96
+ for (const entry of readdirSync(iterationDir, { withFileTypes: true })) {
97
+ if (!entry.isDirectory() || !entry.name.startsWith("eval-")) continue;
98
+ const evalId = entry.name.slice("eval-".length);
99
+ const evalDir = join(iterationDir, entry.name);
100
+ for (const cond of readdirSync(evalDir, { withFileTypes: true })) {
101
+ if (!cond.isDirectory()) continue;
102
+ const gradingSrc = join(evalDir, cond.name, "grading.json");
103
+ if (!existsSync(gradingSrc)) continue;
104
+ copyFileSync(
105
+ gradingSrc,
106
+ join(gradingDir, `${evalId}__${cond.name}.json`),
107
+ );
108
+ gradingsCopied++;
109
+ }
110
+ }
111
+
112
+ const head = gitHead(opts.gitCwd);
113
+ const mode = conditions?.mode ?? "unknown";
114
+ const timestamp = conditions?.timestamp ?? "unknown";
115
+ const conditionNames = conditions?.conditions.map((c) => c.name) ?? [];
116
+ const provenance = [
117
+ `# Baseline — ${opts.skillName}`,
118
+ "",
119
+ "Committed reference output from a canonical eval run. Regenerate with",
120
+ "`bun run evals:promote-baseline -- --skill " +
121
+ `${opts.skillName} --iteration <N>` +
122
+ "` after aggregating. The ephemeral workspace (run records, timing,",
123
+ "dispatch files, produced outputs) stays gitignored under `skills-workspace/`.",
124
+ "",
125
+ "| Field | Value |",
126
+ "|-------|-------|",
127
+ `| Mode | ${mode} |`,
128
+ `| Iteration | iteration-${opts.iteration} |`,
129
+ `| Harness | ${opts.harness} |`,
130
+ `| Agent model | ${opts.agentModel ?? "unspecified"} |`,
131
+ `| Judge model | ${opts.judgeModel ?? "unspecified"} |`,
132
+ `| Conditions | ${conditionNames.join(", ") || "unknown"} |`,
133
+ `| Run timestamp | ${timestamp} |`,
134
+ `| Label | ${opts.label ?? "(none)"} |`,
135
+ `| Promoted from commit | ${head} |`,
136
+ "",
137
+ "Files:",
138
+ "- `benchmark.json` — aggregate pass-rate / duration / token deltas.",
139
+ "- `grading/<eval-id>__<condition>.json` — per-run assertion results and judge rationales.",
140
+ "",
141
+ ].join("\n");
142
+ writeFileSync(join(baselineDir, "BASELINE.md"), `${provenance}\n`);
143
+
144
+ return { baselineDir, gradingsCopied };
145
+ }
146
+
147
+ if (import.meta.main) {
148
+ const argv = Bun.argv.slice(2);
149
+ let ctx: ReturnType<typeof detectRunContext>;
150
+ try {
151
+ ctx = detectRunContext(argv);
152
+ } catch (err) {
153
+ die(err instanceof Error ? err.message : String(err));
154
+ }
155
+
156
+ const iterIdx = argv.indexOf("--iteration");
157
+ const iteration = iterIdx === -1 ? undefined : argv[iterIdx + 1];
158
+ if (!iteration) die("missing --iteration <N>");
159
+
160
+ const labelIdx = argv.indexOf("--label");
161
+ const label = labelIdx === -1 ? null : (argv[labelIdx + 1] ?? null);
162
+
163
+ const agentModelIdx = argv.indexOf("--agent-model");
164
+ const agentModel =
165
+ agentModelIdx === -1 ? null : (argv[agentModelIdx + 1] ?? null);
166
+
167
+ const judgeModelIdx = argv.indexOf("--judge-model");
168
+ const judgeModel =
169
+ judgeModelIdx === -1 ? null : (argv[judgeModelIdx + 1] ?? null);
170
+
171
+ const { baselineDir, gradingsCopied } = promoteBaseline({
172
+ workspaceRoot: ctx.workspaceRoot,
173
+ skillName: ctx.skillName,
174
+ skillSubdir: ctx.skillSubdir,
175
+ iteration,
176
+ harness: ctx.harness,
177
+ label,
178
+ agentModel,
179
+ judgeModel,
180
+ gitCwd: ctx.skillSubdir,
181
+ });
182
+
183
+ console.log(
184
+ `Promoted baseline for ${ctx.skillName} → ${baselineDir} (benchmark.json + ${gradingsCopied} grading file${gradingsCopied === 1 ? "" : "s"} + BASELINE.md)`,
185
+ );
186
+ }