@slowdini/slow-powers-opencode 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +69 -5
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/hardening-plans/SKILL.md +29 -7
  16. package/skills/hardening-plans/evals/baseline/BASELINE.md +11 -6
  17. package/skills/hardening-plans/evals/baseline/NOTES.md +72 -58
  18. package/skills/hardening-plans/evals/baseline/benchmark.json +25 -25
  19. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__new_skill.json +2 -2
  20. package/skills/hardening-plans/evals/baseline/grading/concrete-todo-app-plan__old_skill.json +2 -2
  21. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__new_skill.json +39 -0
  22. package/skills/hardening-plans/evals/baseline/grading/docs-refactor-plan-mode__old_skill.json +39 -0
  23. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__new_skill.json +39 -0
  24. package/skills/hardening-plans/evals/baseline/grading/oauth-task-breakdown-cold__old_skill.json +39 -0
  25. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__new_skill.json +32 -0
  26. package/skills/hardening-plans/evals/baseline/grading/research-plan-no-required-skill__old_skill.json +32 -0
  27. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__new_skill.json +39 -0
  28. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app-adversarial__old_skill.json +39 -0
  29. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__new_skill.json +39 -0
  30. package/skills/hardening-plans/evals/baseline/grading/seeded-plan-mode-todo-app__old_skill.json +39 -0
  31. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__new_skill.json +3 -3
  32. package/skills/hardening-plans/evals/baseline/grading/seeded-review-catches-defects__old_skill.json +8 -8
  33. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__new_skill.json +39 -0
  34. package/skills/hardening-plans/evals/baseline/grading/structural-refactor-cold__old_skill.json +39 -0
  35. package/skills/hardening-plans/evals/evals.json +46 -0
  36. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  37. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  38. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  39. package/skills/evaluating-skills/harness-parity.md +0 -155
  40. package/skills/evaluating-skills/runner/README.md +0 -163
  41. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  42. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  43. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  44. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  45. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  46. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  47. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  48. package/skills/evaluating-skills/runner/context.ts +0 -90
  49. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  50. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  51. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  52. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  53. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  54. package/skills/evaluating-skills/runner/grade.ts +0 -603
  55. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  56. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  57. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  58. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  59. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  60. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  61. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  62. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  63. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  64. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  65. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  66. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  67. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  68. package/skills/evaluating-skills/runner/run.ts +0 -1388
  69. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  70. package/skills/evaluating-skills/runner/types.ts +0 -121
  71. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  72. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  73. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  74. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  75. package/skills/evaluating-skills/runner/validate.ts +0 -21
  76. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  77. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  78. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  79. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  80. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  81. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  82. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  83. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  84. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  85. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,242 +0,0 @@
1
- import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
2
- import { join } from "node:path";
3
- import type { ToolInvocation } from "../types";
4
-
5
- type ToolUseBlock = {
6
- type: "tool_use";
7
- id: string;
8
- name: string;
9
- input: unknown;
10
- };
11
-
12
- type ToolResultBlock = {
13
- type: "tool_result";
14
- tool_use_id: string;
15
- content: string | unknown[];
16
- };
17
-
18
- type TextBlock = {
19
- type: "text";
20
- text: string;
21
- };
22
-
23
- type ContentBlock =
24
- | ToolUseBlock
25
- | ToolResultBlock
26
- | TextBlock
27
- | { type: string };
28
-
29
- type UsageRecord = {
30
- input_tokens?: number;
31
- output_tokens?: number;
32
- cache_creation_input_tokens?: number;
33
- cache_read_input_tokens?: number;
34
- };
35
-
36
- type TranscriptRecord = {
37
- type: "user" | "assistant" | string;
38
- timestamp?: string;
39
- message?: {
40
- id?: string;
41
- role?: string;
42
- usage?: UsageRecord;
43
- content?: string | ContentBlock[];
44
- };
45
- };
46
-
47
- function flattenContent(
48
- content: string | ContentBlock[] | undefined,
49
- ): ContentBlock[] {
50
- if (!content) return [];
51
- if (typeof content === "string") return [];
52
- return content;
53
- }
54
-
55
- function stringifyResult(content: ToolResultBlock["content"]): string {
56
- if (typeof content === "string") return content;
57
- if (Array.isArray(content))
58
- return content
59
- .map((c) => {
60
- if (typeof c === "string") return c;
61
- if (c && typeof c === "object" && "text" in c)
62
- return String((c as { text: unknown }).text);
63
- return JSON.stringify(c);
64
- })
65
- .join("\n");
66
- return JSON.stringify(content);
67
- }
68
-
69
- function readRecords(jsonlPath: string): TranscriptRecord[] {
70
- const raw = readFileSync(jsonlPath, "utf8");
71
- const records: TranscriptRecord[] = [];
72
- for (const line of raw.split("\n")) {
73
- if (line.length === 0) continue;
74
- try {
75
- records.push(JSON.parse(line) as TranscriptRecord);
76
- } catch {
77
- // skip malformed lines
78
- }
79
- }
80
- return records;
81
- }
82
-
83
- function extractInvocations(records: TranscriptRecord[]): ToolInvocation[] {
84
- const invocations: ToolInvocation[] = [];
85
- const indexById = new Map<string, number>();
86
-
87
- for (const record of records) {
88
- const blocks = flattenContent(record.message?.content);
89
-
90
- if (record.type === "assistant") {
91
- for (const block of blocks) {
92
- if (block.type !== "tool_use") continue;
93
- const tu = block as ToolUseBlock;
94
- const ordinal = invocations.length;
95
- indexById.set(tu.id, ordinal);
96
- invocations.push({
97
- name: tu.name,
98
- args: tu.input,
99
- ordinal,
100
- });
101
- }
102
- continue;
103
- }
104
-
105
- if (record.type === "user") {
106
- for (const block of blocks) {
107
- if (block.type !== "tool_result") continue;
108
- const tr = block as ToolResultBlock;
109
- const idx = indexById.get(tr.tool_use_id);
110
- if (idx === undefined) continue;
111
- invocations[idx].result = stringifyResult(tr.content);
112
- }
113
- }
114
- }
115
-
116
- return invocations;
117
- }
118
-
119
- export function parseTranscript(jsonlPath: string): ToolInvocation[] {
120
- return extractInvocations(readRecords(jsonlPath));
121
- }
122
-
123
- export type TranscriptSummary = {
124
- tool_invocations: ToolInvocation[];
125
- /**
126
- * Sum of usage across unique API responses. One response spans multiple
127
- * jsonl lines (one per content block) and repeats the same `message.id` +
128
- * `usage` on each, so totals are deduped by `message.id`. Includes cache
129
- * creation/read tokens — a different accounting than the harness's task
130
- * completion event.
131
- */
132
- total_tokens: number | null;
133
- /** Wall clock between the first and last line timestamps. */
134
- duration_ms: number | null;
135
- /** Concatenated text blocks of the last assistant message. */
136
- final_text: string | null;
137
- };
138
-
139
- export function parseTranscriptFull(jsonlPath: string): TranscriptSummary {
140
- const records = readRecords(jsonlPath);
141
-
142
- const usageById = new Map<string, UsageRecord>();
143
- let firstTs: number | null = null;
144
- let lastTs: number | null = null;
145
- let timestampCount = 0;
146
- let finalText: string | null = null;
147
-
148
- for (const record of records) {
149
- if (record.timestamp) {
150
- const ts = Date.parse(record.timestamp);
151
- if (!Number.isNaN(ts)) {
152
- if (firstTs === null) firstTs = ts;
153
- lastTs = ts;
154
- timestampCount++;
155
- }
156
- }
157
-
158
- if (record.type !== "assistant") continue;
159
-
160
- const { id, usage } = record.message ?? {};
161
- if (id && usage) usageById.set(id, usage);
162
-
163
- const texts = flattenContent(record.message?.content)
164
- .filter((b): b is TextBlock => b.type === "text")
165
- .map((b) => b.text);
166
- if (texts.length > 0) finalText = texts.join("\n");
167
- }
168
-
169
- let totalTokens: number | null = null;
170
- if (usageById.size > 0) {
171
- totalTokens = 0;
172
- for (const usage of usageById.values()) {
173
- totalTokens +=
174
- (usage.input_tokens ?? 0) +
175
- (usage.output_tokens ?? 0) +
176
- (usage.cache_creation_input_tokens ?? 0) +
177
- (usage.cache_read_input_tokens ?? 0);
178
- }
179
- }
180
-
181
- return {
182
- tool_invocations: extractInvocations(records),
183
- total_tokens: totalTokens,
184
- duration_ms:
185
- timestampCount >= 2 && firstTs !== null && lastTs !== null
186
- ? lastTs - firstTs
187
- : null,
188
- final_text: finalText,
189
- };
190
- }
191
-
192
- export type SubagentMeta = {
193
- agentType?: string;
194
- description?: string;
195
- toolUseId?: string;
196
- };
197
-
198
- export type SubagentEntry = {
199
- jsonlPath: string;
200
- metaPath: string;
201
- meta: SubagentMeta;
202
- };
203
-
204
- export function listSubagents(subagentsDir: string): SubagentEntry[] {
205
- if (!existsSync(subagentsDir)) return [];
206
- const files = readdirSync(subagentsDir);
207
- const out: SubagentEntry[] = [];
208
- for (const f of files) {
209
- if (!f.endsWith(".meta.json")) continue;
210
- const base = f.slice(0, -".meta.json".length);
211
- const metaPath = join(subagentsDir, f);
212
- const jsonlPath = join(subagentsDir, `${base}.jsonl`);
213
- if (!existsSync(jsonlPath)) continue;
214
- try {
215
- const meta = JSON.parse(readFileSync(metaPath, "utf8")) as SubagentMeta;
216
- out.push({ jsonlPath, metaPath, meta });
217
- } catch {}
218
- }
219
- return out;
220
- }
221
-
222
- export function findByDescription(
223
- subagentsDir: string,
224
- description: string,
225
- ): SubagentEntry | null {
226
- const entries = listSubagents(subagentsDir);
227
- const matches = entries.filter((e) => e.meta.description === description);
228
- if (matches.length === 0) return null;
229
- if (matches.length === 1) return matches[0];
230
-
231
- // Descriptions are namespaced per iteration+run (see run.ts), so duplicates
232
- // here mean a retry within the same run. Prefer the most-recently-written
233
- // transcript; readdir order is not chronological.
234
- matches.sort((a, b) => {
235
- try {
236
- return statSync(b.jsonlPath).mtimeMs - statSync(a.jsonlPath).mtimeMs;
237
- } catch {
238
- return 0;
239
- }
240
- });
241
- return matches[0];
242
- }