@slowdini/slow-powers-opencode 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/README.md +34 -72
  2. package/bootstrap.md +1 -7
  3. package/opencode/plugins/slow-powers.js +1 -1
  4. package/package.json +14 -17
  5. package/skills/evaluating-skills/SKILL.md +90 -338
  6. package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
  7. package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
  8. package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
  9. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
  10. package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
  11. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
  12. package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
  13. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
  14. package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
  15. package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
  16. package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
  17. package/skills/evaluating-skills/harness-details/claude.md +0 -194
  18. package/skills/evaluating-skills/harness-parity.md +0 -155
  19. package/skills/evaluating-skills/runner/README.md +0 -163
  20. package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
  21. package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
  22. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
  23. package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
  24. package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
  25. package/skills/evaluating-skills/runner/aggregate.ts +0 -269
  26. package/skills/evaluating-skills/runner/context.test.ts +0 -181
  27. package/skills/evaluating-skills/runner/context.ts +0 -90
  28. package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
  29. package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
  30. package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
  31. package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
  32. package/skills/evaluating-skills/runner/grade.test.ts +0 -347
  33. package/skills/evaluating-skills/runner/grade.ts +0 -603
  34. package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
  35. package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
  36. package/skills/evaluating-skills/runner/guard/install.ts +0 -147
  37. package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
  38. package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
  39. package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
  40. package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
  41. package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
  42. package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
  43. package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
  44. package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
  45. package/skills/evaluating-skills/runner/record-runs.ts +0 -209
  46. package/skills/evaluating-skills/runner/run.test.ts +0 -1703
  47. package/skills/evaluating-skills/runner/run.ts +0 -1388
  48. package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
  49. package/skills/evaluating-skills/runner/types.ts +0 -121
  50. package/skills/evaluating-skills/runner/validate-all.ts +0 -54
  51. package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
  52. package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
  53. package/skills/evaluating-skills/runner/validate.test.ts +0 -56
  54. package/skills/evaluating-skills/runner/validate.ts +0 -21
  55. package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
  56. package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
  57. package/skills/evaluating-skills/schema/evals.schema.json +0 -105
  58. package/skills/evaluating-skills/schema/grading.schema.json +0 -84
  59. package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
  60. package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
  61. package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
  62. package/skills/evaluating-skills/templates/evals.json.example +0 -17
  63. package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
  64. package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
@@ -1,242 +0,0 @@
1
- import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
2
- import { join } from "node:path";
3
- import type { ToolInvocation } from "../types";
4
-
5
- type ToolUseBlock = {
6
- type: "tool_use";
7
- id: string;
8
- name: string;
9
- input: unknown;
10
- };
11
-
12
- type ToolResultBlock = {
13
- type: "tool_result";
14
- tool_use_id: string;
15
- content: string | unknown[];
16
- };
17
-
18
- type TextBlock = {
19
- type: "text";
20
- text: string;
21
- };
22
-
23
- type ContentBlock =
24
- | ToolUseBlock
25
- | ToolResultBlock
26
- | TextBlock
27
- | { type: string };
28
-
29
- type UsageRecord = {
30
- input_tokens?: number;
31
- output_tokens?: number;
32
- cache_creation_input_tokens?: number;
33
- cache_read_input_tokens?: number;
34
- };
35
-
36
- type TranscriptRecord = {
37
- type: "user" | "assistant" | string;
38
- timestamp?: string;
39
- message?: {
40
- id?: string;
41
- role?: string;
42
- usage?: UsageRecord;
43
- content?: string | ContentBlock[];
44
- };
45
- };
46
-
47
- function flattenContent(
48
- content: string | ContentBlock[] | undefined,
49
- ): ContentBlock[] {
50
- if (!content) return [];
51
- if (typeof content === "string") return [];
52
- return content;
53
- }
54
-
55
- function stringifyResult(content: ToolResultBlock["content"]): string {
56
- if (typeof content === "string") return content;
57
- if (Array.isArray(content))
58
- return content
59
- .map((c) => {
60
- if (typeof c === "string") return c;
61
- if (c && typeof c === "object" && "text" in c)
62
- return String((c as { text: unknown }).text);
63
- return JSON.stringify(c);
64
- })
65
- .join("\n");
66
- return JSON.stringify(content);
67
- }
68
-
69
- function readRecords(jsonlPath: string): TranscriptRecord[] {
70
- const raw = readFileSync(jsonlPath, "utf8");
71
- const records: TranscriptRecord[] = [];
72
- for (const line of raw.split("\n")) {
73
- if (line.length === 0) continue;
74
- try {
75
- records.push(JSON.parse(line) as TranscriptRecord);
76
- } catch {
77
- // skip malformed lines
78
- }
79
- }
80
- return records;
81
- }
82
-
83
- function extractInvocations(records: TranscriptRecord[]): ToolInvocation[] {
84
- const invocations: ToolInvocation[] = [];
85
- const indexById = new Map<string, number>();
86
-
87
- for (const record of records) {
88
- const blocks = flattenContent(record.message?.content);
89
-
90
- if (record.type === "assistant") {
91
- for (const block of blocks) {
92
- if (block.type !== "tool_use") continue;
93
- const tu = block as ToolUseBlock;
94
- const ordinal = invocations.length;
95
- indexById.set(tu.id, ordinal);
96
- invocations.push({
97
- name: tu.name,
98
- args: tu.input,
99
- ordinal,
100
- });
101
- }
102
- continue;
103
- }
104
-
105
- if (record.type === "user") {
106
- for (const block of blocks) {
107
- if (block.type !== "tool_result") continue;
108
- const tr = block as ToolResultBlock;
109
- const idx = indexById.get(tr.tool_use_id);
110
- if (idx === undefined) continue;
111
- invocations[idx].result = stringifyResult(tr.content);
112
- }
113
- }
114
- }
115
-
116
- return invocations;
117
- }
118
-
119
- export function parseTranscript(jsonlPath: string): ToolInvocation[] {
120
- return extractInvocations(readRecords(jsonlPath));
121
- }
122
-
123
- export type TranscriptSummary = {
124
- tool_invocations: ToolInvocation[];
125
- /**
126
- * Sum of usage across unique API responses. One response spans multiple
127
- * jsonl lines (one per content block) and repeats the same `message.id` +
128
- * `usage` on each, so totals are deduped by `message.id`. Includes cache
129
- * creation/read tokens — a different accounting than the harness's task
130
- * completion event.
131
- */
132
- total_tokens: number | null;
133
- /** Wall clock between the first and last line timestamps. */
134
- duration_ms: number | null;
135
- /** Concatenated text blocks of the last assistant message. */
136
- final_text: string | null;
137
- };
138
-
139
- export function parseTranscriptFull(jsonlPath: string): TranscriptSummary {
140
- const records = readRecords(jsonlPath);
141
-
142
- const usageById = new Map<string, UsageRecord>();
143
- let firstTs: number | null = null;
144
- let lastTs: number | null = null;
145
- let timestampCount = 0;
146
- let finalText: string | null = null;
147
-
148
- for (const record of records) {
149
- if (record.timestamp) {
150
- const ts = Date.parse(record.timestamp);
151
- if (!Number.isNaN(ts)) {
152
- if (firstTs === null) firstTs = ts;
153
- lastTs = ts;
154
- timestampCount++;
155
- }
156
- }
157
-
158
- if (record.type !== "assistant") continue;
159
-
160
- const { id, usage } = record.message ?? {};
161
- if (id && usage) usageById.set(id, usage);
162
-
163
- const texts = flattenContent(record.message?.content)
164
- .filter((b): b is TextBlock => b.type === "text")
165
- .map((b) => b.text);
166
- if (texts.length > 0) finalText = texts.join("\n");
167
- }
168
-
169
- let totalTokens: number | null = null;
170
- if (usageById.size > 0) {
171
- totalTokens = 0;
172
- for (const usage of usageById.values()) {
173
- totalTokens +=
174
- (usage.input_tokens ?? 0) +
175
- (usage.output_tokens ?? 0) +
176
- (usage.cache_creation_input_tokens ?? 0) +
177
- (usage.cache_read_input_tokens ?? 0);
178
- }
179
- }
180
-
181
- return {
182
- tool_invocations: extractInvocations(records),
183
- total_tokens: totalTokens,
184
- duration_ms:
185
- timestampCount >= 2 && firstTs !== null && lastTs !== null
186
- ? lastTs - firstTs
187
- : null,
188
- final_text: finalText,
189
- };
190
- }
191
-
192
- export type SubagentMeta = {
193
- agentType?: string;
194
- description?: string;
195
- toolUseId?: string;
196
- };
197
-
198
- export type SubagentEntry = {
199
- jsonlPath: string;
200
- metaPath: string;
201
- meta: SubagentMeta;
202
- };
203
-
204
- export function listSubagents(subagentsDir: string): SubagentEntry[] {
205
- if (!existsSync(subagentsDir)) return [];
206
- const files = readdirSync(subagentsDir);
207
- const out: SubagentEntry[] = [];
208
- for (const f of files) {
209
- if (!f.endsWith(".meta.json")) continue;
210
- const base = f.slice(0, -".meta.json".length);
211
- const metaPath = join(subagentsDir, f);
212
- const jsonlPath = join(subagentsDir, `${base}.jsonl`);
213
- if (!existsSync(jsonlPath)) continue;
214
- try {
215
- const meta = JSON.parse(readFileSync(metaPath, "utf8")) as SubagentMeta;
216
- out.push({ jsonlPath, metaPath, meta });
217
- } catch {}
218
- }
219
- return out;
220
- }
221
-
222
- export function findByDescription(
223
- subagentsDir: string,
224
- description: string,
225
- ): SubagentEntry | null {
226
- const entries = listSubagents(subagentsDir);
227
- const matches = entries.filter((e) => e.meta.description === description);
228
- if (matches.length === 0) return null;
229
- if (matches.length === 1) return matches[0];
230
-
231
- // Descriptions are namespaced per iteration+run (see run.ts), so duplicates
232
- // here mean a retry within the same run. Prefer the most-recently-written
233
- // transcript; readdir order is not chronological.
234
- matches.sort((a, b) => {
235
- try {
236
- return statSync(b.jsonlPath).mtimeMs - statSync(a.jsonlPath).mtimeMs;
237
- } catch {
238
- return 0;
239
- }
240
- });
241
- return matches[0];
242
- }