@slowdini/slow-powers-opencode 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -72
- package/bootstrap.md +1 -7
- package/opencode/plugins/slow-powers.js +1 -1
- package/package.json +14 -17
- package/skills/evaluating-skills/SKILL.md +90 -338
- package/skills/evaluating-skills/evals/baseline/BASELINE.md +23 -0
- package/skills/evaluating-skills/evals/baseline/NOTES.md +40 -0
- package/skills/evaluating-skills/evals/baseline/benchmark.json +54 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/deterministic-edit-skip__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__new_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/did-my-revision-help__old_skill.json +39 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__new_skill.json +32 -0
- package/skills/evaluating-skills/evals/baseline/grading/is-new-skill-ready-to-ship__old_skill.json +32 -0
- package/skills/test-driven-development/evals/baseline/NOTES.md +2 -2
- package/skills/evaluating-skills/examples/verifying-development-work-evals.json +0 -30
- package/skills/evaluating-skills/harness-details/claude.md +0 -194
- package/skills/evaluating-skills/harness-parity.md +0 -155
- package/skills/evaluating-skills/runner/README.md +0 -163
- package/skills/evaluating-skills/runner/adapters/claude-code-session.test.ts +0 -56
- package/skills/evaluating-skills/runner/adapters/claude-code-session.ts +0 -43
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.test.ts +0 -485
- package/skills/evaluating-skills/runner/adapters/claude-code-transcript.ts +0 -242
- package/skills/evaluating-skills/runner/aggregate.test.ts +0 -484
- package/skills/evaluating-skills/runner/aggregate.ts +0 -269
- package/skills/evaluating-skills/runner/context.test.ts +0 -181
- package/skills/evaluating-skills/runner/context.ts +0 -90
- package/skills/evaluating-skills/runner/detect-stray-writes.test.ts +0 -396
- package/skills/evaluating-skills/runner/detect-stray-writes.ts +0 -288
- package/skills/evaluating-skills/runner/fill-transcripts.test.ts +0 -73
- package/skills/evaluating-skills/runner/fill-transcripts.ts +0 -154
- package/skills/evaluating-skills/runner/grade.test.ts +0 -347
- package/skills/evaluating-skills/runner/grade.ts +0 -603
- package/skills/evaluating-skills/runner/guard/guard.ts +0 -49
- package/skills/evaluating-skills/runner/guard/install.test.ts +0 -92
- package/skills/evaluating-skills/runner/guard/install.ts +0 -147
- package/skills/evaluating-skills/runner/guard/policy.test.ts +0 -128
- package/skills/evaluating-skills/runner/guard/policy.ts +0 -74
- package/skills/evaluating-skills/runner/plugin-shadow.test.ts +0 -228
- package/skills/evaluating-skills/runner/plugin-shadow.ts +0 -201
- package/skills/evaluating-skills/runner/profiles/claude-code/plan-mode.md +0 -11
- package/skills/evaluating-skills/runner/promote-baseline.test.ts +0 -281
- package/skills/evaluating-skills/runner/promote-baseline.ts +0 -204
- package/skills/evaluating-skills/runner/record-runs.test.ts +0 -314
- package/skills/evaluating-skills/runner/record-runs.ts +0 -209
- package/skills/evaluating-skills/runner/run.test.ts +0 -1703
- package/skills/evaluating-skills/runner/run.ts +0 -1388
- package/skills/evaluating-skills/runner/sandbox-policy.ts +0 -94
- package/skills/evaluating-skills/runner/types.ts +0 -121
- package/skills/evaluating-skills/runner/validate-all.ts +0 -54
- package/skills/evaluating-skills/runner/validate-schema.test.ts +0 -99
- package/skills/evaluating-skills/runner/validate-schema.ts +0 -51
- package/skills/evaluating-skills/runner/validate.test.ts +0 -56
- package/skills/evaluating-skills/runner/validate.ts +0 -21
- package/skills/evaluating-skills/runner/workspace-teardown.test.ts +0 -227
- package/skills/evaluating-skills/runner/workspace-teardown.ts +0 -136
- package/skills/evaluating-skills/schema/evals.schema.json +0 -105
- package/skills/evaluating-skills/schema/grading.schema.json +0 -84
- package/skills/evaluating-skills/schema/run-record.schema.json +0 -80
- package/skills/evaluating-skills/schema/stray-writes.schema.json +0 -80
- package/skills/evaluating-skills/templates/eval-task-prompt.md +0 -69
- package/skills/evaluating-skills/templates/evals.json.example +0 -17
- package/skills/evaluating-skills/templates/judge-prompt.md +0 -56
- package/skills/evaluating-skills/templates/revise-skill-prompt.md +0 -56
|
@@ -1,242 +0,0 @@
|
|
|
1
|
-
import { existsSync, readdirSync, readFileSync, statSync } from "node:fs";
|
|
2
|
-
import { join } from "node:path";
|
|
3
|
-
import type { ToolInvocation } from "../types";
|
|
4
|
-
|
|
5
|
-
type ToolUseBlock = {
|
|
6
|
-
type: "tool_use";
|
|
7
|
-
id: string;
|
|
8
|
-
name: string;
|
|
9
|
-
input: unknown;
|
|
10
|
-
};
|
|
11
|
-
|
|
12
|
-
type ToolResultBlock = {
|
|
13
|
-
type: "tool_result";
|
|
14
|
-
tool_use_id: string;
|
|
15
|
-
content: string | unknown[];
|
|
16
|
-
};
|
|
17
|
-
|
|
18
|
-
type TextBlock = {
|
|
19
|
-
type: "text";
|
|
20
|
-
text: string;
|
|
21
|
-
};
|
|
22
|
-
|
|
23
|
-
type ContentBlock =
|
|
24
|
-
| ToolUseBlock
|
|
25
|
-
| ToolResultBlock
|
|
26
|
-
| TextBlock
|
|
27
|
-
| { type: string };
|
|
28
|
-
|
|
29
|
-
type UsageRecord = {
|
|
30
|
-
input_tokens?: number;
|
|
31
|
-
output_tokens?: number;
|
|
32
|
-
cache_creation_input_tokens?: number;
|
|
33
|
-
cache_read_input_tokens?: number;
|
|
34
|
-
};
|
|
35
|
-
|
|
36
|
-
type TranscriptRecord = {
|
|
37
|
-
type: "user" | "assistant" | string;
|
|
38
|
-
timestamp?: string;
|
|
39
|
-
message?: {
|
|
40
|
-
id?: string;
|
|
41
|
-
role?: string;
|
|
42
|
-
usage?: UsageRecord;
|
|
43
|
-
content?: string | ContentBlock[];
|
|
44
|
-
};
|
|
45
|
-
};
|
|
46
|
-
|
|
47
|
-
function flattenContent(
|
|
48
|
-
content: string | ContentBlock[] | undefined,
|
|
49
|
-
): ContentBlock[] {
|
|
50
|
-
if (!content) return [];
|
|
51
|
-
if (typeof content === "string") return [];
|
|
52
|
-
return content;
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
function stringifyResult(content: ToolResultBlock["content"]): string {
|
|
56
|
-
if (typeof content === "string") return content;
|
|
57
|
-
if (Array.isArray(content))
|
|
58
|
-
return content
|
|
59
|
-
.map((c) => {
|
|
60
|
-
if (typeof c === "string") return c;
|
|
61
|
-
if (c && typeof c === "object" && "text" in c)
|
|
62
|
-
return String((c as { text: unknown }).text);
|
|
63
|
-
return JSON.stringify(c);
|
|
64
|
-
})
|
|
65
|
-
.join("\n");
|
|
66
|
-
return JSON.stringify(content);
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
function readRecords(jsonlPath: string): TranscriptRecord[] {
|
|
70
|
-
const raw = readFileSync(jsonlPath, "utf8");
|
|
71
|
-
const records: TranscriptRecord[] = [];
|
|
72
|
-
for (const line of raw.split("\n")) {
|
|
73
|
-
if (line.length === 0) continue;
|
|
74
|
-
try {
|
|
75
|
-
records.push(JSON.parse(line) as TranscriptRecord);
|
|
76
|
-
} catch {
|
|
77
|
-
// skip malformed lines
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
return records;
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
function extractInvocations(records: TranscriptRecord[]): ToolInvocation[] {
|
|
84
|
-
const invocations: ToolInvocation[] = [];
|
|
85
|
-
const indexById = new Map<string, number>();
|
|
86
|
-
|
|
87
|
-
for (const record of records) {
|
|
88
|
-
const blocks = flattenContent(record.message?.content);
|
|
89
|
-
|
|
90
|
-
if (record.type === "assistant") {
|
|
91
|
-
for (const block of blocks) {
|
|
92
|
-
if (block.type !== "tool_use") continue;
|
|
93
|
-
const tu = block as ToolUseBlock;
|
|
94
|
-
const ordinal = invocations.length;
|
|
95
|
-
indexById.set(tu.id, ordinal);
|
|
96
|
-
invocations.push({
|
|
97
|
-
name: tu.name,
|
|
98
|
-
args: tu.input,
|
|
99
|
-
ordinal,
|
|
100
|
-
});
|
|
101
|
-
}
|
|
102
|
-
continue;
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
if (record.type === "user") {
|
|
106
|
-
for (const block of blocks) {
|
|
107
|
-
if (block.type !== "tool_result") continue;
|
|
108
|
-
const tr = block as ToolResultBlock;
|
|
109
|
-
const idx = indexById.get(tr.tool_use_id);
|
|
110
|
-
if (idx === undefined) continue;
|
|
111
|
-
invocations[idx].result = stringifyResult(tr.content);
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
return invocations;
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
export function parseTranscript(jsonlPath: string): ToolInvocation[] {
|
|
120
|
-
return extractInvocations(readRecords(jsonlPath));
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
export type TranscriptSummary = {
|
|
124
|
-
tool_invocations: ToolInvocation[];
|
|
125
|
-
/**
|
|
126
|
-
* Sum of usage across unique API responses. One response spans multiple
|
|
127
|
-
* jsonl lines (one per content block) and repeats the same `message.id` +
|
|
128
|
-
* `usage` on each, so totals are deduped by `message.id`. Includes cache
|
|
129
|
-
* creation/read tokens — a different accounting than the harness's task
|
|
130
|
-
* completion event.
|
|
131
|
-
*/
|
|
132
|
-
total_tokens: number | null;
|
|
133
|
-
/** Wall clock between the first and last line timestamps. */
|
|
134
|
-
duration_ms: number | null;
|
|
135
|
-
/** Concatenated text blocks of the last assistant message. */
|
|
136
|
-
final_text: string | null;
|
|
137
|
-
};
|
|
138
|
-
|
|
139
|
-
export function parseTranscriptFull(jsonlPath: string): TranscriptSummary {
|
|
140
|
-
const records = readRecords(jsonlPath);
|
|
141
|
-
|
|
142
|
-
const usageById = new Map<string, UsageRecord>();
|
|
143
|
-
let firstTs: number | null = null;
|
|
144
|
-
let lastTs: number | null = null;
|
|
145
|
-
let timestampCount = 0;
|
|
146
|
-
let finalText: string | null = null;
|
|
147
|
-
|
|
148
|
-
for (const record of records) {
|
|
149
|
-
if (record.timestamp) {
|
|
150
|
-
const ts = Date.parse(record.timestamp);
|
|
151
|
-
if (!Number.isNaN(ts)) {
|
|
152
|
-
if (firstTs === null) firstTs = ts;
|
|
153
|
-
lastTs = ts;
|
|
154
|
-
timestampCount++;
|
|
155
|
-
}
|
|
156
|
-
}
|
|
157
|
-
|
|
158
|
-
if (record.type !== "assistant") continue;
|
|
159
|
-
|
|
160
|
-
const { id, usage } = record.message ?? {};
|
|
161
|
-
if (id && usage) usageById.set(id, usage);
|
|
162
|
-
|
|
163
|
-
const texts = flattenContent(record.message?.content)
|
|
164
|
-
.filter((b): b is TextBlock => b.type === "text")
|
|
165
|
-
.map((b) => b.text);
|
|
166
|
-
if (texts.length > 0) finalText = texts.join("\n");
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
let totalTokens: number | null = null;
|
|
170
|
-
if (usageById.size > 0) {
|
|
171
|
-
totalTokens = 0;
|
|
172
|
-
for (const usage of usageById.values()) {
|
|
173
|
-
totalTokens +=
|
|
174
|
-
(usage.input_tokens ?? 0) +
|
|
175
|
-
(usage.output_tokens ?? 0) +
|
|
176
|
-
(usage.cache_creation_input_tokens ?? 0) +
|
|
177
|
-
(usage.cache_read_input_tokens ?? 0);
|
|
178
|
-
}
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
return {
|
|
182
|
-
tool_invocations: extractInvocations(records),
|
|
183
|
-
total_tokens: totalTokens,
|
|
184
|
-
duration_ms:
|
|
185
|
-
timestampCount >= 2 && firstTs !== null && lastTs !== null
|
|
186
|
-
? lastTs - firstTs
|
|
187
|
-
: null,
|
|
188
|
-
final_text: finalText,
|
|
189
|
-
};
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
export type SubagentMeta = {
|
|
193
|
-
agentType?: string;
|
|
194
|
-
description?: string;
|
|
195
|
-
toolUseId?: string;
|
|
196
|
-
};
|
|
197
|
-
|
|
198
|
-
export type SubagentEntry = {
|
|
199
|
-
jsonlPath: string;
|
|
200
|
-
metaPath: string;
|
|
201
|
-
meta: SubagentMeta;
|
|
202
|
-
};
|
|
203
|
-
|
|
204
|
-
export function listSubagents(subagentsDir: string): SubagentEntry[] {
|
|
205
|
-
if (!existsSync(subagentsDir)) return [];
|
|
206
|
-
const files = readdirSync(subagentsDir);
|
|
207
|
-
const out: SubagentEntry[] = [];
|
|
208
|
-
for (const f of files) {
|
|
209
|
-
if (!f.endsWith(".meta.json")) continue;
|
|
210
|
-
const base = f.slice(0, -".meta.json".length);
|
|
211
|
-
const metaPath = join(subagentsDir, f);
|
|
212
|
-
const jsonlPath = join(subagentsDir, `${base}.jsonl`);
|
|
213
|
-
if (!existsSync(jsonlPath)) continue;
|
|
214
|
-
try {
|
|
215
|
-
const meta = JSON.parse(readFileSync(metaPath, "utf8")) as SubagentMeta;
|
|
216
|
-
out.push({ jsonlPath, metaPath, meta });
|
|
217
|
-
} catch {}
|
|
218
|
-
}
|
|
219
|
-
return out;
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
export function findByDescription(
|
|
223
|
-
subagentsDir: string,
|
|
224
|
-
description: string,
|
|
225
|
-
): SubagentEntry | null {
|
|
226
|
-
const entries = listSubagents(subagentsDir);
|
|
227
|
-
const matches = entries.filter((e) => e.meta.description === description);
|
|
228
|
-
if (matches.length === 0) return null;
|
|
229
|
-
if (matches.length === 1) return matches[0];
|
|
230
|
-
|
|
231
|
-
// Descriptions are namespaced per iteration+run (see run.ts), so duplicates
|
|
232
|
-
// here mean a retry within the same run. Prefer the most-recently-written
|
|
233
|
-
// transcript; readdir order is not chronological.
|
|
234
|
-
matches.sort((a, b) => {
|
|
235
|
-
try {
|
|
236
|
-
return statSync(b.jsonlPath).mtimeMs - statSync(a.jsonlPath).mtimeMs;
|
|
237
|
-
} catch {
|
|
238
|
-
return 0;
|
|
239
|
-
}
|
|
240
|
-
});
|
|
241
|
-
return matches[0];
|
|
242
|
-
}
|