@kodax-ai/kodax-cli 0.7.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1304 -0
- package/LICENSE +191 -0
- package/README.md +1167 -0
- package/README_CN.md +631 -0
- package/dist/builtin/code-review/SKILL.md +63 -0
- package/dist/builtin/git-workflow/SKILL.md +84 -0
- package/dist/builtin/skill-creator/SKILL.md +122 -0
- package/dist/builtin/skill-creator/agents/analyzer.md +12 -0
- package/dist/builtin/skill-creator/agents/comparator.md +13 -0
- package/dist/builtin/skill-creator/agents/grader.md +13 -0
- package/dist/builtin/skill-creator/references/schemas.md +227 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.js +209 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.js +289 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.d.ts +62 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.js +333 -0
- package/dist/builtin/skill-creator/scripts/generate-review.d.ts +33 -0
- package/dist/builtin/skill-creator/scripts/generate-review.js +415 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.d.ts +73 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.js +405 -0
- package/dist/builtin/skill-creator/scripts/improve-description.d.ts +23 -0
- package/dist/builtin/skill-creator/scripts/improve-description.js +161 -0
- package/dist/builtin/skill-creator/scripts/init-skill.d.ts +14 -0
- package/dist/builtin/skill-creator/scripts/init-skill.js +153 -0
- package/dist/builtin/skill-creator/scripts/install-skill.d.ts +29 -0
- package/dist/builtin/skill-creator/scripts/install-skill.js +176 -0
- package/dist/builtin/skill-creator/scripts/package-skill.d.ts +38 -0
- package/dist/builtin/skill-creator/scripts/package-skill.js +124 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.d.ts +8 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.js +166 -0
- package/dist/builtin/skill-creator/scripts/run-eval.d.ts +66 -0
- package/dist/builtin/skill-creator/scripts/run-eval.js +356 -0
- package/dist/builtin/skill-creator/scripts/run-loop.d.ts +49 -0
- package/dist/builtin/skill-creator/scripts/run-loop.js +243 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.d.ts +58 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.js +225 -0
- package/dist/builtin/skill-creator/scripts/utils.js +278 -0
- package/dist/builtin/tdd/SKILL.md +56 -0
- package/dist/index.js +1717 -0
- package/dist/kodax_cli.js +1870 -0
- package/package.json +122 -0
- package/scripts/kodax-bin.cjs +27 -0
- package/scripts/production-env.cjs +16 -0
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFile, readdir, stat, writeFile } from 'node:fs/promises';
|
|
4
|
+
import { createServer } from 'node:http';
|
|
5
|
+
import path from 'node:path';
|
|
6
|
+
import { fileURLToPath } from 'node:url';
|
|
7
|
+
|
|
8
|
+
const TEXT_EXTENSIONS = new Set([
|
|
9
|
+
'.txt', '.md', '.json', '.csv', '.ts', '.tsx', '.js', '.jsx',
|
|
10
|
+
'.yaml', '.yml', '.html', '.css', '.sql', '.toml', '.xml',
|
|
11
|
+
]);
|
|
12
|
+
const IMAGE_EXTENSIONS = new Set(['.png', '.jpg', '.jpeg', '.gif', '.svg', '.webp']);
|
|
13
|
+
const METADATA_FILES = new Set(['transcript.md', 'metrics.json']);
|
|
14
|
+
|
|
15
|
+
async function readJson(filePath, fallback = null) {
|
|
16
|
+
try {
|
|
17
|
+
return JSON.parse(await readFile(filePath, 'utf8'));
|
|
18
|
+
} catch {
|
|
19
|
+
return fallback;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async function pathExists(filePath) {
|
|
24
|
+
try {
|
|
25
|
+
await stat(filePath);
|
|
26
|
+
return true;
|
|
27
|
+
} catch {
|
|
28
|
+
return false;
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
async function collectOutputFiles(outputsDir) {
|
|
33
|
+
const entries = await readdir(outputsDir, { withFileTypes: true }).catch(() => []);
|
|
34
|
+
const files = [];
|
|
35
|
+
|
|
36
|
+
for (const entry of entries) {
|
|
37
|
+
if (!entry.isFile() || METADATA_FILES.has(entry.name)) {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
const filePath = path.join(outputsDir, entry.name);
|
|
42
|
+
const extension = path.extname(entry.name).toLowerCase();
|
|
43
|
+
|
|
44
|
+
if (TEXT_EXTENSIONS.has(extension)) {
|
|
45
|
+
files.push({
|
|
46
|
+
name: entry.name,
|
|
47
|
+
kind: 'text',
|
|
48
|
+
content: await readFile(filePath, 'utf8').catch(() => '(Error reading file)'),
|
|
49
|
+
});
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const raw = await readFile(filePath).catch(() => null);
|
|
54
|
+
if (!raw) {
|
|
55
|
+
files.push({ name: entry.name, kind: 'error', content: '(Error reading file)' });
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const base64 = raw.toString('base64');
|
|
60
|
+
if (IMAGE_EXTENSIONS.has(extension)) {
|
|
61
|
+
const mime = extension === '.svg' ? 'image/svg+xml' : `image/${extension.slice(1)}`;
|
|
62
|
+
files.push({
|
|
63
|
+
name: entry.name,
|
|
64
|
+
kind: 'image',
|
|
65
|
+
dataUri: `data:${mime};base64,${base64}`,
|
|
66
|
+
});
|
|
67
|
+
} else if (extension === '.pdf') {
|
|
68
|
+
files.push({
|
|
69
|
+
name: entry.name,
|
|
70
|
+
kind: 'pdf',
|
|
71
|
+
dataUri: `data:application/pdf;base64,${base64}`,
|
|
72
|
+
});
|
|
73
|
+
} else {
|
|
74
|
+
files.push({
|
|
75
|
+
name: entry.name,
|
|
76
|
+
kind: 'binary',
|
|
77
|
+
dataUri: `data:application/octet-stream;base64,${base64}`,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return files;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
async function buildRun(workspaceRoot, runDir) {
|
|
86
|
+
const outputsDir = path.join(runDir, 'outputs');
|
|
87
|
+
if (!(await pathExists(outputsDir))) {
|
|
88
|
+
return null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const evalMetadata = await readJson(path.join(runDir, 'eval_metadata.json'))
|
|
92
|
+
?? await readJson(path.join(path.dirname(runDir), 'eval_metadata.json'))
|
|
93
|
+
?? await readJson(path.join(path.dirname(path.dirname(runDir)), 'eval_metadata.json'), {});
|
|
94
|
+
|
|
95
|
+
let prompt = evalMetadata.prompt ?? '';
|
|
96
|
+
if (!prompt) {
|
|
97
|
+
const transcript = await readFile(path.join(runDir, 'transcript.md'), 'utf8').catch(() => '');
|
|
98
|
+
const match = transcript.match(/## Eval Prompt\s+([\s\S]*?)(?:\n##|$)/);
|
|
99
|
+
prompt = match?.[1]?.trim() ?? '(No prompt found)';
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
id: path.relative(workspaceRoot, runDir).replace(/\\/g, '/'),
|
|
104
|
+
evalId: evalMetadata.eval_id ?? null,
|
|
105
|
+
prompt,
|
|
106
|
+
grading: await readJson(path.join(runDir, 'grading.json'), null),
|
|
107
|
+
outputs: await collectOutputFiles(outputsDir),
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
export async function findRuns(workspaceRoot, currentDir = workspaceRoot, runs = []) {
|
|
112
|
+
const outputsDir = path.join(currentDir, 'outputs');
|
|
113
|
+
if (await pathExists(outputsDir)) {
|
|
114
|
+
const run = await buildRun(workspaceRoot, currentDir);
|
|
115
|
+
if (run) {
|
|
116
|
+
runs.push(run);
|
|
117
|
+
}
|
|
118
|
+
return runs;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
const entries = await readdir(currentDir, { withFileTypes: true }).catch(() => []);
|
|
122
|
+
for (const entry of entries.sort((left, right) => left.name.localeCompare(right.name))) {
|
|
123
|
+
if (!entry.isDirectory()) {
|
|
124
|
+
continue;
|
|
125
|
+
}
|
|
126
|
+
if (['node_modules', '.git', '__pycache__', 'inputs'].includes(entry.name)) {
|
|
127
|
+
continue;
|
|
128
|
+
}
|
|
129
|
+
await findRuns(workspaceRoot, path.join(currentDir, entry.name), runs);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
return runs;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function escapeHtml(value) {
|
|
136
|
+
return value
|
|
137
|
+
.replace(/&/g, '&')
|
|
138
|
+
.replace(/</g, '<')
|
|
139
|
+
.replace(/>/g, '>')
|
|
140
|
+
.replace(/"/g, '"');
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
export function renderHtml(payload, staticMode) {
|
|
144
|
+
const serialized = JSON.stringify(payload).replace(/</g, '\\u003c');
|
|
145
|
+
|
|
146
|
+
return `<!doctype html>
|
|
147
|
+
<html lang="en">
|
|
148
|
+
<head>
|
|
149
|
+
<meta charset="utf-8">
|
|
150
|
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
151
|
+
<title>${escapeHtml(payload.skillName)} Review</title>
|
|
152
|
+
<style>
|
|
153
|
+
:root { color-scheme: light; font-family: "Segoe UI", "IBM Plex Sans", sans-serif; }
|
|
154
|
+
body { margin: 0; background: #f5f1e8; color: #1f2328; }
|
|
155
|
+
header { padding: 24px 28px; background: linear-gradient(135deg, #113946, #bca37f); color: #fffdf7; }
|
|
156
|
+
main { display: grid; grid-template-columns: 280px 1fr; min-height: calc(100vh - 100px); }
|
|
157
|
+
nav { border-right: 1px solid #d7d1c7; background: #fffaf1; padding: 20px; overflow: auto; }
|
|
158
|
+
.panel { padding: 24px 28px; overflow: auto; }
|
|
159
|
+
.run-button { display: block; width: 100%; text-align: left; margin: 0 0 10px; padding: 12px; border: 1px solid #d7d1c7; border-radius: 12px; background: #fff; cursor: pointer; }
|
|
160
|
+
.run-button.active { border-color: #113946; box-shadow: 0 0 0 2px rgba(17,57,70,.12); }
|
|
161
|
+
.tabs { display: flex; gap: 12px; margin-bottom: 18px; }
|
|
162
|
+
.tab { border: 0; border-radius: 999px; padding: 10px 16px; background: #e6dccd; cursor: pointer; }
|
|
163
|
+
.tab.active { background: #113946; color: white; }
|
|
164
|
+
.card { background: white; border: 1px solid #d7d1c7; border-radius: 18px; padding: 18px; margin-bottom: 18px; }
|
|
165
|
+
pre { white-space: pre-wrap; word-break: break-word; background: #f7f7f8; padding: 14px; border-radius: 12px; }
|
|
166
|
+
textarea { width: 100%; min-height: 180px; border-radius: 12px; border: 1px solid #c8c1b6; padding: 12px; font: inherit; }
|
|
167
|
+
table { width: 100%; border-collapse: collapse; background: white; border-radius: 18px; overflow: hidden; }
|
|
168
|
+
th, td { padding: 12px; border-bottom: 1px solid #ece7df; text-align: left; }
|
|
169
|
+
.hidden { display: none; }
|
|
170
|
+
.muted { color: #57606a; }
|
|
171
|
+
img, iframe { max-width: 100%; border-radius: 12px; }
|
|
172
|
+
@media (max-width: 920px) {
|
|
173
|
+
main { grid-template-columns: 1fr; }
|
|
174
|
+
nav { border-right: 0; border-bottom: 1px solid #d7d1c7; }
|
|
175
|
+
}
|
|
176
|
+
</style>
|
|
177
|
+
</head>
|
|
178
|
+
<body>
|
|
179
|
+
<header>
|
|
180
|
+
<div class="muted">KodaX Skill Review</div>
|
|
181
|
+
<h1>${escapeHtml(payload.skillName)}</h1>
|
|
182
|
+
<div>${escapeHtml(payload.workspace)}</div>
|
|
183
|
+
</header>
|
|
184
|
+
<main>
|
|
185
|
+
<nav>
|
|
186
|
+
<div class="tabs">
|
|
187
|
+
<button class="tab active" data-tab="outputs">Outputs</button>
|
|
188
|
+
<button class="tab" data-tab="benchmark">Benchmark</button>
|
|
189
|
+
</div>
|
|
190
|
+
<div id="run-list"></div>
|
|
191
|
+
</nav>
|
|
192
|
+
<section class="panel">
|
|
193
|
+
<div id="outputs-panel"></div>
|
|
194
|
+
<div id="benchmark-panel" class="hidden"></div>
|
|
195
|
+
</section>
|
|
196
|
+
</main>
|
|
197
|
+
<script>
|
|
198
|
+
const payload = ${serialized};
|
|
199
|
+
const feedback = { ...(payload.feedback || {}) };
|
|
200
|
+
let currentRunId = payload.runs[0]?.id || null;
|
|
201
|
+
|
|
202
|
+
const runList = document.getElementById('run-list');
|
|
203
|
+
const outputsPanel = document.getElementById('outputs-panel');
|
|
204
|
+
const benchmarkPanel = document.getElementById('benchmark-panel');
|
|
205
|
+
const tabs = Array.from(document.querySelectorAll('.tab'));
|
|
206
|
+
|
|
207
|
+
function escapeHtml(value) {
|
|
208
|
+
return String(value)
|
|
209
|
+
.replace(/&/g, '&')
|
|
210
|
+
.replace(/</g, '<')
|
|
211
|
+
.replace(/>/g, '>')
|
|
212
|
+
.replace(/"/g, '"');
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function persistFeedback() {
|
|
216
|
+
if (${staticMode ? 'true' : 'false'}) {
|
|
217
|
+
localStorage.setItem('kodax-skill-review-feedback', JSON.stringify(feedback));
|
|
218
|
+
return Promise.resolve();
|
|
219
|
+
}
|
|
220
|
+
return fetch('/feedback', {
|
|
221
|
+
method: 'POST',
|
|
222
|
+
headers: { 'Content-Type': 'application/json' },
|
|
223
|
+
body: JSON.stringify(feedback),
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
function renderRunList() {
|
|
228
|
+
runList.innerHTML = '';
|
|
229
|
+
for (const run of payload.runs) {
|
|
230
|
+
const button = document.createElement('button');
|
|
231
|
+
button.className = 'run-button' + (run.id === currentRunId ? ' active' : '');
|
|
232
|
+
button.innerHTML = '<strong>' + run.id + '</strong><div class="muted">' + (run.prompt || '') + '</div>';
|
|
233
|
+
button.onclick = () => {
|
|
234
|
+
currentRunId = run.id;
|
|
235
|
+
renderRunList();
|
|
236
|
+
renderOutputs();
|
|
237
|
+
};
|
|
238
|
+
runList.appendChild(button);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
function renderOutputs() {
|
|
243
|
+
const run = payload.runs.find((item) => item.id === currentRunId);
|
|
244
|
+
if (!run) {
|
|
245
|
+
outputsPanel.innerHTML = '<div class="card">No run selected.</div>';
|
|
246
|
+
return;
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
const cards = [];
|
|
250
|
+
cards.push('<div class="card"><h2>Prompt</h2><pre>' + escapeHtml(run.prompt || '(No prompt found)') + '</pre></div>');
|
|
251
|
+
if (run.grading) {
|
|
252
|
+
cards.push('<div class="card"><h2>Grading</h2><pre>' + escapeHtml(JSON.stringify(run.grading, null, 2)) + '</pre></div>');
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
for (const output of run.outputs) {
|
|
256
|
+
if (output.kind === 'text') {
|
|
257
|
+
cards.push('<div class="card"><h2>' + escapeHtml(output.name) + '</h2><pre>' + escapeHtml(output.content) + '</pre></div>');
|
|
258
|
+
} else if (output.kind === 'image') {
|
|
259
|
+
cards.push('<div class="card"><h2>' + escapeHtml(output.name) + '</h2><img alt="" src="' + output.dataUri + '"></div>');
|
|
260
|
+
} else if (output.kind === 'pdf') {
|
|
261
|
+
cards.push('<div class="card"><h2>' + escapeHtml(output.name) + '</h2><iframe title="" src="' + output.dataUri + '" style="width:100%;min-height:500px;"></iframe></div>');
|
|
262
|
+
} else {
|
|
263
|
+
cards.push('<div class="card"><h2>' + escapeHtml(output.name) + '</h2><a download="' + escapeHtml(output.name) + '" href="' + output.dataUri + '">Download output</a></div>');
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
cards.push('<div class="card"><h2>Feedback</h2><textarea id="feedback-box">' + escapeHtml(feedback[run.id] || '') + '</textarea><div class="muted">Feedback is ' + (${staticMode ? '"saved in localStorage for this browser."' : '"written to feedback.json in the workspace."'}) + '</div></div>');
|
|
268
|
+
outputsPanel.innerHTML = cards.join('');
|
|
269
|
+
|
|
270
|
+
const textarea = document.getElementById('feedback-box');
|
|
271
|
+
textarea.addEventListener('input', () => {
|
|
272
|
+
feedback[run.id] = textarea.value;
|
|
273
|
+
persistFeedback();
|
|
274
|
+
});
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
function renderBenchmark() {
|
|
278
|
+
const benchmark = payload.benchmark;
|
|
279
|
+
if (!benchmark) {
|
|
280
|
+
benchmarkPanel.innerHTML = '<div class="card">No benchmark.json found.</div>';
|
|
281
|
+
return;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
const rows = Object.entries(benchmark.configs || {}).map(([name, stats]) => (
|
|
285
|
+
'<tr><td>' + escapeHtml(name) + '</td><td>' + stats.pass_rate.mean + ' ± ' + stats.pass_rate.stddev + '</td><td>' + stats.time_seconds.mean + ' ± ' + stats.time_seconds.stddev + '</td><td>' + stats.tokens.mean + ' ± ' + stats.tokens.stddev + '</td></tr>'
|
|
286
|
+
)).join('');
|
|
287
|
+
|
|
288
|
+
benchmarkPanel.innerHTML = '<div class="card"><h2>Summary</h2><table><thead><tr><th>Config</th><th>Pass rate</th><th>Time (s)</th><th>Tokens</th></tr></thead><tbody>' + rows + '</tbody></table><p class="muted">Delta: pass rate ' + (benchmark.delta?.pass_rate || 'n/a') + ', time ' + (benchmark.delta?.time_seconds || 'n/a') + ', tokens ' + (benchmark.delta?.tokens || 'n/a') + '</p></div>';
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function setTab(tabName) {
|
|
292
|
+
tabs.forEach((tab) => tab.classList.toggle('active', tab.dataset.tab === tabName));
|
|
293
|
+
outputsPanel.classList.toggle('hidden', tabName !== 'outputs');
|
|
294
|
+
benchmarkPanel.classList.toggle('hidden', tabName !== 'benchmark');
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
tabs.forEach((tab) => {
|
|
298
|
+
tab.addEventListener('click', () => setTab(tab.dataset.tab));
|
|
299
|
+
});
|
|
300
|
+
|
|
301
|
+
if (${staticMode ? 'true' : 'false'}) {
|
|
302
|
+
try {
|
|
303
|
+
Object.assign(feedback, JSON.parse(localStorage.getItem('kodax-skill-review-feedback') || '{}'));
|
|
304
|
+
} catch {}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
renderRunList();
|
|
308
|
+
renderOutputs();
|
|
309
|
+
renderBenchmark();
|
|
310
|
+
</script>
|
|
311
|
+
</body>
|
|
312
|
+
</html>`;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
function parseArgs(argv) {
|
|
316
|
+
const args = {
|
|
317
|
+
workspace: argv[2],
|
|
318
|
+
skillName: 'unknown-skill',
|
|
319
|
+
benchmark: null,
|
|
320
|
+
staticOutput: null,
|
|
321
|
+
port: 4173,
|
|
322
|
+
};
|
|
323
|
+
|
|
324
|
+
for (let index = 3; index < argv.length; index += 1) {
|
|
325
|
+
const token = argv[index];
|
|
326
|
+
if (token === '--skill-name' && argv[index + 1]) {
|
|
327
|
+
args.skillName = argv[index + 1];
|
|
328
|
+
index += 1;
|
|
329
|
+
} else if (token === '--benchmark' && argv[index + 1]) {
|
|
330
|
+
args.benchmark = argv[index + 1];
|
|
331
|
+
index += 1;
|
|
332
|
+
} else if (token === '--static' && argv[index + 1]) {
|
|
333
|
+
args.staticOutput = argv[index + 1];
|
|
334
|
+
index += 1;
|
|
335
|
+
} else if (token === '--port' && argv[index + 1]) {
|
|
336
|
+
args.port = Number(argv[index + 1]);
|
|
337
|
+
index += 1;
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
return args;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
export async function buildPayload(workspace, args) {
|
|
345
|
+
const benchmarkPath = args.benchmark ?? path.join(workspace, 'benchmark.json');
|
|
346
|
+
|
|
347
|
+
return {
|
|
348
|
+
skillName: args.skillName,
|
|
349
|
+
workspace: path.resolve(workspace),
|
|
350
|
+
benchmark: await readJson(benchmarkPath, null),
|
|
351
|
+
feedback: await readJson(path.join(workspace, 'feedback.json'), {}),
|
|
352
|
+
runs: await findRuns(path.resolve(workspace)),
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
async function main() {
|
|
357
|
+
const args = parseArgs(process.argv);
|
|
358
|
+
if (!args.workspace) {
|
|
359
|
+
console.error('Usage: node scripts/generate-review.js <workspace> [--skill-name name] [--benchmark file] [--static output.html] [--port 4173]');
|
|
360
|
+
process.exit(1);
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
const payload = await buildPayload(args.workspace, args);
|
|
364
|
+
|
|
365
|
+
if (args.staticOutput) {
|
|
366
|
+
await writeFile(args.staticOutput, renderHtml(payload, true));
|
|
367
|
+
console.log(`Wrote ${args.staticOutput}`);
|
|
368
|
+
return;
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
const server = createServer(async (request, response) => {
|
|
372
|
+
if (!request.url || request.method === 'GET' && request.url === '/') {
|
|
373
|
+
response.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' });
|
|
374
|
+
response.end(renderHtml(payload, false));
|
|
375
|
+
return;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
if (request.method === 'POST' && request.url === '/feedback') {
|
|
379
|
+
let body = '';
|
|
380
|
+
request.on('data', (chunk) => {
|
|
381
|
+
body += chunk;
|
|
382
|
+
});
|
|
383
|
+
request.on('end', async () => {
|
|
384
|
+
try {
|
|
385
|
+
const parsed = JSON.parse(body);
|
|
386
|
+
payload.feedback = parsed;
|
|
387
|
+
await writeFile(path.join(args.workspace, 'feedback.json'), JSON.stringify(parsed, null, 2));
|
|
388
|
+
response.writeHead(204);
|
|
389
|
+
response.end();
|
|
390
|
+
} catch {
|
|
391
|
+
response.writeHead(400, { 'Content-Type': 'text/plain; charset=utf-8' });
|
|
392
|
+
response.end('Invalid feedback payload');
|
|
393
|
+
}
|
|
394
|
+
});
|
|
395
|
+
return;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
response.writeHead(404, { 'Content-Type': 'text/plain; charset=utf-8' });
|
|
399
|
+
response.end('Not found');
|
|
400
|
+
});
|
|
401
|
+
|
|
402
|
+
server.listen(args.port, () => {
|
|
403
|
+
console.log(`Review server running at http://127.0.0.1:${args.port}`);
|
|
404
|
+
});
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
const isDirectRun = process.argv[1]
|
|
408
|
+
&& fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
|
|
409
|
+
|
|
410
|
+
if (isDirectRun) {
|
|
411
|
+
main().catch((error) => {
|
|
412
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
413
|
+
process.exit(1);
|
|
414
|
+
});
|
|
415
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
export interface GradeWorkspaceOptions {
|
|
2
|
+
workspaceDir: string;
|
|
3
|
+
provider?: string;
|
|
4
|
+
model?: string;
|
|
5
|
+
reasoningMode?: string;
|
|
6
|
+
maxIter?: number;
|
|
7
|
+
cwd?: string;
|
|
8
|
+
overwrite?: boolean;
|
|
9
|
+
configs?: string[];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface GradedExpectation {
|
|
13
|
+
text: string;
|
|
14
|
+
passed: boolean;
|
|
15
|
+
evidence: string;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface GradingDocument {
|
|
19
|
+
summary: {
|
|
20
|
+
passed: number;
|
|
21
|
+
failed: number;
|
|
22
|
+
total: number;
|
|
23
|
+
pass_rate: number;
|
|
24
|
+
};
|
|
25
|
+
expectations: GradedExpectation[];
|
|
26
|
+
execution_metrics: {
|
|
27
|
+
total_tool_calls: number;
|
|
28
|
+
errors_encountered: number;
|
|
29
|
+
output_chars: number;
|
|
30
|
+
};
|
|
31
|
+
user_notes_summary: {
|
|
32
|
+
uncertainties: string[];
|
|
33
|
+
needs_review: string[];
|
|
34
|
+
workarounds: string[];
|
|
35
|
+
};
|
|
36
|
+
overall_summary: string;
|
|
37
|
+
timing: {
|
|
38
|
+
total_tokens: number;
|
|
39
|
+
total_duration_seconds: number;
|
|
40
|
+
};
|
|
41
|
+
meta: {
|
|
42
|
+
generated_at: string;
|
|
43
|
+
eval_id: string | number | null;
|
|
44
|
+
eval_name: string | null;
|
|
45
|
+
config: string;
|
|
46
|
+
run_id: string;
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function buildGradingPrompt(input: Record<string, unknown>): string;
|
|
51
|
+
|
|
52
|
+
export function gradeRun(
|
|
53
|
+
runDir: string,
|
|
54
|
+
options: GradeWorkspaceOptions,
|
|
55
|
+
runner?: (prompt: string, options: Record<string, unknown>) => Promise<string>
|
|
56
|
+
): Promise<{
|
|
57
|
+
runDir: string;
|
|
58
|
+
grading: GradingDocument;
|
|
59
|
+
prompt: string;
|
|
60
|
+
rawResponse: string;
|
|
61
|
+
}>;
|
|
62
|
+
|
|
63
|
+
export function gradeWorkspace(
|
|
64
|
+
options: GradeWorkspaceOptions,
|
|
65
|
+
runner?: (prompt: string, options: Record<string, unknown>) => Promise<string>
|
|
66
|
+
): Promise<{
|
|
67
|
+
workspace: string;
|
|
68
|
+
generated_at: string;
|
|
69
|
+
processed: number;
|
|
70
|
+
skipped: number;
|
|
71
|
+
processed_runs: Array<Record<string, unknown>>;
|
|
72
|
+
skipped_runs: string[];
|
|
73
|
+
}>;
|