@kodax-ai/kodax-cli 0.7.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1304 -0
- package/LICENSE +191 -0
- package/README.md +1167 -0
- package/README_CN.md +631 -0
- package/dist/builtin/code-review/SKILL.md +63 -0
- package/dist/builtin/git-workflow/SKILL.md +84 -0
- package/dist/builtin/skill-creator/SKILL.md +122 -0
- package/dist/builtin/skill-creator/agents/analyzer.md +12 -0
- package/dist/builtin/skill-creator/agents/comparator.md +13 -0
- package/dist/builtin/skill-creator/agents/grader.md +13 -0
- package/dist/builtin/skill-creator/references/schemas.md +227 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.js +209 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.js +289 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.d.ts +62 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.js +333 -0
- package/dist/builtin/skill-creator/scripts/generate-review.d.ts +33 -0
- package/dist/builtin/skill-creator/scripts/generate-review.js +415 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.d.ts +73 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.js +405 -0
- package/dist/builtin/skill-creator/scripts/improve-description.d.ts +23 -0
- package/dist/builtin/skill-creator/scripts/improve-description.js +161 -0
- package/dist/builtin/skill-creator/scripts/init-skill.d.ts +14 -0
- package/dist/builtin/skill-creator/scripts/init-skill.js +153 -0
- package/dist/builtin/skill-creator/scripts/install-skill.d.ts +29 -0
- package/dist/builtin/skill-creator/scripts/install-skill.js +176 -0
- package/dist/builtin/skill-creator/scripts/package-skill.d.ts +38 -0
- package/dist/builtin/skill-creator/scripts/package-skill.js +124 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.d.ts +8 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.js +166 -0
- package/dist/builtin/skill-creator/scripts/run-eval.d.ts +66 -0
- package/dist/builtin/skill-creator/scripts/run-eval.js +356 -0
- package/dist/builtin/skill-creator/scripts/run-loop.d.ts +49 -0
- package/dist/builtin/skill-creator/scripts/run-loop.js +243 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.d.ts +58 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.js +225 -0
- package/dist/builtin/skill-creator/scripts/utils.js +278 -0
- package/dist/builtin/tdd/SKILL.md +56 -0
- package/dist/index.js +1717 -0
- package/dist/kodax_cli.js +1870 -0
- package/package.json +122 -0
- package/scripts/kodax-bin.cjs +27 -0
- package/scripts/production-env.cjs +16 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
export interface SkillEvalAssertion {
|
|
2
|
+
text: string;
|
|
3
|
+
}
|
|
4
|
+
|
|
5
|
+
export interface SkillEvalItem {
|
|
6
|
+
id?: string | number;
|
|
7
|
+
name?: string;
|
|
8
|
+
prompt?: string;
|
|
9
|
+
query?: string;
|
|
10
|
+
expected_output?: string;
|
|
11
|
+
files?: string[];
|
|
12
|
+
assertions?: Array<string | SkillEvalAssertion>;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface SkillEvalExecution {
|
|
16
|
+
result: {
|
|
17
|
+
success: boolean;
|
|
18
|
+
lastText: string;
|
|
19
|
+
signal?: 'COMPLETE' | 'BLOCKED' | 'DECIDE';
|
|
20
|
+
signalReason?: string;
|
|
21
|
+
messages: Array<Record<string, unknown>>;
|
|
22
|
+
sessionId: string;
|
|
23
|
+
interrupted?: boolean;
|
|
24
|
+
limitReached?: boolean;
|
|
25
|
+
};
|
|
26
|
+
totalTokens: number;
|
|
27
|
+
durationMs: number;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface RunEvalWorkspaceOptions {
|
|
31
|
+
skillPath: string;
|
|
32
|
+
evalsPath: string;
|
|
33
|
+
workspaceDir: string;
|
|
34
|
+
provider?: string;
|
|
35
|
+
model?: string;
|
|
36
|
+
runsPerConfig?: number;
|
|
37
|
+
maxIter?: number;
|
|
38
|
+
reasoningMode?: string;
|
|
39
|
+
cwd?: string;
|
|
40
|
+
configs?: string[];
|
|
41
|
+
output?: string;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function buildEvalPrompt(
|
|
45
|
+
evalItem: SkillEvalItem,
|
|
46
|
+
options: Pick<RunEvalWorkspaceOptions, 'evalsPath' | 'cwd'>
|
|
47
|
+
): Promise<string>;
|
|
48
|
+
|
|
49
|
+
export function runEvalWorkspace(
|
|
50
|
+
options: RunEvalWorkspaceOptions,
|
|
51
|
+
runner?: (
|
|
52
|
+
prompt: string,
|
|
53
|
+
options: RunEvalWorkspaceOptions & {
|
|
54
|
+
configName: string;
|
|
55
|
+
evalItem: SkillEvalItem;
|
|
56
|
+
runIndex: number;
|
|
57
|
+
}
|
|
58
|
+
) => Promise<SkillEvalExecution>
|
|
59
|
+
): Promise<{
|
|
60
|
+
workspace: string;
|
|
61
|
+
skill_name: string;
|
|
62
|
+
eval_count: number;
|
|
63
|
+
configs: string[];
|
|
64
|
+
runs_per_config: number;
|
|
65
|
+
reports: Array<Record<string, unknown>>;
|
|
66
|
+
}>;
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFile, writeFile } from 'node:fs/promises';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
import { expandSkillForLLM } from '../../../skill-expander.js';
|
|
7
|
+
import { loadFullSkill } from '../../../skill-loader.js';
|
|
8
|
+
import { ensureDirectory, loadKodaXSDK } from './utils.js';
|
|
9
|
+
|
|
10
|
+
function normalizeAssertion(assertion) {
|
|
11
|
+
if (typeof assertion === 'string') {
|
|
12
|
+
return { text: assertion };
|
|
13
|
+
}
|
|
14
|
+
if (assertion && typeof assertion === 'object' && typeof assertion.text === 'string') {
|
|
15
|
+
return { text: assertion.text };
|
|
16
|
+
}
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
function countToolCalls(messages) {
|
|
21
|
+
return messages.reduce((total, message) => {
|
|
22
|
+
if (!Array.isArray(message.content)) {
|
|
23
|
+
return total;
|
|
24
|
+
}
|
|
25
|
+
return total + message.content.filter((block) => block?.type === 'tool_use').length;
|
|
26
|
+
}, 0);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function countToolErrors(messages) {
|
|
30
|
+
return messages.reduce((total, message) => {
|
|
31
|
+
if (!Array.isArray(message.content)) {
|
|
32
|
+
return total;
|
|
33
|
+
}
|
|
34
|
+
return total + message.content.filter((block) => block?.type === 'tool_result' && block.is_error === true).length;
|
|
35
|
+
}, 0);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function renderTranscript(prompt, result) {
|
|
39
|
+
const lines = [
|
|
40
|
+
'# Skill Eval Transcript',
|
|
41
|
+
'',
|
|
42
|
+
'## Eval Prompt',
|
|
43
|
+
'',
|
|
44
|
+
prompt,
|
|
45
|
+
'',
|
|
46
|
+
'## Final Response',
|
|
47
|
+
'',
|
|
48
|
+
result.lastText || '(No final text)',
|
|
49
|
+
'',
|
|
50
|
+
'## Result Flags',
|
|
51
|
+
'',
|
|
52
|
+
`- success: ${result.success}`,
|
|
53
|
+
`- signal: ${result.signal ?? 'none'}`,
|
|
54
|
+
`- interrupted: ${result.interrupted === true}`,
|
|
55
|
+
`- limit_reached: ${result.limitReached === true}`,
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
return `${lines.join('\n')}\n`;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
async function readInputFiles(files, options) {
|
|
62
|
+
const baseDir = path.dirname(path.resolve(options.evalsPath));
|
|
63
|
+
const cwd = path.resolve(options.cwd ?? process.cwd());
|
|
64
|
+
const sections = [];
|
|
65
|
+
|
|
66
|
+
for (const filePath of files ?? []) {
|
|
67
|
+
const resolved = path.isAbsolute(filePath)
|
|
68
|
+
? filePath
|
|
69
|
+
: path.resolve(baseDir, filePath);
|
|
70
|
+
const fallback = path.resolve(cwd, filePath);
|
|
71
|
+
let content = null;
|
|
72
|
+
let usedPath = resolved;
|
|
73
|
+
|
|
74
|
+
try {
|
|
75
|
+
content = await readFile(resolved, 'utf8');
|
|
76
|
+
} catch {
|
|
77
|
+
content = await readFile(fallback, 'utf8').catch(() => null);
|
|
78
|
+
usedPath = fallback;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (content == null) {
|
|
82
|
+
throw new Error(`Input file not found for eval: ${filePath}`);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
sections.push(`### ${filePath}\n\`\`\`\n${content}\n\`\`\`\n(Resolved from ${usedPath})`);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (sections.length === 0) {
|
|
89
|
+
return '';
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return `## Input Files\n\n${sections.join('\n\n')}\n\n`;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export async function buildEvalPrompt(evalItem, options) {
|
|
96
|
+
const prompt = String(evalItem.prompt ?? evalItem.query ?? '').trim();
|
|
97
|
+
const fileSection = await readInputFiles(evalItem.files, options);
|
|
98
|
+
return `${fileSection}${prompt}`.trim();
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
async function defaultRunAgent(prompt, options) {
|
|
102
|
+
const { runKodaX, estimateTokens } = await loadKodaXSDK();
|
|
103
|
+
const startedAt = Date.now();
|
|
104
|
+
const result = await runKodaX(
|
|
105
|
+
{
|
|
106
|
+
provider: options.provider ?? 'anthropic',
|
|
107
|
+
model: options.model,
|
|
108
|
+
maxIter: options.maxIter ?? 30,
|
|
109
|
+
reasoningMode: options.reasoningMode ?? 'off',
|
|
110
|
+
thinking: options.reasoningMode ? options.reasoningMode !== 'off' : false,
|
|
111
|
+
context: {
|
|
112
|
+
gitRoot: path.resolve(options.cwd ?? process.cwd()),
|
|
113
|
+
},
|
|
114
|
+
},
|
|
115
|
+
prompt
|
|
116
|
+
);
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
result,
|
|
120
|
+
totalTokens: estimateTokens(result.messages),
|
|
121
|
+
durationMs: Date.now() - startedAt,
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async function prepareConfigPrompt(configName, skill, taskPrompt, options) {
|
|
126
|
+
if (configName === 'with_skill') {
|
|
127
|
+
const expanded = await expandSkillForLLM(
|
|
128
|
+
skill,
|
|
129
|
+
taskPrompt,
|
|
130
|
+
{
|
|
131
|
+
workingDirectory: path.resolve(options.cwd ?? process.cwd()),
|
|
132
|
+
projectRoot: path.resolve(options.cwd ?? process.cwd()),
|
|
133
|
+
sessionId: 'skill-eval',
|
|
134
|
+
environment: {},
|
|
135
|
+
}
|
|
136
|
+
);
|
|
137
|
+
return expanded.content;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return taskPrompt;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
async function writeRunArtifacts(runDir, configName, evalItem, prompt, execution) {
|
|
144
|
+
const outputsDir = path.join(runDir, 'outputs');
|
|
145
|
+
await ensureDirectory(outputsDir);
|
|
146
|
+
|
|
147
|
+
const executionMetrics = {
|
|
148
|
+
total_tool_calls: countToolCalls(execution.result.messages),
|
|
149
|
+
errors_encountered: countToolErrors(execution.result.messages),
|
|
150
|
+
output_chars: execution.result.lastText.length,
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
await writeFile(path.join(outputsDir, 'result.md'), `${execution.result.lastText}\n`, 'utf8');
|
|
154
|
+
await writeFile(path.join(outputsDir, 'prompt.txt'), `${prompt}\n`, 'utf8');
|
|
155
|
+
await writeFile(path.join(runDir, 'transcript.md'), renderTranscript(prompt, execution.result), 'utf8');
|
|
156
|
+
await writeFile(
|
|
157
|
+
path.join(outputsDir, 'metrics.json'),
|
|
158
|
+
`${JSON.stringify({
|
|
159
|
+
config: configName,
|
|
160
|
+
eval_id: evalItem.id ?? null,
|
|
161
|
+
session_id: execution.result.sessionId,
|
|
162
|
+
...executionMetrics,
|
|
163
|
+
}, null, 2)}\n`,
|
|
164
|
+
'utf8'
|
|
165
|
+
);
|
|
166
|
+
await writeFile(
|
|
167
|
+
path.join(outputsDir, 'messages.json'),
|
|
168
|
+
`${JSON.stringify(execution.result.messages, null, 2)}\n`,
|
|
169
|
+
'utf8'
|
|
170
|
+
);
|
|
171
|
+
await writeFile(
|
|
172
|
+
path.join(runDir, 'timing.json'),
|
|
173
|
+
`${JSON.stringify({
|
|
174
|
+
total_tokens: execution.totalTokens,
|
|
175
|
+
duration_ms: execution.durationMs,
|
|
176
|
+
total_duration_seconds: Number((execution.durationMs / 1000).toFixed(4)),
|
|
177
|
+
}, null, 2)}\n`,
|
|
178
|
+
'utf8'
|
|
179
|
+
);
|
|
180
|
+
await writeFile(
|
|
181
|
+
path.join(runDir, 'result.json'),
|
|
182
|
+
`${JSON.stringify({
|
|
183
|
+
success: execution.result.success,
|
|
184
|
+
signal: execution.result.signal ?? null,
|
|
185
|
+
signal_reason: execution.result.signalReason ?? null,
|
|
186
|
+
interrupted: execution.result.interrupted === true,
|
|
187
|
+
limit_reached: execution.result.limitReached === true,
|
|
188
|
+
session_id: execution.result.sessionId,
|
|
189
|
+
execution_metrics: executionMetrics,
|
|
190
|
+
}, null, 2)}\n`,
|
|
191
|
+
'utf8'
|
|
192
|
+
);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
export async function runEvalWorkspace(
|
|
196
|
+
options,
|
|
197
|
+
runner = defaultRunAgent
|
|
198
|
+
) {
|
|
199
|
+
const skill = await loadFullSkill(path.resolve(options.skillPath), 'user');
|
|
200
|
+
if (!skill) {
|
|
201
|
+
throw new Error(`Failed to load skill from ${options.skillPath}`);
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
const workspaceDir = path.resolve(options.workspaceDir);
|
|
205
|
+
await ensureDirectory(workspaceDir);
|
|
206
|
+
|
|
207
|
+
const evalDocument = JSON.parse(await readFile(options.evalsPath, 'utf8'));
|
|
208
|
+
const evals = Array.isArray(evalDocument.evals) ? evalDocument.evals : [];
|
|
209
|
+
const configs = options.configs?.length ? options.configs : ['with_skill', 'without_skill'];
|
|
210
|
+
const runsPerConfig = Number.isFinite(options.runsPerConfig) && options.runsPerConfig > 0
|
|
211
|
+
? Math.floor(options.runsPerConfig)
|
|
212
|
+
: 1;
|
|
213
|
+
const reports = [];
|
|
214
|
+
|
|
215
|
+
for (let evalIndex = 0; evalIndex < evals.length; evalIndex += 1) {
|
|
216
|
+
const evalItem = evals[evalIndex];
|
|
217
|
+
const evalDir = path.join(workspaceDir, `eval-${evalIndex}`);
|
|
218
|
+
await ensureDirectory(evalDir);
|
|
219
|
+
await writeFile(
|
|
220
|
+
path.join(evalDir, 'eval_metadata.json'),
|
|
221
|
+
`${JSON.stringify({
|
|
222
|
+
eval_id: evalItem.id ?? evalIndex,
|
|
223
|
+
eval_name: evalItem.name ?? `eval-${evalIndex}`,
|
|
224
|
+
prompt: evalItem.prompt ?? evalItem.query ?? '',
|
|
225
|
+
expected_output: evalItem.expected_output ?? '',
|
|
226
|
+
assertions: (Array.isArray(evalItem.assertions) ? evalItem.assertions : [])
|
|
227
|
+
.map(normalizeAssertion)
|
|
228
|
+
.filter(Boolean),
|
|
229
|
+
}, null, 2)}\n`,
|
|
230
|
+
'utf8'
|
|
231
|
+
);
|
|
232
|
+
|
|
233
|
+
const taskPrompt = await buildEvalPrompt(evalItem, options);
|
|
234
|
+
const configReports = {};
|
|
235
|
+
|
|
236
|
+
for (const configName of configs) {
|
|
237
|
+
const configDir = path.join(evalDir, configName);
|
|
238
|
+
await ensureDirectory(configDir);
|
|
239
|
+
configReports[configName] = [];
|
|
240
|
+
|
|
241
|
+
for (let runIndex = 1; runIndex <= runsPerConfig; runIndex += 1) {
|
|
242
|
+
const runDir = path.join(configDir, `run-${runIndex}`);
|
|
243
|
+
await ensureDirectory(runDir);
|
|
244
|
+
|
|
245
|
+
const prompt = await prepareConfigPrompt(configName, skill, taskPrompt, options);
|
|
246
|
+
const execution = await runner(prompt, {
|
|
247
|
+
...options,
|
|
248
|
+
configName,
|
|
249
|
+
evalItem,
|
|
250
|
+
runIndex,
|
|
251
|
+
});
|
|
252
|
+
await writeRunArtifacts(runDir, configName, evalItem, prompt, execution);
|
|
253
|
+
|
|
254
|
+
configReports[configName].push({
|
|
255
|
+
run_id: `run-${runIndex}`,
|
|
256
|
+
session_id: execution.result.sessionId,
|
|
257
|
+
success: execution.result.success,
|
|
258
|
+
total_tokens: execution.totalTokens,
|
|
259
|
+
duration_ms: execution.durationMs,
|
|
260
|
+
output_chars: execution.result.lastText.length,
|
|
261
|
+
});
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
reports.push({
|
|
266
|
+
eval_id: evalItem.id ?? evalIndex,
|
|
267
|
+
prompt: evalItem.prompt ?? evalItem.query ?? '',
|
|
268
|
+
configs: configReports,
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
workspace: workspaceDir,
|
|
274
|
+
skill_name: skill.name,
|
|
275
|
+
eval_count: evals.length,
|
|
276
|
+
configs,
|
|
277
|
+
runs_per_config: runsPerConfig,
|
|
278
|
+
reports,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
function parseArgs(argv) {
|
|
283
|
+
const args = {
|
|
284
|
+
skillPath: '',
|
|
285
|
+
evalsPath: '',
|
|
286
|
+
workspaceDir: '',
|
|
287
|
+
provider: 'anthropic',
|
|
288
|
+
model: undefined,
|
|
289
|
+
runsPerConfig: 1,
|
|
290
|
+
maxIter: 30,
|
|
291
|
+
reasoningMode: 'off',
|
|
292
|
+
cwd: process.cwd(),
|
|
293
|
+
configs: ['with_skill', 'without_skill'],
|
|
294
|
+
output: undefined,
|
|
295
|
+
};
|
|
296
|
+
|
|
297
|
+
for (let index = 2; index < argv.length; index += 1) {
|
|
298
|
+
const token = argv[index];
|
|
299
|
+
if (token === '--skill-path' && argv[index + 1]) {
|
|
300
|
+
args.skillPath = argv[++index];
|
|
301
|
+
} else if (token === '--evals' && argv[index + 1]) {
|
|
302
|
+
args.evalsPath = argv[++index];
|
|
303
|
+
} else if (token === '--workspace' && argv[index + 1]) {
|
|
304
|
+
args.workspaceDir = argv[++index];
|
|
305
|
+
} else if (token === '--provider' && argv[index + 1]) {
|
|
306
|
+
args.provider = argv[++index];
|
|
307
|
+
} else if (token === '--model' && argv[index + 1]) {
|
|
308
|
+
args.model = argv[++index];
|
|
309
|
+
} else if (token === '--runs' && argv[index + 1]) {
|
|
310
|
+
args.runsPerConfig = Number(argv[++index]);
|
|
311
|
+
} else if (token === '--max-iter' && argv[index + 1]) {
|
|
312
|
+
args.maxIter = Number(argv[++index]);
|
|
313
|
+
} else if (token === '--reasoning' && argv[index + 1]) {
|
|
314
|
+
args.reasoningMode = argv[++index];
|
|
315
|
+
} else if (token === '--cwd' && argv[index + 1]) {
|
|
316
|
+
args.cwd = argv[++index];
|
|
317
|
+
} else if (token === '--configs' && argv[index + 1]) {
|
|
318
|
+
args.configs = argv[++index]
|
|
319
|
+
.split(',')
|
|
320
|
+
.map((item) => item.trim())
|
|
321
|
+
.filter(Boolean);
|
|
322
|
+
} else if (token === '--output' && argv[index + 1]) {
|
|
323
|
+
args.output = argv[++index];
|
|
324
|
+
}
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
return args;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
async function main() {
|
|
331
|
+
const args = parseArgs(process.argv);
|
|
332
|
+
if (!args.skillPath || !args.evalsPath || !args.workspaceDir) {
|
|
333
|
+
console.error('Usage: node scripts/run-eval.js --skill-path <dir> --evals <evals.json> --workspace <dir> [--provider anthropic] [--runs 1]');
|
|
334
|
+
process.exit(1);
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
const report = await runEvalWorkspace(args);
|
|
338
|
+
const outputText = `${JSON.stringify(report, null, 2)}\n`;
|
|
339
|
+
|
|
340
|
+
if (args.output) {
|
|
341
|
+
await writeFile(args.output, outputText, 'utf8');
|
|
342
|
+
console.log(`Wrote ${path.resolve(args.output)}`);
|
|
343
|
+
} else {
|
|
344
|
+
process.stdout.write(outputText);
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
const isDirectRun = process.argv[1]
|
|
349
|
+
&& fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
|
|
350
|
+
|
|
351
|
+
if (isDirectRun) {
|
|
352
|
+
main().catch((error) => {
|
|
353
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
354
|
+
process.exit(1);
|
|
355
|
+
});
|
|
356
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import type { ImproveDescriptionOptions, ImproveDescriptionResult } from './improve-description.js';
|
|
2
|
+
import type { TriggerEvalOptions, TriggerEvalReport } from './run-trigger-eval.js';
|
|
3
|
+
|
|
4
|
+
export interface DescriptionLoopRecord {
|
|
5
|
+
iteration: number;
|
|
6
|
+
description: string;
|
|
7
|
+
score: string;
|
|
8
|
+
train: TriggerEvalReport['summary'];
|
|
9
|
+
test: TriggerEvalReport['summary'] | null;
|
|
10
|
+
train_results: TriggerEvalReport['results'];
|
|
11
|
+
test_results: TriggerEvalReport['results'];
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface DescriptionLoopReport {
|
|
15
|
+
skill_name: string;
|
|
16
|
+
original_description: string;
|
|
17
|
+
final_description: string;
|
|
18
|
+
best_description: string;
|
|
19
|
+
history: DescriptionLoopRecord[];
|
|
20
|
+
train_size: number;
|
|
21
|
+
test_size: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export interface DescriptionLoopOptions extends TriggerEvalOptions {
|
|
25
|
+
workspaceDir: string;
|
|
26
|
+
maxIterations: number;
|
|
27
|
+
holdout?: number;
|
|
28
|
+
seed?: number;
|
|
29
|
+
writeBest?: boolean;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export function splitEvalSet(
|
|
33
|
+
evals: Array<Record<string, unknown> & { should_trigger?: boolean }>,
|
|
34
|
+
holdout?: number,
|
|
35
|
+
seed?: number
|
|
36
|
+
): {
|
|
37
|
+
train: Array<Record<string, unknown> & { should_trigger?: boolean }>;
|
|
38
|
+
test: Array<Record<string, unknown> & { should_trigger?: boolean }>;
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
export function runDescriptionLoop(
|
|
42
|
+
options: DescriptionLoopOptions,
|
|
43
|
+
dependencies?: {
|
|
44
|
+
runTriggerEvalFn?: (options: TriggerEvalOptions) => Promise<TriggerEvalReport>;
|
|
45
|
+
improveDescriptionFn?: (
|
|
46
|
+
options: ImproveDescriptionOptions
|
|
47
|
+
) => Promise<ImproveDescriptionResult>;
|
|
48
|
+
}
|
|
49
|
+
): Promise<DescriptionLoopReport>;
|