@kodax-ai/kodax 0.7.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1350 -0
- package/LICENSE +191 -0
- package/README.md +1170 -0
- package/README_CN.md +659 -0
- package/dist/acp_events.d.ts +109 -0
- package/dist/acp_logger.d.ts +20 -0
- package/dist/acp_server.d.ts +92 -0
- package/dist/builtin/code-review/SKILL.md +63 -0
- package/dist/builtin/git-workflow/SKILL.md +84 -0
- package/dist/builtin/skill-creator/SKILL.md +122 -0
- package/dist/builtin/skill-creator/agents/analyzer.md +12 -0
- package/dist/builtin/skill-creator/agents/comparator.md +13 -0
- package/dist/builtin/skill-creator/agents/grader.md +13 -0
- package/dist/builtin/skill-creator/references/schemas.md +227 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.js +209 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.js +289 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.d.ts +62 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.js +333 -0
- package/dist/builtin/skill-creator/scripts/generate-review.d.ts +33 -0
- package/dist/builtin/skill-creator/scripts/generate-review.js +415 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.d.ts +73 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.js +405 -0
- package/dist/builtin/skill-creator/scripts/improve-description.d.ts +23 -0
- package/dist/builtin/skill-creator/scripts/improve-description.js +161 -0
- package/dist/builtin/skill-creator/scripts/init-skill.d.ts +14 -0
- package/dist/builtin/skill-creator/scripts/init-skill.js +153 -0
- package/dist/builtin/skill-creator/scripts/install-skill.d.ts +29 -0
- package/dist/builtin/skill-creator/scripts/install-skill.js +176 -0
- package/dist/builtin/skill-creator/scripts/package-skill.d.ts +38 -0
- package/dist/builtin/skill-creator/scripts/package-skill.js +124 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.d.ts +8 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.js +166 -0
- package/dist/builtin/skill-creator/scripts/run-eval.d.ts +66 -0
- package/dist/builtin/skill-creator/scripts/run-eval.js +356 -0
- package/dist/builtin/skill-creator/scripts/run-loop.d.ts +49 -0
- package/dist/builtin/skill-creator/scripts/run-loop.js +243 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.d.ts +58 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.js +225 -0
- package/dist/builtin/skill-creator/scripts/utils.js +273 -0
- package/dist/builtin/tdd/SKILL.md +56 -0
- package/dist/chunks/chunk-4E76FLZ3.js +2 -0
- package/dist/chunks/chunk-7LQ2NCHF.js +1221 -0
- package/dist/chunks/chunk-HUAU4KB3.js +2 -0
- package/dist/chunks/chunk-N2VZ2MJF.js +11 -0
- package/dist/chunks/chunk-SF7WD7E5.js +2 -0
- package/dist/chunks/chunk-SONW6AC7.js +14 -0
- package/dist/chunks/chunk-WEEQZYZS.js +460 -0
- package/dist/chunks/chunk-XI75LZIO.js +30 -0
- package/dist/chunks/compaction-config-YL4SWWII.js +2 -0
- package/dist/chunks/construction-bootstrap-XSE7ZABG.js +5 -0
- package/dist/chunks/devtools-MOFU7YQF.js +2 -0
- package/dist/chunks/dist-AMUYI7R5.js +2 -0
- package/dist/chunks/dist-WKW4CBG6.js +2 -0
- package/dist/chunks/utils-3HW4KOGE.js +2 -0
- package/dist/cli_commands.d.ts +17 -0
- package/dist/cli_option_helpers.d.ts +49 -0
- package/dist/cli_option_helpers.test.d.ts +1 -0
- package/dist/constructed_cli.d.ts +82 -0
- package/dist/constructed_cli.test.d.ts +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +9 -0
- package/dist/kodax_cli.d.ts +7 -0
- package/dist/kodax_cli.js +1882 -0
- package/dist/sdk-agent.d.ts +15 -0
- package/dist/sdk-agent.js +2 -0
- package/dist/sdk-coding.d.ts +20 -0
- package/dist/sdk-coding.js +2 -0
- package/dist/sdk-llm.d.ts +15 -0
- package/dist/sdk-llm.js +2 -0
- package/dist/sdk-repl.d.ts +21 -0
- package/dist/sdk-repl.js +2 -0
- package/dist/sdk-skills.d.ts +16 -0
- package/dist/sdk-skills.js +2 -0
- package/dist/self_modify_cli.d.ts +81 -0
- package/dist/self_modify_cli.test.d.ts +9 -0
- package/dist/skill_cli.d.ts +15 -0
- package/dist/skill_cli.test.d.ts +1 -0
- package/package.json +143 -0
- package/scripts/kodax-bin.cjs +27 -0
- package/scripts/production-env.cjs +16 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFile, writeFile } from 'node:fs/promises';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
import {
|
|
7
|
+
buildBenchmarkDocument,
|
|
8
|
+
loadRunResults,
|
|
9
|
+
} from './aggregate-benchmark.js';
|
|
10
|
+
import {
|
|
11
|
+
extractJsonObject,
|
|
12
|
+
loadKodaXSDK,
|
|
13
|
+
loadRelativeText,
|
|
14
|
+
readJsonFile,
|
|
15
|
+
truncateText,
|
|
16
|
+
} from './utils.js';
|
|
17
|
+
|
|
18
|
+
function normalizeStringArray(value) {
|
|
19
|
+
if (!Array.isArray(value)) {
|
|
20
|
+
return [];
|
|
21
|
+
}
|
|
22
|
+
return value
|
|
23
|
+
.map((item) => String(item ?? '').trim())
|
|
24
|
+
.filter(Boolean);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function summarizeFailureClusters(configRuns) {
|
|
28
|
+
const clusters = {};
|
|
29
|
+
|
|
30
|
+
for (const [configName, runs] of Object.entries(configRuns)) {
|
|
31
|
+
const failureCounts = new Map();
|
|
32
|
+
const notes = [];
|
|
33
|
+
|
|
34
|
+
for (const run of runs) {
|
|
35
|
+
for (const expectation of run.expectations ?? []) {
|
|
36
|
+
if (expectation?.passed === true) {
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
const text = String(expectation?.text ?? '').trim();
|
|
40
|
+
if (!text) {
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
failureCounts.set(text, (failureCounts.get(text) ?? 0) + 1);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
for (const note of run.notes ?? []) {
|
|
47
|
+
const normalized = String(note ?? '').trim();
|
|
48
|
+
if (normalized) {
|
|
49
|
+
notes.push(normalized);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
clusters[configName] = {
|
|
55
|
+
repeated_failures: Array.from(failureCounts.entries())
|
|
56
|
+
.sort((left, right) => right[1] - left[1])
|
|
57
|
+
.slice(0, 10)
|
|
58
|
+
.map(([text, count]) => ({ text, count })),
|
|
59
|
+
notes: notes.slice(0, 10),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
return clusters;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function normalizeAnalysisResult(rawText, benchmark, failureClusters) {
|
|
67
|
+
const parsed = extractJsonObject(rawText) ?? {};
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
skill_name: benchmark.skill_name,
|
|
71
|
+
generated_at: new Date().toISOString(),
|
|
72
|
+
workspace: benchmark.workspace,
|
|
73
|
+
verdict: ['improves', 'regresses', 'mixed', 'inconclusive'].includes(parsed.verdict)
|
|
74
|
+
? parsed.verdict
|
|
75
|
+
: 'inconclusive',
|
|
76
|
+
release_readiness: ['ready', 'needs_iteration', 'needs_manual_review'].includes(parsed.release_readiness)
|
|
77
|
+
? parsed.release_readiness
|
|
78
|
+
: 'needs_manual_review',
|
|
79
|
+
recommendation: String(parsed.recommendation ?? '').trim(),
|
|
80
|
+
key_findings: normalizeStringArray(parsed.key_findings),
|
|
81
|
+
variance_hotspots: normalizeStringArray(parsed.variance_hotspots),
|
|
82
|
+
suggested_actions: normalizeStringArray(parsed.suggested_actions),
|
|
83
|
+
watchouts: normalizeStringArray(parsed.watchouts),
|
|
84
|
+
supporting_metrics: {
|
|
85
|
+
pass_rate_delta: benchmark.delta?.pass_rate ?? 'n/a',
|
|
86
|
+
time_seconds_delta: benchmark.delta?.time_seconds ?? 'n/a',
|
|
87
|
+
tokens_delta: benchmark.delta?.tokens ?? 'n/a',
|
|
88
|
+
},
|
|
89
|
+
failure_clusters: failureClusters,
|
|
90
|
+
};
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export function buildAnalysisPrompt(input) {
|
|
94
|
+
return `${input.agentInstructions.trim()}
|
|
95
|
+
|
|
96
|
+
Return JSON with this shape:
|
|
97
|
+
{
|
|
98
|
+
"verdict": "improves | regresses | mixed | inconclusive",
|
|
99
|
+
"release_readiness": "ready | needs_iteration | needs_manual_review",
|
|
100
|
+
"recommendation": "short recommendation",
|
|
101
|
+
"key_findings": [],
|
|
102
|
+
"variance_hotspots": [],
|
|
103
|
+
"suggested_actions": [],
|
|
104
|
+
"watchouts": []
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
## Benchmark Summary
|
|
108
|
+
${truncateText(JSON.stringify({
|
|
109
|
+
skill_name: input.benchmark.skill_name,
|
|
110
|
+
configs: input.benchmark.configs,
|
|
111
|
+
delta: input.benchmark.delta,
|
|
112
|
+
}, null, 2), 12000)}
|
|
113
|
+
|
|
114
|
+
## Failure Clusters
|
|
115
|
+
${truncateText(JSON.stringify(input.failureClusters, null, 2), 8000)}
|
|
116
|
+
`;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
async function defaultRunAnalyst(prompt, options) {
|
|
120
|
+
const { runKodaX } = await loadKodaXSDK();
|
|
121
|
+
const result = await runKodaX(
|
|
122
|
+
{
|
|
123
|
+
provider: options.provider ?? 'anthropic',
|
|
124
|
+
model: options.model,
|
|
125
|
+
maxIter: options.maxIter ?? 20,
|
|
126
|
+
reasoningMode: options.reasoningMode ?? 'balanced',
|
|
127
|
+
thinking: options.reasoningMode ? options.reasoningMode !== 'off' : true,
|
|
128
|
+
context: {
|
|
129
|
+
gitRoot: path.resolve(options.cwd ?? options.workspaceDir ?? process.cwd()),
|
|
130
|
+
},
|
|
131
|
+
},
|
|
132
|
+
prompt
|
|
133
|
+
);
|
|
134
|
+
return result.lastText;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
export function renderAnalysisMarkdown(analysis) {
|
|
138
|
+
const lines = [
|
|
139
|
+
`# Benchmark Analysis: ${analysis.skill_name}`,
|
|
140
|
+
'',
|
|
141
|
+
`Generated: ${analysis.generated_at}`,
|
|
142
|
+
'',
|
|
143
|
+
`- Verdict: ${analysis.verdict}`,
|
|
144
|
+
`- Release readiness: ${analysis.release_readiness}`,
|
|
145
|
+
`- Recommendation: ${analysis.recommendation || 'n/a'}`,
|
|
146
|
+
'',
|
|
147
|
+
];
|
|
148
|
+
|
|
149
|
+
const sections = [
|
|
150
|
+
['key_findings', 'Key Findings'],
|
|
151
|
+
['variance_hotspots', 'Variance Hotspots'],
|
|
152
|
+
['suggested_actions', 'Suggested Actions'],
|
|
153
|
+
['watchouts', 'Watchouts'],
|
|
154
|
+
];
|
|
155
|
+
|
|
156
|
+
for (const [field, title] of sections) {
|
|
157
|
+
lines.push(`## ${title}`);
|
|
158
|
+
lines.push('');
|
|
159
|
+
const items = Array.isArray(analysis[field]) ? analysis[field] : [];
|
|
160
|
+
if (items.length === 0) {
|
|
161
|
+
lines.push('- None');
|
|
162
|
+
} else {
|
|
163
|
+
for (const item of items) {
|
|
164
|
+
lines.push(`- ${item}`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
lines.push('');
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
lines.push('## Supporting Metrics');
|
|
171
|
+
lines.push('');
|
|
172
|
+
lines.push(`- Pass rate delta: ${analysis.supporting_metrics.pass_rate_delta}`);
|
|
173
|
+
lines.push(`- Time delta: ${analysis.supporting_metrics.time_seconds_delta}`);
|
|
174
|
+
lines.push(`- Tokens delta: ${analysis.supporting_metrics.tokens_delta}`);
|
|
175
|
+
|
|
176
|
+
return `${lines.join('\n')}\n`;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
export async function analyzeBenchmark(
|
|
180
|
+
options,
|
|
181
|
+
runner = defaultRunAnalyst
|
|
182
|
+
) {
|
|
183
|
+
const workspaceDir = path.resolve(options.workspaceDir);
|
|
184
|
+
const benchmarkPath = path.resolve(options.benchmarkPath ?? path.join(workspaceDir, 'benchmark.json'));
|
|
185
|
+
let benchmark = await readJsonFile(benchmarkPath, null);
|
|
186
|
+
|
|
187
|
+
if (!benchmark) {
|
|
188
|
+
const configRuns = await loadRunResults(workspaceDir);
|
|
189
|
+
if (Object.keys(configRuns).length === 0) {
|
|
190
|
+
throw new Error(`No benchmark data found in ${workspaceDir}`);
|
|
191
|
+
}
|
|
192
|
+
benchmark = buildBenchmarkDocument(workspaceDir, options.skillName ?? path.basename(workspaceDir), configRuns);
|
|
193
|
+
await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
const configRuns = await loadRunResults(workspaceDir);
|
|
197
|
+
const failureClusters = summarizeFailureClusters(configRuns);
|
|
198
|
+
const agentInstructions = await loadRelativeText(import.meta.url, '../agents/analyzer.md');
|
|
199
|
+
const prompt = buildAnalysisPrompt({
|
|
200
|
+
agentInstructions,
|
|
201
|
+
benchmark,
|
|
202
|
+
failureClusters,
|
|
203
|
+
});
|
|
204
|
+
const rawResponse = await runner(prompt, {
|
|
205
|
+
...options,
|
|
206
|
+
workspaceDir,
|
|
207
|
+
benchmarkPath,
|
|
208
|
+
benchmark,
|
|
209
|
+
});
|
|
210
|
+
const analysis = normalizeAnalysisResult(rawResponse, benchmark, failureClusters);
|
|
211
|
+
const analysisJsonPath = path.resolve(options.outputPath ?? path.join(workspaceDir, 'analysis.json'));
|
|
212
|
+
const analysisMdPath = path.resolve(options.markdownPath ?? path.join(workspaceDir, 'analysis.md'));
|
|
213
|
+
|
|
214
|
+
await writeFile(analysisJsonPath, `${JSON.stringify(analysis, null, 2)}\n`, 'utf8');
|
|
215
|
+
await writeFile(analysisMdPath, renderAnalysisMarkdown(analysis), 'utf8');
|
|
216
|
+
|
|
217
|
+
return {
|
|
218
|
+
analysis,
|
|
219
|
+
prompt,
|
|
220
|
+
rawResponse,
|
|
221
|
+
analysisJsonPath,
|
|
222
|
+
analysisMdPath,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function parseArgs(argv) {
|
|
227
|
+
const args = {
|
|
228
|
+
workspaceDir: argv[2] ?? '',
|
|
229
|
+
benchmarkPath: undefined,
|
|
230
|
+
outputPath: undefined,
|
|
231
|
+
markdownPath: undefined,
|
|
232
|
+
skillName: undefined,
|
|
233
|
+
provider: 'anthropic',
|
|
234
|
+
model: undefined,
|
|
235
|
+
reasoningMode: 'balanced',
|
|
236
|
+
maxIter: 20,
|
|
237
|
+
cwd: process.cwd(),
|
|
238
|
+
};
|
|
239
|
+
|
|
240
|
+
for (let index = 3; index < argv.length; index += 1) {
|
|
241
|
+
const token = argv[index];
|
|
242
|
+
if (token === '--benchmark' && argv[index + 1]) {
|
|
243
|
+
args.benchmarkPath = argv[++index];
|
|
244
|
+
} else if (token === '--output' && argv[index + 1]) {
|
|
245
|
+
args.outputPath = argv[++index];
|
|
246
|
+
} else if (token === '--markdown' && argv[index + 1]) {
|
|
247
|
+
args.markdownPath = argv[++index];
|
|
248
|
+
} else if (token === '--skill-name' && argv[index + 1]) {
|
|
249
|
+
args.skillName = argv[++index];
|
|
250
|
+
} else if (token === '--provider' && argv[index + 1]) {
|
|
251
|
+
args.provider = argv[++index];
|
|
252
|
+
} else if (token === '--model' && argv[index + 1]) {
|
|
253
|
+
args.model = argv[++index];
|
|
254
|
+
} else if (token === '--reasoning' && argv[index + 1]) {
|
|
255
|
+
args.reasoningMode = argv[++index];
|
|
256
|
+
} else if (token === '--max-iter' && argv[index + 1]) {
|
|
257
|
+
args.maxIter = Number(argv[++index]);
|
|
258
|
+
} else if (token === '--cwd' && argv[index + 1]) {
|
|
259
|
+
args.cwd = argv[++index];
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return args;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
async function main() {
|
|
267
|
+
const args = parseArgs(process.argv);
|
|
268
|
+
if (!args.workspaceDir) {
|
|
269
|
+
console.error('Usage: node scripts/analyze-benchmark.js <workspace> [--benchmark benchmark.json] [--output analysis.json] [--markdown analysis.md]');
|
|
270
|
+
process.exit(1);
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
const result = await analyzeBenchmark(args);
|
|
274
|
+
process.stdout.write(`${JSON.stringify({
|
|
275
|
+
analysis: result.analysis,
|
|
276
|
+
analysis_json: result.analysisJsonPath,
|
|
277
|
+
analysis_md: result.analysisMdPath,
|
|
278
|
+
}, null, 2)}\n`);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
const isDirectRun = process.argv[1]
|
|
282
|
+
&& fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
|
|
283
|
+
|
|
284
|
+
if (isDirectRun) {
|
|
285
|
+
main().catch((error) => {
|
|
286
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
287
|
+
process.exit(1);
|
|
288
|
+
});
|
|
289
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
export interface CompareWorkspaceOptions {
|
|
2
|
+
workspaceDir: string;
|
|
3
|
+
configA?: string;
|
|
4
|
+
configB?: string;
|
|
5
|
+
outputPath?: string;
|
|
6
|
+
markdownPath?: string;
|
|
7
|
+
provider?: string;
|
|
8
|
+
model?: string;
|
|
9
|
+
reasoningMode?: string;
|
|
10
|
+
maxIter?: number;
|
|
11
|
+
maxPairs?: number;
|
|
12
|
+
cwd?: string;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export function buildComparisonPrompt(input: Record<string, unknown>): string;
|
|
16
|
+
|
|
17
|
+
export interface ComparisonSummary {
|
|
18
|
+
total_pairs: number;
|
|
19
|
+
config_a_wins: number;
|
|
20
|
+
config_b_wins: number;
|
|
21
|
+
ties: number;
|
|
22
|
+
inconclusive: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface ComparisonEntry {
|
|
26
|
+
index: number;
|
|
27
|
+
eval_id: string | number | null;
|
|
28
|
+
eval_name: string | null;
|
|
29
|
+
run_a: string;
|
|
30
|
+
run_b: string;
|
|
31
|
+
presented_as: {
|
|
32
|
+
A: string;
|
|
33
|
+
B: string;
|
|
34
|
+
};
|
|
35
|
+
winner_label: 'A' | 'B' | 'tie' | 'inconclusive';
|
|
36
|
+
winner_config: string;
|
|
37
|
+
confidence: number;
|
|
38
|
+
rationale: string;
|
|
39
|
+
strengths_a: string[];
|
|
40
|
+
strengths_b: string[];
|
|
41
|
+
risks: string[];
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export interface ComparisonDocument {
|
|
45
|
+
workspace: string;
|
|
46
|
+
generated_at: string;
|
|
47
|
+
config_a: string;
|
|
48
|
+
config_b: string;
|
|
49
|
+
summary: ComparisonSummary;
|
|
50
|
+
comparisons: ComparisonEntry[];
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
export function renderComparisonMarkdown(document: ComparisonDocument): string;
|
|
54
|
+
|
|
55
|
+
export function compareWorkspace(
|
|
56
|
+
options: CompareWorkspaceOptions,
|
|
57
|
+
runner?: (prompt: string, options: Record<string, unknown>) => Promise<string>
|
|
58
|
+
): Promise<{
|
|
59
|
+
document: ComparisonDocument;
|
|
60
|
+
outputPath: string;
|
|
61
|
+
markdownPath: string;
|
|
62
|
+
}>;
|
|
@@ -0,0 +1,333 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readdir, readFile, writeFile } from 'node:fs/promises';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
import {
|
|
7
|
+
extractJsonObject,
|
|
8
|
+
loadKodaXSDK,
|
|
9
|
+
loadRelativeText,
|
|
10
|
+
readJsonFile,
|
|
11
|
+
truncateText,
|
|
12
|
+
} from './utils.js';
|
|
13
|
+
|
|
14
|
+
async function listDirectories(dirPath) {
|
|
15
|
+
const entries = await readdir(dirPath, { withFileTypes: true }).catch(() => []);
|
|
16
|
+
return entries
|
|
17
|
+
.filter((entry) => entry.isDirectory())
|
|
18
|
+
.map((entry) => path.join(dirPath, entry.name))
|
|
19
|
+
.sort((left, right) => left.localeCompare(right));
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function normalizeStringArray(value) {
|
|
23
|
+
if (!Array.isArray(value)) {
|
|
24
|
+
return [];
|
|
25
|
+
}
|
|
26
|
+
return value
|
|
27
|
+
.map((item) => String(item ?? '').trim())
|
|
28
|
+
.filter(Boolean);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function normalizeComparisonResult(rawText) {
|
|
32
|
+
const parsed = extractJsonObject(rawText) ?? {};
|
|
33
|
+
const winner = ['A', 'B', 'tie', 'inconclusive'].includes(parsed.winner)
|
|
34
|
+
? parsed.winner
|
|
35
|
+
: 'inconclusive';
|
|
36
|
+
const confidenceValue = Number(parsed.confidence);
|
|
37
|
+
|
|
38
|
+
return {
|
|
39
|
+
winner,
|
|
40
|
+
confidence: Number.isFinite(confidenceValue)
|
|
41
|
+
? Math.max(0, Math.min(1, Number(confidenceValue.toFixed(4))))
|
|
42
|
+
: 0,
|
|
43
|
+
rationale: String(parsed.rationale ?? '').trim(),
|
|
44
|
+
strengths_a: normalizeStringArray(parsed.strengths_a),
|
|
45
|
+
strengths_b: normalizeStringArray(parsed.strengths_b),
|
|
46
|
+
risks: normalizeStringArray(parsed.risks),
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
async function loadComparisonPair(evalDir, configA, configB, pairIndex) {
|
|
51
|
+
const runsA = (await listDirectories(path.join(evalDir, configA)))
|
|
52
|
+
.filter((dirPath) => path.basename(dirPath).startsWith('run-'));
|
|
53
|
+
const runsB = (await listDirectories(path.join(evalDir, configB)))
|
|
54
|
+
.filter((dirPath) => path.basename(dirPath).startsWith('run-'));
|
|
55
|
+
const left = runsA[pairIndex];
|
|
56
|
+
const right = runsB[pairIndex];
|
|
57
|
+
|
|
58
|
+
if (!left || !right) {
|
|
59
|
+
return null;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const evalMetadata = await readJsonFile(path.join(evalDir, 'eval_metadata.json'), {});
|
|
63
|
+
const outputA = await readFile(path.join(left, 'outputs', 'result.md'), 'utf8').catch(() => '');
|
|
64
|
+
const outputB = await readFile(path.join(right, 'outputs', 'result.md'), 'utf8').catch(() => '');
|
|
65
|
+
|
|
66
|
+
return {
|
|
67
|
+
evalDir,
|
|
68
|
+
evalMetadata,
|
|
69
|
+
runA: {
|
|
70
|
+
runDir: left,
|
|
71
|
+
runId: path.basename(left),
|
|
72
|
+
configName: configA,
|
|
73
|
+
output: outputA,
|
|
74
|
+
},
|
|
75
|
+
runB: {
|
|
76
|
+
runDir: right,
|
|
77
|
+
runId: path.basename(right),
|
|
78
|
+
configName: configB,
|
|
79
|
+
output: outputB,
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export function buildComparisonPrompt(input) {
|
|
85
|
+
const pair = input.presentPrimaryFirst
|
|
86
|
+
? { A: input.runA, B: input.runB }
|
|
87
|
+
: { A: input.runB, B: input.runA };
|
|
88
|
+
|
|
89
|
+
return `${input.agentInstructions.trim()}
|
|
90
|
+
|
|
91
|
+
Return JSON with this shape:
|
|
92
|
+
{
|
|
93
|
+
"winner": "A | B | tie | inconclusive",
|
|
94
|
+
"confidence": 0.0,
|
|
95
|
+
"rationale": "short explanation",
|
|
96
|
+
"strengths_a": [],
|
|
97
|
+
"strengths_b": [],
|
|
98
|
+
"risks": []
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
Judge only the visible outputs. Do not mention hidden config names in the rationale.
|
|
102
|
+
|
|
103
|
+
## Eval Prompt
|
|
104
|
+
${truncateText(input.evalMetadata.prompt ?? '', 4000)}
|
|
105
|
+
|
|
106
|
+
## Expected Outcome
|
|
107
|
+
${truncateText(input.evalMetadata.expected_output ?? '', 2000)}
|
|
108
|
+
|
|
109
|
+
## Assertions
|
|
110
|
+
${truncateText(JSON.stringify(input.evalMetadata.assertions ?? [], null, 2), 4000)}
|
|
111
|
+
|
|
112
|
+
## Candidate A
|
|
113
|
+
${truncateText(pair.A.output, 12000)}
|
|
114
|
+
|
|
115
|
+
## Candidate B
|
|
116
|
+
${truncateText(pair.B.output, 12000)}
|
|
117
|
+
`;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
async function defaultRunComparator(prompt, options) {
|
|
121
|
+
const { runKodaX } = await loadKodaXSDK();
|
|
122
|
+
const result = await runKodaX(
|
|
123
|
+
{
|
|
124
|
+
provider: options.provider ?? 'anthropic',
|
|
125
|
+
model: options.model,
|
|
126
|
+
maxIter: options.maxIter ?? 20,
|
|
127
|
+
reasoningMode: options.reasoningMode ?? 'balanced',
|
|
128
|
+
thinking: options.reasoningMode ? options.reasoningMode !== 'off' : true,
|
|
129
|
+
context: {
|
|
130
|
+
gitRoot: path.resolve(options.cwd ?? options.workspaceDir ?? process.cwd()),
|
|
131
|
+
},
|
|
132
|
+
},
|
|
133
|
+
prompt
|
|
134
|
+
);
|
|
135
|
+
return result.lastText;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function mapWinnerToConfig(winner, presentPrimaryFirst, configA, configB) {
|
|
139
|
+
if (winner === 'A') {
|
|
140
|
+
return presentPrimaryFirst ? configA : configB;
|
|
141
|
+
}
|
|
142
|
+
if (winner === 'B') {
|
|
143
|
+
return presentPrimaryFirst ? configB : configA;
|
|
144
|
+
}
|
|
145
|
+
return winner;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export function renderComparisonMarkdown(document) {
|
|
149
|
+
const lines = [
|
|
150
|
+
`# Blind Comparison: ${document.config_a} vs ${document.config_b}`,
|
|
151
|
+
'',
|
|
152
|
+
`Generated: ${document.generated_at}`,
|
|
153
|
+
'',
|
|
154
|
+
`- ${document.config_a} wins: ${document.summary.config_a_wins}`,
|
|
155
|
+
`- ${document.config_b} wins: ${document.summary.config_b_wins}`,
|
|
156
|
+
`- ties: ${document.summary.ties}`,
|
|
157
|
+
`- inconclusive: ${document.summary.inconclusive}`,
|
|
158
|
+
'',
|
|
159
|
+
];
|
|
160
|
+
|
|
161
|
+
for (const comparison of document.comparisons) {
|
|
162
|
+
lines.push(`## Eval ${comparison.eval_id ?? comparison.eval_name ?? comparison.index}`);
|
|
163
|
+
lines.push('');
|
|
164
|
+
lines.push(`- Winner: ${comparison.winner_config}`);
|
|
165
|
+
lines.push(`- Confidence: ${comparison.confidence}`);
|
|
166
|
+
lines.push(`- Rationale: ${comparison.rationale || 'n/a'}`);
|
|
167
|
+
lines.push('');
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return `${lines.join('\n')}\n`;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
export async function compareWorkspace(
|
|
174
|
+
options,
|
|
175
|
+
runner = defaultRunComparator
|
|
176
|
+
) {
|
|
177
|
+
const workspaceDir = path.resolve(options.workspaceDir);
|
|
178
|
+
const configA = options.configA ?? 'with_skill';
|
|
179
|
+
const configB = options.configB ?? 'without_skill';
|
|
180
|
+
const maxPairs = Number.isFinite(options.maxPairs) && options.maxPairs > 0
|
|
181
|
+
? Math.floor(options.maxPairs)
|
|
182
|
+
: Number.POSITIVE_INFINITY;
|
|
183
|
+
const agentInstructions = await loadRelativeText(import.meta.url, '../agents/comparator.md');
|
|
184
|
+
const comparisons = [];
|
|
185
|
+
|
|
186
|
+
for (const evalDir of await listDirectories(workspaceDir)) {
|
|
187
|
+
if (!path.basename(evalDir).startsWith('eval-')) {
|
|
188
|
+
continue;
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
for (let pairIndex = 0; pairIndex < maxPairs; pairIndex += 1) {
|
|
192
|
+
const pair = await loadComparisonPair(evalDir, configA, configB, pairIndex);
|
|
193
|
+
if (!pair) {
|
|
194
|
+
break;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const presentPrimaryFirst = comparisons.length % 2 === 0;
|
|
198
|
+
const prompt = buildComparisonPrompt({
|
|
199
|
+
agentInstructions,
|
|
200
|
+
evalMetadata: pair.evalMetadata,
|
|
201
|
+
runA: pair.runA,
|
|
202
|
+
runB: pair.runB,
|
|
203
|
+
presentPrimaryFirst,
|
|
204
|
+
});
|
|
205
|
+
const rawResponse = await runner(prompt, {
|
|
206
|
+
...options,
|
|
207
|
+
workspaceDir,
|
|
208
|
+
evalDir,
|
|
209
|
+
pairIndex,
|
|
210
|
+
configA,
|
|
211
|
+
configB,
|
|
212
|
+
});
|
|
213
|
+
const parsed = normalizeComparisonResult(rawResponse);
|
|
214
|
+
const winnerConfig = mapWinnerToConfig(parsed.winner, presentPrimaryFirst, configA, configB);
|
|
215
|
+
|
|
216
|
+
comparisons.push({
|
|
217
|
+
index: comparisons.length + 1,
|
|
218
|
+
eval_id: pair.evalMetadata.eval_id ?? null,
|
|
219
|
+
eval_name: pair.evalMetadata.eval_name ?? null,
|
|
220
|
+
run_a: path.relative(workspaceDir, pair.runA.runDir).replace(/\\/g, '/'),
|
|
221
|
+
run_b: path.relative(workspaceDir, pair.runB.runDir).replace(/\\/g, '/'),
|
|
222
|
+
presented_as: presentPrimaryFirst
|
|
223
|
+
? { A: configA, B: configB }
|
|
224
|
+
: { A: configB, B: configA },
|
|
225
|
+
winner_label: parsed.winner,
|
|
226
|
+
winner_config: winnerConfig,
|
|
227
|
+
confidence: parsed.confidence,
|
|
228
|
+
rationale: parsed.rationale,
|
|
229
|
+
strengths_a: parsed.strengths_a,
|
|
230
|
+
strengths_b: parsed.strengths_b,
|
|
231
|
+
risks: parsed.risks,
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
if (comparisons.length === 0) {
|
|
237
|
+
throw new Error(`No comparable run pairs found for ${configA} vs ${configB} in ${workspaceDir}`);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const document = {
|
|
241
|
+
workspace: workspaceDir,
|
|
242
|
+
generated_at: new Date().toISOString(),
|
|
243
|
+
config_a: configA,
|
|
244
|
+
config_b: configB,
|
|
245
|
+
summary: {
|
|
246
|
+
total_pairs: comparisons.length,
|
|
247
|
+
config_a_wins: comparisons.filter((item) => item.winner_config === configA).length,
|
|
248
|
+
config_b_wins: comparisons.filter((item) => item.winner_config === configB).length,
|
|
249
|
+
ties: comparisons.filter((item) => item.winner_config === 'tie').length,
|
|
250
|
+
inconclusive: comparisons.filter((item) => item.winner_config === 'inconclusive').length,
|
|
251
|
+
},
|
|
252
|
+
comparisons,
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
const outputPath = path.resolve(options.outputPath ?? path.join(workspaceDir, 'comparison.json'));
|
|
256
|
+
const markdownPath = path.resolve(options.markdownPath ?? path.join(workspaceDir, 'comparison.md'));
|
|
257
|
+
await writeFile(outputPath, `${JSON.stringify(document, null, 2)}\n`, 'utf8');
|
|
258
|
+
await writeFile(markdownPath, renderComparisonMarkdown(document), 'utf8');
|
|
259
|
+
|
|
260
|
+
return {
|
|
261
|
+
document,
|
|
262
|
+
outputPath,
|
|
263
|
+
markdownPath,
|
|
264
|
+
};
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
function parseArgs(argv) {
|
|
268
|
+
const args = {
|
|
269
|
+
workspaceDir: argv[2] ?? '',
|
|
270
|
+
configA: 'with_skill',
|
|
271
|
+
configB: 'without_skill',
|
|
272
|
+
outputPath: undefined,
|
|
273
|
+
markdownPath: undefined,
|
|
274
|
+
provider: 'anthropic',
|
|
275
|
+
model: undefined,
|
|
276
|
+
reasoningMode: 'balanced',
|
|
277
|
+
maxIter: 20,
|
|
278
|
+
maxPairs: undefined,
|
|
279
|
+
cwd: process.cwd(),
|
|
280
|
+
};
|
|
281
|
+
|
|
282
|
+
for (let index = 3; index < argv.length; index += 1) {
|
|
283
|
+
const token = argv[index];
|
|
284
|
+
if (token === '--config-a' && argv[index + 1]) {
|
|
285
|
+
args.configA = argv[++index];
|
|
286
|
+
} else if (token === '--config-b' && argv[index + 1]) {
|
|
287
|
+
args.configB = argv[++index];
|
|
288
|
+
} else if (token === '--output' && argv[index + 1]) {
|
|
289
|
+
args.outputPath = argv[++index];
|
|
290
|
+
} else if (token === '--markdown' && argv[index + 1]) {
|
|
291
|
+
args.markdownPath = argv[++index];
|
|
292
|
+
} else if (token === '--provider' && argv[index + 1]) {
|
|
293
|
+
args.provider = argv[++index];
|
|
294
|
+
} else if (token === '--model' && argv[index + 1]) {
|
|
295
|
+
args.model = argv[++index];
|
|
296
|
+
} else if (token === '--reasoning' && argv[index + 1]) {
|
|
297
|
+
args.reasoningMode = argv[++index];
|
|
298
|
+
} else if (token === '--max-iter' && argv[index + 1]) {
|
|
299
|
+
args.maxIter = Number(argv[++index]);
|
|
300
|
+
} else if (token === '--max-pairs' && argv[index + 1]) {
|
|
301
|
+
args.maxPairs = Number(argv[++index]);
|
|
302
|
+
} else if (token === '--cwd' && argv[index + 1]) {
|
|
303
|
+
args.cwd = argv[++index];
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
return args;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
async function main() {
|
|
311
|
+
const args = parseArgs(process.argv);
|
|
312
|
+
if (!args.workspaceDir) {
|
|
313
|
+
console.error('Usage: node scripts/compare-runs.js <workspace> [--config-a with_skill] [--config-b without_skill] [--output comparison.json]');
|
|
314
|
+
process.exit(1);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
const result = await compareWorkspace(args);
|
|
318
|
+
process.stdout.write(`${JSON.stringify({
|
|
319
|
+
comparison: result.document.summary,
|
|
320
|
+
output: result.outputPath,
|
|
321
|
+
markdown: result.markdownPath,
|
|
322
|
+
}, null, 2)}\n`);
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const isDirectRun = process.argv[1]
|
|
326
|
+
&& fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
|
|
327
|
+
|
|
328
|
+
if (isDirectRun) {
|
|
329
|
+
main().catch((error) => {
|
|
330
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
331
|
+
process.exit(1);
|
|
332
|
+
});
|
|
333
|
+
}
|