@kodax-ai/kodax-cli 0.7.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/CHANGELOG.md +1304 -0
  2. package/LICENSE +191 -0
  3. package/README.md +1167 -0
  4. package/README_CN.md +631 -0
  5. package/dist/builtin/code-review/SKILL.md +63 -0
  6. package/dist/builtin/git-workflow/SKILL.md +84 -0
  7. package/dist/builtin/skill-creator/SKILL.md +122 -0
  8. package/dist/builtin/skill-creator/agents/analyzer.md +12 -0
  9. package/dist/builtin/skill-creator/agents/comparator.md +13 -0
  10. package/dist/builtin/skill-creator/agents/grader.md +13 -0
  11. package/dist/builtin/skill-creator/references/schemas.md +227 -0
  12. package/dist/builtin/skill-creator/scripts/aggregate-benchmark.d.ts +46 -0
  13. package/dist/builtin/skill-creator/scripts/aggregate-benchmark.js +209 -0
  14. package/dist/builtin/skill-creator/scripts/analyze-benchmark.d.ts +46 -0
  15. package/dist/builtin/skill-creator/scripts/analyze-benchmark.js +289 -0
  16. package/dist/builtin/skill-creator/scripts/compare-runs.d.ts +62 -0
  17. package/dist/builtin/skill-creator/scripts/compare-runs.js +333 -0
  18. package/dist/builtin/skill-creator/scripts/generate-review.d.ts +33 -0
  19. package/dist/builtin/skill-creator/scripts/generate-review.js +415 -0
  20. package/dist/builtin/skill-creator/scripts/grade-evals.d.ts +73 -0
  21. package/dist/builtin/skill-creator/scripts/grade-evals.js +405 -0
  22. package/dist/builtin/skill-creator/scripts/improve-description.d.ts +23 -0
  23. package/dist/builtin/skill-creator/scripts/improve-description.js +161 -0
  24. package/dist/builtin/skill-creator/scripts/init-skill.d.ts +14 -0
  25. package/dist/builtin/skill-creator/scripts/init-skill.js +153 -0
  26. package/dist/builtin/skill-creator/scripts/install-skill.d.ts +29 -0
  27. package/dist/builtin/skill-creator/scripts/install-skill.js +176 -0
  28. package/dist/builtin/skill-creator/scripts/package-skill.d.ts +38 -0
  29. package/dist/builtin/skill-creator/scripts/package-skill.js +124 -0
  30. package/dist/builtin/skill-creator/scripts/quick-validate.d.ts +8 -0
  31. package/dist/builtin/skill-creator/scripts/quick-validate.js +166 -0
  32. package/dist/builtin/skill-creator/scripts/run-eval.d.ts +66 -0
  33. package/dist/builtin/skill-creator/scripts/run-eval.js +356 -0
  34. package/dist/builtin/skill-creator/scripts/run-loop.d.ts +49 -0
  35. package/dist/builtin/skill-creator/scripts/run-loop.js +243 -0
  36. package/dist/builtin/skill-creator/scripts/run-trigger-eval.d.ts +58 -0
  37. package/dist/builtin/skill-creator/scripts/run-trigger-eval.js +225 -0
  38. package/dist/builtin/skill-creator/scripts/utils.js +278 -0
  39. package/dist/builtin/tdd/SKILL.md +56 -0
  40. package/dist/index.js +1717 -0
  41. package/dist/kodax_cli.js +1870 -0
  42. package/package.json +122 -0
  43. package/scripts/kodax-bin.cjs +27 -0
  44. package/scripts/production-env.cjs +16 -0
@@ -0,0 +1,209 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { readFile, readdir, stat, writeFile } from 'node:fs/promises';
4
+ import path from 'node:path';
5
+ import { fileURLToPath } from 'node:url';
6
+ import { calculateStats, formatDelta } from './utils.js';
7
+
8
+ async function readJson(filePath) {
9
+ return JSON.parse(await readFile(filePath, 'utf8'));
10
+ }
11
+
12
+ async function pathExists(filePath) {
13
+ try {
14
+ await stat(filePath);
15
+ return true;
16
+ } catch {
17
+ return false;
18
+ }
19
+ }
20
+
21
+ async function listDirectories(dirPath) {
22
+ const entries = await readdir(dirPath, { withFileTypes: true }).catch(() => []);
23
+ return entries
24
+ .filter((entry) => entry.isDirectory())
25
+ .map((entry) => path.join(dirPath, entry.name))
26
+ .sort((left, right) => left.localeCompare(right));
27
+ }
28
+
29
+ export async function loadRunResults(iterationDir) {
30
+ const runsRoot = await pathExists(path.join(iterationDir, 'runs'))
31
+ ? path.join(iterationDir, 'runs')
32
+ : iterationDir;
33
+
34
+ const evalDirs = (await listDirectories(runsRoot))
35
+ .filter((dirPath) => path.basename(dirPath).startsWith('eval-'));
36
+
37
+ const configs = {};
38
+
39
+ for (const evalDir of evalDirs) {
40
+ const metadataPath = path.join(evalDir, 'eval_metadata.json');
41
+ const metadata = await readJson(metadataPath).catch(() => ({}));
42
+ const evalId = metadata.eval_id ?? path.basename(evalDir);
43
+
44
+ for (const configDir of await listDirectories(evalDir)) {
45
+ const configName = path.basename(configDir);
46
+ const runDirs = (await listDirectories(configDir))
47
+ .filter((dirPath) => path.basename(dirPath).startsWith('run-'));
48
+
49
+ if (runDirs.length === 0) {
50
+ continue;
51
+ }
52
+
53
+ configs[configName] ??= [];
54
+
55
+ for (const runDir of runDirs) {
56
+ const grading = await readJson(path.join(runDir, 'grading.json')).catch(() => null);
57
+ if (!grading) {
58
+ continue;
59
+ }
60
+
61
+ const timing = await readJson(path.join(runDir, 'timing.json')).catch(() => ({}));
62
+
63
+ configs[configName].push({
64
+ eval_id: evalId,
65
+ run_id: path.basename(runDir),
66
+ pass_rate: grading.summary?.pass_rate ?? 0,
67
+ passed: grading.summary?.passed ?? 0,
68
+ failed: grading.summary?.failed ?? 0,
69
+ total: grading.summary?.total ?? 0,
70
+ time_seconds: timing.total_duration_seconds ?? grading.timing?.total_duration_seconds ?? 0,
71
+ tokens: timing.total_tokens ?? grading.execution_metrics?.output_chars ?? 0,
72
+ tool_calls: grading.execution_metrics?.total_tool_calls ?? 0,
73
+ errors: grading.execution_metrics?.errors_encountered ?? 0,
74
+ expectations: Array.isArray(grading.expectations) ? grading.expectations : [],
75
+ notes: [
76
+ ...(grading.user_notes_summary?.uncertainties ?? []),
77
+ ...(grading.user_notes_summary?.needs_review ?? []),
78
+ ...(grading.user_notes_summary?.workarounds ?? []),
79
+ ],
80
+ });
81
+ }
82
+ }
83
+ }
84
+
85
+ return configs;
86
+ }
87
+
88
+ export function summarizeConfigs(configRuns) {
89
+ const summary = {};
90
+
91
+ for (const [configName, runs] of Object.entries(configRuns)) {
92
+ summary[configName] = {
93
+ pass_rate: calculateStats(runs.map((run) => Number(run.pass_rate ?? 0))),
94
+ time_seconds: calculateStats(runs.map((run) => Number(run.time_seconds ?? 0))),
95
+ tokens: calculateStats(runs.map((run) => Number(run.tokens ?? 0))),
96
+ };
97
+ }
98
+
99
+ const orderedConfigs = Object.keys(summary);
100
+ const primary = summary[orderedConfigs[0]] ?? {
101
+ pass_rate: { mean: 0 },
102
+ time_seconds: { mean: 0 },
103
+ tokens: { mean: 0 },
104
+ };
105
+ const baseline = summary[orderedConfigs[1]] ?? {
106
+ pass_rate: { mean: 0 },
107
+ time_seconds: { mean: 0 },
108
+ tokens: { mean: 0 },
109
+ };
110
+
111
+ return {
112
+ configs: summary,
113
+ delta: {
114
+ pass_rate: formatDelta(primary.pass_rate.mean - baseline.pass_rate.mean),
115
+ time_seconds: formatDelta(primary.time_seconds.mean - baseline.time_seconds.mean),
116
+ tokens: formatDelta(primary.tokens.mean - baseline.tokens.mean),
117
+ },
118
+ };
119
+ }
120
+
121
+ export function buildBenchmarkDocument(iterationDir, skillName, configRuns) {
122
+ const summary = summarizeConfigs(configRuns);
123
+
124
+ return {
125
+ skill_name: skillName,
126
+ generated_at: new Date().toISOString(),
127
+ workspace: path.resolve(iterationDir),
128
+ configs: summary.configs,
129
+ delta: summary.delta,
130
+ runs: configRuns,
131
+ };
132
+ }
133
+
134
+ export function renderBenchmarkMarkdown(benchmark) {
135
+ const lines = [
136
+ `# Benchmark: ${benchmark.skill_name}`,
137
+ '',
138
+ `Generated: ${benchmark.generated_at}`,
139
+ '',
140
+ '| Config | Pass Rate | Time (s) | Tokens |',
141
+ '| --- | --- | --- | --- |',
142
+ ];
143
+
144
+ for (const [configName, metrics] of Object.entries(benchmark.configs)) {
145
+ lines.push(
146
+ `| ${configName} | ${metrics.pass_rate.mean} ± ${metrics.pass_rate.stddev} | ${metrics.time_seconds.mean} ± ${metrics.time_seconds.stddev} | ${metrics.tokens.mean} ± ${metrics.tokens.stddev} |`
147
+ );
148
+ }
149
+
150
+ lines.push('');
151
+ lines.push('## Delta');
152
+ lines.push('');
153
+ lines.push(`- Pass rate: ${benchmark.delta.pass_rate}`);
154
+ lines.push(`- Time (s): ${benchmark.delta.time_seconds}`);
155
+ lines.push(`- Tokens: ${benchmark.delta.tokens}`);
156
+
157
+ return lines.join('\n');
158
+ }
159
+
160
+ function parseArgs(argv) {
161
+ const args = {
162
+ iterationDir: argv[2],
163
+ skillName: 'unknown-skill',
164
+ };
165
+
166
+ for (let index = 3; index < argv.length; index += 1) {
167
+ const token = argv[index];
168
+ if (token === '--skill-name' && argv[index + 1]) {
169
+ args.skillName = argv[index + 1];
170
+ index += 1;
171
+ }
172
+ }
173
+
174
+ return args;
175
+ }
176
+
177
+ async function main() {
178
+ const { iterationDir, skillName } = parseArgs(process.argv);
179
+ if (!iterationDir) {
180
+ console.error('Usage: node scripts/aggregate-benchmark.js <iteration-dir> --skill-name <name>');
181
+ process.exit(1);
182
+ }
183
+
184
+ const configRuns = await loadRunResults(iterationDir);
185
+ if (Object.keys(configRuns).length === 0) {
186
+ console.error(`No benchmark runs found in ${iterationDir}`);
187
+ process.exit(1);
188
+ }
189
+
190
+ const benchmark = buildBenchmarkDocument(iterationDir, skillName, configRuns);
191
+ const benchmarkJsonPath = path.join(iterationDir, 'benchmark.json');
192
+ const benchmarkMdPath = path.join(iterationDir, 'benchmark.md');
193
+
194
+ await writeFile(benchmarkJsonPath, JSON.stringify(benchmark, null, 2));
195
+ await writeFile(benchmarkMdPath, `${renderBenchmarkMarkdown(benchmark)}\n`);
196
+
197
+ console.log(`Wrote ${benchmarkJsonPath}`);
198
+ console.log(`Wrote ${benchmarkMdPath}`);
199
+ }
200
+
201
+ const isDirectRun = process.argv[1]
202
+ && fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
203
+
204
+ if (isDirectRun) {
205
+ main().catch((error) => {
206
+ console.error(error instanceof Error ? error.message : String(error));
207
+ process.exit(1);
208
+ });
209
+ }
@@ -0,0 +1,46 @@
1
+ export interface AnalyzeBenchmarkOptions {
2
+ workspaceDir: string;
3
+ benchmarkPath?: string;
4
+ outputPath?: string;
5
+ markdownPath?: string;
6
+ skillName?: string;
7
+ provider?: string;
8
+ model?: string;
9
+ reasoningMode?: string;
10
+ maxIter?: number;
11
+ cwd?: string;
12
+ }
13
+
14
+ export interface BenchmarkAnalysis {
15
+ skill_name: string;
16
+ generated_at: string;
17
+ workspace: string;
18
+ verdict: 'improves' | 'regresses' | 'mixed' | 'inconclusive';
19
+ release_readiness: 'ready' | 'needs_iteration' | 'needs_manual_review';
20
+ recommendation: string;
21
+ key_findings: string[];
22
+ variance_hotspots: string[];
23
+ suggested_actions: string[];
24
+ watchouts: string[];
25
+ supporting_metrics: {
26
+ pass_rate_delta: string;
27
+ time_seconds_delta: string;
28
+ tokens_delta: string;
29
+ };
30
+ failure_clusters: Record<string, unknown>;
31
+ }
32
+
33
+ export function buildAnalysisPrompt(input: Record<string, unknown>): string;
34
+
35
+ export function renderAnalysisMarkdown(analysis: Record<string, any>): string;
36
+
37
+ export function analyzeBenchmark(
38
+ options: AnalyzeBenchmarkOptions,
39
+ runner?: (prompt: string, options: Record<string, unknown>) => Promise<string>
40
+ ): Promise<{
41
+ analysis: BenchmarkAnalysis;
42
+ prompt: string;
43
+ rawResponse: string;
44
+ analysisJsonPath: string;
45
+ analysisMdPath: string;
46
+ }>;
@@ -0,0 +1,289 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { readFile, writeFile } from 'node:fs/promises';
4
+ import path from 'node:path';
5
+ import { fileURLToPath } from 'node:url';
6
+ import {
7
+ buildBenchmarkDocument,
8
+ loadRunResults,
9
+ } from './aggregate-benchmark.js';
10
+ import {
11
+ extractJsonObject,
12
+ loadKodaXSDK,
13
+ loadRelativeText,
14
+ readJsonFile,
15
+ truncateText,
16
+ } from './utils.js';
17
+
18
+ function normalizeStringArray(value) {
19
+ if (!Array.isArray(value)) {
20
+ return [];
21
+ }
22
+ return value
23
+ .map((item) => String(item ?? '').trim())
24
+ .filter(Boolean);
25
+ }
26
+
27
+ function summarizeFailureClusters(configRuns) {
28
+ const clusters = {};
29
+
30
+ for (const [configName, runs] of Object.entries(configRuns)) {
31
+ const failureCounts = new Map();
32
+ const notes = [];
33
+
34
+ for (const run of runs) {
35
+ for (const expectation of run.expectations ?? []) {
36
+ if (expectation?.passed === true) {
37
+ continue;
38
+ }
39
+ const text = String(expectation?.text ?? '').trim();
40
+ if (!text) {
41
+ continue;
42
+ }
43
+ failureCounts.set(text, (failureCounts.get(text) ?? 0) + 1);
44
+ }
45
+
46
+ for (const note of run.notes ?? []) {
47
+ const normalized = String(note ?? '').trim();
48
+ if (normalized) {
49
+ notes.push(normalized);
50
+ }
51
+ }
52
+ }
53
+
54
+ clusters[configName] = {
55
+ repeated_failures: Array.from(failureCounts.entries())
56
+ .sort((left, right) => right[1] - left[1])
57
+ .slice(0, 10)
58
+ .map(([text, count]) => ({ text, count })),
59
+ notes: notes.slice(0, 10),
60
+ };
61
+ }
62
+
63
+ return clusters;
64
+ }
65
+
66
+ function normalizeAnalysisResult(rawText, benchmark, failureClusters) {
67
+ const parsed = extractJsonObject(rawText) ?? {};
68
+
69
+ return {
70
+ skill_name: benchmark.skill_name,
71
+ generated_at: new Date().toISOString(),
72
+ workspace: benchmark.workspace,
73
+ verdict: ['improves', 'regresses', 'mixed', 'inconclusive'].includes(parsed.verdict)
74
+ ? parsed.verdict
75
+ : 'inconclusive',
76
+ release_readiness: ['ready', 'needs_iteration', 'needs_manual_review'].includes(parsed.release_readiness)
77
+ ? parsed.release_readiness
78
+ : 'needs_manual_review',
79
+ recommendation: String(parsed.recommendation ?? '').trim(),
80
+ key_findings: normalizeStringArray(parsed.key_findings),
81
+ variance_hotspots: normalizeStringArray(parsed.variance_hotspots),
82
+ suggested_actions: normalizeStringArray(parsed.suggested_actions),
83
+ watchouts: normalizeStringArray(parsed.watchouts),
84
+ supporting_metrics: {
85
+ pass_rate_delta: benchmark.delta?.pass_rate ?? 'n/a',
86
+ time_seconds_delta: benchmark.delta?.time_seconds ?? 'n/a',
87
+ tokens_delta: benchmark.delta?.tokens ?? 'n/a',
88
+ },
89
+ failure_clusters: failureClusters,
90
+ };
91
+ }
92
+
93
+ export function buildAnalysisPrompt(input) {
94
+ return `${input.agentInstructions.trim()}
95
+
96
+ Return JSON with this shape:
97
+ {
98
+ "verdict": "improves | regresses | mixed | inconclusive",
99
+ "release_readiness": "ready | needs_iteration | needs_manual_review",
100
+ "recommendation": "short recommendation",
101
+ "key_findings": [],
102
+ "variance_hotspots": [],
103
+ "suggested_actions": [],
104
+ "watchouts": []
105
+ }
106
+
107
+ ## Benchmark Summary
108
+ ${truncateText(JSON.stringify({
109
+ skill_name: input.benchmark.skill_name,
110
+ configs: input.benchmark.configs,
111
+ delta: input.benchmark.delta,
112
+ }, null, 2), 12000)}
113
+
114
+ ## Failure Clusters
115
+ ${truncateText(JSON.stringify(input.failureClusters, null, 2), 8000)}
116
+ `;
117
+ }
118
+
119
+ async function defaultRunAnalyst(prompt, options) {
120
+ const { runKodaX } = await loadKodaXSDK();
121
+ const result = await runKodaX(
122
+ {
123
+ provider: options.provider ?? 'anthropic',
124
+ model: options.model,
125
+ maxIter: options.maxIter ?? 20,
126
+ reasoningMode: options.reasoningMode ?? 'balanced',
127
+ thinking: options.reasoningMode ? options.reasoningMode !== 'off' : true,
128
+ context: {
129
+ gitRoot: path.resolve(options.cwd ?? options.workspaceDir ?? process.cwd()),
130
+ },
131
+ },
132
+ prompt
133
+ );
134
+ return result.lastText;
135
+ }
136
+
137
+ export function renderAnalysisMarkdown(analysis) {
138
+ const lines = [
139
+ `# Benchmark Analysis: ${analysis.skill_name}`,
140
+ '',
141
+ `Generated: ${analysis.generated_at}`,
142
+ '',
143
+ `- Verdict: ${analysis.verdict}`,
144
+ `- Release readiness: ${analysis.release_readiness}`,
145
+ `- Recommendation: ${analysis.recommendation || 'n/a'}`,
146
+ '',
147
+ ];
148
+
149
+ const sections = [
150
+ ['key_findings', 'Key Findings'],
151
+ ['variance_hotspots', 'Variance Hotspots'],
152
+ ['suggested_actions', 'Suggested Actions'],
153
+ ['watchouts', 'Watchouts'],
154
+ ];
155
+
156
+ for (const [field, title] of sections) {
157
+ lines.push(`## ${title}`);
158
+ lines.push('');
159
+ const items = Array.isArray(analysis[field]) ? analysis[field] : [];
160
+ if (items.length === 0) {
161
+ lines.push('- None');
162
+ } else {
163
+ for (const item of items) {
164
+ lines.push(`- ${item}`);
165
+ }
166
+ }
167
+ lines.push('');
168
+ }
169
+
170
+ lines.push('## Supporting Metrics');
171
+ lines.push('');
172
+ lines.push(`- Pass rate delta: ${analysis.supporting_metrics.pass_rate_delta}`);
173
+ lines.push(`- Time delta: ${analysis.supporting_metrics.time_seconds_delta}`);
174
+ lines.push(`- Tokens delta: ${analysis.supporting_metrics.tokens_delta}`);
175
+
176
+ return `${lines.join('\n')}\n`;
177
+ }
178
+
179
+ export async function analyzeBenchmark(
180
+ options,
181
+ runner = defaultRunAnalyst
182
+ ) {
183
+ const workspaceDir = path.resolve(options.workspaceDir);
184
+ const benchmarkPath = path.resolve(options.benchmarkPath ?? path.join(workspaceDir, 'benchmark.json'));
185
+ let benchmark = await readJsonFile(benchmarkPath, null);
186
+
187
+ if (!benchmark) {
188
+ const configRuns = await loadRunResults(workspaceDir);
189
+ if (Object.keys(configRuns).length === 0) {
190
+ throw new Error(`No benchmark data found in ${workspaceDir}`);
191
+ }
192
+ benchmark = buildBenchmarkDocument(workspaceDir, options.skillName ?? path.basename(workspaceDir), configRuns);
193
+ await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
194
+ }
195
+
196
+ const configRuns = await loadRunResults(workspaceDir);
197
+ const failureClusters = summarizeFailureClusters(configRuns);
198
+ const agentInstructions = await loadRelativeText(import.meta.url, '../agents/analyzer.md');
199
+ const prompt = buildAnalysisPrompt({
200
+ agentInstructions,
201
+ benchmark,
202
+ failureClusters,
203
+ });
204
+ const rawResponse = await runner(prompt, {
205
+ ...options,
206
+ workspaceDir,
207
+ benchmarkPath,
208
+ benchmark,
209
+ });
210
+ const analysis = normalizeAnalysisResult(rawResponse, benchmark, failureClusters);
211
+ const analysisJsonPath = path.resolve(options.outputPath ?? path.join(workspaceDir, 'analysis.json'));
212
+ const analysisMdPath = path.resolve(options.markdownPath ?? path.join(workspaceDir, 'analysis.md'));
213
+
214
+ await writeFile(analysisJsonPath, `${JSON.stringify(analysis, null, 2)}\n`, 'utf8');
215
+ await writeFile(analysisMdPath, renderAnalysisMarkdown(analysis), 'utf8');
216
+
217
+ return {
218
+ analysis,
219
+ prompt,
220
+ rawResponse,
221
+ analysisJsonPath,
222
+ analysisMdPath,
223
+ };
224
+ }
225
+
226
+ function parseArgs(argv) {
227
+ const args = {
228
+ workspaceDir: argv[2] ?? '',
229
+ benchmarkPath: undefined,
230
+ outputPath: undefined,
231
+ markdownPath: undefined,
232
+ skillName: undefined,
233
+ provider: 'anthropic',
234
+ model: undefined,
235
+ reasoningMode: 'balanced',
236
+ maxIter: 20,
237
+ cwd: process.cwd(),
238
+ };
239
+
240
+ for (let index = 3; index < argv.length; index += 1) {
241
+ const token = argv[index];
242
+ if (token === '--benchmark' && argv[index + 1]) {
243
+ args.benchmarkPath = argv[++index];
244
+ } else if (token === '--output' && argv[index + 1]) {
245
+ args.outputPath = argv[++index];
246
+ } else if (token === '--markdown' && argv[index + 1]) {
247
+ args.markdownPath = argv[++index];
248
+ } else if (token === '--skill-name' && argv[index + 1]) {
249
+ args.skillName = argv[++index];
250
+ } else if (token === '--provider' && argv[index + 1]) {
251
+ args.provider = argv[++index];
252
+ } else if (token === '--model' && argv[index + 1]) {
253
+ args.model = argv[++index];
254
+ } else if (token === '--reasoning' && argv[index + 1]) {
255
+ args.reasoningMode = argv[++index];
256
+ } else if (token === '--max-iter' && argv[index + 1]) {
257
+ args.maxIter = Number(argv[++index]);
258
+ } else if (token === '--cwd' && argv[index + 1]) {
259
+ args.cwd = argv[++index];
260
+ }
261
+ }
262
+
263
+ return args;
264
+ }
265
+
266
+ async function main() {
267
+ const args = parseArgs(process.argv);
268
+ if (!args.workspaceDir) {
269
+ console.error('Usage: node scripts/analyze-benchmark.js <workspace> [--benchmark benchmark.json] [--output analysis.json] [--markdown analysis.md]');
270
+ process.exit(1);
271
+ }
272
+
273
+ const result = await analyzeBenchmark(args);
274
+ process.stdout.write(`${JSON.stringify({
275
+ analysis: result.analysis,
276
+ analysis_json: result.analysisJsonPath,
277
+ analysis_md: result.analysisMdPath,
278
+ }, null, 2)}\n`);
279
+ }
280
+
281
+ const isDirectRun = process.argv[1]
282
+ && fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
283
+
284
+ if (isDirectRun) {
285
+ main().catch((error) => {
286
+ console.error(error instanceof Error ? error.message : String(error));
287
+ process.exit(1);
288
+ });
289
+ }
@@ -0,0 +1,62 @@
1
+ export interface CompareWorkspaceOptions {
2
+ workspaceDir: string;
3
+ configA?: string;
4
+ configB?: string;
5
+ outputPath?: string;
6
+ markdownPath?: string;
7
+ provider?: string;
8
+ model?: string;
9
+ reasoningMode?: string;
10
+ maxIter?: number;
11
+ maxPairs?: number;
12
+ cwd?: string;
13
+ }
14
+
15
+ export function buildComparisonPrompt(input: Record<string, unknown>): string;
16
+
17
+ export interface ComparisonSummary {
18
+ total_pairs: number;
19
+ config_a_wins: number;
20
+ config_b_wins: number;
21
+ ties: number;
22
+ inconclusive: number;
23
+ }
24
+
25
+ export interface ComparisonEntry {
26
+ index: number;
27
+ eval_id: string | number | null;
28
+ eval_name: string | null;
29
+ run_a: string;
30
+ run_b: string;
31
+ presented_as: {
32
+ A: string;
33
+ B: string;
34
+ };
35
+ winner_label: 'A' | 'B' | 'tie' | 'inconclusive';
36
+ winner_config: string;
37
+ confidence: number;
38
+ rationale: string;
39
+ strengths_a: string[];
40
+ strengths_b: string[];
41
+ risks: string[];
42
+ }
43
+
44
+ export interface ComparisonDocument {
45
+ workspace: string;
46
+ generated_at: string;
47
+ config_a: string;
48
+ config_b: string;
49
+ summary: ComparisonSummary;
50
+ comparisons: ComparisonEntry[];
51
+ }
52
+
53
+ export function renderComparisonMarkdown(document: ComparisonDocument): string;
54
+
55
+ export function compareWorkspace(
56
+ options: CompareWorkspaceOptions,
57
+ runner?: (prompt: string, options: Record<string, unknown>) => Promise<string>
58
+ ): Promise<{
59
+ document: ComparisonDocument;
60
+ outputPath: string;
61
+ markdownPath: string;
62
+ }>;