@kodax-ai/kodax 0.7.39
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1350 -0
- package/LICENSE +191 -0
- package/README.md +1170 -0
- package/README_CN.md +659 -0
- package/dist/acp_events.d.ts +109 -0
- package/dist/acp_logger.d.ts +20 -0
- package/dist/acp_server.d.ts +92 -0
- package/dist/builtin/code-review/SKILL.md +63 -0
- package/dist/builtin/git-workflow/SKILL.md +84 -0
- package/dist/builtin/skill-creator/SKILL.md +122 -0
- package/dist/builtin/skill-creator/agents/analyzer.md +12 -0
- package/dist/builtin/skill-creator/agents/comparator.md +13 -0
- package/dist/builtin/skill-creator/agents/grader.md +13 -0
- package/dist/builtin/skill-creator/references/schemas.md +227 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.js +209 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.js +289 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.d.ts +62 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.js +333 -0
- package/dist/builtin/skill-creator/scripts/generate-review.d.ts +33 -0
- package/dist/builtin/skill-creator/scripts/generate-review.js +415 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.d.ts +73 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.js +405 -0
- package/dist/builtin/skill-creator/scripts/improve-description.d.ts +23 -0
- package/dist/builtin/skill-creator/scripts/improve-description.js +161 -0
- package/dist/builtin/skill-creator/scripts/init-skill.d.ts +14 -0
- package/dist/builtin/skill-creator/scripts/init-skill.js +153 -0
- package/dist/builtin/skill-creator/scripts/install-skill.d.ts +29 -0
- package/dist/builtin/skill-creator/scripts/install-skill.js +176 -0
- package/dist/builtin/skill-creator/scripts/package-skill.d.ts +38 -0
- package/dist/builtin/skill-creator/scripts/package-skill.js +124 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.d.ts +8 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.js +166 -0
- package/dist/builtin/skill-creator/scripts/run-eval.d.ts +66 -0
- package/dist/builtin/skill-creator/scripts/run-eval.js +356 -0
- package/dist/builtin/skill-creator/scripts/run-loop.d.ts +49 -0
- package/dist/builtin/skill-creator/scripts/run-loop.js +243 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.d.ts +58 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.js +225 -0
- package/dist/builtin/skill-creator/scripts/utils.js +273 -0
- package/dist/builtin/tdd/SKILL.md +56 -0
- package/dist/chunks/chunk-4E76FLZ3.js +2 -0
- package/dist/chunks/chunk-7LQ2NCHF.js +1221 -0
- package/dist/chunks/chunk-HUAU4KB3.js +2 -0
- package/dist/chunks/chunk-N2VZ2MJF.js +11 -0
- package/dist/chunks/chunk-SF7WD7E5.js +2 -0
- package/dist/chunks/chunk-SONW6AC7.js +14 -0
- package/dist/chunks/chunk-WEEQZYZS.js +460 -0
- package/dist/chunks/chunk-XI75LZIO.js +30 -0
- package/dist/chunks/compaction-config-YL4SWWII.js +2 -0
- package/dist/chunks/construction-bootstrap-XSE7ZABG.js +5 -0
- package/dist/chunks/devtools-MOFU7YQF.js +2 -0
- package/dist/chunks/dist-AMUYI7R5.js +2 -0
- package/dist/chunks/dist-WKW4CBG6.js +2 -0
- package/dist/chunks/utils-3HW4KOGE.js +2 -0
- package/dist/cli_commands.d.ts +17 -0
- package/dist/cli_option_helpers.d.ts +49 -0
- package/dist/cli_option_helpers.test.d.ts +1 -0
- package/dist/constructed_cli.d.ts +82 -0
- package/dist/constructed_cli.test.d.ts +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +9 -0
- package/dist/kodax_cli.d.ts +7 -0
- package/dist/kodax_cli.js +1882 -0
- package/dist/sdk-agent.d.ts +15 -0
- package/dist/sdk-agent.js +2 -0
- package/dist/sdk-coding.d.ts +20 -0
- package/dist/sdk-coding.js +2 -0
- package/dist/sdk-llm.d.ts +15 -0
- package/dist/sdk-llm.js +2 -0
- package/dist/sdk-repl.d.ts +21 -0
- package/dist/sdk-repl.js +2 -0
- package/dist/sdk-skills.d.ts +16 -0
- package/dist/sdk-skills.js +2 -0
- package/dist/self_modify_cli.d.ts +81 -0
- package/dist/self_modify_cli.test.d.ts +9 -0
- package/dist/skill_cli.d.ts +15 -0
- package/dist/skill_cli.test.d.ts +1 -0
- package/package.json +143 -0
- package/scripts/kodax-bin.cjs +27 -0
- package/scripts/production-env.cjs +16 -0
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# Skill Creator Schemas
|
|
2
|
+
|
|
3
|
+
这份参考文档定义 KodaX 版 `skill-creator` 默认使用的评测文件格式。它不是强制协议,但建议优先沿用,方便后续聚合、review 和自动分析。
|
|
4
|
+
|
|
5
|
+
## `evals/evals.json`
|
|
6
|
+
|
|
7
|
+
用于保存测试提示集合。
|
|
8
|
+
|
|
9
|
+
```json
|
|
10
|
+
{
|
|
11
|
+
"skill_name": "example-skill",
|
|
12
|
+
"evals": [
|
|
13
|
+
{
|
|
14
|
+
"id": 1,
|
|
15
|
+
"prompt": "User task prompt",
|
|
16
|
+
"expected_output": "What a good result should achieve",
|
|
17
|
+
"files": [],
|
|
18
|
+
"assertions": []
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
字段说明:
|
|
25
|
+
- `skill_name`: skill 名称。
|
|
26
|
+
- `evals`: 测试用例数组。
|
|
27
|
+
- `id`: 用例唯一标识。
|
|
28
|
+
- `prompt`: 给 agent 的任务文本。
|
|
29
|
+
- `expected_output`: 对预期结果的简短说明。
|
|
30
|
+
- `files`: 需要作为输入提供的文件列表。
|
|
31
|
+
- `assertions`: 可选,后续 grading 用的断言定义。
|
|
32
|
+
|
|
33
|
+
## `eval_metadata.json`
|
|
34
|
+
|
|
35
|
+
用于单个 eval 目录,帮助 review 工具识别 prompt 和断言。
|
|
36
|
+
|
|
37
|
+
```json
|
|
38
|
+
{
|
|
39
|
+
"eval_id": 1,
|
|
40
|
+
"eval_name": "handles-empty-input",
|
|
41
|
+
"prompt": "Implement validation for empty input",
|
|
42
|
+
"expected_output": "Reject empty input with a clear message",
|
|
43
|
+
"assertions": [
|
|
44
|
+
{
|
|
45
|
+
"text": "rejects empty input with a clear message"
|
|
46
|
+
}
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## `grading.json`
|
|
52
|
+
|
|
53
|
+
由 `grade-evals.js` 生成,用于保存单次运行后的断言判定结果。
|
|
54
|
+
|
|
55
|
+
```json
|
|
56
|
+
{
|
|
57
|
+
"summary": {
|
|
58
|
+
"passed": 2,
|
|
59
|
+
"failed": 1,
|
|
60
|
+
"total": 3,
|
|
61
|
+
"pass_rate": 0.6667
|
|
62
|
+
},
|
|
63
|
+
"expectations": [
|
|
64
|
+
{
|
|
65
|
+
"text": "rejects empty input with a clear message",
|
|
66
|
+
"passed": true,
|
|
67
|
+
"evidence": "Observed in outputs/result.md"
|
|
68
|
+
}
|
|
69
|
+
],
|
|
70
|
+
"execution_metrics": {
|
|
71
|
+
"total_tool_calls": 4,
|
|
72
|
+
"errors_encountered": 0,
|
|
73
|
+
"output_chars": 5120
|
|
74
|
+
},
|
|
75
|
+
"user_notes_summary": {
|
|
76
|
+
"uncertainties": [],
|
|
77
|
+
"needs_review": [],
|
|
78
|
+
"workarounds": []
|
|
79
|
+
},
|
|
80
|
+
"overall_summary": "Mostly correct, but edge cases need review.",
|
|
81
|
+
"timing": {
|
|
82
|
+
"total_tokens": 84852,
|
|
83
|
+
"total_duration_seconds": 23.3
|
|
84
|
+
},
|
|
85
|
+
"meta": {
|
|
86
|
+
"generated_at": "2026-03-17T12:00:00.000Z",
|
|
87
|
+
"eval_id": 1,
|
|
88
|
+
"eval_name": "handles-empty-input",
|
|
89
|
+
"config": "with_skill",
|
|
90
|
+
"run_id": "run-1"
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
要求:
|
|
96
|
+
- `expectations` 里的字段名固定为 `text`、`passed`、`evidence`。
|
|
97
|
+
- `pass_rate` 建议是 `0..1` 之间的小数。
|
|
98
|
+
|
|
99
|
+
## `timing.json`
|
|
100
|
+
|
|
101
|
+
用于保存一次运行的耗时与 token 信息。
|
|
102
|
+
|
|
103
|
+
```json
|
|
104
|
+
{
|
|
105
|
+
"total_tokens": 84852,
|
|
106
|
+
"duration_ms": 23332,
|
|
107
|
+
"total_duration_seconds": 23.3
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## `benchmark.json`
|
|
112
|
+
|
|
113
|
+
由 `aggregate-benchmark.js` 生成,用于总览不同配置的表现。
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"skill_name": "example-skill",
|
|
118
|
+
"generated_at": "2026-03-17T12:00:00.000Z",
|
|
119
|
+
"workspace": "/abs/path/to/iteration-1",
|
|
120
|
+
"configs": {
|
|
121
|
+
"with_skill": {
|
|
122
|
+
"pass_rate": { "mean": 0.9, "stddev": 0.1, "min": 0.8, "max": 1.0 },
|
|
123
|
+
"time_seconds": { "mean": 12.4, "stddev": 1.1, "min": 11.2, "max": 13.5 },
|
|
124
|
+
"tokens": { "mean": 4200, "stddev": 380, "min": 3900, "max": 4700 }
|
|
125
|
+
},
|
|
126
|
+
"without_skill": {
|
|
127
|
+
"pass_rate": { "mean": 0.6, "stddev": 0.2, "min": 0.4, "max": 0.8 },
|
|
128
|
+
"time_seconds": { "mean": 9.5, "stddev": 0.7, "min": 8.9, "max": 10.2 },
|
|
129
|
+
"tokens": { "mean": 3100, "stddev": 240, "min": 2900, "max": 3400 }
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
"delta": {
|
|
133
|
+
"pass_rate": "+0.3000",
|
|
134
|
+
"time_seconds": "+2.9000",
|
|
135
|
+
"tokens": "+1100.0000"
|
|
136
|
+
},
|
|
137
|
+
"runs": {
|
|
138
|
+
"with_skill": [],
|
|
139
|
+
"without_skill": []
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## `analysis.json`
|
|
145
|
+
|
|
146
|
+
由 `analyze-benchmark.js` 生成,用于总结 benchmark 的稳定收益、方差热点和下一步建议。
|
|
147
|
+
|
|
148
|
+
```json
|
|
149
|
+
{
|
|
150
|
+
"skill_name": "example-skill",
|
|
151
|
+
"generated_at": "2026-03-17T12:15:00.000Z",
|
|
152
|
+
"workspace": "/abs/path/to/iteration-1",
|
|
153
|
+
"verdict": "improves",
|
|
154
|
+
"release_readiness": "needs_iteration",
|
|
155
|
+
"recommendation": "Keep the skill, but reduce variance before release.",
|
|
156
|
+
"key_findings": [
|
|
157
|
+
"with_skill materially improves pass rate"
|
|
158
|
+
],
|
|
159
|
+
"variance_hotspots": [
|
|
160
|
+
"baseline repeatedly misses billing details"
|
|
161
|
+
],
|
|
162
|
+
"suggested_actions": [
|
|
163
|
+
"tighten assertions around billing coverage"
|
|
164
|
+
],
|
|
165
|
+
"watchouts": [
|
|
166
|
+
"token cost increased"
|
|
167
|
+
],
|
|
168
|
+
"supporting_metrics": {
|
|
169
|
+
"pass_rate_delta": "+0.3000",
|
|
170
|
+
"time_seconds_delta": "+2.9000",
|
|
171
|
+
"tokens_delta": "+1100.0000"
|
|
172
|
+
},
|
|
173
|
+
"failure_clusters": {}
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## `comparison.json`
|
|
178
|
+
|
|
179
|
+
由 `compare-runs.js` 生成,用于 blind comparison 两个 config 的输出质量。
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"workspace": "/abs/path/to/iteration-1",
|
|
184
|
+
"generated_at": "2026-03-17T12:20:00.000Z",
|
|
185
|
+
"config_a": "with_skill",
|
|
186
|
+
"config_b": "without_skill",
|
|
187
|
+
"summary": {
|
|
188
|
+
"total_pairs": 3,
|
|
189
|
+
"config_a_wins": 2,
|
|
190
|
+
"config_b_wins": 0,
|
|
191
|
+
"ties": 1,
|
|
192
|
+
"inconclusive": 0
|
|
193
|
+
},
|
|
194
|
+
"comparisons": [
|
|
195
|
+
{
|
|
196
|
+
"eval_id": 1,
|
|
197
|
+
"winner_label": "A",
|
|
198
|
+
"winner_config": "with_skill",
|
|
199
|
+
"confidence": 0.9,
|
|
200
|
+
"rationale": "Candidate A is more complete and specific."
|
|
201
|
+
}
|
|
202
|
+
]
|
|
203
|
+
}
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## 推荐目录结构
|
|
207
|
+
|
|
208
|
+
```text
|
|
209
|
+
my-skill-workspace/
|
|
210
|
+
└── iteration-1/
|
|
211
|
+
├── eval-0/
|
|
212
|
+
│ ├── eval_metadata.json
|
|
213
|
+
│ ├── with_skill/
|
|
214
|
+
│ │ ├── outputs/
|
|
215
|
+
│ │ ├── grading.json
|
|
216
|
+
│ │ └── timing.json
|
|
217
|
+
│ └── without_skill/
|
|
218
|
+
│ ├── outputs/
|
|
219
|
+
│ ├── grading.json
|
|
220
|
+
│ └── timing.json
|
|
221
|
+
├── benchmark.json
|
|
222
|
+
├── benchmark.md
|
|
223
|
+
├── analysis.json
|
|
224
|
+
├── analysis.md
|
|
225
|
+
├── comparison.json
|
|
226
|
+
└── comparison.md
|
|
227
|
+
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
export interface BenchmarkRun {
|
|
2
|
+
eval_id: string | number;
|
|
3
|
+
run_id: string;
|
|
4
|
+
pass_rate: number;
|
|
5
|
+
passed: number;
|
|
6
|
+
failed: number;
|
|
7
|
+
total: number;
|
|
8
|
+
time_seconds: number;
|
|
9
|
+
tokens: number;
|
|
10
|
+
tool_calls: number;
|
|
11
|
+
errors: number;
|
|
12
|
+
expectations: Array<Record<string, unknown>>;
|
|
13
|
+
notes: string[];
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface StatsSummary {
|
|
17
|
+
mean: number;
|
|
18
|
+
stddev: number;
|
|
19
|
+
min: number;
|
|
20
|
+
max: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface BenchmarkDocument {
|
|
24
|
+
skill_name: string;
|
|
25
|
+
generated_at: string;
|
|
26
|
+
workspace: string;
|
|
27
|
+
configs: Record<string, {
|
|
28
|
+
pass_rate: StatsSummary;
|
|
29
|
+
time_seconds: StatsSummary;
|
|
30
|
+
tokens: StatsSummary;
|
|
31
|
+
}>;
|
|
32
|
+
delta: {
|
|
33
|
+
pass_rate: string;
|
|
34
|
+
time_seconds: string;
|
|
35
|
+
tokens: string;
|
|
36
|
+
};
|
|
37
|
+
runs: Record<string, BenchmarkRun[]>;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function loadRunResults(iterationDir: string): Promise<Record<string, BenchmarkRun[]>>;
|
|
41
|
+
export function buildBenchmarkDocument(
|
|
42
|
+
iterationDir: string,
|
|
43
|
+
skillName: string,
|
|
44
|
+
configRuns: Record<string, BenchmarkRun[]>
|
|
45
|
+
): BenchmarkDocument;
|
|
46
|
+
export function renderBenchmarkMarkdown(benchmark: BenchmarkDocument): string;
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { readFile, readdir, stat, writeFile } from 'node:fs/promises';
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
import { fileURLToPath } from 'node:url';
|
|
6
|
+
import { calculateStats, formatDelta } from './utils.js';
|
|
7
|
+
|
|
8
|
+
async function readJson(filePath) {
|
|
9
|
+
return JSON.parse(await readFile(filePath, 'utf8'));
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async function pathExists(filePath) {
|
|
13
|
+
try {
|
|
14
|
+
await stat(filePath);
|
|
15
|
+
return true;
|
|
16
|
+
} catch {
|
|
17
|
+
return false;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
async function listDirectories(dirPath) {
|
|
22
|
+
const entries = await readdir(dirPath, { withFileTypes: true }).catch(() => []);
|
|
23
|
+
return entries
|
|
24
|
+
.filter((entry) => entry.isDirectory())
|
|
25
|
+
.map((entry) => path.join(dirPath, entry.name))
|
|
26
|
+
.sort((left, right) => left.localeCompare(right));
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export async function loadRunResults(iterationDir) {
|
|
30
|
+
const runsRoot = await pathExists(path.join(iterationDir, 'runs'))
|
|
31
|
+
? path.join(iterationDir, 'runs')
|
|
32
|
+
: iterationDir;
|
|
33
|
+
|
|
34
|
+
const evalDirs = (await listDirectories(runsRoot))
|
|
35
|
+
.filter((dirPath) => path.basename(dirPath).startsWith('eval-'));
|
|
36
|
+
|
|
37
|
+
const configs = {};
|
|
38
|
+
|
|
39
|
+
for (const evalDir of evalDirs) {
|
|
40
|
+
const metadataPath = path.join(evalDir, 'eval_metadata.json');
|
|
41
|
+
const metadata = await readJson(metadataPath).catch(() => ({}));
|
|
42
|
+
const evalId = metadata.eval_id ?? path.basename(evalDir);
|
|
43
|
+
|
|
44
|
+
for (const configDir of await listDirectories(evalDir)) {
|
|
45
|
+
const configName = path.basename(configDir);
|
|
46
|
+
const runDirs = (await listDirectories(configDir))
|
|
47
|
+
.filter((dirPath) => path.basename(dirPath).startsWith('run-'));
|
|
48
|
+
|
|
49
|
+
if (runDirs.length === 0) {
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
configs[configName] ??= [];
|
|
54
|
+
|
|
55
|
+
for (const runDir of runDirs) {
|
|
56
|
+
const grading = await readJson(path.join(runDir, 'grading.json')).catch(() => null);
|
|
57
|
+
if (!grading) {
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
const timing = await readJson(path.join(runDir, 'timing.json')).catch(() => ({}));
|
|
62
|
+
|
|
63
|
+
configs[configName].push({
|
|
64
|
+
eval_id: evalId,
|
|
65
|
+
run_id: path.basename(runDir),
|
|
66
|
+
pass_rate: grading.summary?.pass_rate ?? 0,
|
|
67
|
+
passed: grading.summary?.passed ?? 0,
|
|
68
|
+
failed: grading.summary?.failed ?? 0,
|
|
69
|
+
total: grading.summary?.total ?? 0,
|
|
70
|
+
time_seconds: timing.total_duration_seconds ?? grading.timing?.total_duration_seconds ?? 0,
|
|
71
|
+
tokens: timing.total_tokens ?? grading.execution_metrics?.output_chars ?? 0,
|
|
72
|
+
tool_calls: grading.execution_metrics?.total_tool_calls ?? 0,
|
|
73
|
+
errors: grading.execution_metrics?.errors_encountered ?? 0,
|
|
74
|
+
expectations: Array.isArray(grading.expectations) ? grading.expectations : [],
|
|
75
|
+
notes: [
|
|
76
|
+
...(grading.user_notes_summary?.uncertainties ?? []),
|
|
77
|
+
...(grading.user_notes_summary?.needs_review ?? []),
|
|
78
|
+
...(grading.user_notes_summary?.workarounds ?? []),
|
|
79
|
+
],
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return configs;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
export function summarizeConfigs(configRuns) {
|
|
89
|
+
const summary = {};
|
|
90
|
+
|
|
91
|
+
for (const [configName, runs] of Object.entries(configRuns)) {
|
|
92
|
+
summary[configName] = {
|
|
93
|
+
pass_rate: calculateStats(runs.map((run) => Number(run.pass_rate ?? 0))),
|
|
94
|
+
time_seconds: calculateStats(runs.map((run) => Number(run.time_seconds ?? 0))),
|
|
95
|
+
tokens: calculateStats(runs.map((run) => Number(run.tokens ?? 0))),
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const orderedConfigs = Object.keys(summary);
|
|
100
|
+
const primary = summary[orderedConfigs[0]] ?? {
|
|
101
|
+
pass_rate: { mean: 0 },
|
|
102
|
+
time_seconds: { mean: 0 },
|
|
103
|
+
tokens: { mean: 0 },
|
|
104
|
+
};
|
|
105
|
+
const baseline = summary[orderedConfigs[1]] ?? {
|
|
106
|
+
pass_rate: { mean: 0 },
|
|
107
|
+
time_seconds: { mean: 0 },
|
|
108
|
+
tokens: { mean: 0 },
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
configs: summary,
|
|
113
|
+
delta: {
|
|
114
|
+
pass_rate: formatDelta(primary.pass_rate.mean - baseline.pass_rate.mean),
|
|
115
|
+
time_seconds: formatDelta(primary.time_seconds.mean - baseline.time_seconds.mean),
|
|
116
|
+
tokens: formatDelta(primary.tokens.mean - baseline.tokens.mean),
|
|
117
|
+
},
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export function buildBenchmarkDocument(iterationDir, skillName, configRuns) {
|
|
122
|
+
const summary = summarizeConfigs(configRuns);
|
|
123
|
+
|
|
124
|
+
return {
|
|
125
|
+
skill_name: skillName,
|
|
126
|
+
generated_at: new Date().toISOString(),
|
|
127
|
+
workspace: path.resolve(iterationDir),
|
|
128
|
+
configs: summary.configs,
|
|
129
|
+
delta: summary.delta,
|
|
130
|
+
runs: configRuns,
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
export function renderBenchmarkMarkdown(benchmark) {
|
|
135
|
+
const lines = [
|
|
136
|
+
`# Benchmark: ${benchmark.skill_name}`,
|
|
137
|
+
'',
|
|
138
|
+
`Generated: ${benchmark.generated_at}`,
|
|
139
|
+
'',
|
|
140
|
+
'| Config | Pass Rate | Time (s) | Tokens |',
|
|
141
|
+
'| --- | --- | --- | --- |',
|
|
142
|
+
];
|
|
143
|
+
|
|
144
|
+
for (const [configName, metrics] of Object.entries(benchmark.configs)) {
|
|
145
|
+
lines.push(
|
|
146
|
+
`| ${configName} | ${metrics.pass_rate.mean} ± ${metrics.pass_rate.stddev} | ${metrics.time_seconds.mean} ± ${metrics.time_seconds.stddev} | ${metrics.tokens.mean} ± ${metrics.tokens.stddev} |`
|
|
147
|
+
);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
lines.push('');
|
|
151
|
+
lines.push('## Delta');
|
|
152
|
+
lines.push('');
|
|
153
|
+
lines.push(`- Pass rate: ${benchmark.delta.pass_rate}`);
|
|
154
|
+
lines.push(`- Time (s): ${benchmark.delta.time_seconds}`);
|
|
155
|
+
lines.push(`- Tokens: ${benchmark.delta.tokens}`);
|
|
156
|
+
|
|
157
|
+
return lines.join('\n');
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function parseArgs(argv) {
|
|
161
|
+
const args = {
|
|
162
|
+
iterationDir: argv[2],
|
|
163
|
+
skillName: 'unknown-skill',
|
|
164
|
+
};
|
|
165
|
+
|
|
166
|
+
for (let index = 3; index < argv.length; index += 1) {
|
|
167
|
+
const token = argv[index];
|
|
168
|
+
if (token === '--skill-name' && argv[index + 1]) {
|
|
169
|
+
args.skillName = argv[index + 1];
|
|
170
|
+
index += 1;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
return args;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
async function main() {
|
|
178
|
+
const { iterationDir, skillName } = parseArgs(process.argv);
|
|
179
|
+
if (!iterationDir) {
|
|
180
|
+
console.error('Usage: node scripts/aggregate-benchmark.js <iteration-dir> --skill-name <name>');
|
|
181
|
+
process.exit(1);
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
const configRuns = await loadRunResults(iterationDir);
|
|
185
|
+
if (Object.keys(configRuns).length === 0) {
|
|
186
|
+
console.error(`No benchmark runs found in ${iterationDir}`);
|
|
187
|
+
process.exit(1);
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
const benchmark = buildBenchmarkDocument(iterationDir, skillName, configRuns);
|
|
191
|
+
const benchmarkJsonPath = path.join(iterationDir, 'benchmark.json');
|
|
192
|
+
const benchmarkMdPath = path.join(iterationDir, 'benchmark.md');
|
|
193
|
+
|
|
194
|
+
await writeFile(benchmarkJsonPath, JSON.stringify(benchmark, null, 2));
|
|
195
|
+
await writeFile(benchmarkMdPath, `${renderBenchmarkMarkdown(benchmark)}\n`);
|
|
196
|
+
|
|
197
|
+
console.log(`Wrote ${benchmarkJsonPath}`);
|
|
198
|
+
console.log(`Wrote ${benchmarkMdPath}`);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
const isDirectRun = process.argv[1]
|
|
202
|
+
&& fileURLToPath(import.meta.url) === path.resolve(process.argv[1]);
|
|
203
|
+
|
|
204
|
+
if (isDirectRun) {
|
|
205
|
+
main().catch((error) => {
|
|
206
|
+
console.error(error instanceof Error ? error.message : String(error));
|
|
207
|
+
process.exit(1);
|
|
208
|
+
});
|
|
209
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
export interface AnalyzeBenchmarkOptions {
|
|
2
|
+
workspaceDir: string;
|
|
3
|
+
benchmarkPath?: string;
|
|
4
|
+
outputPath?: string;
|
|
5
|
+
markdownPath?: string;
|
|
6
|
+
skillName?: string;
|
|
7
|
+
provider?: string;
|
|
8
|
+
model?: string;
|
|
9
|
+
reasoningMode?: string;
|
|
10
|
+
maxIter?: number;
|
|
11
|
+
cwd?: string;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export interface BenchmarkAnalysis {
|
|
15
|
+
skill_name: string;
|
|
16
|
+
generated_at: string;
|
|
17
|
+
workspace: string;
|
|
18
|
+
verdict: 'improves' | 'regresses' | 'mixed' | 'inconclusive';
|
|
19
|
+
release_readiness: 'ready' | 'needs_iteration' | 'needs_manual_review';
|
|
20
|
+
recommendation: string;
|
|
21
|
+
key_findings: string[];
|
|
22
|
+
variance_hotspots: string[];
|
|
23
|
+
suggested_actions: string[];
|
|
24
|
+
watchouts: string[];
|
|
25
|
+
supporting_metrics: {
|
|
26
|
+
pass_rate_delta: string;
|
|
27
|
+
time_seconds_delta: string;
|
|
28
|
+
tokens_delta: string;
|
|
29
|
+
};
|
|
30
|
+
failure_clusters: Record<string, unknown>;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function buildAnalysisPrompt(input: Record<string, unknown>): string;
|
|
34
|
+
|
|
35
|
+
export function renderAnalysisMarkdown(analysis: Record<string, any>): string;
|
|
36
|
+
|
|
37
|
+
export function analyzeBenchmark(
|
|
38
|
+
options: AnalyzeBenchmarkOptions,
|
|
39
|
+
runner?: (prompt: string, options: Record<string, unknown>) => Promise<string>
|
|
40
|
+
): Promise<{
|
|
41
|
+
analysis: BenchmarkAnalysis;
|
|
42
|
+
prompt: string;
|
|
43
|
+
rawResponse: string;
|
|
44
|
+
analysisJsonPath: string;
|
|
45
|
+
analysisMdPath: string;
|
|
46
|
+
}>;
|