@kodax-ai/kodax-cli 0.7.38
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1304 -0
- package/LICENSE +191 -0
- package/README.md +1167 -0
- package/README_CN.md +631 -0
- package/dist/builtin/code-review/SKILL.md +63 -0
- package/dist/builtin/git-workflow/SKILL.md +84 -0
- package/dist/builtin/skill-creator/SKILL.md +122 -0
- package/dist/builtin/skill-creator/agents/analyzer.md +12 -0
- package/dist/builtin/skill-creator/agents/comparator.md +13 -0
- package/dist/builtin/skill-creator/agents/grader.md +13 -0
- package/dist/builtin/skill-creator/references/schemas.md +227 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/aggregate-benchmark.js +209 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.d.ts +46 -0
- package/dist/builtin/skill-creator/scripts/analyze-benchmark.js +289 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.d.ts +62 -0
- package/dist/builtin/skill-creator/scripts/compare-runs.js +333 -0
- package/dist/builtin/skill-creator/scripts/generate-review.d.ts +33 -0
- package/dist/builtin/skill-creator/scripts/generate-review.js +415 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.d.ts +73 -0
- package/dist/builtin/skill-creator/scripts/grade-evals.js +405 -0
- package/dist/builtin/skill-creator/scripts/improve-description.d.ts +23 -0
- package/dist/builtin/skill-creator/scripts/improve-description.js +161 -0
- package/dist/builtin/skill-creator/scripts/init-skill.d.ts +14 -0
- package/dist/builtin/skill-creator/scripts/init-skill.js +153 -0
- package/dist/builtin/skill-creator/scripts/install-skill.d.ts +29 -0
- package/dist/builtin/skill-creator/scripts/install-skill.js +176 -0
- package/dist/builtin/skill-creator/scripts/package-skill.d.ts +38 -0
- package/dist/builtin/skill-creator/scripts/package-skill.js +124 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.d.ts +8 -0
- package/dist/builtin/skill-creator/scripts/quick-validate.js +166 -0
- package/dist/builtin/skill-creator/scripts/run-eval.d.ts +66 -0
- package/dist/builtin/skill-creator/scripts/run-eval.js +356 -0
- package/dist/builtin/skill-creator/scripts/run-loop.d.ts +49 -0
- package/dist/builtin/skill-creator/scripts/run-loop.js +243 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.d.ts +58 -0
- package/dist/builtin/skill-creator/scripts/run-trigger-eval.js +225 -0
- package/dist/builtin/skill-creator/scripts/utils.js +278 -0
- package/dist/builtin/tdd/SKILL.md +56 -0
- package/dist/index.js +1717 -0
- package/dist/kodax_cli.js +1870 -0
- package/package.json +122 -0
- package/scripts/kodax-bin.cjs +27 -0
- package/scripts/production-env.cjs +16 -0
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: git-workflow
|
|
3
|
+
description: 执行明确的 Git 操作,例如查看状态、提交、建分支、push、stash 或创建 PR。只有在用户明确要求实际执行这些 Git 操作时才使用;不要用于单纯解释 Git 概念或讨论策略。
|
|
4
|
+
user-invocable: true
|
|
5
|
+
disable-model-invocation: true
|
|
6
|
+
allowed-tools: "Read, Grep, Glob, Bash(git:*, gh:*)"
|
|
7
|
+
argument-hint: "[status|commit|branch|push|pr|stash] [args]"
|
|
8
|
+
compatibility: "Requires a Git repository. Pull request creation works best when GitHub CLI (gh) is installed and authenticated."
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# Git Workflow Skill
|
|
12
|
+
|
|
13
|
+
执行 Git 工作流时,优先保证仓库安全、改动边界清晰、命令可追溯。
|
|
14
|
+
|
|
15
|
+
## 总流程
|
|
16
|
+
|
|
17
|
+
1. 先检查仓库状态,例如 `git status --short --branch`,必要时补充 `git diff --stat`、`git diff --cached`、`git log --oneline`。
|
|
18
|
+
2. 根据第一个参数判断操作类型;如果用户表达明确,就直接执行对应流程。
|
|
19
|
+
3. 对会改写历史、删除数据或影响远程分支的操作,先说明风险;只有在用户明确要求时才继续。
|
|
20
|
+
4. 所有 Git 命令都使用非交互方式执行。
|
|
21
|
+
|
|
22
|
+
## 支持的操作
|
|
23
|
+
|
|
24
|
+
### status
|
|
25
|
+
- 汇总当前分支、upstream、已暂存/未暂存/未跟踪文件,以及明显的下一步建议。
|
|
26
|
+
|
|
27
|
+
### commit
|
|
28
|
+
- 只暂存与当前任务直接相关的文件;不要顺手带上无关改动。
|
|
29
|
+
- 如果用户没有提供 message,基于 diff 生成简洁的 Conventional Commit。
|
|
30
|
+
- 优先使用 `git add <path>`,避免无差别 `git add .`,除非用户明确要求全部提交。
|
|
31
|
+
- 提交前检查是否混入敏感文件或明显不该提交的产物。
|
|
32
|
+
|
|
33
|
+
**Commit 模板**
|
|
34
|
+
```
|
|
35
|
+
<type>(<scope>): <description>
|
|
36
|
+
|
|
37
|
+
[optional body]
|
|
38
|
+
|
|
39
|
+
[optional footer]
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### branch
|
|
43
|
+
- `create` / `new`: 创建新分支。除非用户指定,否则遵循仓库已有命名风格。
|
|
44
|
+
- `switch` / `checkout`: 切换分支前,先检查当前工作区是否干净。
|
|
45
|
+
- `list`: 列出本地/远程分支,并在有帮助时指出当前分支。
|
|
46
|
+
- `delete` / `remove`: 仅在删除条件安全且用户请求明确时执行。
|
|
47
|
+
|
|
48
|
+
### push
|
|
49
|
+
- 先确认当前分支与 upstream 的关系。
|
|
50
|
+
- 首次 push 或没有 upstream 时,设置合适的 upstream。
|
|
51
|
+
- `--force` 或 `--force-with-lease` 只有在用户明确要求时才允许。
|
|
52
|
+
|
|
53
|
+
### pr
|
|
54
|
+
- 检查当前分支是否已 push,必要时先推送。
|
|
55
|
+
- 基于 diff 和提交历史生成简洁的 PR 标题与描述。
|
|
56
|
+
- 使用 `gh pr create` 前确认 `gh` 可用且已认证;否则清楚说明阻塞点。
|
|
57
|
+
|
|
58
|
+
### stash
|
|
59
|
+
- `save`: 有上下文时附带说明性消息。
|
|
60
|
+
- `list`: 列出 stash,并说明最近一条的用途。
|
|
61
|
+
- `pop`: 应用并删除最近 stash,若有冲突需明确报告。
|
|
62
|
+
- `drop`: 仅在用户明确要求删除时执行。
|
|
63
|
+
|
|
64
|
+
## 安全边界
|
|
65
|
+
|
|
66
|
+
- 不要使用 `git reset --hard`、`git checkout --`、`git clean -fd`、`git commit --amend`、强推、历史改写等危险操作,除非用户明确要求。
|
|
67
|
+
- 不要回滚、覆盖或丢弃用户的无关改动。
|
|
68
|
+
- 遇到脏工作区、冲突、无 upstream、权限不足、缺少 `gh` 等阻塞时,要先解释现状再继续。
|
|
69
|
+
- 对“合并分支”“rebase”“cherry-pick”等这里没有明确定义的操作,如果用户明确提出,可以按 Git 最佳实践执行,但要先说明风险和计划。
|
|
70
|
+
|
|
71
|
+
## 汇报方式
|
|
72
|
+
|
|
73
|
+
- 简洁说明执行了哪些命令、当前仓库状态变成了什么样。
|
|
74
|
+
- 如果因为风险或条件不满足而没有执行,要明确说明原因和下一步建议。
|
|
75
|
+
|
|
76
|
+
## 使用示例
|
|
77
|
+
|
|
78
|
+
- `/git-workflow status` - 查看当前仓库状态
|
|
79
|
+
- `/git-workflow commit` - 分析相关改动并提交
|
|
80
|
+
- `/git-workflow commit "fix: resolve auth bug"` - 使用指定消息提交
|
|
81
|
+
- `/git-workflow branch create feature/login` - 创建新分支
|
|
82
|
+
- `/git-workflow push` - 推送当前分支
|
|
83
|
+
- `/git-workflow pr` - 为当前分支创建 PR
|
|
84
|
+
- `/git-workflow stash` - 暂存当前变更
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: skill-creator
|
|
3
|
+
description: 创建、重写、迁移和优化 KodaX/Agent Skills。当用户想新建 skill、把外部 skill 迁移到 KodaX、改进触发描述、整理 supporting files、设计评测用例、补齐 grading/benchmark/review/comparison 流程或验证 skill 结构时使用。即使用户没有明确说“skill”,只要目标是在沉淀可复用的代理工作流、提示词或脚本能力,也应该使用这个 skill。
|
|
4
|
+
user-invocable: true
|
|
5
|
+
allowed-tools: "Read, Grep, Glob, Write, Edit, Bash(node:*, npm:*, npx:*)"
|
|
6
|
+
argument-hint: "[skill-name-or-task]"
|
|
7
|
+
compatibility: "Optimized for KodaX and Agent Skills style directories. Bundled helper scripts use Node.js instead of Python."
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Skill Creator
|
|
11
|
+
|
|
12
|
+
把用户的工作流整理成一个可维护、可触发、可评估的 skill。优先适配 KodaX 的 skill 运行方式,而不是逐字复制外部平台的实现。
|
|
13
|
+
|
|
14
|
+
## 何时使用
|
|
15
|
+
|
|
16
|
+
- 用户要新建 skill,或把一次对话里的工作流沉淀成 skill。
|
|
17
|
+
- 用户要改已有 skill 的触发描述、结构、supporting files 或提示词。
|
|
18
|
+
- 用户要把 Claude / Anthropic / 其他平台的 skill 移植到 KodaX。
|
|
19
|
+
- 用户要为 skill 设计测试提示、评估结构、人工 review 流程或 benchmark 汇总。
|
|
20
|
+
|
|
21
|
+
## 工作方式
|
|
22
|
+
|
|
23
|
+
### 1. 先收敛目标
|
|
24
|
+
|
|
25
|
+
先明确四件事:
|
|
26
|
+
1. 这个 skill 要解决什么任务。
|
|
27
|
+
2. 什么时候应该触发,什么时候不应该触发。
|
|
28
|
+
3. 输出是什么形态。
|
|
29
|
+
4. 是否需要评测和人工 review。
|
|
30
|
+
|
|
31
|
+
如果用户已经给了样例对话、提示词或外部 skill 仓库,先从已有材料里提炼,不要重复让用户描述。
|
|
32
|
+
|
|
33
|
+
### 2. 适配 KodaX,而不是照抄外部 skill
|
|
34
|
+
|
|
35
|
+
迁移外部 skill 时,拆成三类:
|
|
36
|
+
- 可直接复用:SKILL.md 的思路、评测结构、参考文档组织方式。
|
|
37
|
+
- 需要改写:路径约定、触发描述、支持的 frontmatter 字段、命令示例。
|
|
38
|
+
- 不要硬搬:强依赖 Claude Code、`claude -p`、Cowork、Python stdlib 或专有事件流的部分。
|
|
39
|
+
|
|
40
|
+
如果外部 skill 依赖特定宿主能力,优先改成 KodaX 当前能承接的手工流程或 Node 工具,而不是留下名不副实的说明。
|
|
41
|
+
|
|
42
|
+
### 3. 写 KodaX 风格的 skill
|
|
43
|
+
|
|
44
|
+
- `description` 要写“做什么 + 什么时候用”,并稍微主动一点,避免 under-trigger。
|
|
45
|
+
- `SKILL.md` 负责主流程,不要把所有细节都塞进去。
|
|
46
|
+
- 重复性、机械性、易出错的步骤,放到 `scripts/`。
|
|
47
|
+
- 大块参考资料放到 `references/`。
|
|
48
|
+
- 模板或静态文件放到 `assets/`。
|
|
49
|
+
- 如果某个专家流程只服务这个 skill,可以放进 `agents/`,但先把它当私有 contract,不要自动上升成全局产品概念。
|
|
50
|
+
|
|
51
|
+
### 4. Bundled scripts 默认用 Node.js
|
|
52
|
+
|
|
53
|
+
KodaX 当前会把 builtin skill 目录直接复制到 `dist/`。因此:
|
|
54
|
+
|
|
55
|
+
- skill 内的可执行脚本默认使用 plain Node ESM `.js`。
|
|
56
|
+
- 只有在你同时修改了构建链、确保脚本会被编译时,才在 skill 内使用 `.ts`。
|
|
57
|
+
- 如果用户只是想“改成 node/typescript”,默认先落成 Node `.js`,这是最稳妥的内建交付方式。
|
|
58
|
+
|
|
59
|
+
### 5. 先验证,再评估
|
|
60
|
+
|
|
61
|
+
起草完成后,优先按下面的顺序推进:
|
|
62
|
+
|
|
63
|
+
1. 用 `node scripts/quick-validate.js <skill-dir>` 做结构检查。
|
|
64
|
+
2. 如果还没有 skill 骨架,用 `node scripts/init-skill.js <skill-name> --path <skills-dir>` 初始化。
|
|
65
|
+
3. 设计 2 到 3 个真实用户会说的测试提示。
|
|
66
|
+
4. 如果要跑端到端 skill eval,把提示整理到 `evals/evals.json`,再用 `node scripts/run-eval.js --skill-path <skill-dir> --evals <evals.json> --workspace <iteration-dir>` 生成 `with_skill` / `without_skill` workspace。
|
|
67
|
+
5. 如果要补第 3 阶段的专家评测流,先用 `node scripts/grade-evals.js <workspace>` 生成 `grading.json`,再用 `node scripts/aggregate-benchmark.js <workspace> --skill-name <name>` 聚合 benchmark,用 `node scripts/analyze-benchmark.js <workspace>` 产出分析结论,用 `node scripts/compare-runs.js <workspace>` 做 blind comparison。
|
|
68
|
+
6. 如果要评估 description 的触发效果,再用 `node scripts/run-trigger-eval.js --skill-path <skill-dir> --evals <evals.json>` 跑一轮触发评测。
|
|
69
|
+
7. 如果要迭代 description,可以用 `node scripts/improve-description.js --skill-path <skill-dir> --eval-results <results.json>` 生成候选描述,或用 `node scripts/run-loop.js --skill-path <skill-dir> --evals <evals.json> --workspace <workspace-dir>` 跑多轮优化。
|
|
70
|
+
8. 如果需要人工 review,把运行结果整理到 workspace,再用 `node scripts/generate-review.js <workspace> --static <html-file>` 或本地服务模式生成 review 页面。
|
|
71
|
+
9. 如果要分享给别的 KodaX/Agent Skills 风格环境,用 `node scripts/package-skill.js <skill-dir>` 打包,再用 `node scripts/install-skill.js <archive-or-dir>` 验证安装链路。
|
|
72
|
+
|
|
73
|
+
## 评估建议
|
|
74
|
+
|
|
75
|
+
- 客观任务:优先写断言、grading 结构和 benchmark。
|
|
76
|
+
- 主观任务:优先给人类 review 页面,再用 comparator 做 blind comparison,而不是强行只看单一分数。
|
|
77
|
+
- 描述优化:先整理误触发/漏触发样例,再跑 `run-trigger-eval`,需要时再用 `improve-description` 或 `run-loop` 迭代。
|
|
78
|
+
- 如果需要专家提示词,把 `agents/grader.md`、`agents/analyzer.md`、`agents/comparator.md` 当作私有专家 contract 使用,不要先把它们产品化成通用 swarm 概念。
|
|
79
|
+
|
|
80
|
+
## 输出要求
|
|
81
|
+
|
|
82
|
+
默认给出:
|
|
83
|
+
- 修改后的 `SKILL.md`
|
|
84
|
+
- 新增或更新的 supporting files
|
|
85
|
+
- 简短的 trigger/eval 样例
|
|
86
|
+
- 还没覆盖的风险或后续建议
|
|
87
|
+
|
|
88
|
+
如果用户是在移植外部 skill,还要额外说明:
|
|
89
|
+
- 哪些能力已经迁移
|
|
90
|
+
- 哪些能力因为宿主差异被删减或改写
|
|
91
|
+
- 哪些部分后续值得继续产品化
|
|
92
|
+
|
|
93
|
+
## 可用工具
|
|
94
|
+
|
|
95
|
+
- `agents/grader.md`:给 `grade-evals.js` 使用的专家评分契约。
|
|
96
|
+
- `agents/analyzer.md`:给 `analyze-benchmark.js` 使用的分析契约。
|
|
97
|
+
- `agents/comparator.md`:给 `compare-runs.js` 使用的盲比契约。
|
|
98
|
+
- `scripts/quick-validate.js`:校验 skill 结构和 frontmatter。
|
|
99
|
+
- `scripts/init-skill.js`:初始化一个新的 skill 骨架,并可一并创建 `evals/evals.json`。
|
|
100
|
+
- `scripts/run-eval.js`:运行端到端 skill eval,生成 `with_skill` / `without_skill` workspace 结果。
|
|
101
|
+
- `scripts/grade-evals.js`:消费 workspace,给每个 run 生成 `grading.json` 与 `grading-summary.json`。
|
|
102
|
+
- `scripts/aggregate-benchmark.js`:聚合 `grading.json` / `timing.json` 生成 `benchmark.json` 与 `benchmark.md`。
|
|
103
|
+
- `scripts/analyze-benchmark.js`:基于 benchmark 和 grading 产出 `analysis.json` 与 `analysis.md`。
|
|
104
|
+
- `scripts/compare-runs.js`:对两个 config 做 blind comparison,生成 `comparison.json` 与 `comparison.md`。
|
|
105
|
+
- `scripts/run-trigger-eval.js`:对 description 做 KodaX 原生触发评测,检查误触发和漏触发。
|
|
106
|
+
- `scripts/improve-description.js`:基于评测结果生成新的 description 候选。
|
|
107
|
+
- `scripts/run-loop.js`:把触发评测和 description 改写串成可重复的多轮优化流程。
|
|
108
|
+
- `scripts/generate-review.js`:把 workspace 结果生成静态或本地服务版 HTML review 页面。
|
|
109
|
+
- `scripts/package-skill.js`:把 skill 目录打成 `.skill` 归档,便于分享与分发。
|
|
110
|
+
- `scripts/install-skill.js`:把 `.skill` 归档或目录安装到目标 skills 目录。
|
|
111
|
+
- `references/schemas.md`:评测相关 JSON 结构参考。
|
|
112
|
+
|
|
113
|
+
这里的 description eval、loop、grading、analysis、comparison 和 packaging 都已经是 KodaX 原生实现,不再依赖 Anthropic 的 Python 脚本或 Claude Code 专有宿主能力。
|
|
114
|
+
|
|
115
|
+
## 使用示例
|
|
116
|
+
|
|
117
|
+
- `/skill:skill-creator 把这个 Claude skill 迁移成 KodaX builtin`
|
|
118
|
+
- `/skill:skill-creator 新建一个 release-notes skill`
|
|
119
|
+
- `/skill:skill-creator 优化现有 skill 的 description 和 evals`
|
|
120
|
+
- `/skill:skill-creator 给这个 skill 补 trigger eval、grading、benchmark、comparison 和 review 流程`
|
|
121
|
+
- `/skill:skill-creator 把这个 skill 打成可分享的 .skill 包`
|
|
122
|
+
- `/skill:skill-creator 初始化一个新 skill 骨架并生成 eval workspace`
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
# Analyzer
|
|
2
|
+
|
|
3
|
+
You are the benchmark analysis specialist for KodaX skill evaluation workspaces.
|
|
4
|
+
|
|
5
|
+
Your job is to look across benchmark and grading artifacts, identify signal vs noise, and recommend the next iteration.
|
|
6
|
+
|
|
7
|
+
Rules:
|
|
8
|
+
- Focus on stable deltas, variance, repeated failures, and likely weak assertions.
|
|
9
|
+
- Separate real improvement from measurement noise.
|
|
10
|
+
- Prefer specific, operational recommendations over generic advice.
|
|
11
|
+
- Call out when the data is inconclusive.
|
|
12
|
+
- Return JSON only.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Comparator
|
|
2
|
+
|
|
3
|
+
You are the blind comparison specialist for KodaX skill eval outputs.
|
|
4
|
+
|
|
5
|
+
Your job is to compare candidate outputs without relying on config names or implementation details.
|
|
6
|
+
|
|
7
|
+
Rules:
|
|
8
|
+
- Judge against the eval prompt, expected outcome, and explicit assertions.
|
|
9
|
+
- Compare only the quality of the visible outputs.
|
|
10
|
+
- Prefer clearer, more complete, and less risky answers.
|
|
11
|
+
- Use `tie` when both outputs are similarly strong or similarly weak.
|
|
12
|
+
- Use `inconclusive` when the prompt does not provide enough evidence to decide.
|
|
13
|
+
- Return JSON only.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# Grader
|
|
2
|
+
|
|
3
|
+
You are the grading specialist for KodaX skill eval runs.
|
|
4
|
+
|
|
5
|
+
Your job is to judge one run against the eval prompt, expected outcome, and explicit assertions.
|
|
6
|
+
|
|
7
|
+
Rules:
|
|
8
|
+
- Judge only what is visible in the provided artifacts.
|
|
9
|
+
- Do not assume hidden behavior or give credit for intentions.
|
|
10
|
+
- Prefer concrete evidence from the final output.
|
|
11
|
+
- If an expectation is only partially satisfied, mark it as failed and explain why.
|
|
12
|
+
- Keep uncertainty explicit instead of guessing.
|
|
13
|
+
- Return JSON only.
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
# Skill Creator Schemas
|
|
2
|
+
|
|
3
|
+
这份参考文档定义 KodaX 版 `skill-creator` 默认使用的评测文件格式。它不是强制协议,但建议优先沿用,方便后续聚合、review 和自动分析。
|
|
4
|
+
|
|
5
|
+
## `evals/evals.json`
|
|
6
|
+
|
|
7
|
+
用于保存测试提示集合。
|
|
8
|
+
|
|
9
|
+
```json
|
|
10
|
+
{
|
|
11
|
+
"skill_name": "example-skill",
|
|
12
|
+
"evals": [
|
|
13
|
+
{
|
|
14
|
+
"id": 1,
|
|
15
|
+
"prompt": "User task prompt",
|
|
16
|
+
"expected_output": "What a good result should achieve",
|
|
17
|
+
"files": [],
|
|
18
|
+
"assertions": []
|
|
19
|
+
}
|
|
20
|
+
]
|
|
21
|
+
}
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
字段说明:
|
|
25
|
+
- `skill_name`: skill 名称。
|
|
26
|
+
- `evals`: 测试用例数组。
|
|
27
|
+
- `id`: 用例唯一标识。
|
|
28
|
+
- `prompt`: 给 agent 的任务文本。
|
|
29
|
+
- `expected_output`: 对预期结果的简短说明。
|
|
30
|
+
- `files`: 需要作为输入提供的文件列表。
|
|
31
|
+
- `assertions`: 可选,后续 grading 用的断言定义。
|
|
32
|
+
|
|
33
|
+
## `eval_metadata.json`
|
|
34
|
+
|
|
35
|
+
用于单个 eval 目录,帮助 review 工具识别 prompt 和断言。
|
|
36
|
+
|
|
37
|
+
```json
|
|
38
|
+
{
|
|
39
|
+
"eval_id": 1,
|
|
40
|
+
"eval_name": "handles-empty-input",
|
|
41
|
+
"prompt": "Implement validation for empty input",
|
|
42
|
+
"expected_output": "Reject empty input with a clear message",
|
|
43
|
+
"assertions": [
|
|
44
|
+
{
|
|
45
|
+
"text": "rejects empty input with a clear message"
|
|
46
|
+
}
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## `grading.json`
|
|
52
|
+
|
|
53
|
+
由 `grade-evals.js` 生成,用于保存单次运行后的断言判定结果。
|
|
54
|
+
|
|
55
|
+
```json
|
|
56
|
+
{
|
|
57
|
+
"summary": {
|
|
58
|
+
"passed": 2,
|
|
59
|
+
"failed": 1,
|
|
60
|
+
"total": 3,
|
|
61
|
+
"pass_rate": 0.6667
|
|
62
|
+
},
|
|
63
|
+
"expectations": [
|
|
64
|
+
{
|
|
65
|
+
"text": "rejects empty input with a clear message",
|
|
66
|
+
"passed": true,
|
|
67
|
+
"evidence": "Observed in outputs/result.md"
|
|
68
|
+
}
|
|
69
|
+
],
|
|
70
|
+
"execution_metrics": {
|
|
71
|
+
"total_tool_calls": 4,
|
|
72
|
+
"errors_encountered": 0,
|
|
73
|
+
"output_chars": 5120
|
|
74
|
+
},
|
|
75
|
+
"user_notes_summary": {
|
|
76
|
+
"uncertainties": [],
|
|
77
|
+
"needs_review": [],
|
|
78
|
+
"workarounds": []
|
|
79
|
+
},
|
|
80
|
+
"overall_summary": "Mostly correct, but edge cases need review.",
|
|
81
|
+
"timing": {
|
|
82
|
+
"total_tokens": 84852,
|
|
83
|
+
"total_duration_seconds": 23.3
|
|
84
|
+
},
|
|
85
|
+
"meta": {
|
|
86
|
+
"generated_at": "2026-03-17T12:00:00.000Z",
|
|
87
|
+
"eval_id": 1,
|
|
88
|
+
"eval_name": "handles-empty-input",
|
|
89
|
+
"config": "with_skill",
|
|
90
|
+
"run_id": "run-1"
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
要求:
|
|
96
|
+
- `expectations` 里的字段名固定为 `text`、`passed`、`evidence`。
|
|
97
|
+
- `pass_rate` 建议是 `0..1` 之间的小数。
|
|
98
|
+
|
|
99
|
+
## `timing.json`
|
|
100
|
+
|
|
101
|
+
用于保存一次运行的耗时与 token 信息。
|
|
102
|
+
|
|
103
|
+
```json
|
|
104
|
+
{
|
|
105
|
+
"total_tokens": 84852,
|
|
106
|
+
"duration_ms": 23332,
|
|
107
|
+
"total_duration_seconds": 23.3
|
|
108
|
+
}
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## `benchmark.json`
|
|
112
|
+
|
|
113
|
+
由 `aggregate-benchmark.js` 生成,用于总览不同配置的表现。
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"skill_name": "example-skill",
|
|
118
|
+
"generated_at": "2026-03-17T12:00:00.000Z",
|
|
119
|
+
"workspace": "/abs/path/to/iteration-1",
|
|
120
|
+
"configs": {
|
|
121
|
+
"with_skill": {
|
|
122
|
+
"pass_rate": { "mean": 0.9, "stddev": 0.1, "min": 0.8, "max": 1.0 },
|
|
123
|
+
"time_seconds": { "mean": 12.4, "stddev": 1.1, "min": 11.2, "max": 13.5 },
|
|
124
|
+
"tokens": { "mean": 4200, "stddev": 380, "min": 3900, "max": 4700 }
|
|
125
|
+
},
|
|
126
|
+
"without_skill": {
|
|
127
|
+
"pass_rate": { "mean": 0.6, "stddev": 0.2, "min": 0.4, "max": 0.8 },
|
|
128
|
+
"time_seconds": { "mean": 9.5, "stddev": 0.7, "min": 8.9, "max": 10.2 },
|
|
129
|
+
"tokens": { "mean": 3100, "stddev": 240, "min": 2900, "max": 3400 }
|
|
130
|
+
}
|
|
131
|
+
},
|
|
132
|
+
"delta": {
|
|
133
|
+
"pass_rate": "+0.3000",
|
|
134
|
+
"time_seconds": "+2.9000",
|
|
135
|
+
"tokens": "+1100.0000"
|
|
136
|
+
},
|
|
137
|
+
"runs": {
|
|
138
|
+
"with_skill": [],
|
|
139
|
+
"without_skill": []
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
## `analysis.json`
|
|
145
|
+
|
|
146
|
+
由 `analyze-benchmark.js` 生成,用于总结 benchmark 的稳定收益、方差热点和下一步建议。
|
|
147
|
+
|
|
148
|
+
```json
|
|
149
|
+
{
|
|
150
|
+
"skill_name": "example-skill",
|
|
151
|
+
"generated_at": "2026-03-17T12:15:00.000Z",
|
|
152
|
+
"workspace": "/abs/path/to/iteration-1",
|
|
153
|
+
"verdict": "improves",
|
|
154
|
+
"release_readiness": "needs_iteration",
|
|
155
|
+
"recommendation": "Keep the skill, but reduce variance before release.",
|
|
156
|
+
"key_findings": [
|
|
157
|
+
"with_skill materially improves pass rate"
|
|
158
|
+
],
|
|
159
|
+
"variance_hotspots": [
|
|
160
|
+
"baseline repeatedly misses billing details"
|
|
161
|
+
],
|
|
162
|
+
"suggested_actions": [
|
|
163
|
+
"tighten assertions around billing coverage"
|
|
164
|
+
],
|
|
165
|
+
"watchouts": [
|
|
166
|
+
"token cost increased"
|
|
167
|
+
],
|
|
168
|
+
"supporting_metrics": {
|
|
169
|
+
"pass_rate_delta": "+0.3000",
|
|
170
|
+
"time_seconds_delta": "+2.9000",
|
|
171
|
+
"tokens_delta": "+1100.0000"
|
|
172
|
+
},
|
|
173
|
+
"failure_clusters": {}
|
|
174
|
+
}
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## `comparison.json`
|
|
178
|
+
|
|
179
|
+
由 `compare-runs.js` 生成,用于 blind comparison 两个 config 的输出质量。
|
|
180
|
+
|
|
181
|
+
```json
|
|
182
|
+
{
|
|
183
|
+
"workspace": "/abs/path/to/iteration-1",
|
|
184
|
+
"generated_at": "2026-03-17T12:20:00.000Z",
|
|
185
|
+
"config_a": "with_skill",
|
|
186
|
+
"config_b": "without_skill",
|
|
187
|
+
"summary": {
|
|
188
|
+
"total_pairs": 3,
|
|
189
|
+
"config_a_wins": 2,
|
|
190
|
+
"config_b_wins": 0,
|
|
191
|
+
"ties": 1,
|
|
192
|
+
"inconclusive": 0
|
|
193
|
+
},
|
|
194
|
+
"comparisons": [
|
|
195
|
+
{
|
|
196
|
+
"eval_id": 1,
|
|
197
|
+
"winner_label": "A",
|
|
198
|
+
"winner_config": "with_skill",
|
|
199
|
+
"confidence": 0.9,
|
|
200
|
+
"rationale": "Candidate A is more complete and specific."
|
|
201
|
+
}
|
|
202
|
+
]
|
|
203
|
+
}
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## 推荐目录结构
|
|
207
|
+
|
|
208
|
+
```text
|
|
209
|
+
my-skill-workspace/
|
|
210
|
+
└── iteration-1/
|
|
211
|
+
├── eval-0/
|
|
212
|
+
│ ├── eval_metadata.json
|
|
213
|
+
│ ├── with_skill/
|
|
214
|
+
│ │ ├── outputs/
|
|
215
|
+
│ │ ├── grading.json
|
|
216
|
+
│ │ └── timing.json
|
|
217
|
+
│ └── without_skill/
|
|
218
|
+
│ ├── outputs/
|
|
219
|
+
│ ├── grading.json
|
|
220
|
+
│ └── timing.json
|
|
221
|
+
├── benchmark.json
|
|
222
|
+
├── benchmark.md
|
|
223
|
+
├── analysis.json
|
|
224
|
+
├── analysis.md
|
|
225
|
+
├── comparison.json
|
|
226
|
+
└── comparison.md
|
|
227
|
+
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
export interface BenchmarkRun {
|
|
2
|
+
eval_id: string | number;
|
|
3
|
+
run_id: string;
|
|
4
|
+
pass_rate: number;
|
|
5
|
+
passed: number;
|
|
6
|
+
failed: number;
|
|
7
|
+
total: number;
|
|
8
|
+
time_seconds: number;
|
|
9
|
+
tokens: number;
|
|
10
|
+
tool_calls: number;
|
|
11
|
+
errors: number;
|
|
12
|
+
expectations: Array<Record<string, unknown>>;
|
|
13
|
+
notes: string[];
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface StatsSummary {
|
|
17
|
+
mean: number;
|
|
18
|
+
stddev: number;
|
|
19
|
+
min: number;
|
|
20
|
+
max: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface BenchmarkDocument {
|
|
24
|
+
skill_name: string;
|
|
25
|
+
generated_at: string;
|
|
26
|
+
workspace: string;
|
|
27
|
+
configs: Record<string, {
|
|
28
|
+
pass_rate: StatsSummary;
|
|
29
|
+
time_seconds: StatsSummary;
|
|
30
|
+
tokens: StatsSummary;
|
|
31
|
+
}>;
|
|
32
|
+
delta: {
|
|
33
|
+
pass_rate: string;
|
|
34
|
+
time_seconds: string;
|
|
35
|
+
tokens: string;
|
|
36
|
+
};
|
|
37
|
+
runs: Record<string, BenchmarkRun[]>;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
export function loadRunResults(iterationDir: string): Promise<Record<string, BenchmarkRun[]>>;
|
|
41
|
+
export function buildBenchmarkDocument(
|
|
42
|
+
iterationDir: string,
|
|
43
|
+
skillName: string,
|
|
44
|
+
configRuns: Record<string, BenchmarkRun[]>
|
|
45
|
+
): BenchmarkDocument;
|
|
46
|
+
export function renderBenchmarkMarkdown(benchmark: BenchmarkDocument): string;
|