@einja/dev-cli 0.1.40 → 0.1.41
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +89 -1
- package/dist/cli.js +1 -0
- package/dist/cli.js.map +1 -1
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +71 -1
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/list.js.map +1 -1
- package/dist/commands/sync.d.ts.map +1 -1
- package/dist/commands/sync.js +187 -13
- package/dist/commands/sync.js.map +1 -1
- package/dist/lib/dependency-checker.d.ts.map +1 -1
- package/dist/lib/merger.d.ts +12 -0
- package/dist/lib/merger.d.ts.map +1 -1
- package/dist/lib/merger.js +28 -0
- package/dist/lib/merger.js.map +1 -1
- package/dist/lib/preset-update/cli-repo-detector.d.ts.map +1 -1
- package/dist/lib/preset-update/file-copier.d.ts.map +1 -1
- package/dist/lib/preset-update/preset-finder.d.ts.map +1 -1
- package/dist/lib/preset.d.ts.map +1 -1
- package/dist/lib/sync/category-validator.d.ts +1 -1
- package/dist/lib/sync/category-validator.d.ts.map +1 -1
- package/dist/lib/sync/category-validator.js +2 -1
- package/dist/lib/sync/category-validator.js.map +1 -1
- package/dist/lib/sync/category-validator.test.js +3 -1
- package/dist/lib/sync/category-validator.test.js.map +1 -1
- package/dist/lib/sync/conflict-reporter.d.ts.map +1 -1
- package/dist/lib/sync/diff-engine.d.ts.map +1 -1
- package/dist/lib/sync/file-filter.d.ts.map +1 -1
- package/dist/lib/sync/file-filter.js +1 -0
- package/dist/lib/sync/file-filter.js.map +1 -1
- package/dist/lib/sync/integration.test.js +255 -69
- package/dist/lib/sync/integration.test.js.map +1 -1
- package/dist/lib/sync/json-processor.d.ts +4 -4
- package/dist/lib/sync/json-processor.d.ts.map +1 -1
- package/dist/lib/sync/json-processor.js +11 -11
- package/dist/lib/sync/json-processor.js.map +1 -1
- package/dist/lib/sync/marker-processor.d.ts +60 -8
- package/dist/lib/sync/marker-processor.d.ts.map +1 -1
- package/dist/lib/sync/marker-processor.js +117 -26
- package/dist/lib/sync/marker-processor.js.map +1 -1
- package/dist/lib/sync/marker-processor.test.js +261 -40
- package/dist/lib/sync/marker-processor.test.js.map +1 -1
- package/dist/lib/sync/metadata-manager.d.ts +4 -0
- package/dist/lib/sync/metadata-manager.d.ts.map +1 -1
- package/dist/lib/sync/metadata-manager.js +15 -0
- package/dist/lib/sync/metadata-manager.js.map +1 -1
- package/dist/lib/sync/metadata-manager.test.js +68 -0
- package/dist/lib/sync/metadata-manager.test.js.map +1 -1
- package/dist/lib/sync/orphan-cleaner.d.ts +29 -0
- package/dist/lib/sync/orphan-cleaner.d.ts.map +1 -0
- package/dist/lib/sync/orphan-cleaner.js +80 -0
- package/dist/lib/sync/orphan-cleaner.js.map +1 -0
- package/dist/lib/sync/orphan-cleaner.test.d.ts +2 -0
- package/dist/lib/sync/orphan-cleaner.test.d.ts.map +1 -0
- package/dist/lib/sync/orphan-cleaner.test.js +169 -0
- package/dist/lib/sync/orphan-cleaner.test.js.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.d.ts +52 -0
- package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.js +106 -0
- package/dist/lib/sync/project-private-synchronizer.js.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.test.d.ts +2 -0
- package/dist/lib/sync/project-private-synchronizer.test.d.ts.map +1 -0
- package/dist/lib/sync/project-private-synchronizer.test.js +348 -0
- package/dist/lib/sync/project-private-synchronizer.test.js.map +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/sync.d.ts +36 -6
- package/dist/types/sync.d.ts.map +1 -1
- package/dist/types/sync.js +2 -2
- package/dist/types/sync.js.map +1 -1
- package/package.json +5 -4
- package/presets/default/.claude/agents/einja/Explore.md +140 -0
- package/presets/default/.claude/agents/einja/backend-architect.md +4 -0
- package/presets/default/.claude/agents/einja/codex-agent.md +4 -0
- package/presets/default/.claude/agents/einja/design-engineer.md +4 -0
- package/presets/default/.claude/agents/einja/docs/docs-updater.md +4 -0
- package/presets/default/.claude/agents/einja/frontend-architect.md +4 -0
- package/presets/default/.claude/agents/einja/frontend-coder.md +4 -0
- package/presets/default/.claude/agents/einja/git/conflict-resolver.md +4 -0
- package/presets/default/.claude/agents/einja/specs/spec-design-generator.md +4 -1
- package/presets/default/.claude/agents/einja/specs/spec-qa-generator.md +4 -0
- package/presets/default/.claude/agents/einja/specs/spec-requirements-generator.md +4 -1
- package/presets/default/.claude/agents/einja/specs/spec-tasks-generator.md +6 -2
- package/presets/default/.claude/agents/einja/specs/spec-tasks-validator.md +4 -0
- package/presets/default/.claude/agents/einja/task/task-executer.md +57 -115
- package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +4 -0
- package/presets/default/.claude/agents/einja/task/task-qa.md +4 -0
- package/presets/default/.claude/agents/einja/task/task-reviewer.md +4 -0
- package/presets/default/.claude/commands/einja/einja-sync.md +5 -1
- package/presets/default/.claude/commands/einja/frontend-implement.md +3 -1
- package/presets/default/.claude/commands/einja/issue-exec.md +403 -0
- package/presets/default/.claude/commands/einja/spec-create.md +15 -1
- package/presets/default/.claude/commands/einja/start-dev.md +4 -0
- package/presets/default/.claude/commands/einja/sync-cursor-commands.md +4 -0
- package/presets/default/.claude/commands/einja/task-exec.md +106 -14
- package/presets/default/.claude/commands/einja/update-docs-by-task-specs.md +4 -0
- package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +23 -0
- package/presets/default/.claude/settings.json +15 -1
- package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-general-context-loader/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-output-format/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-project-overview/SKILL.md +7 -3
- package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +266 -274
- package/presets/default/.claude/skills/einja-skill-creator/agents/analyzer.md +274 -0
- package/presets/default/.claude/skills/einja-skill-creator/agents/comparator.md +202 -0
- package/presets/default/.claude/skills/einja-skill-creator/agents/grader.md +195 -0
- package/presets/default/.claude/skills/einja-skill-creator/assets/eval_review.html +146 -0
- package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/generate_review.py +471 -0
- package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/viewer.html +1325 -0
- package/presets/default/.claude/skills/einja-skill-creator/references/schemas.md +430 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +154 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +265 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +252 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/init_skill.py +13 -19
- package/presets/default/.claude/skills/einja-skill-creator/scripts/package_skill.py +36 -7
- package/presets/default/.claude/skills/einja-skill-creator/scripts/run_eval.py +310 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +295 -0
- package/presets/default/.claude/skills/einja-skill-creator/scripts/utils.py +48 -0
- package/presets/default/.claude/skills/einja-spec-context-loader/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-task-commit/SKILL.md +4 -0
- package/presets/default/.claude/skills/einja-task-qa/SKILL.md +4 -0
- package/presets/default/.envrc +5 -0
- package/presets/default/.mcp.json +2 -12
- package/presets/default/CLAUDE.md.template +26 -4
- package/presets/default/docs/einja/example/specs/issues/issue999-example-task/tasks.md +1 -1
- package/presets/default/docs/einja/instructions/deployment-setup.md +3 -8
- package/presets/default/docs/einja/instructions/environment-setup.md +3 -8
- package/presets/default/docs/einja/instructions/issue-exec-workflow.md +276 -0
- package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +70 -8
- package/presets/default/docs/einja/instructions/neon-cli-reference.md +3 -8
- package/presets/default/docs/einja/instructions/task-execute.md +23 -28
- package/presets/default/docs/einja/instructions/vercel-cli-reference.md +17 -10
- package/presets/default/docs/einja/steering/README.md +11 -11
- package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +3 -8
- package/presets/default/docs/einja/steering/architecture.md +3 -8
- package/presets/default/docs/einja/steering/branch-strategy.md +63 -70
- package/presets/default/docs/einja/steering/commit-rules.md +3 -8
- package/presets/default/docs/einja/steering/db-schema-design.md +3 -8
- package/presets/default/docs/einja/steering/development/api-development.md +3 -8
- package/presets/default/docs/einja/steering/development/backend-architecture.md +3 -8
- package/presets/default/docs/einja/steering/development/coding-standards.md +723 -0
- package/presets/default/docs/einja/steering/development/component-design.md +502 -0
- package/presets/default/docs/einja/steering/development/database-guidelines.md +2 -2
- package/presets/default/docs/einja/steering/development/frontend-development.md +3 -8
- package/presets/default/docs/einja/steering/development/playwright-guidelines.md +59 -0
- package/presets/default/docs/einja/steering/development/review-guidelines.md +3 -8
- package/presets/default/docs/einja/steering/development/testing-strategy.md +3 -8
- package/presets/default/docs/einja/steering/development-workflow.md +71 -124
- package/presets/default/docs/einja/steering/infrastructure/deployment.md +49 -55
- package/presets/default/docs/einja/steering/infrastructure/environment-variables.md +4 -8
- package/presets/default/docs/einja/steering/product.md +3 -8
- package/presets/default/docs/einja/steering/task-management.md +14 -98
- package/presets/default/scripts/ensure-serena.sh +75 -0
- package/presets/default/scripts/env-rotate-secrets.ts +336 -0
- package/presets/default/scripts/env-show.ts +130 -0
- package/presets/default/scripts/env.ts +479 -0
- package/presets/default/scripts/init.sh +92 -0
- package/presets/default/scripts/lib/env-common.ts +108 -0
- package/presets/default/scripts/lib/worktree-config.ts +64 -0
- package/presets/default/scripts/setup-dev.ts +640 -0
- package/presets/default/scripts/stop-serena.sh +25 -0
- package/presets/default/scripts/worktree/dev.ts +872 -0
- package/dist/lib/sync/seed-synchronizer.d.ts +0 -27
- package/dist/lib/sync/seed-synchronizer.d.ts.map +0 -1
- package/dist/lib/sync/seed-synchronizer.js +0 -72
- package/dist/lib/sync/seed-synchronizer.js.map +0 -1
- package/dist/lib/sync/seed-synchronizer.test.d.ts +0 -2
- package/dist/lib/sync/seed-synchronizer.test.d.ts.map +0 -1
- package/dist/lib/sync/seed-synchronizer.test.js +0 -147
- package/dist/lib/sync/seed-synchronizer.test.js.map +0 -1
- package/presets/default/.claude/skills/einja-api-development/SKILL.md +0 -14
- package/presets/default/.claude/skills/einja-backend-architecture/SKILL.md +0 -18
- package/presets/default/.claude/skills/einja-coding-standards/SKILL.md +0 -132
- package/presets/default/.claude/skills/einja-coding-standards/references/import-conventions.md +0 -69
- package/presets/default/.claude/skills/einja-coding-standards/references/naming-conventions.md +0 -107
- package/presets/default/.claude/skills/einja-coding-standards/references/prohibited-patterns.md +0 -169
- package/presets/default/.claude/skills/einja-coding-standards/references/typescript-rules.md +0 -247
- package/presets/default/.claude/skills/einja-component-design/SKILL.md +0 -109
- package/presets/default/.claude/skills/einja-component-design/references/directory-structure.md +0 -117
- package/presets/default/.claude/skills/einja-component-design/references/props-patterns.md +0 -159
- package/presets/default/.claude/skills/einja-component-design/references/styling-guide.md +0 -122
- package/presets/default/.claude/skills/einja-frontend-development/SKILL.md +0 -14
- package/presets/default/docs/einja/instructions/task-vibe-kanban-loop.md +0 -565
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
# JSONスキーマ
|
|
2
|
+
|
|
3
|
+
このドキュメントはskill-creatorで使用されるJSONスキーマを定義する。
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## evals.json
|
|
8
|
+
|
|
9
|
+
スキルの評価テストケースを定義する。スキルディレクトリ内の`evals/evals.json`に配置。
|
|
10
|
+
|
|
11
|
+
```json
|
|
12
|
+
{
|
|
13
|
+
"skill_name": "example-skill",
|
|
14
|
+
"evals": [
|
|
15
|
+
{
|
|
16
|
+
"id": 1,
|
|
17
|
+
"prompt": "ユーザーのサンプルプロンプト",
|
|
18
|
+
"expected_output": "期待される結果の説明",
|
|
19
|
+
"files": ["evals/files/sample1.pdf"],
|
|
20
|
+
"expectations": [
|
|
21
|
+
"出力にXが含まれている",
|
|
22
|
+
"スキルがスクリプトYを使用した"
|
|
23
|
+
]
|
|
24
|
+
}
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
**フィールド:**
|
|
30
|
+
- `skill_name`: スキルのフロントマターと一致する名前
|
|
31
|
+
- `evals[].id`: 一意の整数識別子
|
|
32
|
+
- `evals[].prompt`: 実行するタスク
|
|
33
|
+
- `evals[].expected_output`: 成功を表す人間が読める説明
|
|
34
|
+
- `evals[].files`: 入力ファイルパスのオプションリスト(スキルルートからの相対パス)
|
|
35
|
+
- `evals[].expectations`: 検証可能な記述のリスト
|
|
36
|
+
|
|
37
|
+
---
|
|
38
|
+
|
|
39
|
+
## history.json
|
|
40
|
+
|
|
41
|
+
改善モードでのバージョン進行を追跡する。ワークスペースルートに配置。
|
|
42
|
+
|
|
43
|
+
```json
|
|
44
|
+
{
|
|
45
|
+
"started_at": "2026-01-15T10:30:00Z",
|
|
46
|
+
"skill_name": "pdf",
|
|
47
|
+
"current_best": "v2",
|
|
48
|
+
"iterations": [
|
|
49
|
+
{
|
|
50
|
+
"version": "v0",
|
|
51
|
+
"parent": null,
|
|
52
|
+
"expectation_pass_rate": 0.65,
|
|
53
|
+
"grading_result": "baseline",
|
|
54
|
+
"is_current_best": false
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
"version": "v1",
|
|
58
|
+
"parent": "v0",
|
|
59
|
+
"expectation_pass_rate": 0.75,
|
|
60
|
+
"grading_result": "won",
|
|
61
|
+
"is_current_best": false
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
"version": "v2",
|
|
65
|
+
"parent": "v1",
|
|
66
|
+
"expectation_pass_rate": 0.85,
|
|
67
|
+
"grading_result": "won",
|
|
68
|
+
"is_current_best": true
|
|
69
|
+
}
|
|
70
|
+
]
|
|
71
|
+
}
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
**フィールド:**
|
|
75
|
+
- `started_at`: 改善開始のISOタイムスタンプ
|
|
76
|
+
- `skill_name`: 改善対象のスキル名
|
|
77
|
+
- `current_best`: 最高パフォーマンスのバージョン識別子
|
|
78
|
+
- `iterations[].version`: バージョン識別子(v0、v1、...)
|
|
79
|
+
- `iterations[].parent`: 派生元の親バージョン
|
|
80
|
+
- `iterations[].expectation_pass_rate`: 採点からのパス率
|
|
81
|
+
- `iterations[].grading_result`: "baseline"、"won"、"lost"、または"tie"
|
|
82
|
+
- `iterations[].is_current_best`: 現在の最良バージョンかどうか
|
|
83
|
+
|
|
84
|
+
---
|
|
85
|
+
|
|
86
|
+
## grading.json
|
|
87
|
+
|
|
88
|
+
採点エージェントの出力。`<run-dir>/grading.json`に配置。
|
|
89
|
+
|
|
90
|
+
```json
|
|
91
|
+
{
|
|
92
|
+
"expectations": [
|
|
93
|
+
{
|
|
94
|
+
"text": "出力に'John Smith'という名前が含まれている",
|
|
95
|
+
"passed": true,
|
|
96
|
+
"evidence": "トランスクリプトのステップ3で発見: 'Extracted names: John Smith, Sarah Johnson'"
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
"text": "スプレッドシートのセルB10にSUM数式がある",
|
|
100
|
+
"passed": false,
|
|
101
|
+
"evidence": "スプレッドシートが作成されなかった。出力はテキストファイルだった。"
|
|
102
|
+
}
|
|
103
|
+
],
|
|
104
|
+
"summary": {
|
|
105
|
+
"passed": 2,
|
|
106
|
+
"failed": 1,
|
|
107
|
+
"total": 3,
|
|
108
|
+
"pass_rate": 0.67
|
|
109
|
+
},
|
|
110
|
+
"execution_metrics": {
|
|
111
|
+
"tool_calls": {
|
|
112
|
+
"Read": 5,
|
|
113
|
+
"Write": 2,
|
|
114
|
+
"Bash": 8
|
|
115
|
+
},
|
|
116
|
+
"total_tool_calls": 15,
|
|
117
|
+
"total_steps": 6,
|
|
118
|
+
"errors_encountered": 0,
|
|
119
|
+
"output_chars": 12450,
|
|
120
|
+
"transcript_chars": 3200
|
|
121
|
+
},
|
|
122
|
+
"timing": {
|
|
123
|
+
"executor_duration_seconds": 165.0,
|
|
124
|
+
"grader_duration_seconds": 26.0,
|
|
125
|
+
"total_duration_seconds": 191.0
|
|
126
|
+
},
|
|
127
|
+
"claims": [
|
|
128
|
+
{
|
|
129
|
+
"claim": "フォームに12個の入力可能フィールドがある",
|
|
130
|
+
"type": "factual",
|
|
131
|
+
"verified": true,
|
|
132
|
+
"evidence": "field_info.jsonで12フィールドを確認"
|
|
133
|
+
}
|
|
134
|
+
],
|
|
135
|
+
"user_notes_summary": {
|
|
136
|
+
"uncertainties": ["2023年のデータを使用、古い可能性がある"],
|
|
137
|
+
"needs_review": [],
|
|
138
|
+
"workarounds": ["入力不可フィールドにテキストオーバーレイで代替"]
|
|
139
|
+
},
|
|
140
|
+
"eval_feedback": {
|
|
141
|
+
"suggestions": [
|
|
142
|
+
{
|
|
143
|
+
"assertion": "出力に'John Smith'という名前が含まれている",
|
|
144
|
+
"reason": "名前に言及する幻覚ドキュメントでもパスしてしまう"
|
|
145
|
+
}
|
|
146
|
+
],
|
|
147
|
+
"overall": "アサーションは存在のみをチェックし、正確性をチェックしていない。"
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
**フィールド:**
|
|
153
|
+
- `expectations[]`: 根拠付きの採点済み期待値
|
|
154
|
+
- `summary`: パス/フェイルの集計カウント
|
|
155
|
+
- `execution_metrics`: ツール使用量と出力サイズ(エグゼキューターのmetrics.jsonから)
|
|
156
|
+
- `timing`: 実行時間(timing.jsonから)
|
|
157
|
+
- `claims`: 出力から抽出・検証されたクレーム
|
|
158
|
+
- `user_notes_summary`: エグゼキューターがフラグした問題
|
|
159
|
+
- `eval_feedback`:(オプション)評価の改善提案。採点エージェントが指摘すべき問題を特定した場合のみ存在
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## metrics.json
|
|
164
|
+
|
|
165
|
+
エグゼキューターエージェントの出力。`<run-dir>/outputs/metrics.json`に配置。
|
|
166
|
+
|
|
167
|
+
```json
|
|
168
|
+
{
|
|
169
|
+
"tool_calls": {
|
|
170
|
+
"Read": 5,
|
|
171
|
+
"Write": 2,
|
|
172
|
+
"Bash": 8,
|
|
173
|
+
"Edit": 1,
|
|
174
|
+
"Glob": 2,
|
|
175
|
+
"Grep": 0
|
|
176
|
+
},
|
|
177
|
+
"total_tool_calls": 18,
|
|
178
|
+
"total_steps": 6,
|
|
179
|
+
"files_created": ["filled_form.pdf", "field_values.json"],
|
|
180
|
+
"errors_encountered": 0,
|
|
181
|
+
"output_chars": 12450,
|
|
182
|
+
"transcript_chars": 3200
|
|
183
|
+
}
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**フィールド:**
|
|
187
|
+
- `tool_calls`: ツールタイプごとのカウント
|
|
188
|
+
- `total_tool_calls`: 全ツール呼び出しの合計
|
|
189
|
+
- `total_steps`: 主要な実行ステップの数
|
|
190
|
+
- `files_created`: 作成された出力ファイルのリスト
|
|
191
|
+
- `errors_encountered`: 実行中のエラー数
|
|
192
|
+
- `output_chars`: 出力ファイルの合計文字数
|
|
193
|
+
- `transcript_chars`: トランスクリプトの文字数
|
|
194
|
+
|
|
195
|
+
---
|
|
196
|
+
|
|
197
|
+
## timing.json
|
|
198
|
+
|
|
199
|
+
実行の経過時間。`<run-dir>/timing.json`に配置。
|
|
200
|
+
|
|
201
|
+
**キャプチャ方法:** サブエージェントタスクが完了すると、タスク通知に`total_tokens`と`duration_ms`が含まれる。これらは他の場所に永続化されず、事後に復元できないため、即座に保存すること。
|
|
202
|
+
|
|
203
|
+
```json
|
|
204
|
+
{
|
|
205
|
+
"total_tokens": 84852,
|
|
206
|
+
"duration_ms": 23332,
|
|
207
|
+
"total_duration_seconds": 23.3,
|
|
208
|
+
"executor_start": "2026-01-15T10:30:00Z",
|
|
209
|
+
"executor_end": "2026-01-15T10:32:45Z",
|
|
210
|
+
"executor_duration_seconds": 165.0,
|
|
211
|
+
"grader_start": "2026-01-15T10:32:46Z",
|
|
212
|
+
"grader_end": "2026-01-15T10:33:12Z",
|
|
213
|
+
"grader_duration_seconds": 26.0
|
|
214
|
+
}
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## benchmark.json
|
|
220
|
+
|
|
221
|
+
ベンチマークモードの出力。`benchmarks/<timestamp>/benchmark.json`に配置。
|
|
222
|
+
|
|
223
|
+
```json
|
|
224
|
+
{
|
|
225
|
+
"metadata": {
|
|
226
|
+
"skill_name": "pdf",
|
|
227
|
+
"skill_path": "/path/to/pdf",
|
|
228
|
+
"executor_model": "claude-sonnet-4-20250514",
|
|
229
|
+
"analyzer_model": "most-capable-model",
|
|
230
|
+
"timestamp": "2026-01-15T10:30:00Z",
|
|
231
|
+
"evals_run": [1, 2, 3],
|
|
232
|
+
"runs_per_configuration": 3
|
|
233
|
+
},
|
|
234
|
+
|
|
235
|
+
"runs": [
|
|
236
|
+
{
|
|
237
|
+
"eval_id": 1,
|
|
238
|
+
"eval_name": "Ocean",
|
|
239
|
+
"configuration": "with_skill",
|
|
240
|
+
"run_number": 1,
|
|
241
|
+
"result": {
|
|
242
|
+
"pass_rate": 0.85,
|
|
243
|
+
"passed": 6,
|
|
244
|
+
"failed": 1,
|
|
245
|
+
"total": 7,
|
|
246
|
+
"time_seconds": 42.5,
|
|
247
|
+
"tokens": 3800,
|
|
248
|
+
"tool_calls": 18,
|
|
249
|
+
"errors": 0
|
|
250
|
+
},
|
|
251
|
+
"expectations": [
|
|
252
|
+
{"text": "...", "passed": true, "evidence": "..."}
|
|
253
|
+
],
|
|
254
|
+
"notes": [
|
|
255
|
+
"2023年のデータを使用、古い可能性がある",
|
|
256
|
+
"入力不可フィールドにテキストオーバーレイで代替"
|
|
257
|
+
]
|
|
258
|
+
}
|
|
259
|
+
],
|
|
260
|
+
|
|
261
|
+
"run_summary": {
|
|
262
|
+
"with_skill": {
|
|
263
|
+
"pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
|
|
264
|
+
"time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
|
|
265
|
+
"tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
|
|
266
|
+
},
|
|
267
|
+
"without_skill": {
|
|
268
|
+
"pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
|
|
269
|
+
"time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
|
|
270
|
+
"tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
|
|
271
|
+
},
|
|
272
|
+
"delta": {
|
|
273
|
+
"pass_rate": "+0.50",
|
|
274
|
+
"time_seconds": "+13.0",
|
|
275
|
+
"tokens": "+1700"
|
|
276
|
+
}
|
|
277
|
+
},
|
|
278
|
+
|
|
279
|
+
"notes": [
|
|
280
|
+
"アサーション '出力はPDFファイルである' は両構成で100%パス - スキルの価値を区別しない可能性",
|
|
281
|
+
"評価3が高いばらつきを示す(50% ± 40%) - 不安定またはモデル依存の可能性",
|
|
282
|
+
"スキルなし実行はテーブル抽出の期待値で一貫して失敗",
|
|
283
|
+
"スキルは平均13秒の実行時間増加だが、パス率を50%改善"
|
|
284
|
+
]
|
|
285
|
+
}
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
**フィールド:**
|
|
289
|
+
- `metadata`: ベンチマーク実行に関する情報
|
|
290
|
+
- `skill_name`: スキル名
|
|
291
|
+
- `timestamp`: ベンチマーク実行日時
|
|
292
|
+
- `evals_run`: 評価名またはIDのリスト
|
|
293
|
+
- `runs_per_configuration`: 構成ごとの実行回数(例: 3)
|
|
294
|
+
- `runs[]`: 個別の実行結果
|
|
295
|
+
- `eval_id`: 数値の評価識別子
|
|
296
|
+
- `eval_name`: 人間が読める評価名(ビューアーのセクションヘッダーとして使用)
|
|
297
|
+
- `configuration`: `"with_skill"`または`"without_skill"`でなければならない(ビューアーはこの正確な文字列をグルーピングとカラーコーディングに使用)
|
|
298
|
+
- `run_number`: 整数の実行番号(1、2、3...)
|
|
299
|
+
- `result`: `pass_rate`、`passed`、`total`、`time_seconds`、`tokens`、`errors`を含むネストされたオブジェクト
|
|
300
|
+
- `run_summary`: 構成ごとの統計集計
|
|
301
|
+
- `with_skill` / `without_skill`: それぞれ`pass_rate`、`time_seconds`、`tokens`オブジェクトを含み、`mean`と`stddev`フィールドを持つ
|
|
302
|
+
- `delta`: `"+0.50"`、`"+13.0"`、`"+1700"`のような差分文字列
|
|
303
|
+
- `notes`: 分析エージェントからのフリーフォーム観察
|
|
304
|
+
|
|
305
|
+
**重要:** ビューアーはこれらのフィールド名を正確に読み取る。`configuration`の代わりに`config`を使用したり、`pass_rate`を`result`内ではなく実行のトップレベルに配置したりすると、ビューアーは空/ゼロの値を表示する。benchmark.jsonを手動で生成する際は常にこのスキーマを参照すること。
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## comparison.json
|
|
310
|
+
|
|
311
|
+
ブラインド比較エージェントの出力。`<grading-dir>/comparison-N.json`に配置。
|
|
312
|
+
|
|
313
|
+
```json
|
|
314
|
+
{
|
|
315
|
+
"winner": "A",
|
|
316
|
+
"reasoning": "出力Aは適切なフォーマットとすべての必須フィールドを備えた完全なソリューションを提供している。出力Bは日付フィールドが欠落しており、フォーマットに不一致がある。",
|
|
317
|
+
"rubric": {
|
|
318
|
+
"A": {
|
|
319
|
+
"content": {
|
|
320
|
+
"correctness": 5,
|
|
321
|
+
"completeness": 5,
|
|
322
|
+
"accuracy": 4
|
|
323
|
+
},
|
|
324
|
+
"structure": {
|
|
325
|
+
"organization": 4,
|
|
326
|
+
"formatting": 5,
|
|
327
|
+
"usability": 4
|
|
328
|
+
},
|
|
329
|
+
"content_score": 4.7,
|
|
330
|
+
"structure_score": 4.3,
|
|
331
|
+
"overall_score": 9.0
|
|
332
|
+
},
|
|
333
|
+
"B": {
|
|
334
|
+
"content": {
|
|
335
|
+
"correctness": 3,
|
|
336
|
+
"completeness": 2,
|
|
337
|
+
"accuracy": 3
|
|
338
|
+
},
|
|
339
|
+
"structure": {
|
|
340
|
+
"organization": 3,
|
|
341
|
+
"formatting": 2,
|
|
342
|
+
"usability": 3
|
|
343
|
+
},
|
|
344
|
+
"content_score": 2.7,
|
|
345
|
+
"structure_score": 2.7,
|
|
346
|
+
"overall_score": 5.4
|
|
347
|
+
}
|
|
348
|
+
},
|
|
349
|
+
"output_quality": {
|
|
350
|
+
"A": {
|
|
351
|
+
"score": 9,
|
|
352
|
+
"strengths": ["完全なソリューション", "適切なフォーマット", "すべてのフィールドが存在"],
|
|
353
|
+
"weaknesses": ["ヘッダーに軽微なスタイル不一致"]
|
|
354
|
+
},
|
|
355
|
+
"B": {
|
|
356
|
+
"score": 5,
|
|
357
|
+
"strengths": ["読みやすい出力", "基本構造が正しい"],
|
|
358
|
+
"weaknesses": ["日付フィールドの欠落", "フォーマットの不一致", "部分的なデータ抽出"]
|
|
359
|
+
}
|
|
360
|
+
},
|
|
361
|
+
"expectation_results": {
|
|
362
|
+
"A": {
|
|
363
|
+
"passed": 4,
|
|
364
|
+
"total": 5,
|
|
365
|
+
"pass_rate": 0.80,
|
|
366
|
+
"details": [
|
|
367
|
+
{"text": "出力に名前が含まれている", "passed": true}
|
|
368
|
+
]
|
|
369
|
+
},
|
|
370
|
+
"B": {
|
|
371
|
+
"passed": 3,
|
|
372
|
+
"total": 5,
|
|
373
|
+
"pass_rate": 0.60,
|
|
374
|
+
"details": [
|
|
375
|
+
{"text": "出力に名前が含まれている", "passed": true}
|
|
376
|
+
]
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
}
|
|
380
|
+
```
|
|
381
|
+
|
|
382
|
+
---
|
|
383
|
+
|
|
384
|
+
## analysis.json
|
|
385
|
+
|
|
386
|
+
事後分析エージェントの出力。`<grading-dir>/analysis.json`に配置。
|
|
387
|
+
|
|
388
|
+
```json
|
|
389
|
+
{
|
|
390
|
+
"comparison_summary": {
|
|
391
|
+
"winner": "A",
|
|
392
|
+
"winner_skill": "path/to/winner/skill",
|
|
393
|
+
"loser_skill": "path/to/loser/skill",
|
|
394
|
+
"comparator_reasoning": "比較エージェントが勝者を選んだ理由の要約"
|
|
395
|
+
},
|
|
396
|
+
"winner_strengths": [
|
|
397
|
+
"複数ページのドキュメント処理に対する明確なステップバイステップの指示",
|
|
398
|
+
"フォーマットエラーを検出する検証スクリプトを含む"
|
|
399
|
+
],
|
|
400
|
+
"loser_weaknesses": [
|
|
401
|
+
"曖昧な指示「ドキュメントを適切に処理」が一貫性のない動作につながった",
|
|
402
|
+
"検証スクリプトがなく、エージェントが即興で対応"
|
|
403
|
+
],
|
|
404
|
+
"instruction_following": {
|
|
405
|
+
"winner": {
|
|
406
|
+
"score": 9,
|
|
407
|
+
"issues": ["軽微: オプションのログ記録ステップをスキップ"]
|
|
408
|
+
},
|
|
409
|
+
"loser": {
|
|
410
|
+
"score": 6,
|
|
411
|
+
"issues": [
|
|
412
|
+
"スキルのフォーマットテンプレートを使用しなかった",
|
|
413
|
+
"ステップ3に従わず独自のアプローチを考案した"
|
|
414
|
+
]
|
|
415
|
+
}
|
|
416
|
+
},
|
|
417
|
+
"improvement_suggestions": [
|
|
418
|
+
{
|
|
419
|
+
"priority": "high",
|
|
420
|
+
"category": "instructions",
|
|
421
|
+
"suggestion": "「ドキュメントを適切に処理」を明示的なステップに置き換え",
|
|
422
|
+
"expected_impact": "一貫性のない動作を引き起こした曖昧さを排除"
|
|
423
|
+
}
|
|
424
|
+
],
|
|
425
|
+
"transcript_insights": {
|
|
426
|
+
"winner_execution_pattern": "スキルを読む -> 5ステッププロセスに従う -> 検証スクリプトを使用",
|
|
427
|
+
"loser_execution_pattern": "スキルを読む -> アプローチが不明確 -> 3つの異なる方法を試す"
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
```
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""複数スキルのベンチマーク結果を集約。
|
|
3
|
+
|
|
4
|
+
複数のrun_loop.py出力を受け取り、全スキルのスコアを
|
|
5
|
+
サマリーテーブルとして表示する。
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import json
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def aggregate_results(result_files: list[str]) -> dict:
|
|
15
|
+
"""複数のrun_loop.py出力ファイルを集約する。"""
|
|
16
|
+
skills = []
|
|
17
|
+
|
|
18
|
+
for filepath in result_files:
|
|
19
|
+
try:
|
|
20
|
+
data = json.loads(Path(filepath).read_text())
|
|
21
|
+
except (json.JSONDecodeError, FileNotFoundError) as e:
|
|
22
|
+
print(f"警告: {filepath} の読み込みに失敗しました: {e}", file=sys.stderr)
|
|
23
|
+
continue
|
|
24
|
+
|
|
25
|
+
history = data.get("history", [])
|
|
26
|
+
if not history:
|
|
27
|
+
print(f"警告: {filepath} に履歴がありません", file=sys.stderr)
|
|
28
|
+
continue
|
|
29
|
+
|
|
30
|
+
# 最良のイテレーションを見つける(テスト > トレーニングで優先)
|
|
31
|
+
best_idx = 0
|
|
32
|
+
best_test = -1
|
|
33
|
+
best_train = -1
|
|
34
|
+
for i, h in enumerate(history):
|
|
35
|
+
t_passed = h.get("test_passed", -1)
|
|
36
|
+
tr_passed = h.get("train_passed", h.get("passed", 0))
|
|
37
|
+
if t_passed > best_test or (t_passed == best_test and tr_passed > best_train):
|
|
38
|
+
best_test = t_passed
|
|
39
|
+
best_train = tr_passed
|
|
40
|
+
best_idx = i
|
|
41
|
+
|
|
42
|
+
best = history[best_idx]
|
|
43
|
+
original = history[0]
|
|
44
|
+
|
|
45
|
+
skill_entry = {
|
|
46
|
+
"skill_name": data.get("skill_name", Path(filepath).stem),
|
|
47
|
+
"file": filepath,
|
|
48
|
+
"iterations": len(history),
|
|
49
|
+
"best_iteration": best_idx,
|
|
50
|
+
"original_description": data.get("original_description", ""),
|
|
51
|
+
"best_description": best.get("description", ""),
|
|
52
|
+
"original_train_score": f"{original.get('train_passed', original.get('passed', 0))}/{original.get('train_total', original.get('total', 0))}",
|
|
53
|
+
"best_train_score": f"{best.get('train_passed', best.get('passed', 0))}/{best.get('train_total', best.get('total', 0))}",
|
|
54
|
+
"original_train_passed": original.get("train_passed", original.get("passed", 0)),
|
|
55
|
+
"original_train_total": original.get("train_total", original.get("total", 0)),
|
|
56
|
+
"best_train_passed": best.get("train_passed", best.get("passed", 0)),
|
|
57
|
+
"best_train_total": best.get("train_total", best.get("total", 0)),
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# テストスコア(存在する場合)
|
|
61
|
+
if best.get("test_passed") is not None:
|
|
62
|
+
skill_entry["original_test_score"] = f"{original.get('test_passed', '?')}/{original.get('test_total', '?')}"
|
|
63
|
+
skill_entry["best_test_score"] = f"{best.get('test_passed', '?')}/{best.get('test_total', '?')}"
|
|
64
|
+
skill_entry["best_test_passed"] = best.get("test_passed", 0)
|
|
65
|
+
skill_entry["best_test_total"] = best.get("test_total", 0)
|
|
66
|
+
|
|
67
|
+
skills.append(skill_entry)
|
|
68
|
+
|
|
69
|
+
# トレーニングスコアでソート(降順)
|
|
70
|
+
skills.sort(
|
|
71
|
+
key=lambda s: (
|
|
72
|
+
s.get("best_test_passed", 0) / max(s.get("best_test_total", 1), 1),
|
|
73
|
+
s["best_train_passed"] / max(s["best_train_total"], 1),
|
|
74
|
+
),
|
|
75
|
+
reverse=True,
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# 全体サマリーの計算
|
|
79
|
+
total_train_passed = sum(s["best_train_passed"] for s in skills)
|
|
80
|
+
total_train_total = sum(s["best_train_total"] for s in skills)
|
|
81
|
+
total_test_passed = sum(s.get("best_test_passed", 0) for s in skills if "best_test_passed" in s)
|
|
82
|
+
total_test_total = sum(s.get("best_test_total", 0) for s in skills if "best_test_total" in s)
|
|
83
|
+
|
|
84
|
+
return {
|
|
85
|
+
"skills": skills,
|
|
86
|
+
"summary": {
|
|
87
|
+
"total_skills": len(skills),
|
|
88
|
+
"total_train_passed": total_train_passed,
|
|
89
|
+
"total_train_total": total_train_total,
|
|
90
|
+
"total_train_score": f"{total_train_passed}/{total_train_total}",
|
|
91
|
+
"total_test_passed": total_test_passed,
|
|
92
|
+
"total_test_total": total_test_total,
|
|
93
|
+
"total_test_score": f"{total_test_passed}/{total_test_total}" if total_test_total > 0 else None,
|
|
94
|
+
},
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def print_table(aggregated: dict, verbose: bool = False) -> None:
|
|
99
|
+
"""集約結果をテーブル形式でstderrに出力する。"""
|
|
100
|
+
skills = aggregated["skills"]
|
|
101
|
+
summary = aggregated["summary"]
|
|
102
|
+
|
|
103
|
+
has_test = any("best_test_score" in s for s in skills)
|
|
104
|
+
|
|
105
|
+
# ヘッダー
|
|
106
|
+
header = f"{'スキル名':<30} {'トレーニング(元)':<14} {'トレーニング(最良)':<14}"
|
|
107
|
+
if has_test:
|
|
108
|
+
header += f" {'テスト(元)':<12} {'テスト(最良)':<12}"
|
|
109
|
+
header += f" {'回数':<6} {'最良回':<6}"
|
|
110
|
+
print(header, file=sys.stderr)
|
|
111
|
+
print("-" * len(header), file=sys.stderr)
|
|
112
|
+
|
|
113
|
+
# 各スキル
|
|
114
|
+
for s in skills:
|
|
115
|
+
line = f"{s['skill_name']:<30} {s['original_train_score']:<14} {s['best_train_score']:<14}"
|
|
116
|
+
if has_test:
|
|
117
|
+
orig_test = s.get("original_test_score", "-")
|
|
118
|
+
best_test = s.get("best_test_score", "-")
|
|
119
|
+
line += f" {orig_test:<12} {best_test:<12}"
|
|
120
|
+
line += f" {s['iterations']:<6} {s['best_iteration']:<6}"
|
|
121
|
+
print(line, file=sys.stderr)
|
|
122
|
+
|
|
123
|
+
if verbose:
|
|
124
|
+
print(f" オリジナル: {s['original_description'][:80]}...", file=sys.stderr)
|
|
125
|
+
print(f" 最良: {s['best_description'][:80]}...", file=sys.stderr)
|
|
126
|
+
|
|
127
|
+
# サマリー
|
|
128
|
+
print("-" * len(header), file=sys.stderr)
|
|
129
|
+
total_line = f"{'合計':<30} {'':<14} {summary['total_train_score']:<14}"
|
|
130
|
+
if has_test and summary.get("total_test_score"):
|
|
131
|
+
total_line += f" {'':<12} {summary['total_test_score']:<12}"
|
|
132
|
+
total_line += f" {summary['total_skills']} スキル"
|
|
133
|
+
print(total_line, file=sys.stderr)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def main():
|
|
137
|
+
parser = argparse.ArgumentParser(description="複数スキルのベンチマーク結果を集約")
|
|
138
|
+
parser.add_argument("files", nargs="+", help="run_loop.pyのJSON出力ファイル(複数指定可)")
|
|
139
|
+
parser.add_argument("--verbose", action="store_true", help="各スキルのdescriptionも表示")
|
|
140
|
+
parser.add_argument("--json", action="store_true", help="JSON形式で標準出力に出力")
|
|
141
|
+
args = parser.parse_args()
|
|
142
|
+
|
|
143
|
+
aggregated = aggregate_results(args.files)
|
|
144
|
+
|
|
145
|
+
# テーブル表示
|
|
146
|
+
print_table(aggregated, verbose=args.verbose)
|
|
147
|
+
|
|
148
|
+
# JSON出力
|
|
149
|
+
if args.json:
|
|
150
|
+
print(json.dumps(aggregated, indent=2, ensure_ascii=False))
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
if __name__ == "__main__":
|
|
154
|
+
main()
|