@einja/dev-cli 0.1.40 → 0.1.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. package/README.md +89 -1
  2. package/dist/cli.js +1 -0
  3. package/dist/cli.js.map +1 -1
  4. package/dist/commands/init.d.ts.map +1 -1
  5. package/dist/commands/init.js +71 -1
  6. package/dist/commands/init.js.map +1 -1
  7. package/dist/commands/list.js.map +1 -1
  8. package/dist/commands/sync.d.ts.map +1 -1
  9. package/dist/commands/sync.js +187 -13
  10. package/dist/commands/sync.js.map +1 -1
  11. package/dist/commands/task-loop/lib/github-client.test.js.map +1 -1
  12. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js +2 -2
  13. package/dist/commands/task-loop/lib/vibe-kanban-rest-client.js.map +1 -1
  14. package/dist/lib/dependency-checker.d.ts.map +1 -1
  15. package/dist/lib/merger.d.ts +12 -0
  16. package/dist/lib/merger.d.ts.map +1 -1
  17. package/dist/lib/merger.js +28 -0
  18. package/dist/lib/merger.js.map +1 -1
  19. package/dist/lib/preset-update/cli-repo-detector.d.ts.map +1 -1
  20. package/dist/lib/preset-update/file-copier.d.ts.map +1 -1
  21. package/dist/lib/preset-update/file-copier.js +3 -3
  22. package/dist/lib/preset-update/file-copier.js.map +1 -1
  23. package/dist/lib/preset-update/preset-finder.d.ts.map +1 -1
  24. package/dist/lib/preset.d.ts.map +1 -1
  25. package/dist/lib/sync/category-validator.d.ts +1 -1
  26. package/dist/lib/sync/category-validator.d.ts.map +1 -1
  27. package/dist/lib/sync/category-validator.js +2 -1
  28. package/dist/lib/sync/category-validator.js.map +1 -1
  29. package/dist/lib/sync/category-validator.test.js +3 -1
  30. package/dist/lib/sync/category-validator.test.js.map +1 -1
  31. package/dist/lib/sync/conflict-reporter.d.ts.map +1 -1
  32. package/dist/lib/sync/diff-engine.d.ts.map +1 -1
  33. package/dist/lib/sync/file-filter.d.ts.map +1 -1
  34. package/dist/lib/sync/file-filter.js +1 -0
  35. package/dist/lib/sync/file-filter.js.map +1 -1
  36. package/dist/lib/sync/integration.test.js +255 -69
  37. package/dist/lib/sync/integration.test.js.map +1 -1
  38. package/dist/lib/sync/json-processor.d.ts +4 -4
  39. package/dist/lib/sync/json-processor.d.ts.map +1 -1
  40. package/dist/lib/sync/json-processor.js +11 -11
  41. package/dist/lib/sync/json-processor.js.map +1 -1
  42. package/dist/lib/sync/marker-processor.d.ts +60 -8
  43. package/dist/lib/sync/marker-processor.d.ts.map +1 -1
  44. package/dist/lib/sync/marker-processor.js +117 -26
  45. package/dist/lib/sync/marker-processor.js.map +1 -1
  46. package/dist/lib/sync/marker-processor.test.js +261 -40
  47. package/dist/lib/sync/marker-processor.test.js.map +1 -1
  48. package/dist/lib/sync/metadata-manager.d.ts +4 -0
  49. package/dist/lib/sync/metadata-manager.d.ts.map +1 -1
  50. package/dist/lib/sync/metadata-manager.js +15 -0
  51. package/dist/lib/sync/metadata-manager.js.map +1 -1
  52. package/dist/lib/sync/metadata-manager.test.js +69 -0
  53. package/dist/lib/sync/metadata-manager.test.js.map +1 -1
  54. package/dist/lib/sync/orphan-cleaner.d.ts +29 -0
  55. package/dist/lib/sync/orphan-cleaner.d.ts.map +1 -0
  56. package/dist/lib/sync/orphan-cleaner.js +80 -0
  57. package/dist/lib/sync/orphan-cleaner.js.map +1 -0
  58. package/dist/lib/sync/orphan-cleaner.test.d.ts +2 -0
  59. package/dist/lib/sync/orphan-cleaner.test.d.ts.map +1 -0
  60. package/dist/lib/sync/orphan-cleaner.test.js +169 -0
  61. package/dist/lib/sync/orphan-cleaner.test.js.map +1 -0
  62. package/dist/lib/sync/project-private-synchronizer.d.ts +52 -0
  63. package/dist/lib/sync/project-private-synchronizer.d.ts.map +1 -0
  64. package/dist/lib/sync/project-private-synchronizer.js +110 -0
  65. package/dist/lib/sync/project-private-synchronizer.js.map +1 -0
  66. package/dist/lib/sync/project-private-synchronizer.test.d.ts +2 -0
  67. package/dist/lib/sync/project-private-synchronizer.test.d.ts.map +1 -0
  68. package/dist/lib/sync/project-private-synchronizer.test.js +348 -0
  69. package/dist/lib/sync/project-private-synchronizer.test.js.map +1 -0
  70. package/dist/types/index.d.ts +1 -0
  71. package/dist/types/index.d.ts.map +1 -1
  72. package/dist/types/sync.d.ts +36 -6
  73. package/dist/types/sync.d.ts.map +1 -1
  74. package/dist/types/sync.js +2 -2
  75. package/dist/types/sync.js.map +1 -1
  76. package/package.json +5 -4
  77. package/presets/default/.claude/agents/einja/Explore.md +140 -0
  78. package/presets/default/.claude/agents/einja/backend-architect.md +21 -1
  79. package/presets/default/.claude/agents/einja/codex-agent.md +5 -1
  80. package/presets/default/.claude/agents/einja/design-engineer.md +5 -1
  81. package/presets/default/.claude/agents/einja/docs/docs-updater.md +7 -93
  82. package/presets/default/.claude/agents/einja/frontend-architect.md +21 -1
  83. package/presets/default/.claude/agents/einja/frontend-coder.md +5 -1
  84. package/presets/default/.claude/agents/einja/{specs/spec-design-generator.md → issue-specs/design-generator.md} +16 -8
  85. package/presets/default/.claude/agents/einja/{specs/spec-qa-generator.md → issue-specs/qa-generator.md} +10 -4
  86. package/presets/default/.claude/agents/einja/{specs/spec-requirements-generator.md → issue-specs/requirements-generator.md} +9 -6
  87. package/presets/default/.claude/agents/einja/{specs/spec-tasks-generator.md → issue-specs/tasks-generator.md} +19 -16
  88. package/presets/default/.claude/agents/einja/{specs/spec-tasks-validator.md → issue-specs/tasks-validator.md} +13 -9
  89. package/presets/default/.claude/agents/einja/issue-specs/ui-design-generator.md +114 -0
  90. package/presets/default/.claude/agents/einja/task/task-executer.md +64 -116
  91. package/presets/default/.claude/agents/einja/task/task-modification-analyzer.md +6 -2
  92. package/presets/default/.claude/agents/einja/task/task-qa.md +7 -3
  93. package/presets/default/.claude/agents/einja/task/task-reviewer.md +17 -1
  94. package/presets/default/.claude/commands/einja/einja-sync.md +124 -45
  95. package/presets/default/.claude/commands/einja/frontend-implement.md +3 -1
  96. package/presets/default/.claude/commands/einja/issue-exec.md +413 -0
  97. package/presets/default/.claude/commands/einja/start-dev.md +4 -0
  98. package/presets/default/.claude/commands/einja/sync-cursor-commands.md +10 -6
  99. package/presets/default/.claude/commands/einja/{update-docs-by-task-specs.md → update-docs-by-issue-specs.md} +61 -57
  100. package/presets/default/.claude/hooks/einja/plan-mode-skill-loader.sh +27 -0
  101. package/presets/default/.claude/settings.json +29 -5
  102. package/presets/default/.claude/skills/{einja-general-context-loader → _einja-general-context-loader}/SKILL.md +6 -2
  103. package/presets/default/.claude/skills/{einja-output-format → _einja-output-format}/SKILL.md +5 -1
  104. package/presets/default/.claude/skills/_einja-project-overview/SKILL.md +29 -0
  105. package/presets/default/.claude/skills/{einja-spec-context-loader → _einja-spec-context-loader}/SKILL.md +9 -5
  106. package/presets/default/.claude/skills/einja-coding-standards/references/testing-strategy.md +899 -0
  107. package/presets/default/.claude/skills/einja-conflict-resolver/SKILL.md +5 -1
  108. package/presets/default/.claude/skills/einja-create-pr/SKILL.md +138 -0
  109. package/presets/default/.claude/skills/einja-infra-maintenance/SKILL.md +779 -0
  110. package/presets/default/.claude/{commands/einja/spec-create.md → skills/einja-issue-spec-create/SKILL.md} +60 -23
  111. package/presets/default/.claude/skills/einja-issue-spec-generator/SKILL.md +105 -0
  112. package/presets/default/.claude/skills/einja-issue-spec-generator/references/format-rules.md +35 -0
  113. package/presets/default/.claude/skills/einja-issue-spec-validator/SKILL.md +130 -0
  114. package/presets/default/.claude/skills/einja-issue-spec-validator/references/validation-rules.md +52 -0
  115. package/presets/default/.claude/skills/einja-npm-release/SKILL.md +242 -0
  116. package/presets/default/.claude/skills/einja-skill-creator/SKILL.md +311 -263
  117. package/presets/default/.claude/skills/einja-skill-creator/agents/analyzer.md +274 -0
  118. package/presets/default/.claude/skills/einja-skill-creator/agents/comparator.md +202 -0
  119. package/presets/default/.claude/skills/einja-skill-creator/agents/grader.md +195 -0
  120. package/presets/default/.claude/skills/einja-skill-creator/assets/eval_review.html +146 -0
  121. package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/generate_review.py +471 -0
  122. package/presets/default/.claude/skills/einja-skill-creator/eval-viewer/viewer.html +1325 -0
  123. package/presets/default/.claude/skills/einja-skill-creator/references/schemas.md +430 -0
  124. package/presets/default/.claude/skills/einja-skill-creator/scripts/aggregate_benchmark.py +401 -0
  125. package/presets/default/.claude/skills/einja-skill-creator/scripts/compare_runs.py +154 -0
  126. package/presets/default/.claude/skills/einja-skill-creator/scripts/generate_report.py +272 -0
  127. package/presets/default/.claude/skills/einja-skill-creator/scripts/improve_description.py +247 -0
  128. package/presets/default/.claude/skills/einja-skill-creator/scripts/init_skill.py +13 -19
  129. package/presets/default/.claude/skills/einja-skill-creator/scripts/package_skill.py +36 -7
  130. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_eval.py +310 -0
  131. package/presets/default/.claude/skills/einja-skill-creator/scripts/run_loop.py +375 -0
  132. package/presets/default/.claude/skills/einja-skill-creator/scripts/utils.py +48 -0
  133. package/presets/default/.claude/skills/einja-skill-first/SKILL.md +265 -0
  134. package/presets/default/.claude/skills/einja-subagent-question-protocol/SKILL.md +98 -0
  135. package/presets/default/.claude/skills/einja-task-commit/SKILL.md +11 -7
  136. package/presets/default/.claude/{commands/einja/task-exec.md → skills/einja-task-exec/SKILL.md} +106 -89
  137. package/presets/default/.claude/skills/einja-task-qa/SKILL.md +8 -4
  138. package/presets/default/.claude/skills/einja-task-qa/references/troubleshooting.md +1 -1
  139. package/presets/default/.claude/skills/einja-task-qa/references/usage-patterns.md +2 -2
  140. package/presets/default/.claude/skills/einja-team-exec/SKILL.md +165 -0
  141. package/presets/default/.envrc +5 -0
  142. package/presets/default/.mcp.json +2 -12
  143. package/presets/default/CLAUDE.md.template +45 -8
  144. package/presets/default/docs/einja/example/specs/issues/issue999-example-task/tasks.md +1 -1
  145. package/presets/default/docs/einja/instructions/deployment-setup.md +4 -9
  146. package/presets/default/docs/einja/instructions/environment-setup.md +3 -8
  147. package/presets/default/docs/einja/instructions/issue-exec-workflow.md +276 -0
  148. package/presets/default/docs/einja/instructions/local-server-environment-and-worktree.md +71 -9
  149. package/presets/default/docs/einja/instructions/neon-cli-reference.md +3 -8
  150. package/presets/default/docs/einja/instructions/setup-flow.md +279 -0
  151. package/presets/default/docs/einja/instructions/task-execute.md +63 -68
  152. package/presets/default/docs/einja/instructions/vercel-cli-reference.md +17 -10
  153. package/presets/default/docs/einja/steering/README.md +11 -11
  154. package/presets/default/docs/einja/steering/acceptance-criteria-and-qa-guide.md +4 -9
  155. package/presets/default/docs/einja/steering/architecture.md +3 -8
  156. package/presets/default/docs/einja/steering/branch-strategy.md +63 -70
  157. package/presets/default/docs/einja/steering/commit-rules.md +3 -8
  158. package/presets/default/docs/einja/steering/db-schema-design.md +3 -8
  159. package/presets/default/docs/einja/steering/development/api-development.md +3 -8
  160. package/presets/default/docs/einja/steering/development/backend-architecture.md +3 -8
  161. package/presets/default/docs/einja/steering/development/coding-standards.md +723 -0
  162. package/presets/default/docs/einja/steering/development/component-design.md +502 -0
  163. package/presets/default/docs/einja/steering/development/database-guidelines.md +2 -2
  164. package/presets/default/docs/einja/steering/development/frontend-development.md +3 -8
  165. package/presets/default/docs/einja/steering/development/playwright-guidelines.md +59 -0
  166. package/presets/default/docs/einja/steering/development/review-guidelines.md +3 -8
  167. package/presets/default/docs/einja/steering/development/testing-strategy.md +3 -8
  168. package/presets/default/docs/einja/steering/development-workflow.md +155 -140
  169. package/presets/default/docs/einja/steering/infrastructure/deployment.md +156 -55
  170. package/presets/default/docs/einja/steering/infrastructure/environment-variables.md +4 -8
  171. package/presets/default/docs/einja/steering/product.md +3 -8
  172. package/presets/default/docs/einja/steering/task-management.md +22 -110
  173. package/presets/default/scripts/ensure-serena.sh +75 -0
  174. package/presets/default/scripts/env-rotate-secrets.ts +396 -0
  175. package/presets/default/scripts/env-show.ts +130 -0
  176. package/presets/default/scripts/env.ts +479 -0
  177. package/presets/default/scripts/init-github.ts +363 -0
  178. package/presets/default/scripts/init.sh +98 -0
  179. package/presets/default/scripts/lib/env-common.ts +108 -0
  180. package/presets/default/scripts/lib/worktree-config.ts +64 -0
  181. package/presets/default/scripts/setup-dev.ts +655 -0
  182. package/presets/default/scripts/stop-serena.sh +25 -0
  183. package/presets/default/scripts/worktree/dev.ts +872 -0
  184. package/dist/lib/sync/seed-synchronizer.d.ts +0 -27
  185. package/dist/lib/sync/seed-synchronizer.d.ts.map +0 -1
  186. package/dist/lib/sync/seed-synchronizer.js +0 -72
  187. package/dist/lib/sync/seed-synchronizer.js.map +0 -1
  188. package/dist/lib/sync/seed-synchronizer.test.d.ts +0 -2
  189. package/dist/lib/sync/seed-synchronizer.test.d.ts.map +0 -1
  190. package/dist/lib/sync/seed-synchronizer.test.js +0 -147
  191. package/dist/lib/sync/seed-synchronizer.test.js.map +0 -1
  192. package/presets/default/.claude/agents/einja/git/conflict-resolver.md +0 -148
  193. package/presets/default/.claude/hooks/einja/validate-git-commit.sh +0 -239
  194. package/presets/default/.claude/skills/einja-api-development/SKILL.md +0 -14
  195. package/presets/default/.claude/skills/einja-backend-architecture/SKILL.md +0 -18
  196. package/presets/default/.claude/skills/einja-coding-standards/SKILL.md +0 -132
  197. package/presets/default/.claude/skills/einja-coding-standards/references/import-conventions.md +0 -69
  198. package/presets/default/.claude/skills/einja-coding-standards/references/naming-conventions.md +0 -107
  199. package/presets/default/.claude/skills/einja-coding-standards/references/prohibited-patterns.md +0 -169
  200. package/presets/default/.claude/skills/einja-coding-standards/references/typescript-rules.md +0 -247
  201. package/presets/default/.claude/skills/einja-component-design/SKILL.md +0 -109
  202. package/presets/default/.claude/skills/einja-component-design/references/directory-structure.md +0 -117
  203. package/presets/default/.claude/skills/einja-component-design/references/props-patterns.md +0 -159
  204. package/presets/default/.claude/skills/einja-component-design/references/styling-guide.md +0 -122
  205. package/presets/default/.claude/skills/einja-frontend-development/SKILL.md +0 -14
  206. package/presets/default/.claude/skills/einja-project-overview/SKILL.md +0 -35
  207. package/presets/default/docs/einja/instructions/task-vibe-kanban-loop.md +0 -565
@@ -0,0 +1,430 @@
1
+ # JSONスキーマ
2
+
3
+ このドキュメントはskill-creatorで使用されるJSONスキーマを定義する。
4
+
5
+ ---
6
+
7
+ ## evals.json
8
+
9
+ スキルの評価テストケースを定義する。スキルディレクトリ内の`evals/evals.json`に配置。
10
+
11
+ ```json
12
+ {
13
+ "skill_name": "example-skill",
14
+ "evals": [
15
+ {
16
+ "id": 1,
17
+ "prompt": "ユーザーのサンプルプロンプト",
18
+ "expected_output": "期待される結果の説明",
19
+ "files": ["evals/files/sample1.pdf"],
20
+ "expectations": [
21
+ "出力にXが含まれている",
22
+ "スキルがスクリプトYを使用した"
23
+ ]
24
+ }
25
+ ]
26
+ }
27
+ ```
28
+
29
+ **フィールド:**
30
+ - `skill_name`: スキルのフロントマターと一致する名前
31
+ - `evals[].id`: 一意の整数識別子
32
+ - `evals[].prompt`: 実行するタスク
33
+ - `evals[].expected_output`: 成功を表す人間が読める説明
34
+ - `evals[].files`: 入力ファイルパスのオプションリスト(スキルルートからの相対パス)
35
+ - `evals[].expectations`: 検証可能な記述のリスト
36
+
37
+ ---
38
+
39
+ ## history.json
40
+
41
+ 改善モードでのバージョン進行を追跡する。ワークスペースルートに配置。
42
+
43
+ ```json
44
+ {
45
+ "started_at": "2026-01-15T10:30:00Z",
46
+ "skill_name": "pdf",
47
+ "current_best": "v2",
48
+ "iterations": [
49
+ {
50
+ "version": "v0",
51
+ "parent": null,
52
+ "expectation_pass_rate": 0.65,
53
+ "grading_result": "baseline",
54
+ "is_current_best": false
55
+ },
56
+ {
57
+ "version": "v1",
58
+ "parent": "v0",
59
+ "expectation_pass_rate": 0.75,
60
+ "grading_result": "won",
61
+ "is_current_best": false
62
+ },
63
+ {
64
+ "version": "v2",
65
+ "parent": "v1",
66
+ "expectation_pass_rate": 0.85,
67
+ "grading_result": "won",
68
+ "is_current_best": true
69
+ }
70
+ ]
71
+ }
72
+ ```
73
+
74
+ **フィールド:**
75
+ - `started_at`: 改善開始のISOタイムスタンプ
76
+ - `skill_name`: 改善対象のスキル名
77
+ - `current_best`: 最高パフォーマンスのバージョン識別子
78
+ - `iterations[].version`: バージョン識別子(v0、v1、...)
79
+ - `iterations[].parent`: 派生元の親バージョン
80
+ - `iterations[].expectation_pass_rate`: 採点からのパス率
81
+ - `iterations[].grading_result`: "baseline"、"won"、"lost"、または"tie"
82
+ - `iterations[].is_current_best`: 現在の最良バージョンかどうか
83
+
84
+ ---
85
+
86
+ ## grading.json
87
+
88
+ 採点エージェントの出力。`<run-dir>/grading.json`に配置。
89
+
90
+ ```json
91
+ {
92
+ "expectations": [
93
+ {
94
+ "text": "出力に'John Smith'という名前が含まれている",
95
+ "passed": true,
96
+ "evidence": "トランスクリプトのステップ3で発見: 'Extracted names: John Smith, Sarah Johnson'"
97
+ },
98
+ {
99
+ "text": "スプレッドシートのセルB10にSUM数式がある",
100
+ "passed": false,
101
+ "evidence": "スプレッドシートが作成されなかった。出力はテキストファイルだった。"
102
+ }
103
+ ],
104
+ "summary": {
105
+ "passed": 2,
106
+ "failed": 1,
107
+ "total": 3,
108
+ "pass_rate": 0.67
109
+ },
110
+ "execution_metrics": {
111
+ "tool_calls": {
112
+ "Read": 5,
113
+ "Write": 2,
114
+ "Bash": 8
115
+ },
116
+ "total_tool_calls": 15,
117
+ "total_steps": 6,
118
+ "errors_encountered": 0,
119
+ "output_chars": 12450,
120
+ "transcript_chars": 3200
121
+ },
122
+ "timing": {
123
+ "executor_duration_seconds": 165.0,
124
+ "grader_duration_seconds": 26.0,
125
+ "total_duration_seconds": 191.0
126
+ },
127
+ "claims": [
128
+ {
129
+ "claim": "フォームに12個の入力可能フィールドがある",
130
+ "type": "factual",
131
+ "verified": true,
132
+ "evidence": "field_info.jsonで12フィールドを確認"
133
+ }
134
+ ],
135
+ "user_notes_summary": {
136
+ "uncertainties": ["2023年のデータを使用、古い可能性がある"],
137
+ "needs_review": [],
138
+ "workarounds": ["入力不可フィールドにテキストオーバーレイで代替"]
139
+ },
140
+ "eval_feedback": {
141
+ "suggestions": [
142
+ {
143
+ "assertion": "出力に'John Smith'という名前が含まれている",
144
+ "reason": "名前に言及する幻覚ドキュメントでもパスしてしまう"
145
+ }
146
+ ],
147
+ "overall": "アサーションは存在のみをチェックし、正確性をチェックしていない。"
148
+ }
149
+ }
150
+ ```
151
+
152
+ **フィールド:**
153
+ - `expectations[]`: 根拠付きの採点済み期待値
154
+ - `summary`: パス/フェイルの集計カウント
155
+ - `execution_metrics`: ツール使用量と出力サイズ(エグゼキューターのmetrics.jsonから)
156
+ - `timing`: 実行時間(timing.jsonから)
157
+ - `claims`: 出力から抽出・検証されたクレーム
158
+ - `user_notes_summary`: エグゼキューターがフラグした問題
159
+ - `eval_feedback`:(オプション)評価の改善提案。採点エージェントが指摘すべき問題を特定した場合のみ存在
160
+
161
+ ---
162
+
163
+ ## metrics.json
164
+
165
+ エグゼキューターエージェントの出力。`<run-dir>/outputs/metrics.json`に配置。
166
+
167
+ ```json
168
+ {
169
+ "tool_calls": {
170
+ "Read": 5,
171
+ "Write": 2,
172
+ "Bash": 8,
173
+ "Edit": 1,
174
+ "Glob": 2,
175
+ "Grep": 0
176
+ },
177
+ "total_tool_calls": 18,
178
+ "total_steps": 6,
179
+ "files_created": ["filled_form.pdf", "field_values.json"],
180
+ "errors_encountered": 0,
181
+ "output_chars": 12450,
182
+ "transcript_chars": 3200
183
+ }
184
+ ```
185
+
186
+ **フィールド:**
187
+ - `tool_calls`: ツールタイプごとのカウント
188
+ - `total_tool_calls`: 全ツール呼び出しの合計
189
+ - `total_steps`: 主要な実行ステップの数
190
+ - `files_created`: 作成された出力ファイルのリスト
191
+ - `errors_encountered`: 実行中のエラー数
192
+ - `output_chars`: 出力ファイルの合計文字数
193
+ - `transcript_chars`: トランスクリプトの文字数
194
+
195
+ ---
196
+
197
+ ## timing.json
198
+
199
+ 実行の経過時間。`<run-dir>/timing.json`に配置。
200
+
201
+ **キャプチャ方法:** サブエージェントタスクが完了すると、タスク通知に`total_tokens`と`duration_ms`が含まれる。これらは他の場所に永続化されず、事後に復元できないため、即座に保存すること。
202
+
203
+ ```json
204
+ {
205
+ "total_tokens": 84852,
206
+ "duration_ms": 23332,
207
+ "total_duration_seconds": 23.3,
208
+ "executor_start": "2026-01-15T10:30:00Z",
209
+ "executor_end": "2026-01-15T10:32:45Z",
210
+ "executor_duration_seconds": 165.0,
211
+ "grader_start": "2026-01-15T10:32:46Z",
212
+ "grader_end": "2026-01-15T10:33:12Z",
213
+ "grader_duration_seconds": 26.0
214
+ }
215
+ ```
216
+
217
+ ---
218
+
219
+ ## benchmark.json
220
+
221
+ ベンチマークモードの出力。`benchmarks/<timestamp>/benchmark.json`に配置。
222
+
223
+ ```json
224
+ {
225
+ "metadata": {
226
+ "skill_name": "pdf",
227
+ "skill_path": "/path/to/pdf",
228
+ "executor_model": "claude-sonnet-4-20250514",
229
+ "analyzer_model": "most-capable-model",
230
+ "timestamp": "2026-01-15T10:30:00Z",
231
+ "evals_run": [1, 2, 3],
232
+ "runs_per_configuration": 3
233
+ },
234
+
235
+ "runs": [
236
+ {
237
+ "eval_id": 1,
238
+ "eval_name": "Ocean",
239
+ "configuration": "with_skill",
240
+ "run_number": 1,
241
+ "result": {
242
+ "pass_rate": 0.85,
243
+ "passed": 6,
244
+ "failed": 1,
245
+ "total": 7,
246
+ "time_seconds": 42.5,
247
+ "tokens": 3800,
248
+ "tool_calls": 18,
249
+ "errors": 0
250
+ },
251
+ "expectations": [
252
+ {"text": "...", "passed": true, "evidence": "..."}
253
+ ],
254
+ "notes": [
255
+ "2023年のデータを使用、古い可能性がある",
256
+ "入力不可フィールドにテキストオーバーレイで代替"
257
+ ]
258
+ }
259
+ ],
260
+
261
+ "run_summary": {
262
+ "with_skill": {
263
+ "pass_rate": {"mean": 0.85, "stddev": 0.05, "min": 0.80, "max": 0.90},
264
+ "time_seconds": {"mean": 45.0, "stddev": 12.0, "min": 32.0, "max": 58.0},
265
+ "tokens": {"mean": 3800, "stddev": 400, "min": 3200, "max": 4100}
266
+ },
267
+ "without_skill": {
268
+ "pass_rate": {"mean": 0.35, "stddev": 0.08, "min": 0.28, "max": 0.45},
269
+ "time_seconds": {"mean": 32.0, "stddev": 8.0, "min": 24.0, "max": 42.0},
270
+ "tokens": {"mean": 2100, "stddev": 300, "min": 1800, "max": 2500}
271
+ },
272
+ "delta": {
273
+ "pass_rate": "+0.50",
274
+ "time_seconds": "+13.0",
275
+ "tokens": "+1700"
276
+ }
277
+ },
278
+
279
+ "notes": [
280
+ "アサーション '出力はPDFファイルである' は両構成で100%パス - スキルの価値を区別しない可能性",
281
+ "評価3が高いばらつきを示す(50% ± 40%) - 不安定またはモデル依存の可能性",
282
+ "スキルなし実行はテーブル抽出の期待値で一貫して失敗",
283
+ "スキルは平均13秒の実行時間増加だが、パス率を50%改善"
284
+ ]
285
+ }
286
+ ```
287
+
288
+ **フィールド:**
289
+ - `metadata`: ベンチマーク実行に関する情報
290
+ - `skill_name`: スキル名
291
+ - `timestamp`: ベンチマーク実行日時
292
+ - `evals_run`: 評価名またはIDのリスト
293
+ - `runs_per_configuration`: 構成ごとの実行回数(例: 3)
294
+ - `runs[]`: 個別の実行結果
295
+ - `eval_id`: 数値の評価識別子
296
+ - `eval_name`: 人間が読める評価名(ビューアーのセクションヘッダーとして使用)
297
+ - `configuration`: `"with_skill"`または`"without_skill"`でなければならない(ビューアーはこの正確な文字列をグルーピングとカラーコーディングに使用)
298
+ - `run_number`: 整数の実行番号(1、2、3...)
299
+ - `result`: `pass_rate`、`passed`、`total`、`time_seconds`、`tokens`、`errors`を含むネストされたオブジェクト
300
+ - `run_summary`: 構成ごとの統計集計
301
+ - `with_skill` / `without_skill`: それぞれ`pass_rate`、`time_seconds`、`tokens`オブジェクトを含み、`mean`と`stddev`フィールドを持つ
302
+ - `delta`: `"+0.50"`、`"+13.0"`、`"+1700"`のような差分文字列
303
+ - `notes`: 分析エージェントからのフリーフォーム観察
304
+
305
+ **重要:** ビューアーはこれらのフィールド名を正確に読み取る。`configuration`の代わりに`config`を使用したり、`pass_rate`を`result`内ではなく実行のトップレベルに配置したりすると、ビューアーは空/ゼロの値を表示する。benchmark.jsonを手動で生成する際は常にこのスキーマを参照すること。
306
+
307
+ ---
308
+
309
+ ## comparison.json
310
+
311
+ ブラインド比較エージェントの出力。`<grading-dir>/comparison-N.json`に配置。
312
+
313
+ ```json
314
+ {
315
+ "winner": "A",
316
+ "reasoning": "出力Aは適切なフォーマットとすべての必須フィールドを備えた完全なソリューションを提供している。出力Bは日付フィールドが欠落しており、フォーマットに不一致がある。",
317
+ "rubric": {
318
+ "A": {
319
+ "content": {
320
+ "correctness": 5,
321
+ "completeness": 5,
322
+ "accuracy": 4
323
+ },
324
+ "structure": {
325
+ "organization": 4,
326
+ "formatting": 5,
327
+ "usability": 4
328
+ },
329
+ "content_score": 4.7,
330
+ "structure_score": 4.3,
331
+ "overall_score": 9.0
332
+ },
333
+ "B": {
334
+ "content": {
335
+ "correctness": 3,
336
+ "completeness": 2,
337
+ "accuracy": 3
338
+ },
339
+ "structure": {
340
+ "organization": 3,
341
+ "formatting": 2,
342
+ "usability": 3
343
+ },
344
+ "content_score": 2.7,
345
+ "structure_score": 2.7,
346
+ "overall_score": 5.4
347
+ }
348
+ },
349
+ "output_quality": {
350
+ "A": {
351
+ "score": 9,
352
+ "strengths": ["完全なソリューション", "適切なフォーマット", "すべてのフィールドが存在"],
353
+ "weaknesses": ["ヘッダーに軽微なスタイル不一致"]
354
+ },
355
+ "B": {
356
+ "score": 5,
357
+ "strengths": ["読みやすい出力", "基本構造が正しい"],
358
+ "weaknesses": ["日付フィールドの欠落", "フォーマットの不一致", "部分的なデータ抽出"]
359
+ }
360
+ },
361
+ "expectation_results": {
362
+ "A": {
363
+ "passed": 4,
364
+ "total": 5,
365
+ "pass_rate": 0.80,
366
+ "details": [
367
+ {"text": "出力に名前が含まれている", "passed": true}
368
+ ]
369
+ },
370
+ "B": {
371
+ "passed": 3,
372
+ "total": 5,
373
+ "pass_rate": 0.60,
374
+ "details": [
375
+ {"text": "出力に名前が含まれている", "passed": true}
376
+ ]
377
+ }
378
+ }
379
+ }
380
+ ```
381
+
382
+ ---
383
+
384
+ ## analysis.json
385
+
386
+ 事後分析エージェントの出力。`<grading-dir>/analysis.json`に配置。
387
+
388
+ ```json
389
+ {
390
+ "comparison_summary": {
391
+ "winner": "A",
392
+ "winner_skill": "path/to/winner/skill",
393
+ "loser_skill": "path/to/loser/skill",
394
+ "comparator_reasoning": "比較エージェントが勝者を選んだ理由の要約"
395
+ },
396
+ "winner_strengths": [
397
+ "複数ページのドキュメント処理に対する明確なステップバイステップの指示",
398
+ "フォーマットエラーを検出する検証スクリプトを含む"
399
+ ],
400
+ "loser_weaknesses": [
401
+ "曖昧な指示「ドキュメントを適切に処理」が一貫性のない動作につながった",
402
+ "検証スクリプトがなく、エージェントが即興で対応"
403
+ ],
404
+ "instruction_following": {
405
+ "winner": {
406
+ "score": 9,
407
+ "issues": ["軽微: オプションのログ記録ステップをスキップ"]
408
+ },
409
+ "loser": {
410
+ "score": 6,
411
+ "issues": [
412
+ "スキルのフォーマットテンプレートを使用しなかった",
413
+ "ステップ3に従わず独自のアプローチを考案した"
414
+ ]
415
+ }
416
+ },
417
+ "improvement_suggestions": [
418
+ {
419
+ "priority": "high",
420
+ "category": "instructions",
421
+ "suggestion": "「ドキュメントを適切に処理」を明示的なステップに置き換え",
422
+ "expected_impact": "一貫性のない動作を引き起こした曖昧さを排除"
423
+ }
424
+ ],
425
+ "transcript_insights": {
426
+ "winner_execution_pattern": "スキルを読む -> 5ステッププロセスに従う -> 検証スクリプトを使用",
427
+ "loser_execution_pattern": "スキルを読む -> アプローチが不明確 -> 3つの異なる方法を試す"
428
+ }
429
+ }
430
+ ```