ai-engineering-init 1.6.0 → 1.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/.claude/agents/code-reviewer.md +3 -130
  2. package/.claude/hooks/skill-forced-eval.js +46 -60
  3. package/.claude/hooks/stop.js +24 -1
  4. package/.claude/settings.json +10 -1
  5. package/.claude/skills/api-development/SKILL.md +179 -130
  6. package/.claude/skills/architecture-design/SKILL.md +102 -212
  7. package/.claude/skills/backend-annotations/SKILL.md +166 -220
  8. package/.claude/skills/bug-detective/SKILL.md +225 -186
  9. package/.claude/skills/code-patterns/SKILL.md +127 -244
  10. package/.claude/skills/codex-code-review/SKILL.md +327 -0
  11. package/.claude/skills/collaborating-with-codex/SKILL.md +96 -113
  12. package/.claude/skills/crud-development/SKILL.md +226 -307
  13. package/.claude/skills/data-permission/SKILL.md +131 -202
  14. package/.claude/skills/database-ops/SKILL.md +158 -355
  15. package/.claude/skills/error-handler/SKILL.md +224 -285
  16. package/.claude/skills/file-oss-management/SKILL.md +174 -169
  17. package/.claude/skills/git-workflow/SKILL.md +123 -341
  18. package/.claude/skills/json-serialization/SKILL.md +121 -137
  19. package/.claude/skills/leniu-report-customization/SKILL.md +82 -2
  20. package/.claude/skills/leniu-report-standard-customization/SKILL.md +65 -2
  21. package/.claude/skills/loki-log-query/SKILL.md +400 -0
  22. package/.claude/skills/mysql-debug/SKILL.md +58 -22
  23. package/.claude/skills/performance-doctor/SKILL.md +83 -89
  24. package/.claude/skills/redis-cache/SKILL.md +134 -185
  25. package/.claude/skills/scheduled-jobs/SKILL.md +187 -224
  26. package/.claude/skills/security-guard/SKILL.md +168 -276
  27. package/.claude/skills/sms-mail/SKILL.md +266 -228
  28. package/.claude/skills/social-login/SKILL.md +257 -195
  29. package/.claude/skills/sync-back-merge/SKILL.md +66 -0
  30. package/.claude/skills/tenant-management/SKILL.md +172 -188
  31. package/.claude/skills/utils-toolkit/SKILL.md +214 -222
  32. package/.claude/skills/websocket-sse/SKILL.md +251 -172
  33. package/.claude/skills/workflow-engine/SKILL.md +178 -250
  34. package/.claude/skills/yunxiao-task-management/SKILL.md +489 -0
  35. package/.codex/skills/api-development/SKILL.md +179 -130
  36. package/.codex/skills/architecture-design/SKILL.md +102 -212
  37. package/.codex/skills/backend-annotations/SKILL.md +166 -220
  38. package/.codex/skills/bug-detective/SKILL.md +225 -186
  39. package/.codex/skills/code-patterns/SKILL.md +127 -244
  40. package/.codex/skills/collaborating-with-codex/SKILL.md +96 -113
  41. package/.codex/skills/crud-development/SKILL.md +226 -307
  42. package/.codex/skills/data-permission/SKILL.md +131 -202
  43. package/.codex/skills/database-ops/SKILL.md +158 -355
  44. package/.codex/skills/error-handler/SKILL.md +224 -285
  45. package/.codex/skills/file-oss-management/SKILL.md +174 -169
  46. package/.codex/skills/git-workflow/SKILL.md +123 -341
  47. package/.codex/skills/json-serialization/SKILL.md +121 -137
  48. package/.codex/skills/leniu-report-customization/SKILL.md +82 -2
  49. package/.codex/skills/leniu-report-standard-customization/SKILL.md +65 -2
  50. package/.codex/skills/loki-log-query/SKILL.md +400 -0
  51. package/.codex/skills/loki-log-query/environments.json +45 -0
  52. package/.codex/skills/mysql-debug/SKILL.md +58 -22
  53. package/.codex/skills/performance-doctor/SKILL.md +83 -89
  54. package/.codex/skills/redis-cache/SKILL.md +134 -185
  55. package/.codex/skills/scheduled-jobs/SKILL.md +187 -224
  56. package/.codex/skills/security-guard/SKILL.md +168 -276
  57. package/.codex/skills/skill-creator/LICENSE.txt +202 -0
  58. package/.codex/skills/skill-creator/SKILL.md +479 -0
  59. package/.codex/skills/skill-creator/agents/analyzer.md +274 -0
  60. package/.codex/skills/skill-creator/agents/comparator.md +202 -0
  61. package/.codex/skills/skill-creator/agents/grader.md +223 -0
  62. package/.codex/skills/skill-creator/assets/eval_review.html +146 -0
  63. package/.codex/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  64. package/.codex/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  65. package/.codex/skills/skill-creator/references/schemas.md +430 -0
  66. package/.codex/skills/skill-creator/scripts/__init__.py +0 -0
  67. package/.codex/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  68. package/.codex/skills/skill-creator/scripts/generate_report.py +326 -0
  69. package/.codex/skills/skill-creator/scripts/improve_description.py +248 -0
  70. package/.codex/skills/skill-creator/scripts/package_skill.py +136 -0
  71. package/.codex/skills/skill-creator/scripts/quick_validate.py +103 -0
  72. package/.codex/skills/skill-creator/scripts/run_eval.py +310 -0
  73. package/.codex/skills/skill-creator/scripts/run_loop.py +332 -0
  74. package/.codex/skills/skill-creator/scripts/utils.py +47 -0
  75. package/.codex/skills/sms-mail/SKILL.md +266 -228
  76. package/.codex/skills/social-login/SKILL.md +257 -195
  77. package/.codex/skills/sync-back-merge/SKILL.md +66 -0
  78. package/.codex/skills/tenant-management/SKILL.md +172 -188
  79. package/.codex/skills/utils-toolkit/SKILL.md +214 -222
  80. package/.codex/skills/websocket-sse/SKILL.md +251 -172
  81. package/.codex/skills/workflow-engine/SKILL.md +178 -250
  82. package/.codex/skills/yunxiao-task-management/SKILL.md +489 -0
  83. package/.cursor/hooks/cursor-skill-eval.js +66 -6
  84. package/.cursor/hooks/stop.js +23 -1
  85. package/.cursor/skills/api-development/SKILL.md +179 -130
  86. package/.cursor/skills/architecture-design/SKILL.md +102 -212
  87. package/.cursor/skills/backend-annotations/SKILL.md +166 -220
  88. package/.cursor/skills/bug-detective/SKILL.md +225 -186
  89. package/.cursor/skills/code-patterns/SKILL.md +127 -244
  90. package/.cursor/skills/collaborating-with-codex/SKILL.md +96 -113
  91. package/.cursor/skills/crud-development/SKILL.md +226 -307
  92. package/.cursor/skills/data-permission/SKILL.md +131 -202
  93. package/.cursor/skills/database-ops/SKILL.md +158 -355
  94. package/.cursor/skills/error-handler/SKILL.md +224 -285
  95. package/.cursor/skills/file-oss-management/SKILL.md +174 -169
  96. package/.cursor/skills/git-workflow/SKILL.md +123 -341
  97. package/.cursor/skills/json-serialization/SKILL.md +121 -137
  98. package/.cursor/skills/leniu-report-customization/SKILL.md +82 -2
  99. package/.cursor/skills/leniu-report-standard-customization/SKILL.md +65 -2
  100. package/.cursor/skills/loki-log-query/SKILL.md +400 -0
  101. package/.cursor/skills/loki-log-query/environments.json +45 -0
  102. package/.cursor/skills/mysql-debug/SKILL.md +58 -22
  103. package/.cursor/skills/performance-doctor/SKILL.md +83 -89
  104. package/.cursor/skills/redis-cache/SKILL.md +134 -185
  105. package/.cursor/skills/scheduled-jobs/SKILL.md +187 -224
  106. package/.cursor/skills/security-guard/SKILL.md +168 -276
  107. package/.cursor/skills/skill-creator/LICENSE.txt +202 -0
  108. package/.cursor/skills/skill-creator/SKILL.md +479 -0
  109. package/.cursor/skills/skill-creator/agents/analyzer.md +274 -0
  110. package/.cursor/skills/skill-creator/agents/comparator.md +202 -0
  111. package/.cursor/skills/skill-creator/agents/grader.md +223 -0
  112. package/.cursor/skills/skill-creator/assets/eval_review.html +146 -0
  113. package/.cursor/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  114. package/.cursor/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  115. package/.cursor/skills/skill-creator/references/schemas.md +430 -0
  116. package/.cursor/skills/skill-creator/scripts/__init__.py +0 -0
  117. package/.cursor/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  118. package/.cursor/skills/skill-creator/scripts/generate_report.py +326 -0
  119. package/.cursor/skills/skill-creator/scripts/improve_description.py +248 -0
  120. package/.cursor/skills/skill-creator/scripts/package_skill.py +136 -0
  121. package/.cursor/skills/skill-creator/scripts/quick_validate.py +103 -0
  122. package/.cursor/skills/skill-creator/scripts/run_eval.py +310 -0
  123. package/.cursor/skills/skill-creator/scripts/run_loop.py +332 -0
  124. package/.cursor/skills/skill-creator/scripts/utils.py +47 -0
  125. package/.cursor/skills/sms-mail/SKILL.md +266 -228
  126. package/.cursor/skills/social-login/SKILL.md +257 -195
  127. package/.cursor/skills/sync-back-merge/SKILL.md +66 -0
  128. package/.cursor/skills/tenant-management/SKILL.md +172 -188
  129. package/.cursor/skills/utils-toolkit/SKILL.md +214 -222
  130. package/.cursor/skills/websocket-sse/SKILL.md +251 -172
  131. package/.cursor/skills/workflow-engine/SKILL.md +178 -250
  132. package/.cursor/skills/yunxiao-task-management/SKILL.md +489 -0
  133. package/AGENTS.md +49 -540
  134. package/CLAUDE.md +73 -119
  135. package/README.md +37 -6
  136. package/bin/index.js +611 -25
  137. package/package.json +1 -1
  138. package/src/platform-map.json +4 -0
  139. package/src/skills/api-development/SKILL.md +179 -130
  140. package/src/skills/architecture-design/SKILL.md +102 -212
  141. package/src/skills/backend-annotations/SKILL.md +166 -220
  142. package/src/skills/bug-detective/SKILL.md +225 -186
  143. package/src/skills/code-patterns/SKILL.md +127 -244
  144. package/src/skills/codex-code-review/SKILL.md +261 -69
  145. package/src/skills/collaborating-with-codex/SKILL.md +96 -113
  146. package/src/skills/crud-development/SKILL.md +226 -307
  147. package/src/skills/data-permission/SKILL.md +131 -202
  148. package/src/skills/database-ops/SKILL.md +158 -355
  149. package/src/skills/error-handler/SKILL.md +224 -285
  150. package/src/skills/file-oss-management/SKILL.md +174 -169
  151. package/src/skills/git-workflow/SKILL.md +123 -341
  152. package/src/skills/json-serialization/SKILL.md +121 -137
  153. package/src/skills/leniu-report-customization/SKILL.md +82 -2
  154. package/src/skills/leniu-report-standard-customization/SKILL.md +65 -2
  155. package/src/skills/loki-log-query/SKILL.md +400 -0
  156. package/src/skills/loki-log-query/environments.json +45 -0
  157. package/src/skills/mysql-debug/SKILL.md +58 -22
  158. package/src/skills/performance-doctor/SKILL.md +83 -89
  159. package/src/skills/redis-cache/SKILL.md +134 -185
  160. package/src/skills/scheduled-jobs/SKILL.md +187 -224
  161. package/src/skills/security-guard/SKILL.md +168 -276
  162. package/src/skills/skill-creator/LICENSE.txt +202 -0
  163. package/src/skills/skill-creator/SKILL.md +479 -0
  164. package/src/skills/skill-creator/agents/analyzer.md +274 -0
  165. package/src/skills/skill-creator/agents/comparator.md +202 -0
  166. package/src/skills/skill-creator/agents/grader.md +223 -0
  167. package/src/skills/skill-creator/assets/eval_review.html +146 -0
  168. package/src/skills/skill-creator/eval-viewer/generate_review.py +471 -0
  169. package/src/skills/skill-creator/eval-viewer/viewer.html +1325 -0
  170. package/src/skills/skill-creator/references/schemas.md +430 -0
  171. package/src/skills/skill-creator/scripts/__init__.py +0 -0
  172. package/src/skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
  173. package/src/skills/skill-creator/scripts/generate_report.py +326 -0
  174. package/src/skills/skill-creator/scripts/improve_description.py +248 -0
  175. package/src/skills/skill-creator/scripts/package_skill.py +136 -0
  176. package/src/skills/skill-creator/scripts/quick_validate.py +103 -0
  177. package/src/skills/skill-creator/scripts/run_eval.py +310 -0
  178. package/src/skills/skill-creator/scripts/run_loop.py +332 -0
  179. package/src/skills/skill-creator/scripts/utils.py +47 -0
  180. package/src/skills/sms-mail/SKILL.md +266 -228
  181. package/src/skills/social-login/SKILL.md +257 -195
  182. package/src/skills/sync-back-merge/SKILL.md +66 -0
  183. package/src/skills/tenant-management/SKILL.md +172 -188
  184. package/src/skills/utils-toolkit/SKILL.md +214 -222
  185. package/src/skills/websocket-sse/SKILL.md +251 -172
  186. package/src/skills/workflow-engine/SKILL.md +178 -250
  187. package/src/skills/yunxiao-task-management/SKILL.md +489 -0
@@ -0,0 +1,401 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Aggregate individual run results into benchmark summary statistics.
4
+
5
+ Reads grading.json files from run directories and produces:
6
+ - run_summary with mean, stddev, min, max for each metric
7
+ - delta between with_skill and without_skill configurations
8
+
9
+ Usage:
10
+ python aggregate_benchmark.py <benchmark_dir>
11
+
12
+ Example:
13
+ python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
14
+
15
+ The script supports two directory layouts:
16
+
17
+ Workspace layout (from skill-creator iterations):
18
+ <benchmark_dir>/
19
+ └── eval-N/
20
+ ├── with_skill/
21
+ │ ├── run-1/grading.json
22
+ │ └── run-2/grading.json
23
+ └── without_skill/
24
+ ├── run-1/grading.json
25
+ └── run-2/grading.json
26
+
27
+ Legacy layout (with runs/ subdirectory):
28
+ <benchmark_dir>/
29
+ └── runs/
30
+ └── eval-N/
31
+ ├── with_skill/
32
+ │ └── run-1/grading.json
33
+ └── without_skill/
34
+ └── run-1/grading.json
35
+ """
36
+
37
+ import argparse
38
+ import json
39
+ import math
40
+ import sys
41
+ from datetime import datetime, timezone
42
+ from pathlib import Path
43
+
44
+
45
+ def calculate_stats(values: list[float]) -> dict:
46
+ """Calculate mean, stddev, min, max for a list of values."""
47
+ if not values:
48
+ return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
49
+
50
+ n = len(values)
51
+ mean = sum(values) / n
52
+
53
+ if n > 1:
54
+ variance = sum((x - mean) ** 2 for x in values) / (n - 1)
55
+ stddev = math.sqrt(variance)
56
+ else:
57
+ stddev = 0.0
58
+
59
+ return {
60
+ "mean": round(mean, 4),
61
+ "stddev": round(stddev, 4),
62
+ "min": round(min(values), 4),
63
+ "max": round(max(values), 4)
64
+ }
65
+
66
+
67
+ def load_run_results(benchmark_dir: Path) -> dict:
68
+ """
69
+ Load all run results from a benchmark directory.
70
+
71
+ Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
72
+ or "new_skill"/"old_skill"), each containing a list of run results.
73
+ """
74
+ # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
75
+ runs_dir = benchmark_dir / "runs"
76
+ if runs_dir.exists():
77
+ search_dir = runs_dir
78
+ elif list(benchmark_dir.glob("eval-*")):
79
+ search_dir = benchmark_dir
80
+ else:
81
+ print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
82
+ return {}
83
+
84
+ results: dict[str, list] = {}
85
+
86
+ for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
87
+ metadata_path = eval_dir / "eval_metadata.json"
88
+ if metadata_path.exists():
89
+ try:
90
+ with open(metadata_path) as mf:
91
+ eval_id = json.load(mf).get("eval_id", eval_idx)
92
+ except (json.JSONDecodeError, OSError):
93
+ eval_id = eval_idx
94
+ else:
95
+ try:
96
+ eval_id = int(eval_dir.name.split("-")[1])
97
+ except ValueError:
98
+ eval_id = eval_idx
99
+
100
+ # Discover config directories dynamically rather than hardcoding names
101
+ for config_dir in sorted(eval_dir.iterdir()):
102
+ if not config_dir.is_dir():
103
+ continue
104
+ # Skip non-config directories (inputs, outputs, etc.)
105
+ if not list(config_dir.glob("run-*")):
106
+ continue
107
+ config = config_dir.name
108
+ if config not in results:
109
+ results[config] = []
110
+
111
+ for run_dir in sorted(config_dir.glob("run-*")):
112
+ run_number = int(run_dir.name.split("-")[1])
113
+ grading_file = run_dir / "grading.json"
114
+
115
+ if not grading_file.exists():
116
+ print(f"Warning: grading.json not found in {run_dir}")
117
+ continue
118
+
119
+ try:
120
+ with open(grading_file) as f:
121
+ grading = json.load(f)
122
+ except json.JSONDecodeError as e:
123
+ print(f"Warning: Invalid JSON in {grading_file}: {e}")
124
+ continue
125
+
126
+ # Extract metrics
127
+ result = {
128
+ "eval_id": eval_id,
129
+ "run_number": run_number,
130
+ "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
131
+ "passed": grading.get("summary", {}).get("passed", 0),
132
+ "failed": grading.get("summary", {}).get("failed", 0),
133
+ "total": grading.get("summary", {}).get("total", 0),
134
+ }
135
+
136
+ # Extract timing — check grading.json first, then sibling timing.json
137
+ timing = grading.get("timing", {})
138
+ result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
139
+ timing_file = run_dir / "timing.json"
140
+ if result["time_seconds"] == 0.0 and timing_file.exists():
141
+ try:
142
+ with open(timing_file) as tf:
143
+ timing_data = json.load(tf)
144
+ result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
145
+ result["tokens"] = timing_data.get("total_tokens", 0)
146
+ except json.JSONDecodeError:
147
+ pass
148
+
149
+ # Extract metrics if available
150
+ metrics = grading.get("execution_metrics", {})
151
+ result["tool_calls"] = metrics.get("total_tool_calls", 0)
152
+ if not result.get("tokens"):
153
+ result["tokens"] = metrics.get("output_chars", 0)
154
+ result["errors"] = metrics.get("errors_encountered", 0)
155
+
156
+ # Extract expectations — viewer requires fields: text, passed, evidence
157
+ raw_expectations = grading.get("expectations", [])
158
+ for exp in raw_expectations:
159
+ if "text" not in exp or "passed" not in exp:
160
+ print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
161
+ result["expectations"] = raw_expectations
162
+
163
+ # Extract notes from user_notes_summary
164
+ notes_summary = grading.get("user_notes_summary", {})
165
+ notes = []
166
+ notes.extend(notes_summary.get("uncertainties", []))
167
+ notes.extend(notes_summary.get("needs_review", []))
168
+ notes.extend(notes_summary.get("workarounds", []))
169
+ result["notes"] = notes
170
+
171
+ results[config].append(result)
172
+
173
+ return results
174
+
175
+
176
+ def aggregate_results(results: dict) -> dict:
177
+ """
178
+ Aggregate run results into summary statistics.
179
+
180
+ Returns run_summary with stats for each configuration and delta.
181
+ """
182
+ run_summary = {}
183
+ configs = list(results.keys())
184
+
185
+ for config in configs:
186
+ runs = results.get(config, [])
187
+
188
+ if not runs:
189
+ run_summary[config] = {
190
+ "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
191
+ "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
192
+ "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
193
+ }
194
+ continue
195
+
196
+ pass_rates = [r["pass_rate"] for r in runs]
197
+ times = [r["time_seconds"] for r in runs]
198
+ tokens = [r.get("tokens", 0) for r in runs]
199
+
200
+ run_summary[config] = {
201
+ "pass_rate": calculate_stats(pass_rates),
202
+ "time_seconds": calculate_stats(times),
203
+ "tokens": calculate_stats(tokens)
204
+ }
205
+
206
+ # Calculate delta between the first two configs (if two exist)
207
+ if len(configs) >= 2:
208
+ primary = run_summary.get(configs[0], {})
209
+ baseline = run_summary.get(configs[1], {})
210
+ else:
211
+ primary = run_summary.get(configs[0], {}) if configs else {}
212
+ baseline = {}
213
+
214
+ delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
215
+ delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
216
+ delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
217
+
218
+ run_summary["delta"] = {
219
+ "pass_rate": f"{delta_pass_rate:+.2f}",
220
+ "time_seconds": f"{delta_time:+.1f}",
221
+ "tokens": f"{delta_tokens:+.0f}"
222
+ }
223
+
224
+ return run_summary
225
+
226
+
227
+ def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
228
+ """
229
+ Generate complete benchmark.json from run results.
230
+ """
231
+ results = load_run_results(benchmark_dir)
232
+ run_summary = aggregate_results(results)
233
+
234
+ # Build runs array for benchmark.json
235
+ runs = []
236
+ for config in results:
237
+ for result in results[config]:
238
+ runs.append({
239
+ "eval_id": result["eval_id"],
240
+ "configuration": config,
241
+ "run_number": result["run_number"],
242
+ "result": {
243
+ "pass_rate": result["pass_rate"],
244
+ "passed": result["passed"],
245
+ "failed": result["failed"],
246
+ "total": result["total"],
247
+ "time_seconds": result["time_seconds"],
248
+ "tokens": result.get("tokens", 0),
249
+ "tool_calls": result.get("tool_calls", 0),
250
+ "errors": result.get("errors", 0)
251
+ },
252
+ "expectations": result["expectations"],
253
+ "notes": result["notes"]
254
+ })
255
+
256
+ # Determine eval IDs from results
257
+ eval_ids = sorted(set(
258
+ r["eval_id"]
259
+ for config in results.values()
260
+ for r in config
261
+ ))
262
+
263
+ benchmark = {
264
+ "metadata": {
265
+ "skill_name": skill_name or "<skill-name>",
266
+ "skill_path": skill_path or "<path/to/skill>",
267
+ "executor_model": "<model-name>",
268
+ "analyzer_model": "<model-name>",
269
+ "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
270
+ "evals_run": eval_ids,
271
+ "runs_per_configuration": 3
272
+ },
273
+ "runs": runs,
274
+ "run_summary": run_summary,
275
+ "notes": [] # To be filled by analyzer
276
+ }
277
+
278
+ return benchmark
279
+
280
+
281
+ def generate_markdown(benchmark: dict) -> str:
282
+ """Generate human-readable benchmark.md from benchmark data."""
283
+ metadata = benchmark["metadata"]
284
+ run_summary = benchmark["run_summary"]
285
+
286
+ # Determine config names (excluding "delta")
287
+ configs = [k for k in run_summary if k != "delta"]
288
+ config_a = configs[0] if len(configs) >= 1 else "config_a"
289
+ config_b = configs[1] if len(configs) >= 2 else "config_b"
290
+ label_a = config_a.replace("_", " ").title()
291
+ label_b = config_b.replace("_", " ").title()
292
+
293
+ lines = [
294
+ f"# Skill Benchmark: {metadata['skill_name']}",
295
+ "",
296
+ f"**Model**: {metadata['executor_model']}",
297
+ f"**Date**: {metadata['timestamp']}",
298
+ f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
299
+ "",
300
+ "## Summary",
301
+ "",
302
+ f"| Metric | {label_a} | {label_b} | Delta |",
303
+ "|--------|------------|---------------|-------|",
304
+ ]
305
+
306
+ a_summary = run_summary.get(config_a, {})
307
+ b_summary = run_summary.get(config_b, {})
308
+ delta = run_summary.get("delta", {})
309
+
310
+ # Format pass rate
311
+ a_pr = a_summary.get("pass_rate", {})
312
+ b_pr = b_summary.get("pass_rate", {})
313
+ lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
314
+
315
+ # Format time
316
+ a_time = a_summary.get("time_seconds", {})
317
+ b_time = b_summary.get("time_seconds", {})
318
+ lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
319
+
320
+ # Format tokens
321
+ a_tokens = a_summary.get("tokens", {})
322
+ b_tokens = b_summary.get("tokens", {})
323
+ lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
324
+
325
+ # Notes section
326
+ if benchmark.get("notes"):
327
+ lines.extend([
328
+ "",
329
+ "## Notes",
330
+ ""
331
+ ])
332
+ for note in benchmark["notes"]:
333
+ lines.append(f"- {note}")
334
+
335
+ return "\n".join(lines)
336
+
337
+
338
+ def main():
339
+ parser = argparse.ArgumentParser(
340
+ description="Aggregate benchmark run results into summary statistics"
341
+ )
342
+ parser.add_argument(
343
+ "benchmark_dir",
344
+ type=Path,
345
+ help="Path to the benchmark directory"
346
+ )
347
+ parser.add_argument(
348
+ "--skill-name",
349
+ default="",
350
+ help="Name of the skill being benchmarked"
351
+ )
352
+ parser.add_argument(
353
+ "--skill-path",
354
+ default="",
355
+ help="Path to the skill being benchmarked"
356
+ )
357
+ parser.add_argument(
358
+ "--output", "-o",
359
+ type=Path,
360
+ help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
361
+ )
362
+
363
+ args = parser.parse_args()
364
+
365
+ if not args.benchmark_dir.exists():
366
+ print(f"Directory not found: {args.benchmark_dir}")
367
+ sys.exit(1)
368
+
369
+ # Generate benchmark
370
+ benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
371
+
372
+ # Determine output paths
373
+ output_json = args.output or (args.benchmark_dir / "benchmark.json")
374
+ output_md = output_json.with_suffix(".md")
375
+
376
+ # Write benchmark.json
377
+ with open(output_json, "w") as f:
378
+ json.dump(benchmark, f, indent=2)
379
+ print(f"Generated: {output_json}")
380
+
381
+ # Write benchmark.md
382
+ markdown = generate_markdown(benchmark)
383
+ with open(output_md, "w") as f:
384
+ f.write(markdown)
385
+ print(f"Generated: {output_md}")
386
+
387
+ # Print summary
388
+ run_summary = benchmark["run_summary"]
389
+ configs = [k for k in run_summary if k != "delta"]
390
+ delta = run_summary.get("delta", {})
391
+
392
+ print(f"\nSummary:")
393
+ for config in configs:
394
+ pr = run_summary[config]["pass_rate"]["mean"]
395
+ label = config.replace("_", " ").title()
396
+ print(f" {label}: {pr*100:.1f}% pass rate")
397
+ print(f" Delta: {delta.get('pass_rate', '—')}")
398
+
399
+
400
+ if __name__ == "__main__":
401
+ main()