ai-engineering-init 1.7.0 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. package/.claude/agents/bug-analyzer.md +103 -0
  2. package/.claude/agents/code-reviewer.md +115 -5
  3. package/.claude/agents/image-reader.md +154 -0
  4. package/.claude/agents/loki-runner.md +80 -0
  5. package/.claude/agents/mysql-runner.md +81 -0
  6. package/.claude/agents/requirements-analyzer.md +162 -0
  7. package/.claude/agents/task-fetcher.md +75 -0
  8. package/.claude/commands/dev.md +29 -0
  9. package/.claude/commands/next.md +31 -1
  10. package/.claude/commands/progress.md +23 -1
  11. package/.claude/hooks/skill-forced-eval.js +46 -62
  12. package/.claude/settings.json +10 -1
  13. package/.claude/skills/api-development/SKILL.md +179 -130
  14. package/.claude/skills/architecture-design/SKILL.md +102 -212
  15. package/.claude/skills/backend-annotations/SKILL.md +166 -220
  16. package/.claude/skills/bug-detective/SKILL.md +225 -186
  17. package/.claude/skills/code-patterns/SKILL.md +127 -244
  18. package/.claude/skills/collaborating-with-codex/SKILL.md +96 -113
  19. package/.claude/skills/crud-development/SKILL.md +226 -307
  20. package/.claude/skills/data-permission/SKILL.md +131 -202
  21. package/.claude/skills/database-ops/SKILL.md +158 -355
  22. package/.claude/skills/error-handler/SKILL.md +224 -285
  23. package/.claude/skills/file-oss-management/SKILL.md +174 -169
  24. package/.claude/skills/git-workflow/SKILL.md +123 -341
  25. package/.claude/skills/json-serialization/SKILL.md +121 -137
  26. package/.claude/skills/performance-doctor/SKILL.md +83 -89
  27. package/.claude/skills/redis-cache/SKILL.md +134 -185
  28. package/.claude/skills/scheduled-jobs/SKILL.md +187 -224
  29. package/.claude/skills/security-guard/SKILL.md +168 -276
  30. package/.claude/skills/sms-mail/SKILL.md +266 -228
  31. package/.claude/skills/social-login/SKILL.md +257 -195
  32. package/.claude/skills/tenant-management/SKILL.md +172 -188
  33. package/.claude/skills/utils-toolkit/SKILL.md +214 -222
  34. package/.claude/skills/websocket-sse/SKILL.md +251 -172
  35. package/.claude/skills/workflow-engine/SKILL.md +178 -250
  36. package/.codex/skills/api-development/SKILL.md +179 -130
  37. package/.codex/skills/architecture-design/SKILL.md +102 -212
  38. package/.codex/skills/backend-annotations/SKILL.md +166 -220
  39. package/.codex/skills/bug-detective/SKILL.md +225 -186
  40. package/.codex/skills/code-patterns/SKILL.md +127 -244
  41. package/.codex/skills/collaborating-with-codex/SKILL.md +96 -113
  42. package/.codex/skills/crud-development/SKILL.md +226 -307
  43. package/.codex/skills/data-permission/SKILL.md +131 -202
  44. package/.codex/skills/database-ops/SKILL.md +158 -355
  45. package/.codex/skills/dev/SKILL.md +476 -131
  46. package/.codex/skills/error-handler/SKILL.md +224 -285
  47. package/.codex/skills/file-oss-management/SKILL.md +174 -169
  48. package/.codex/skills/git-workflow/SKILL.md +123 -341
  49. package/.codex/skills/json-serialization/SKILL.md +121 -137
  50. package/.codex/skills/next/SKILL.md +186 -42
  51. package/.codex/skills/performance-doctor/SKILL.md +83 -89
  52. package/.codex/skills/progress/SKILL.md +147 -76
  53. package/.codex/skills/redis-cache/SKILL.md +134 -185
  54. package/.codex/skills/scheduled-jobs/SKILL.md +187 -224
  55. package/.codex/skills/security-guard/SKILL.md +168 -276
  56. package/.codex/skills/sms-mail/SKILL.md +266 -228
  57. package/.codex/skills/social-login/SKILL.md +257 -195
  58. package/.codex/skills/tenant-management/SKILL.md +172 -188
  59. package/.codex/skills/utils-toolkit/SKILL.md +214 -222
  60. package/.codex/skills/websocket-sse/SKILL.md +251 -172
  61. package/.codex/skills/workflow-engine/SKILL.md +178 -250
  62. package/.cursor/agents/bug-analyzer.md +102 -0
  63. package/.cursor/agents/code-reviewer.md +80 -97
  64. package/.cursor/agents/image-reader.md +154 -0
  65. package/.cursor/agents/loki-runner.md +80 -0
  66. package/.cursor/agents/mysql-runner.md +81 -0
  67. package/.cursor/agents/project-manager.md +1 -1
  68. package/.cursor/agents/requirements-analyzer.md +141 -0
  69. package/.cursor/agents/task-fetcher.md +75 -0
  70. package/.cursor/hooks/cursor-skill-eval.js +66 -6
  71. package/.cursor/skills/api-development/SKILL.md +179 -130
  72. package/.cursor/skills/architecture-design/SKILL.md +102 -212
  73. package/.cursor/skills/backend-annotations/SKILL.md +166 -220
  74. package/.cursor/skills/bug-detective/SKILL.md +225 -186
  75. package/.cursor/skills/code-patterns/SKILL.md +127 -244
  76. package/.cursor/skills/collaborating-with-codex/SKILL.md +96 -113
  77. package/.cursor/skills/crud-development/SKILL.md +226 -307
  78. package/.cursor/skills/data-permission/SKILL.md +131 -202
  79. package/.cursor/skills/database-ops/SKILL.md +158 -355
  80. package/.cursor/skills/error-handler/SKILL.md +224 -285
  81. package/.cursor/skills/file-oss-management/SKILL.md +174 -169
  82. package/.cursor/skills/git-workflow/SKILL.md +123 -341
  83. package/.cursor/skills/json-serialization/SKILL.md +121 -137
  84. package/.cursor/skills/performance-doctor/SKILL.md +83 -89
  85. package/.cursor/skills/redis-cache/SKILL.md +134 -185
  86. package/.cursor/skills/scheduled-jobs/SKILL.md +187 -224
  87. package/.cursor/skills/security-guard/SKILL.md +168 -276
  88. package/.cursor/skills/sms-mail/SKILL.md +266 -228
  89. package/.cursor/skills/social-login/SKILL.md +257 -195
  90. package/.cursor/skills/tenant-management/SKILL.md +172 -188
  91. package/.cursor/skills/utils-toolkit/SKILL.md +214 -222
  92. package/.cursor/skills/websocket-sse/SKILL.md +251 -172
  93. package/.cursor/skills/workflow-engine/SKILL.md +178 -250
  94. package/AGENTS.md +117 -540
  95. package/CLAUDE.md +105 -117
  96. package/README.md +37 -6
  97. package/bin/index.js +5 -1
  98. package/package.json +1 -1
  99. package/src/skills/api-development/SKILL.md +179 -130
  100. package/src/skills/architecture-design/SKILL.md +102 -212
  101. package/src/skills/backend-annotations/SKILL.md +166 -220
  102. package/src/skills/bug-detective/SKILL.md +225 -186
  103. package/src/skills/code-patterns/SKILL.md +127 -244
  104. package/src/skills/collaborating-with-codex/SKILL.md +96 -113
  105. package/src/skills/crud-development/SKILL.md +226 -307
  106. package/src/skills/data-permission/SKILL.md +131 -202
  107. package/src/skills/database-ops/SKILL.md +158 -355
  108. package/src/skills/error-handler/SKILL.md +224 -285
  109. package/src/skills/file-oss-management/SKILL.md +174 -169
  110. package/src/skills/git-workflow/SKILL.md +123 -341
  111. package/src/skills/json-serialization/SKILL.md +121 -137
  112. package/src/skills/performance-doctor/SKILL.md +83 -89
  113. package/src/skills/redis-cache/SKILL.md +134 -185
  114. package/src/skills/scheduled-jobs/SKILL.md +187 -224
  115. package/src/skills/security-guard/SKILL.md +168 -276
  116. package/src/skills/sms-mail/SKILL.md +266 -228
  117. package/src/skills/social-login/SKILL.md +257 -195
  118. package/src/skills/tenant-management/SKILL.md +172 -188
  119. package/src/skills/utils-toolkit/SKILL.md +214 -222
  120. package/src/skills/websocket-sse/SKILL.md +251 -172
  121. package/src/skills/workflow-engine/SKILL.md +178 -250
  122. package/.claude/skills/skill-creator/LICENSE.txt +0 -202
  123. package/.claude/skills/skill-creator/SKILL.md +0 -479
  124. package/.claude/skills/skill-creator/agents/analyzer.md +0 -274
  125. package/.claude/skills/skill-creator/agents/comparator.md +0 -202
  126. package/.claude/skills/skill-creator/agents/grader.md +0 -223
  127. package/.claude/skills/skill-creator/assets/eval_review.html +0 -146
  128. package/.claude/skills/skill-creator/eval-viewer/generate_review.py +0 -471
  129. package/.claude/skills/skill-creator/eval-viewer/viewer.html +0 -1325
  130. package/.claude/skills/skill-creator/references/schemas.md +0 -430
  131. package/.claude/skills/skill-creator/scripts/__init__.py +0 -0
  132. package/.claude/skills/skill-creator/scripts/aggregate_benchmark.py +0 -401
  133. package/.claude/skills/skill-creator/scripts/generate_report.py +0 -326
  134. package/.claude/skills/skill-creator/scripts/improve_description.py +0 -248
  135. package/.claude/skills/skill-creator/scripts/package_skill.py +0 -136
  136. package/.claude/skills/skill-creator/scripts/quick_validate.py +0 -103
  137. package/.claude/skills/skill-creator/scripts/run_eval.py +0 -310
  138. package/.claude/skills/skill-creator/scripts/run_loop.py +0 -332
  139. package/.claude/skills/skill-creator/scripts/utils.py +0 -47
@@ -1,401 +0,0 @@
1
- #!/usr/bin/env python3
2
- """
3
- Aggregate individual run results into benchmark summary statistics.
4
-
5
- Reads grading.json files from run directories and produces:
6
- - run_summary with mean, stddev, min, max for each metric
7
- - delta between with_skill and without_skill configurations
8
-
9
- Usage:
10
- python aggregate_benchmark.py <benchmark_dir>
11
-
12
- Example:
13
- python aggregate_benchmark.py benchmarks/2026-01-15T10-30-00/
14
-
15
- The script supports two directory layouts:
16
-
17
- Workspace layout (from skill-creator iterations):
18
- <benchmark_dir>/
19
- └── eval-N/
20
- ├── with_skill/
21
- │ ├── run-1/grading.json
22
- │ └── run-2/grading.json
23
- └── without_skill/
24
- ├── run-1/grading.json
25
- └── run-2/grading.json
26
-
27
- Legacy layout (with runs/ subdirectory):
28
- <benchmark_dir>/
29
- └── runs/
30
- └── eval-N/
31
- ├── with_skill/
32
- │ └── run-1/grading.json
33
- └── without_skill/
34
- └── run-1/grading.json
35
- """
36
-
37
- import argparse
38
- import json
39
- import math
40
- import sys
41
- from datetime import datetime, timezone
42
- from pathlib import Path
43
-
44
-
45
- def calculate_stats(values: list[float]) -> dict:
46
- """Calculate mean, stddev, min, max for a list of values."""
47
- if not values:
48
- return {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0}
49
-
50
- n = len(values)
51
- mean = sum(values) / n
52
-
53
- if n > 1:
54
- variance = sum((x - mean) ** 2 for x in values) / (n - 1)
55
- stddev = math.sqrt(variance)
56
- else:
57
- stddev = 0.0
58
-
59
- return {
60
- "mean": round(mean, 4),
61
- "stddev": round(stddev, 4),
62
- "min": round(min(values), 4),
63
- "max": round(max(values), 4)
64
- }
65
-
66
-
67
- def load_run_results(benchmark_dir: Path) -> dict:
68
- """
69
- Load all run results from a benchmark directory.
70
-
71
- Returns dict keyed by config name (e.g. "with_skill"/"without_skill",
72
- or "new_skill"/"old_skill"), each containing a list of run results.
73
- """
74
- # Support both layouts: eval dirs directly under benchmark_dir, or under runs/
75
- runs_dir = benchmark_dir / "runs"
76
- if runs_dir.exists():
77
- search_dir = runs_dir
78
- elif list(benchmark_dir.glob("eval-*")):
79
- search_dir = benchmark_dir
80
- else:
81
- print(f"No eval directories found in {benchmark_dir} or {benchmark_dir / 'runs'}")
82
- return {}
83
-
84
- results: dict[str, list] = {}
85
-
86
- for eval_idx, eval_dir in enumerate(sorted(search_dir.glob("eval-*"))):
87
- metadata_path = eval_dir / "eval_metadata.json"
88
- if metadata_path.exists():
89
- try:
90
- with open(metadata_path) as mf:
91
- eval_id = json.load(mf).get("eval_id", eval_idx)
92
- except (json.JSONDecodeError, OSError):
93
- eval_id = eval_idx
94
- else:
95
- try:
96
- eval_id = int(eval_dir.name.split("-")[1])
97
- except ValueError:
98
- eval_id = eval_idx
99
-
100
- # Discover config directories dynamically rather than hardcoding names
101
- for config_dir in sorted(eval_dir.iterdir()):
102
- if not config_dir.is_dir():
103
- continue
104
- # Skip non-config directories (inputs, outputs, etc.)
105
- if not list(config_dir.glob("run-*")):
106
- continue
107
- config = config_dir.name
108
- if config not in results:
109
- results[config] = []
110
-
111
- for run_dir in sorted(config_dir.glob("run-*")):
112
- run_number = int(run_dir.name.split("-")[1])
113
- grading_file = run_dir / "grading.json"
114
-
115
- if not grading_file.exists():
116
- print(f"Warning: grading.json not found in {run_dir}")
117
- continue
118
-
119
- try:
120
- with open(grading_file) as f:
121
- grading = json.load(f)
122
- except json.JSONDecodeError as e:
123
- print(f"Warning: Invalid JSON in {grading_file}: {e}")
124
- continue
125
-
126
- # Extract metrics
127
- result = {
128
- "eval_id": eval_id,
129
- "run_number": run_number,
130
- "pass_rate": grading.get("summary", {}).get("pass_rate", 0.0),
131
- "passed": grading.get("summary", {}).get("passed", 0),
132
- "failed": grading.get("summary", {}).get("failed", 0),
133
- "total": grading.get("summary", {}).get("total", 0),
134
- }
135
-
136
- # Extract timing — check grading.json first, then sibling timing.json
137
- timing = grading.get("timing", {})
138
- result["time_seconds"] = timing.get("total_duration_seconds", 0.0)
139
- timing_file = run_dir / "timing.json"
140
- if result["time_seconds"] == 0.0 and timing_file.exists():
141
- try:
142
- with open(timing_file) as tf:
143
- timing_data = json.load(tf)
144
- result["time_seconds"] = timing_data.get("total_duration_seconds", 0.0)
145
- result["tokens"] = timing_data.get("total_tokens", 0)
146
- except json.JSONDecodeError:
147
- pass
148
-
149
- # Extract metrics if available
150
- metrics = grading.get("execution_metrics", {})
151
- result["tool_calls"] = metrics.get("total_tool_calls", 0)
152
- if not result.get("tokens"):
153
- result["tokens"] = metrics.get("output_chars", 0)
154
- result["errors"] = metrics.get("errors_encountered", 0)
155
-
156
- # Extract expectations — viewer requires fields: text, passed, evidence
157
- raw_expectations = grading.get("expectations", [])
158
- for exp in raw_expectations:
159
- if "text" not in exp or "passed" not in exp:
160
- print(f"Warning: expectation in {grading_file} missing required fields (text, passed, evidence): {exp}")
161
- result["expectations"] = raw_expectations
162
-
163
- # Extract notes from user_notes_summary
164
- notes_summary = grading.get("user_notes_summary", {})
165
- notes = []
166
- notes.extend(notes_summary.get("uncertainties", []))
167
- notes.extend(notes_summary.get("needs_review", []))
168
- notes.extend(notes_summary.get("workarounds", []))
169
- result["notes"] = notes
170
-
171
- results[config].append(result)
172
-
173
- return results
174
-
175
-
176
- def aggregate_results(results: dict) -> dict:
177
- """
178
- Aggregate run results into summary statistics.
179
-
180
- Returns run_summary with stats for each configuration and delta.
181
- """
182
- run_summary = {}
183
- configs = list(results.keys())
184
-
185
- for config in configs:
186
- runs = results.get(config, [])
187
-
188
- if not runs:
189
- run_summary[config] = {
190
- "pass_rate": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
191
- "time_seconds": {"mean": 0.0, "stddev": 0.0, "min": 0.0, "max": 0.0},
192
- "tokens": {"mean": 0, "stddev": 0, "min": 0, "max": 0}
193
- }
194
- continue
195
-
196
- pass_rates = [r["pass_rate"] for r in runs]
197
- times = [r["time_seconds"] for r in runs]
198
- tokens = [r.get("tokens", 0) for r in runs]
199
-
200
- run_summary[config] = {
201
- "pass_rate": calculate_stats(pass_rates),
202
- "time_seconds": calculate_stats(times),
203
- "tokens": calculate_stats(tokens)
204
- }
205
-
206
- # Calculate delta between the first two configs (if two exist)
207
- if len(configs) >= 2:
208
- primary = run_summary.get(configs[0], {})
209
- baseline = run_summary.get(configs[1], {})
210
- else:
211
- primary = run_summary.get(configs[0], {}) if configs else {}
212
- baseline = {}
213
-
214
- delta_pass_rate = primary.get("pass_rate", {}).get("mean", 0) - baseline.get("pass_rate", {}).get("mean", 0)
215
- delta_time = primary.get("time_seconds", {}).get("mean", 0) - baseline.get("time_seconds", {}).get("mean", 0)
216
- delta_tokens = primary.get("tokens", {}).get("mean", 0) - baseline.get("tokens", {}).get("mean", 0)
217
-
218
- run_summary["delta"] = {
219
- "pass_rate": f"{delta_pass_rate:+.2f}",
220
- "time_seconds": f"{delta_time:+.1f}",
221
- "tokens": f"{delta_tokens:+.0f}"
222
- }
223
-
224
- return run_summary
225
-
226
-
227
- def generate_benchmark(benchmark_dir: Path, skill_name: str = "", skill_path: str = "") -> dict:
228
- """
229
- Generate complete benchmark.json from run results.
230
- """
231
- results = load_run_results(benchmark_dir)
232
- run_summary = aggregate_results(results)
233
-
234
- # Build runs array for benchmark.json
235
- runs = []
236
- for config in results:
237
- for result in results[config]:
238
- runs.append({
239
- "eval_id": result["eval_id"],
240
- "configuration": config,
241
- "run_number": result["run_number"],
242
- "result": {
243
- "pass_rate": result["pass_rate"],
244
- "passed": result["passed"],
245
- "failed": result["failed"],
246
- "total": result["total"],
247
- "time_seconds": result["time_seconds"],
248
- "tokens": result.get("tokens", 0),
249
- "tool_calls": result.get("tool_calls", 0),
250
- "errors": result.get("errors", 0)
251
- },
252
- "expectations": result["expectations"],
253
- "notes": result["notes"]
254
- })
255
-
256
- # Determine eval IDs from results
257
- eval_ids = sorted(set(
258
- r["eval_id"]
259
- for config in results.values()
260
- for r in config
261
- ))
262
-
263
- benchmark = {
264
- "metadata": {
265
- "skill_name": skill_name or "<skill-name>",
266
- "skill_path": skill_path or "<path/to/skill>",
267
- "executor_model": "<model-name>",
268
- "analyzer_model": "<model-name>",
269
- "timestamp": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
270
- "evals_run": eval_ids,
271
- "runs_per_configuration": 3
272
- },
273
- "runs": runs,
274
- "run_summary": run_summary,
275
- "notes": [] # To be filled by analyzer
276
- }
277
-
278
- return benchmark
279
-
280
-
281
- def generate_markdown(benchmark: dict) -> str:
282
- """Generate human-readable benchmark.md from benchmark data."""
283
- metadata = benchmark["metadata"]
284
- run_summary = benchmark["run_summary"]
285
-
286
- # Determine config names (excluding "delta")
287
- configs = [k for k in run_summary if k != "delta"]
288
- config_a = configs[0] if len(configs) >= 1 else "config_a"
289
- config_b = configs[1] if len(configs) >= 2 else "config_b"
290
- label_a = config_a.replace("_", " ").title()
291
- label_b = config_b.replace("_", " ").title()
292
-
293
- lines = [
294
- f"# Skill Benchmark: {metadata['skill_name']}",
295
- "",
296
- f"**Model**: {metadata['executor_model']}",
297
- f"**Date**: {metadata['timestamp']}",
298
- f"**Evals**: {', '.join(map(str, metadata['evals_run']))} ({metadata['runs_per_configuration']} runs each per configuration)",
299
- "",
300
- "## Summary",
301
- "",
302
- f"| Metric | {label_a} | {label_b} | Delta |",
303
- "|--------|------------|---------------|-------|",
304
- ]
305
-
306
- a_summary = run_summary.get(config_a, {})
307
- b_summary = run_summary.get(config_b, {})
308
- delta = run_summary.get("delta", {})
309
-
310
- # Format pass rate
311
- a_pr = a_summary.get("pass_rate", {})
312
- b_pr = b_summary.get("pass_rate", {})
313
- lines.append(f"| Pass Rate | {a_pr.get('mean', 0)*100:.0f}% ± {a_pr.get('stddev', 0)*100:.0f}% | {b_pr.get('mean', 0)*100:.0f}% ± {b_pr.get('stddev', 0)*100:.0f}% | {delta.get('pass_rate', '—')} |")
314
-
315
- # Format time
316
- a_time = a_summary.get("time_seconds", {})
317
- b_time = b_summary.get("time_seconds", {})
318
- lines.append(f"| Time | {a_time.get('mean', 0):.1f}s ± {a_time.get('stddev', 0):.1f}s | {b_time.get('mean', 0):.1f}s ± {b_time.get('stddev', 0):.1f}s | {delta.get('time_seconds', '—')}s |")
319
-
320
- # Format tokens
321
- a_tokens = a_summary.get("tokens", {})
322
- b_tokens = b_summary.get("tokens", {})
323
- lines.append(f"| Tokens | {a_tokens.get('mean', 0):.0f} ± {a_tokens.get('stddev', 0):.0f} | {b_tokens.get('mean', 0):.0f} ± {b_tokens.get('stddev', 0):.0f} | {delta.get('tokens', '—')} |")
324
-
325
- # Notes section
326
- if benchmark.get("notes"):
327
- lines.extend([
328
- "",
329
- "## Notes",
330
- ""
331
- ])
332
- for note in benchmark["notes"]:
333
- lines.append(f"- {note}")
334
-
335
- return "\n".join(lines)
336
-
337
-
338
- def main():
339
- parser = argparse.ArgumentParser(
340
- description="Aggregate benchmark run results into summary statistics"
341
- )
342
- parser.add_argument(
343
- "benchmark_dir",
344
- type=Path,
345
- help="Path to the benchmark directory"
346
- )
347
- parser.add_argument(
348
- "--skill-name",
349
- default="",
350
- help="Name of the skill being benchmarked"
351
- )
352
- parser.add_argument(
353
- "--skill-path",
354
- default="",
355
- help="Path to the skill being benchmarked"
356
- )
357
- parser.add_argument(
358
- "--output", "-o",
359
- type=Path,
360
- help="Output path for benchmark.json (default: <benchmark_dir>/benchmark.json)"
361
- )
362
-
363
- args = parser.parse_args()
364
-
365
- if not args.benchmark_dir.exists():
366
- print(f"Directory not found: {args.benchmark_dir}")
367
- sys.exit(1)
368
-
369
- # Generate benchmark
370
- benchmark = generate_benchmark(args.benchmark_dir, args.skill_name, args.skill_path)
371
-
372
- # Determine output paths
373
- output_json = args.output or (args.benchmark_dir / "benchmark.json")
374
- output_md = output_json.with_suffix(".md")
375
-
376
- # Write benchmark.json
377
- with open(output_json, "w") as f:
378
- json.dump(benchmark, f, indent=2)
379
- print(f"Generated: {output_json}")
380
-
381
- # Write benchmark.md
382
- markdown = generate_markdown(benchmark)
383
- with open(output_md, "w") as f:
384
- f.write(markdown)
385
- print(f"Generated: {output_md}")
386
-
387
- # Print summary
388
- run_summary = benchmark["run_summary"]
389
- configs = [k for k in run_summary if k != "delta"]
390
- delta = run_summary.get("delta", {})
391
-
392
- print(f"\nSummary:")
393
- for config in configs:
394
- pr = run_summary[config]["pass_rate"]["mean"]
395
- label = config.replace("_", " ").title()
396
- print(f" {label}: {pr*100:.1f}% pass rate")
397
- print(f" Delta: {delta.get('pass_rate', '—')}")
398
-
399
-
400
- if __name__ == "__main__":
401
- main()