codingbuddy-rules 4.5.0 → 5.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/.ai-rules/adapters/antigravity.md +6 -6
  2. package/.ai-rules/adapters/claude-code.md +68 -4
  3. package/.ai-rules/adapters/codex.md +5 -5
  4. package/.ai-rules/adapters/cursor.md +2 -2
  5. package/.ai-rules/adapters/kiro.md +8 -8
  6. package/.ai-rules/adapters/opencode.md +7 -7
  7. package/.ai-rules/adapters/q.md +2 -2
  8. package/.ai-rules/agents/README.md +66 -16
  9. package/.ai-rules/agents/accessibility-specialist.json +2 -1
  10. package/.ai-rules/agents/act-mode.json +2 -1
  11. package/.ai-rules/agents/agent-architect.json +8 -7
  12. package/.ai-rules/agents/ai-ml-engineer.json +1 -0
  13. package/.ai-rules/agents/architecture-specialist.json +1 -0
  14. package/.ai-rules/agents/auto-mode.json +4 -2
  15. package/.ai-rules/agents/backend-developer.json +1 -0
  16. package/.ai-rules/agents/code-quality-specialist.json +1 -0
  17. package/.ai-rules/agents/code-reviewer.json +65 -64
  18. package/.ai-rules/agents/data-engineer.json +8 -7
  19. package/.ai-rules/agents/data-scientist.json +10 -9
  20. package/.ai-rules/agents/devops-engineer.json +1 -0
  21. package/.ai-rules/agents/documentation-specialist.json +1 -0
  22. package/.ai-rules/agents/eval-mode.json +20 -19
  23. package/.ai-rules/agents/event-architecture-specialist.json +1 -0
  24. package/.ai-rules/agents/frontend-developer.json +1 -0
  25. package/.ai-rules/agents/i18n-specialist.json +2 -1
  26. package/.ai-rules/agents/integration-specialist.json +1 -0
  27. package/.ai-rules/agents/migration-specialist.json +1 -0
  28. package/.ai-rules/agents/mobile-developer.json +8 -7
  29. package/.ai-rules/agents/observability-specialist.json +1 -0
  30. package/.ai-rules/agents/parallel-orchestrator.json +346 -0
  31. package/.ai-rules/agents/performance-specialist.json +1 -0
  32. package/.ai-rules/agents/plan-mode.json +3 -1
  33. package/.ai-rules/agents/plan-reviewer.json +208 -0
  34. package/.ai-rules/agents/platform-engineer.json +1 -0
  35. package/.ai-rules/agents/security-engineer.json +9 -8
  36. package/.ai-rules/agents/security-specialist.json +2 -1
  37. package/.ai-rules/agents/seo-specialist.json +1 -0
  38. package/.ai-rules/agents/software-engineer.json +1 -0
  39. package/.ai-rules/agents/solution-architect.json +11 -10
  40. package/.ai-rules/agents/systems-developer.json +9 -8
  41. package/.ai-rules/agents/technical-planner.json +11 -10
  42. package/.ai-rules/agents/test-engineer.json +7 -6
  43. package/.ai-rules/agents/test-strategy-specialist.json +1 -0
  44. package/.ai-rules/agents/tooling-engineer.json +4 -3
  45. package/.ai-rules/agents/ui-ux-designer.json +1 -0
  46. package/.ai-rules/keyword-modes.json +4 -4
  47. package/.ai-rules/rules/clarification-guide.md +14 -14
  48. package/.ai-rules/rules/core.md +73 -0
  49. package/.ai-rules/rules/parallel-execution.md +217 -0
  50. package/.ai-rules/skills/README.md +23 -1
  51. package/.ai-rules/skills/agent-design/SKILL.md +5 -0
  52. package/.ai-rules/skills/agent-design/examples/agent-template.json +58 -0
  53. package/.ai-rules/skills/agent-design/references/expertise-guidelines.md +112 -0
  54. package/.ai-rules/skills/agent-discussion/SKILL.md +199 -0
  55. package/.ai-rules/skills/agent-discussion-panel/SKILL.md +448 -0
  56. package/.ai-rules/skills/api-design/SKILL.md +5 -0
  57. package/.ai-rules/skills/api-design/examples/error-response.json +159 -0
  58. package/.ai-rules/skills/api-design/examples/openapi-template.yaml +393 -0
  59. package/.ai-rules/skills/build-fix/SKILL.md +234 -0
  60. package/.ai-rules/skills/code-explanation/SKILL.md +4 -0
  61. package/.ai-rules/skills/context-management/SKILL.md +1 -0
  62. package/.ai-rules/skills/cost-budget/SKILL.md +348 -0
  63. package/.ai-rules/skills/cross-repo-issues/SKILL.md +257 -0
  64. package/.ai-rules/skills/database-migration/SKILL.md +1 -0
  65. package/.ai-rules/skills/deepsearch/SKILL.md +214 -0
  66. package/.ai-rules/skills/deployment-checklist/SKILL.md +1 -0
  67. package/.ai-rules/skills/error-analysis/SKILL.md +1 -0
  68. package/.ai-rules/skills/finishing-a-development-branch/SKILL.md +281 -0
  69. package/.ai-rules/skills/frontend-design/SKILL.md +5 -0
  70. package/.ai-rules/skills/frontend-design/examples/component-template.tsx +203 -0
  71. package/.ai-rules/skills/frontend-design/references/css-patterns.md +243 -0
  72. package/.ai-rules/skills/git-master/SKILL.md +358 -0
  73. package/.ai-rules/skills/incident-response/SKILL.md +1 -0
  74. package/.ai-rules/skills/legacy-modernization/SKILL.md +1 -0
  75. package/.ai-rules/skills/mcp-builder/SKILL.md +7 -0
  76. package/.ai-rules/skills/mcp-builder/examples/resource-example.ts +233 -0
  77. package/.ai-rules/skills/mcp-builder/examples/tool-example.ts +203 -0
  78. package/.ai-rules/skills/mcp-builder/references/protocol-spec.md +215 -0
  79. package/.ai-rules/skills/performance-optimization/SKILL.md +3 -0
  80. package/.ai-rules/skills/plan-and-review/SKILL.md +115 -0
  81. package/.ai-rules/skills/pr-all-in-one/SKILL.md +15 -13
  82. package/.ai-rules/skills/pr-all-in-one/configuration-guide.md +7 -7
  83. package/.ai-rules/skills/pr-all-in-one/pr-templates.md +10 -10
  84. package/.ai-rules/skills/pr-review/SKILL.md +4 -0
  85. package/.ai-rules/skills/receiving-code-review/SKILL.md +347 -0
  86. package/.ai-rules/skills/refactoring/SKILL.md +1 -0
  87. package/.ai-rules/skills/requesting-code-review/SKILL.md +348 -0
  88. package/.ai-rules/skills/rule-authoring/SKILL.md +5 -0
  89. package/.ai-rules/skills/rule-authoring/examples/rule-template.md +142 -0
  90. package/.ai-rules/skills/rule-authoring/examples/trigger-patterns.md +126 -0
  91. package/.ai-rules/skills/security-audit/SKILL.md +4 -0
  92. package/.ai-rules/skills/skill-creator/SKILL.md +461 -0
  93. package/.ai-rules/skills/skill-creator/agents/analyzer.md +206 -0
  94. package/.ai-rules/skills/skill-creator/agents/comparator.md +167 -0
  95. package/.ai-rules/skills/skill-creator/agents/grader.md +152 -0
  96. package/.ai-rules/skills/skill-creator/assets/eval_review.html +289 -0
  97. package/.ai-rules/skills/skill-creator/assets/skill-template.md +43 -0
  98. package/.ai-rules/skills/skill-creator/eval-viewer/generate_review.py +496 -0
  99. package/.ai-rules/skills/skill-creator/references/frontmatter-guide.md +632 -0
  100. package/.ai-rules/skills/skill-creator/references/multi-tool-compat.md +480 -0
  101. package/.ai-rules/skills/skill-creator/references/schemas.md +784 -0
  102. package/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py +302 -0
  103. package/.ai-rules/skills/skill-creator/scripts/init_skill.sh +196 -0
  104. package/.ai-rules/skills/skill-creator/scripts/run_loop.py +327 -0
  105. package/.ai-rules/skills/systematic-debugging/SKILL.md +1 -0
  106. package/.ai-rules/skills/tech-debt/SKILL.md +1 -0
  107. package/.ai-rules/skills/test-coverage-gate/SKILL.md +303 -0
  108. package/.ai-rules/skills/tmux-master/SKILL.md +491 -0
  109. package/.ai-rules/skills/using-git-worktrees/SKILL.md +368 -0
  110. package/.ai-rules/skills/verification-before-completion/SKILL.md +234 -0
  111. package/.ai-rules/skills/widget-slot-architecture/SKILL.md +6 -0
  112. package/.ai-rules/skills/widget-slot-architecture/examples/parallel-route-setup.tsx +206 -0
  113. package/.ai-rules/skills/widget-slot-architecture/examples/widget-component.tsx +250 -0
  114. package/.ai-rules/skills/writing-plans/SKILL.md +78 -0
  115. package/bin/cli.js +178 -0
  116. package/lib/init/detect-stack.js +148 -0
  117. package/lib/init/generate-config.js +31 -0
  118. package/lib/init/index.js +86 -0
  119. package/lib/init/prompt.js +60 -0
  120. package/lib/init/scaffold.js +67 -0
  121. package/lib/init/suggest-agent.js +46 -0
  122. package/package.json +10 -2
@@ -0,0 +1,496 @@
1
+ #!/usr/bin/env python3
2
+ """Generate an HTML review page for skill-creator benchmark results.
3
+
4
+ Reads iteration workspace directories containing with_skill/ and baseline/
5
+ results, then produces a self-contained dark-mode HTML report with side-by-side
6
+ comparison, assertion pass/fail coloring, and feedback collection.
7
+
8
+ Usage:
9
+ python generate_review.py <workspace>/iteration-N --skill-name <name> \
10
+ --benchmark <workspace>/iteration-N/benchmark.json \
11
+ [--previous-workspace <path>] [--static <output.html>]
12
+
13
+ Requirements: Python 3.8+, standard library only.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import argparse
19
+ import html
20
+ import json
21
+ import sys
22
+ from pathlib import Path
23
+ from typing import Any, Dict, List, Optional
24
+
25
+
26
+ def load_json(path: Path) -> Any:
27
+ """Load a JSON file, returning None on failure."""
28
+ try:
29
+ with open(path, "r", encoding="utf-8") as f:
30
+ return json.load(f)
31
+ except (OSError, json.JSONDecodeError):
32
+ return None
33
+
34
+
35
+ def discover_results(iteration_dir: Path) -> Dict[str, Any]:
36
+ """Discover with_skill and baseline result files in an iteration directory."""
37
+ results: Dict[str, Any] = {"with_skill": [], "baseline": []}
38
+
39
+ for variant in ("with_skill", "baseline"):
40
+ variant_dir = iteration_dir / variant
41
+ if not variant_dir.is_dir():
42
+ continue
43
+ for json_file in sorted(variant_dir.glob("*.json")):
44
+ data = load_json(json_file)
45
+ if data is None:
46
+ continue
47
+ # Handle both list-of-scenarios and single-scenario dict
48
+ if isinstance(data, list):
49
+ results[variant].extend(data)
50
+ elif isinstance(data, dict):
51
+ results[variant].append(data)
52
+
53
+ return results
54
+
55
+
56
+ def count_assertions(scenarios: List[Dict]) -> Dict[str, int]:
57
+ """Count pass/fail/total assertions across scenarios."""
58
+ counts = {"pass": 0, "fail": 0, "total": 0}
59
+ for scenario in scenarios:
60
+ for assertion in scenario.get("assertions", []):
61
+ counts["total"] += 1
62
+ if assertion.get("passed", False):
63
+ counts["pass"] += 1
64
+ else:
65
+ counts["fail"] += 1
66
+ return counts
67
+
68
+
69
+ def fmt_float(val: Any, decimals: int = 2) -> str:
70
+ """Format a float value, returning '—' for missing data."""
71
+ if val is None:
72
+ return "\u2014"
73
+ try:
74
+ return f"{float(val):.{decimals}f}"
75
+ except (TypeError, ValueError):
76
+ return "\u2014"
77
+
78
+
79
+ def build_summary_section(benchmark: Optional[Dict]) -> str:
80
+ """Build the summary statistics section from benchmark.json data."""
81
+ if not benchmark:
82
+ return ""
83
+
84
+ def stat_card(label: str, value: str, sub: str = "") -> str:
85
+ sub_html = f'<div class="summary-sub">{html.escape(sub)}</div>' if sub else ""
86
+ return (
87
+ f'<div class="summary-card">'
88
+ f'<div class="summary-label">{html.escape(label)}</div>'
89
+ f'<div class="summary-value">{html.escape(value)}</div>'
90
+ f'{sub_html}</div>'
91
+ )
92
+
93
+ cards = []
94
+
95
+ # Pass rate
96
+ ws = benchmark.get("with_skill", {})
97
+ bl = benchmark.get("baseline", {})
98
+ ws_rate = ws.get("pass_rate")
99
+ bl_rate = bl.get("pass_rate")
100
+ if ws_rate is not None:
101
+ cards.append(stat_card("Pass Rate (with skill)", f"{fmt_float(ws_rate, 1)}%",
102
+ f"baseline: {fmt_float(bl_rate, 1)}%"))
103
+
104
+ # Tokens
105
+ ws_tokens = ws.get("tokens", {})
106
+ bl_tokens = bl.get("tokens", {})
107
+ if ws_tokens.get("mean") is not None:
108
+ cards.append(stat_card(
109
+ "Tokens (with skill)",
110
+ f"{fmt_float(ws_tokens.get('mean'), 0)}",
111
+ f"\u00b1{fmt_float(ws_tokens.get('stddev'), 0)} | baseline: {fmt_float(bl_tokens.get('mean'), 0)}"
112
+ ))
113
+
114
+ # Time
115
+ ws_time = ws.get("time", {})
116
+ bl_time = bl.get("time", {})
117
+ if ws_time.get("mean") is not None:
118
+ cards.append(stat_card(
119
+ "Time (with skill)",
120
+ f"{fmt_float(ws_time.get('mean'))}s",
121
+ f"\u00b1{fmt_float(ws_time.get('stddev'))}s | baseline: {fmt_float(bl_time.get('mean'))}s"
122
+ ))
123
+
124
+ if not cards:
125
+ return ""
126
+
127
+ return (
128
+ '<div class="summary-section">'
129
+ '<h2>Summary</h2>'
130
+ '<div class="summary-grid">' + "".join(cards) + '</div>'
131
+ '</div>'
132
+ )
133
+
134
+
135
+ def build_scenario_rows(scenarios: List[Dict]) -> str:
136
+ """Build HTML table rows for scenario assertions with collapsible evidence."""
137
+ if not scenarios:
138
+ return '<tr><td colspan="5" style="text-align:center;color:var(--text-dim)">No scenarios found</td></tr>'
139
+
140
+ rows = []
141
+ for scenario in scenarios:
142
+ name = html.escape(scenario.get("name", scenario.get("query", "Unknown")))
143
+ assertions = scenario.get("assertions", [])
144
+ tokens = scenario.get("tokens")
145
+ time_s = scenario.get("time")
146
+ meta_parts = []
147
+ if tokens is not None:
148
+ meta_parts.append(f"{tokens} tok")
149
+ if time_s is not None:
150
+ meta_parts.append(f"{fmt_float(time_s)}s")
151
+ meta_str = " | ".join(meta_parts)
152
+
153
+ if not assertions:
154
+ rows.append(
155
+ f'<tr><td>{name}</td>'
156
+ '<td colspan="4" style="color:var(--text-dim)">No assertions</td></tr>'
157
+ )
158
+ continue
159
+
160
+ for i, assertion in enumerate(assertions):
161
+ label = html.escape(assertion.get("description", assertion.get("name", f"assertion-{i}")))
162
+ passed = assertion.get("passed", False)
163
+ status_cls = "pass" if passed else "fail"
164
+ status_text = "PASS" if passed else "FAIL"
165
+ detail = html.escape(assertion.get("detail", assertion.get("message", "")))
166
+
167
+ # Collapsible evidence
168
+ evidence = assertion.get("evidence", "")
169
+ evidence_html = ""
170
+ if evidence:
171
+ esc_ev = html.escape(str(evidence))
172
+ evidence_html = (
173
+ f'<details class="evidence"><summary>Evidence</summary>'
174
+ f'<pre>{esc_ev}</pre></details>'
175
+ )
176
+
177
+ scenario_cell = (
178
+ f'<td rowspan="{len(assertions)}">{name}'
179
+ f'{"<br><span class=meta-inline>" + html.escape(meta_str) + "</span>" if meta_str and i == 0 else ""}'
180
+ f'</td>'
181
+ ) if i == 0 else ""
182
+
183
+ rows.append(
184
+ f'<tr>{scenario_cell}'
185
+ f'<td>{label}</td>'
186
+ f'<td class="status-{status_cls}">{status_text}</td>'
187
+ f'<td class="detail">{detail}{evidence_html}</td></tr>'
188
+ )
189
+
190
+ return "\n".join(rows)
191
+
192
+
193
+ def build_comparison_section(
194
+ with_skill: List[Dict], baseline: List[Dict]
195
+ ) -> str:
196
+ """Build side-by-side comparison HTML for with_skill vs baseline."""
197
+ ws_counts = count_assertions(with_skill)
198
+ bl_counts = count_assertions(baseline)
199
+
200
+ return f"""
201
+ <div class="comparison">
202
+ <div class="comp-panel">
203
+ <h3>With Skill</h3>
204
+ <div class="comp-stats">
205
+ <span class="stat-pass">{ws_counts['pass']} pass</span>
206
+ <span class="stat-fail">{ws_counts['fail']} fail</span>
207
+ <span class="stat-total">{ws_counts['total']} total</span>
208
+ </div>
209
+ <table class="result-table">
210
+ <thead><tr><th>Scenario</th><th>Assertion</th><th>Result</th><th>Detail</th></tr></thead>
211
+ <tbody>{build_scenario_rows(with_skill)}</tbody>
212
+ </table>
213
+ </div>
214
+ <div class="comp-panel">
215
+ <h3>Baseline</h3>
216
+ <div class="comp-stats">
217
+ <span class="stat-pass">{bl_counts['pass']} pass</span>
218
+ <span class="stat-fail">{bl_counts['fail']} fail</span>
219
+ <span class="stat-total">{bl_counts['total']} total</span>
220
+ </div>
221
+ <table class="result-table">
222
+ <thead><tr><th>Scenario</th><th>Assertion</th><th>Result</th><th>Detail</th></tr></thead>
223
+ <tbody>{build_scenario_rows(baseline)}</tbody>
224
+ </table>
225
+ </div>
226
+ </div>"""
227
+
228
+
229
+ def build_previous_comparison(
230
+ current_dir: Path,
231
+ previous_dir: Path,
232
+ current_benchmark: Optional[Dict] = None,
233
+ previous_benchmark: Optional[Dict] = None,
234
+ ) -> str:
235
+ """Build a delta section comparing current vs previous iteration."""
236
+ curr = discover_results(current_dir)
237
+ prev = discover_results(previous_dir)
238
+
239
+ curr_ws = count_assertions(curr["with_skill"])
240
+ prev_ws = count_assertions(prev["with_skill"])
241
+
242
+ pass_delta = curr_ws["pass"] - prev_ws["pass"]
243
+ fail_delta = curr_ws["fail"] - prev_ws["fail"]
244
+
245
+ pass_sign = "+" if pass_delta >= 0 else ""
246
+ fail_sign = "+" if fail_delta >= 0 else ""
247
+ pass_cls = "delta-positive" if pass_delta >= 0 else "delta-negative"
248
+ fail_cls = "delta-negative" if fail_delta >= 0 else "delta-positive"
249
+
250
+ # Pass rate delta from benchmark data
251
+ rate_html = ""
252
+ if current_benchmark and previous_benchmark:
253
+ curr_rate = current_benchmark.get("with_skill", {}).get("pass_rate")
254
+ prev_rate = previous_benchmark.get("with_skill", {}).get("pass_rate")
255
+ if curr_rate is not None and prev_rate is not None:
256
+ rate_delta = curr_rate - prev_rate
257
+ rate_sign = "+" if rate_delta >= 0 else ""
258
+ rate_cls = "delta-positive" if rate_delta >= 0 else "delta-negative"
259
+ rate_html = f"""
260
+ <div class="delta-card">
261
+ <div class="delta-label">Pass Rate</div>
262
+ <div class="delta-value {rate_cls}">{rate_sign}{fmt_float(rate_delta, 1)}%</div>
263
+ <div class="delta-detail">{fmt_float(prev_rate, 1)}% &rarr; {fmt_float(curr_rate, 1)}%</div>
264
+ </div>"""
265
+
266
+ return f"""
267
+ <div class="delta-section">
268
+ <h2>Delta vs Previous Iteration</h2>
269
+ <div class="delta-stats">
270
+ {rate_html}
271
+ <div class="delta-card">
272
+ <div class="delta-label">Pass</div>
273
+ <div class="delta-value {pass_cls}">{pass_sign}{pass_delta}</div>
274
+ <div class="delta-detail">{prev_ws['pass']} &rarr; {curr_ws['pass']}</div>
275
+ </div>
276
+ <div class="delta-card">
277
+ <div class="delta-label">Fail</div>
278
+ <div class="delta-value {fail_cls}">{fail_sign}{fail_delta}</div>
279
+ <div class="delta-detail">{prev_ws['fail']} &rarr; {curr_ws['fail']}</div>
280
+ </div>
281
+ </div>
282
+ </div>"""
283
+
284
+
285
+ def generate_html(
286
+ skill_name: str,
287
+ iteration_dir: Path,
288
+ benchmark: Optional[Dict] = None,
289
+ previous_dir: Optional[Path] = None,
290
+ previous_benchmark: Optional[Dict] = None,
291
+ ) -> str:
292
+ """Generate the complete HTML review page."""
293
+ results = discover_results(iteration_dir)
294
+ iteration_name = iteration_dir.name
295
+
296
+ summary_html = build_summary_section(benchmark)
297
+
298
+ comparison_html = build_comparison_section(
299
+ results["with_skill"], results["baseline"]
300
+ )
301
+
302
+ delta_html = ""
303
+ if previous_dir and previous_dir.is_dir():
304
+ delta_html = build_previous_comparison(
305
+ iteration_dir, previous_dir, benchmark, previous_benchmark
306
+ )
307
+
308
+ return f"""<!DOCTYPE html>
309
+ <html lang="en">
310
+ <head>
311
+ <meta charset="UTF-8">
312
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
313
+ <title>Benchmark Review \u2014 {html.escape(skill_name)}</title>
314
+ <style>
315
+ *,*::before,*::after{{box-sizing:border-box;margin:0;padding:0}}
316
+ :root{{
317
+ --bg:#0d1117;--surface:#161b22;--border:#30363d;--text:#e6edf3;--text-dim:#8b949e;
318
+ --pass:#238636;--pass-bg:rgba(35,134,54,.15);
319
+ --fail:#da3633;--fail-bg:rgba(218,54,51,.15);
320
+ --accent:#58a6ff;--accent-hover:#79c0ff;
321
+ --radius:8px;
322
+ --font-mono:'SF Mono','Cascadia Code','Fira Code',monospace;
323
+ --font-sans:-apple-system,BlinkMacSystemFont,'Segoe UI',Helvetica,Arial,sans-serif;
324
+ }}
325
+ body{{background:var(--bg);color:var(--text);font-family:var(--font-sans);line-height:1.6;padding:2rem;max-width:1400px;margin:0 auto}}
326
+ h1{{font-size:1.6rem;font-weight:700;margin-bottom:.25rem}}
327
+ h2{{font-size:1.2rem;font-weight:600;margin:2rem 0 1rem;padding-bottom:.5rem;border-bottom:1px solid var(--border)}}
328
+ h3{{font-size:1rem;font-weight:600;margin-bottom:.75rem}}
329
+ .subtitle{{color:var(--text-dim);margin-bottom:1.5rem;font-size:.9rem}}
330
+
331
+ .summary-section{{margin-bottom:2rem}}
332
+ .summary-grid{{display:flex;gap:1rem;flex-wrap:wrap}}
333
+ .summary-card{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1rem 1.5rem;min-width:180px}}
334
+ .summary-label{{font-size:.75rem;color:var(--text-dim);text-transform:uppercase;letter-spacing:.05em}}
335
+ .summary-value{{font-size:1.4rem;font-weight:700;font-family:var(--font-mono);margin:.25rem 0}}
336
+ .summary-sub{{font-size:.8rem;color:var(--text-dim)}}
337
+
338
+ .comparison{{display:grid;grid-template-columns:1fr 1fr;gap:1.5rem;margin-bottom:2rem}}
339
+ @media(max-width:900px){{.comparison{{grid-template-columns:1fr}}}}
340
+ .comp-panel{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1.25rem}}
341
+ .comp-stats{{display:flex;gap:1rem;margin-bottom:1rem;font-family:var(--font-mono);font-size:.85rem}}
342
+ .stat-pass{{color:var(--pass)}}
343
+ .stat-fail{{color:var(--fail)}}
344
+ .stat-total{{color:var(--text-dim)}}
345
+
346
+ .result-table{{width:100%;border-collapse:collapse;font-size:.85rem}}
347
+ .result-table th{{text-align:left;padding:.5rem;border-bottom:2px solid var(--border);color:var(--text-dim);font-weight:600}}
348
+ .result-table td{{padding:.5rem;border-bottom:1px solid var(--border);vertical-align:top}}
349
+ .result-table tr:last-child td{{border-bottom:none}}
350
+ .status-pass{{color:var(--pass);font-weight:700;font-family:var(--font-mono)}}
351
+ .status-fail{{color:var(--fail);font-weight:700;font-family:var(--font-mono)}}
352
+ .detail{{color:var(--text-dim);font-size:.8rem}}
353
+ .meta-inline{{font-size:.75rem;color:var(--text-dim);font-family:var(--font-mono)}}
354
+
355
+ details.evidence{{margin-top:.4rem}}
356
+ details.evidence summary{{cursor:pointer;color:var(--accent);font-size:.8rem}}
357
+ details.evidence summary:hover{{color:var(--accent-hover)}}
358
+ details.evidence pre{{background:var(--bg);border:1px solid var(--border);border-radius:4px;padding:.5rem;margin-top:.3rem;font-size:.75rem;overflow-x:auto;white-space:pre-wrap;word-break:break-word}}
359
+
360
+ .delta-section{{margin-bottom:2rem}}
361
+ .delta-stats{{display:flex;gap:1rem;flex-wrap:wrap}}
362
+ .delta-card{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1rem 1.5rem;min-width:140px}}
363
+ .delta-label{{font-size:.75rem;color:var(--text-dim);text-transform:uppercase;letter-spacing:.05em}}
364
+ .delta-value{{font-size:1.6rem;font-weight:700;font-family:var(--font-mono)}}
365
+ .delta-detail{{font-size:.8rem;color:var(--text-dim);margin-top:.25rem}}
366
+ .delta-positive{{color:var(--pass)}}
367
+ .delta-negative{{color:var(--fail)}}
368
+
369
+ .feedback-section{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1.5rem;margin-top:2rem}}
370
+ .feedback-section h2{{margin-top:0;border-bottom:none;padding-bottom:0}}
371
+ .feedback-section textarea{{width:100%;min-height:120px;background:var(--bg);color:var(--text);border:1px solid var(--border);border-radius:6px;padding:.75rem;font-family:var(--font-sans);font-size:.9rem;resize:vertical;margin:1rem 0}}
372
+ .btn{{background:var(--surface);color:var(--text);border:1px solid var(--border);border-radius:6px;padding:.5rem 1.25rem;cursor:pointer;font-size:.85rem;font-family:var(--font-sans);transition:border-color .15s,background .15s}}
373
+ .btn:hover{{border-color:var(--accent);background:#1c2129}}
374
+ .btn-primary{{background:var(--accent);color:#000;border-color:var(--accent);font-weight:600}}
375
+ .btn-primary:hover{{background:var(--accent-hover)}}
376
+ .btn-group{{display:flex;gap:.5rem}}
377
+ </style>
378
+ </head>
379
+ <body>
380
+
381
+ <h1>Benchmark Review \u2014 {html.escape(skill_name)}</h1>
382
+ <p class="subtitle">Iteration: {html.escape(iteration_name)} | Generated from: {html.escape(str(iteration_dir))}</p>
383
+
384
+ {summary_html}
385
+ {delta_html}
386
+
387
+ <h2>Side-by-Side Comparison</h2>
388
+ {comparison_html}
389
+
390
+ <div class="feedback-section">
391
+ <h2>Feedback</h2>
392
+ <p style="color:var(--text-dim);font-size:.85rem">Add notes about this iteration's results. Download as JSON for the next improvement cycle.</p>
393
+ <textarea id="feedbackText" placeholder="What worked well? What needs improvement? Which assertions need attention?"></textarea>
394
+ <div class="btn-group">
395
+ <button class="btn btn-primary" onclick="downloadFeedback()">Download Feedback JSON</button>
396
+ </div>
397
+ </div>
398
+
399
+ <script>
400
+ (function(){{
401
+ "use strict";
402
+ window.downloadFeedback = function(){{
403
+ var feedback = {{
404
+ skill_name: {json.dumps(skill_name)},
405
+ iteration: {json.dumps(iteration_name)},
406
+ timestamp: new Date().toISOString(),
407
+ feedback: document.getElementById("feedbackText").value,
408
+ results_summary: {{
409
+ with_skill: {json.dumps(count_assertions(results["with_skill"]))},
410
+ baseline: {json.dumps(count_assertions(results["baseline"]))}
411
+ }}
412
+ }};
413
+ var blob = new Blob([JSON.stringify(feedback, null, 2)], {{type:"application/json"}});
414
+ var a = document.createElement("a");
415
+ a.href = URL.createObjectURL(blob);
416
+ a.download = "feedback-" + {json.dumps(iteration_name)} + ".json";
417
+ a.click();
418
+ URL.revokeObjectURL(a.href);
419
+ }};
420
+ }})();
421
+ </script>
422
+ </body>
423
+ </html>"""
424
+
425
+
426
+ def main() -> int:
427
+ parser = argparse.ArgumentParser(
428
+ description="Generate HTML review for skill-creator benchmark results."
429
+ )
430
+ parser.add_argument(
431
+ "iteration_dir",
432
+ type=Path,
433
+ help="Path to iteration directory (e.g. workspace/iteration-1)",
434
+ )
435
+ parser.add_argument(
436
+ "--skill-name",
437
+ required=True,
438
+ help="Name of the skill being benchmarked",
439
+ )
440
+ parser.add_argument(
441
+ "--benchmark",
442
+ type=Path,
443
+ default=None,
444
+ help="Path to benchmark.json with statistics (pass_rate, tokens, time)",
445
+ )
446
+ parser.add_argument(
447
+ "--previous-workspace",
448
+ type=Path,
449
+ default=None,
450
+ help="Path to previous iteration directory for delta comparison",
451
+ )
452
+ parser.add_argument(
453
+ "--static",
454
+ type=Path,
455
+ default=None,
456
+ help="Output path for static HTML file (default: stdout)",
457
+ )
458
+
459
+ args = parser.parse_args()
460
+
461
+ if not args.iteration_dir.is_dir():
462
+ print(f"Error: {args.iteration_dir} is not a directory", file=sys.stderr)
463
+ return 1
464
+
465
+ benchmark = None
466
+ if args.benchmark:
467
+ benchmark = load_json(args.benchmark)
468
+ if benchmark is None:
469
+ print(f"Warning: could not load {args.benchmark}", file=sys.stderr)
470
+
471
+ previous_benchmark = None
472
+ if args.previous_workspace:
473
+ prev_bench_path = args.previous_workspace / "benchmark.json"
474
+ previous_benchmark = load_json(prev_bench_path)
475
+
476
+ html_content = generate_html(
477
+ skill_name=args.skill_name,
478
+ iteration_dir=args.iteration_dir,
479
+ benchmark=benchmark,
480
+ previous_dir=args.previous_workspace,
481
+ previous_benchmark=previous_benchmark,
482
+ )
483
+
484
+ if args.static:
485
+ args.static.parent.mkdir(parents=True, exist_ok=True)
486
+ with open(args.static, "w", encoding="utf-8") as f:
487
+ f.write(html_content)
488
+ print(f"Written to {args.static}", file=sys.stderr)
489
+ else:
490
+ print(html_content)
491
+
492
+ return 0
493
+
494
+
495
+ if __name__ == "__main__":
496
+ sys.exit(main())