codingbuddy-rules 4.4.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.ai-rules/adapters/antigravity.md +6 -6
- package/.ai-rules/adapters/claude-code.md +107 -4
- package/.ai-rules/adapters/codex.md +5 -5
- package/.ai-rules/adapters/cursor.md +2 -2
- package/.ai-rules/adapters/kiro.md +8 -8
- package/.ai-rules/adapters/opencode.md +7 -7
- package/.ai-rules/adapters/q.md +2 -2
- package/.ai-rules/agents/README.md +66 -16
- package/.ai-rules/agents/accessibility-specialist.json +2 -1
- package/.ai-rules/agents/act-mode.json +2 -1
- package/.ai-rules/agents/agent-architect.json +8 -7
- package/.ai-rules/agents/ai-ml-engineer.json +1 -0
- package/.ai-rules/agents/architecture-specialist.json +1 -0
- package/.ai-rules/agents/auto-mode.json +4 -2
- package/.ai-rules/agents/backend-developer.json +1 -0
- package/.ai-rules/agents/code-quality-specialist.json +1 -0
- package/.ai-rules/agents/code-reviewer.json +65 -64
- package/.ai-rules/agents/data-engineer.json +8 -7
- package/.ai-rules/agents/data-scientist.json +10 -9
- package/.ai-rules/agents/devops-engineer.json +1 -0
- package/.ai-rules/agents/documentation-specialist.json +1 -0
- package/.ai-rules/agents/eval-mode.json +20 -19
- package/.ai-rules/agents/event-architecture-specialist.json +1 -0
- package/.ai-rules/agents/frontend-developer.json +1 -0
- package/.ai-rules/agents/i18n-specialist.json +2 -1
- package/.ai-rules/agents/integration-specialist.json +1 -0
- package/.ai-rules/agents/migration-specialist.json +1 -0
- package/.ai-rules/agents/mobile-developer.json +8 -7
- package/.ai-rules/agents/observability-specialist.json +1 -0
- package/.ai-rules/agents/parallel-orchestrator.json +346 -0
- package/.ai-rules/agents/performance-specialist.json +1 -0
- package/.ai-rules/agents/plan-mode.json +3 -1
- package/.ai-rules/agents/plan-reviewer.json +208 -0
- package/.ai-rules/agents/platform-engineer.json +1 -0
- package/.ai-rules/agents/security-engineer.json +9 -8
- package/.ai-rules/agents/security-specialist.json +2 -1
- package/.ai-rules/agents/seo-specialist.json +1 -0
- package/.ai-rules/agents/software-engineer.json +1 -0
- package/.ai-rules/agents/solution-architect.json +11 -10
- package/.ai-rules/agents/systems-developer.json +9 -8
- package/.ai-rules/agents/technical-planner.json +11 -10
- package/.ai-rules/agents/test-engineer.json +7 -6
- package/.ai-rules/agents/test-strategy-specialist.json +1 -0
- package/.ai-rules/agents/tooling-engineer.json +4 -3
- package/.ai-rules/agents/ui-ux-designer.json +1 -0
- package/.ai-rules/keyword-modes.json +4 -4
- package/.ai-rules/rules/clarification-guide.md +14 -14
- package/.ai-rules/rules/core.md +90 -1
- package/.ai-rules/rules/parallel-execution.md +217 -0
- package/.ai-rules/skills/README.md +23 -1
- package/.ai-rules/skills/agent-design/SKILL.md +5 -0
- package/.ai-rules/skills/agent-design/examples/agent-template.json +58 -0
- package/.ai-rules/skills/agent-design/references/expertise-guidelines.md +112 -0
- package/.ai-rules/skills/agent-discussion/SKILL.md +199 -0
- package/.ai-rules/skills/agent-discussion-panel/SKILL.md +448 -0
- package/.ai-rules/skills/api-design/SKILL.md +5 -0
- package/.ai-rules/skills/api-design/examples/error-response.json +159 -0
- package/.ai-rules/skills/api-design/examples/openapi-template.yaml +393 -0
- package/.ai-rules/skills/build-fix/SKILL.md +234 -0
- package/.ai-rules/skills/code-explanation/SKILL.md +4 -0
- package/.ai-rules/skills/context-management/SKILL.md +1 -0
- package/.ai-rules/skills/cost-budget/SKILL.md +348 -0
- package/.ai-rules/skills/cross-repo-issues/SKILL.md +257 -0
- package/.ai-rules/skills/database-migration/SKILL.md +1 -0
- package/.ai-rules/skills/deepsearch/SKILL.md +214 -0
- package/.ai-rules/skills/deployment-checklist/SKILL.md +1 -0
- package/.ai-rules/skills/error-analysis/SKILL.md +1 -0
- package/.ai-rules/skills/finishing-a-development-branch/SKILL.md +281 -0
- package/.ai-rules/skills/frontend-design/SKILL.md +5 -0
- package/.ai-rules/skills/frontend-design/examples/component-template.tsx +203 -0
- package/.ai-rules/skills/frontend-design/references/css-patterns.md +243 -0
- package/.ai-rules/skills/git-master/SKILL.md +358 -0
- package/.ai-rules/skills/incident-response/SKILL.md +1 -0
- package/.ai-rules/skills/legacy-modernization/SKILL.md +1 -0
- package/.ai-rules/skills/mcp-builder/SKILL.md +7 -0
- package/.ai-rules/skills/mcp-builder/examples/resource-example.ts +233 -0
- package/.ai-rules/skills/mcp-builder/examples/tool-example.ts +203 -0
- package/.ai-rules/skills/mcp-builder/references/protocol-spec.md +215 -0
- package/.ai-rules/skills/performance-optimization/SKILL.md +3 -0
- package/.ai-rules/skills/plan-and-review/SKILL.md +115 -0
- package/.ai-rules/skills/pr-all-in-one/SKILL.md +15 -13
- package/.ai-rules/skills/pr-all-in-one/configuration-guide.md +7 -7
- package/.ai-rules/skills/pr-all-in-one/pr-templates.md +10 -10
- package/.ai-rules/skills/pr-review/SKILL.md +4 -0
- package/.ai-rules/skills/receiving-code-review/SKILL.md +347 -0
- package/.ai-rules/skills/refactoring/SKILL.md +1 -0
- package/.ai-rules/skills/requesting-code-review/SKILL.md +348 -0
- package/.ai-rules/skills/rule-authoring/SKILL.md +5 -0
- package/.ai-rules/skills/rule-authoring/examples/rule-template.md +142 -0
- package/.ai-rules/skills/rule-authoring/examples/trigger-patterns.md +126 -0
- package/.ai-rules/skills/security-audit/SKILL.md +4 -0
- package/.ai-rules/skills/skill-creator/SKILL.md +461 -0
- package/.ai-rules/skills/skill-creator/agents/analyzer.md +206 -0
- package/.ai-rules/skills/skill-creator/agents/comparator.md +167 -0
- package/.ai-rules/skills/skill-creator/agents/grader.md +152 -0
- package/.ai-rules/skills/skill-creator/assets/eval_review.html +289 -0
- package/.ai-rules/skills/skill-creator/assets/skill-template.md +43 -0
- package/.ai-rules/skills/skill-creator/eval-viewer/generate_review.py +496 -0
- package/.ai-rules/skills/skill-creator/references/frontmatter-guide.md +632 -0
- package/.ai-rules/skills/skill-creator/references/multi-tool-compat.md +480 -0
- package/.ai-rules/skills/skill-creator/references/schemas.md +784 -0
- package/.ai-rules/skills/skill-creator/scripts/aggregate_benchmark.py +302 -0
- package/.ai-rules/skills/skill-creator/scripts/init_skill.sh +196 -0
- package/.ai-rules/skills/skill-creator/scripts/run_loop.py +327 -0
- package/.ai-rules/skills/systematic-debugging/SKILL.md +1 -0
- package/.ai-rules/skills/tech-debt/SKILL.md +1 -0
- package/.ai-rules/skills/test-coverage-gate/SKILL.md +303 -0
- package/.ai-rules/skills/tmux-master/SKILL.md +491 -0
- package/.ai-rules/skills/using-git-worktrees/SKILL.md +368 -0
- package/.ai-rules/skills/verification-before-completion/SKILL.md +234 -0
- package/.ai-rules/skills/widget-slot-architecture/SKILL.md +6 -0
- package/.ai-rules/skills/widget-slot-architecture/examples/parallel-route-setup.tsx +206 -0
- package/.ai-rules/skills/widget-slot-architecture/examples/widget-component.tsx +250 -0
- package/.ai-rules/skills/writing-plans/SKILL.md +78 -0
- package/bin/cli.js +178 -0
- package/lib/init/detect-stack.js +148 -0
- package/lib/init/generate-config.js +31 -0
- package/lib/init/index.js +86 -0
- package/lib/init/prompt.js +60 -0
- package/lib/init/scaffold.js +67 -0
- package/lib/init/suggest-agent.js +46 -0
- package/package.json +10 -2
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Generate an HTML review page for skill-creator benchmark results.
|
|
3
|
+
|
|
4
|
+
Reads iteration workspace directories containing with_skill/ and baseline/
|
|
5
|
+
results, then produces a self-contained dark-mode HTML report with side-by-side
|
|
6
|
+
comparison, assertion pass/fail coloring, and feedback collection.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python generate_review.py <workspace>/iteration-N --skill-name <name> \
|
|
10
|
+
--benchmark <workspace>/iteration-N/benchmark.json \
|
|
11
|
+
[--previous-workspace <path>] [--static <output.html>]
|
|
12
|
+
|
|
13
|
+
Requirements: Python 3.8+, standard library only.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import html
|
|
20
|
+
import json
|
|
21
|
+
import sys
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
from typing import Any, Dict, List, Optional
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_json(path: Path) -> Any:
|
|
27
|
+
"""Load a JSON file, returning None on failure."""
|
|
28
|
+
try:
|
|
29
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
30
|
+
return json.load(f)
|
|
31
|
+
except (OSError, json.JSONDecodeError):
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def discover_results(iteration_dir: Path) -> Dict[str, Any]:
|
|
36
|
+
"""Discover with_skill and baseline result files in an iteration directory."""
|
|
37
|
+
results: Dict[str, Any] = {"with_skill": [], "baseline": []}
|
|
38
|
+
|
|
39
|
+
for variant in ("with_skill", "baseline"):
|
|
40
|
+
variant_dir = iteration_dir / variant
|
|
41
|
+
if not variant_dir.is_dir():
|
|
42
|
+
continue
|
|
43
|
+
for json_file in sorted(variant_dir.glob("*.json")):
|
|
44
|
+
data = load_json(json_file)
|
|
45
|
+
if data is None:
|
|
46
|
+
continue
|
|
47
|
+
# Handle both list-of-scenarios and single-scenario dict
|
|
48
|
+
if isinstance(data, list):
|
|
49
|
+
results[variant].extend(data)
|
|
50
|
+
elif isinstance(data, dict):
|
|
51
|
+
results[variant].append(data)
|
|
52
|
+
|
|
53
|
+
return results
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def count_assertions(scenarios: List[Dict]) -> Dict[str, int]:
|
|
57
|
+
"""Count pass/fail/total assertions across scenarios."""
|
|
58
|
+
counts = {"pass": 0, "fail": 0, "total": 0}
|
|
59
|
+
for scenario in scenarios:
|
|
60
|
+
for assertion in scenario.get("assertions", []):
|
|
61
|
+
counts["total"] += 1
|
|
62
|
+
if assertion.get("passed", False):
|
|
63
|
+
counts["pass"] += 1
|
|
64
|
+
else:
|
|
65
|
+
counts["fail"] += 1
|
|
66
|
+
return counts
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def fmt_float(val: Any, decimals: int = 2) -> str:
|
|
70
|
+
"""Format a float value, returning '—' for missing data."""
|
|
71
|
+
if val is None:
|
|
72
|
+
return "\u2014"
|
|
73
|
+
try:
|
|
74
|
+
return f"{float(val):.{decimals}f}"
|
|
75
|
+
except (TypeError, ValueError):
|
|
76
|
+
return "\u2014"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def build_summary_section(benchmark: Optional[Dict]) -> str:
|
|
80
|
+
"""Build the summary statistics section from benchmark.json data."""
|
|
81
|
+
if not benchmark:
|
|
82
|
+
return ""
|
|
83
|
+
|
|
84
|
+
def stat_card(label: str, value: str, sub: str = "") -> str:
|
|
85
|
+
sub_html = f'<div class="summary-sub">{html.escape(sub)}</div>' if sub else ""
|
|
86
|
+
return (
|
|
87
|
+
f'<div class="summary-card">'
|
|
88
|
+
f'<div class="summary-label">{html.escape(label)}</div>'
|
|
89
|
+
f'<div class="summary-value">{html.escape(value)}</div>'
|
|
90
|
+
f'{sub_html}</div>'
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
cards = []
|
|
94
|
+
|
|
95
|
+
# Pass rate
|
|
96
|
+
ws = benchmark.get("with_skill", {})
|
|
97
|
+
bl = benchmark.get("baseline", {})
|
|
98
|
+
ws_rate = ws.get("pass_rate")
|
|
99
|
+
bl_rate = bl.get("pass_rate")
|
|
100
|
+
if ws_rate is not None:
|
|
101
|
+
cards.append(stat_card("Pass Rate (with skill)", f"{fmt_float(ws_rate, 1)}%",
|
|
102
|
+
f"baseline: {fmt_float(bl_rate, 1)}%"))
|
|
103
|
+
|
|
104
|
+
# Tokens
|
|
105
|
+
ws_tokens = ws.get("tokens", {})
|
|
106
|
+
bl_tokens = bl.get("tokens", {})
|
|
107
|
+
if ws_tokens.get("mean") is not None:
|
|
108
|
+
cards.append(stat_card(
|
|
109
|
+
"Tokens (with skill)",
|
|
110
|
+
f"{fmt_float(ws_tokens.get('mean'), 0)}",
|
|
111
|
+
f"\u00b1{fmt_float(ws_tokens.get('stddev'), 0)} | baseline: {fmt_float(bl_tokens.get('mean'), 0)}"
|
|
112
|
+
))
|
|
113
|
+
|
|
114
|
+
# Time
|
|
115
|
+
ws_time = ws.get("time", {})
|
|
116
|
+
bl_time = bl.get("time", {})
|
|
117
|
+
if ws_time.get("mean") is not None:
|
|
118
|
+
cards.append(stat_card(
|
|
119
|
+
"Time (with skill)",
|
|
120
|
+
f"{fmt_float(ws_time.get('mean'))}s",
|
|
121
|
+
f"\u00b1{fmt_float(ws_time.get('stddev'))}s | baseline: {fmt_float(bl_time.get('mean'))}s"
|
|
122
|
+
))
|
|
123
|
+
|
|
124
|
+
if not cards:
|
|
125
|
+
return ""
|
|
126
|
+
|
|
127
|
+
return (
|
|
128
|
+
'<div class="summary-section">'
|
|
129
|
+
'<h2>Summary</h2>'
|
|
130
|
+
'<div class="summary-grid">' + "".join(cards) + '</div>'
|
|
131
|
+
'</div>'
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def build_scenario_rows(scenarios: List[Dict]) -> str:
|
|
136
|
+
"""Build HTML table rows for scenario assertions with collapsible evidence."""
|
|
137
|
+
if not scenarios:
|
|
138
|
+
return '<tr><td colspan="5" style="text-align:center;color:var(--text-dim)">No scenarios found</td></tr>'
|
|
139
|
+
|
|
140
|
+
rows = []
|
|
141
|
+
for scenario in scenarios:
|
|
142
|
+
name = html.escape(scenario.get("name", scenario.get("query", "Unknown")))
|
|
143
|
+
assertions = scenario.get("assertions", [])
|
|
144
|
+
tokens = scenario.get("tokens")
|
|
145
|
+
time_s = scenario.get("time")
|
|
146
|
+
meta_parts = []
|
|
147
|
+
if tokens is not None:
|
|
148
|
+
meta_parts.append(f"{tokens} tok")
|
|
149
|
+
if time_s is not None:
|
|
150
|
+
meta_parts.append(f"{fmt_float(time_s)}s")
|
|
151
|
+
meta_str = " | ".join(meta_parts)
|
|
152
|
+
|
|
153
|
+
if not assertions:
|
|
154
|
+
rows.append(
|
|
155
|
+
f'<tr><td>{name}</td>'
|
|
156
|
+
'<td colspan="4" style="color:var(--text-dim)">No assertions</td></tr>'
|
|
157
|
+
)
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
for i, assertion in enumerate(assertions):
|
|
161
|
+
label = html.escape(assertion.get("description", assertion.get("name", f"assertion-{i}")))
|
|
162
|
+
passed = assertion.get("passed", False)
|
|
163
|
+
status_cls = "pass" if passed else "fail"
|
|
164
|
+
status_text = "PASS" if passed else "FAIL"
|
|
165
|
+
detail = html.escape(assertion.get("detail", assertion.get("message", "")))
|
|
166
|
+
|
|
167
|
+
# Collapsible evidence
|
|
168
|
+
evidence = assertion.get("evidence", "")
|
|
169
|
+
evidence_html = ""
|
|
170
|
+
if evidence:
|
|
171
|
+
esc_ev = html.escape(str(evidence))
|
|
172
|
+
evidence_html = (
|
|
173
|
+
f'<details class="evidence"><summary>Evidence</summary>'
|
|
174
|
+
f'<pre>{esc_ev}</pre></details>'
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
scenario_cell = (
|
|
178
|
+
f'<td rowspan="{len(assertions)}">{name}'
|
|
179
|
+
f'{"<br><span class=meta-inline>" + html.escape(meta_str) + "</span>" if meta_str and i == 0 else ""}'
|
|
180
|
+
f'</td>'
|
|
181
|
+
) if i == 0 else ""
|
|
182
|
+
|
|
183
|
+
rows.append(
|
|
184
|
+
f'<tr>{scenario_cell}'
|
|
185
|
+
f'<td>{label}</td>'
|
|
186
|
+
f'<td class="status-{status_cls}">{status_text}</td>'
|
|
187
|
+
f'<td class="detail">{detail}{evidence_html}</td></tr>'
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return "\n".join(rows)
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def build_comparison_section(
|
|
194
|
+
with_skill: List[Dict], baseline: List[Dict]
|
|
195
|
+
) -> str:
|
|
196
|
+
"""Build side-by-side comparison HTML for with_skill vs baseline."""
|
|
197
|
+
ws_counts = count_assertions(with_skill)
|
|
198
|
+
bl_counts = count_assertions(baseline)
|
|
199
|
+
|
|
200
|
+
return f"""
|
|
201
|
+
<div class="comparison">
|
|
202
|
+
<div class="comp-panel">
|
|
203
|
+
<h3>With Skill</h3>
|
|
204
|
+
<div class="comp-stats">
|
|
205
|
+
<span class="stat-pass">{ws_counts['pass']} pass</span>
|
|
206
|
+
<span class="stat-fail">{ws_counts['fail']} fail</span>
|
|
207
|
+
<span class="stat-total">{ws_counts['total']} total</span>
|
|
208
|
+
</div>
|
|
209
|
+
<table class="result-table">
|
|
210
|
+
<thead><tr><th>Scenario</th><th>Assertion</th><th>Result</th><th>Detail</th></tr></thead>
|
|
211
|
+
<tbody>{build_scenario_rows(with_skill)}</tbody>
|
|
212
|
+
</table>
|
|
213
|
+
</div>
|
|
214
|
+
<div class="comp-panel">
|
|
215
|
+
<h3>Baseline</h3>
|
|
216
|
+
<div class="comp-stats">
|
|
217
|
+
<span class="stat-pass">{bl_counts['pass']} pass</span>
|
|
218
|
+
<span class="stat-fail">{bl_counts['fail']} fail</span>
|
|
219
|
+
<span class="stat-total">{bl_counts['total']} total</span>
|
|
220
|
+
</div>
|
|
221
|
+
<table class="result-table">
|
|
222
|
+
<thead><tr><th>Scenario</th><th>Assertion</th><th>Result</th><th>Detail</th></tr></thead>
|
|
223
|
+
<tbody>{build_scenario_rows(baseline)}</tbody>
|
|
224
|
+
</table>
|
|
225
|
+
</div>
|
|
226
|
+
</div>"""
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def build_previous_comparison(
|
|
230
|
+
current_dir: Path,
|
|
231
|
+
previous_dir: Path,
|
|
232
|
+
current_benchmark: Optional[Dict] = None,
|
|
233
|
+
previous_benchmark: Optional[Dict] = None,
|
|
234
|
+
) -> str:
|
|
235
|
+
"""Build a delta section comparing current vs previous iteration."""
|
|
236
|
+
curr = discover_results(current_dir)
|
|
237
|
+
prev = discover_results(previous_dir)
|
|
238
|
+
|
|
239
|
+
curr_ws = count_assertions(curr["with_skill"])
|
|
240
|
+
prev_ws = count_assertions(prev["with_skill"])
|
|
241
|
+
|
|
242
|
+
pass_delta = curr_ws["pass"] - prev_ws["pass"]
|
|
243
|
+
fail_delta = curr_ws["fail"] - prev_ws["fail"]
|
|
244
|
+
|
|
245
|
+
pass_sign = "+" if pass_delta >= 0 else ""
|
|
246
|
+
fail_sign = "+" if fail_delta >= 0 else ""
|
|
247
|
+
pass_cls = "delta-positive" if pass_delta >= 0 else "delta-negative"
|
|
248
|
+
fail_cls = "delta-negative" if fail_delta >= 0 else "delta-positive"
|
|
249
|
+
|
|
250
|
+
# Pass rate delta from benchmark data
|
|
251
|
+
rate_html = ""
|
|
252
|
+
if current_benchmark and previous_benchmark:
|
|
253
|
+
curr_rate = current_benchmark.get("with_skill", {}).get("pass_rate")
|
|
254
|
+
prev_rate = previous_benchmark.get("with_skill", {}).get("pass_rate")
|
|
255
|
+
if curr_rate is not None and prev_rate is not None:
|
|
256
|
+
rate_delta = curr_rate - prev_rate
|
|
257
|
+
rate_sign = "+" if rate_delta >= 0 else ""
|
|
258
|
+
rate_cls = "delta-positive" if rate_delta >= 0 else "delta-negative"
|
|
259
|
+
rate_html = f"""
|
|
260
|
+
<div class="delta-card">
|
|
261
|
+
<div class="delta-label">Pass Rate</div>
|
|
262
|
+
<div class="delta-value {rate_cls}">{rate_sign}{fmt_float(rate_delta, 1)}%</div>
|
|
263
|
+
<div class="delta-detail">{fmt_float(prev_rate, 1)}% → {fmt_float(curr_rate, 1)}%</div>
|
|
264
|
+
</div>"""
|
|
265
|
+
|
|
266
|
+
return f"""
|
|
267
|
+
<div class="delta-section">
|
|
268
|
+
<h2>Delta vs Previous Iteration</h2>
|
|
269
|
+
<div class="delta-stats">
|
|
270
|
+
{rate_html}
|
|
271
|
+
<div class="delta-card">
|
|
272
|
+
<div class="delta-label">Pass</div>
|
|
273
|
+
<div class="delta-value {pass_cls}">{pass_sign}{pass_delta}</div>
|
|
274
|
+
<div class="delta-detail">{prev_ws['pass']} → {curr_ws['pass']}</div>
|
|
275
|
+
</div>
|
|
276
|
+
<div class="delta-card">
|
|
277
|
+
<div class="delta-label">Fail</div>
|
|
278
|
+
<div class="delta-value {fail_cls}">{fail_sign}{fail_delta}</div>
|
|
279
|
+
<div class="delta-detail">{prev_ws['fail']} → {curr_ws['fail']}</div>
|
|
280
|
+
</div>
|
|
281
|
+
</div>
|
|
282
|
+
</div>"""
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def generate_html(
|
|
286
|
+
skill_name: str,
|
|
287
|
+
iteration_dir: Path,
|
|
288
|
+
benchmark: Optional[Dict] = None,
|
|
289
|
+
previous_dir: Optional[Path] = None,
|
|
290
|
+
previous_benchmark: Optional[Dict] = None,
|
|
291
|
+
) -> str:
|
|
292
|
+
"""Generate the complete HTML review page."""
|
|
293
|
+
results = discover_results(iteration_dir)
|
|
294
|
+
iteration_name = iteration_dir.name
|
|
295
|
+
|
|
296
|
+
summary_html = build_summary_section(benchmark)
|
|
297
|
+
|
|
298
|
+
comparison_html = build_comparison_section(
|
|
299
|
+
results["with_skill"], results["baseline"]
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
delta_html = ""
|
|
303
|
+
if previous_dir and previous_dir.is_dir():
|
|
304
|
+
delta_html = build_previous_comparison(
|
|
305
|
+
iteration_dir, previous_dir, benchmark, previous_benchmark
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
return f"""<!DOCTYPE html>
|
|
309
|
+
<html lang="en">
|
|
310
|
+
<head>
|
|
311
|
+
<meta charset="UTF-8">
|
|
312
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
313
|
+
<title>Benchmark Review \u2014 {html.escape(skill_name)}</title>
|
|
314
|
+
<style>
|
|
315
|
+
*,*::before,*::after{{box-sizing:border-box;margin:0;padding:0}}
|
|
316
|
+
:root{{
|
|
317
|
+
--bg:#0d1117;--surface:#161b22;--border:#30363d;--text:#e6edf3;--text-dim:#8b949e;
|
|
318
|
+
--pass:#238636;--pass-bg:rgba(35,134,54,.15);
|
|
319
|
+
--fail:#da3633;--fail-bg:rgba(218,54,51,.15);
|
|
320
|
+
--accent:#58a6ff;--accent-hover:#79c0ff;
|
|
321
|
+
--radius:8px;
|
|
322
|
+
--font-mono:'SF Mono','Cascadia Code','Fira Code',monospace;
|
|
323
|
+
--font-sans:-apple-system,BlinkMacSystemFont,'Segoe UI',Helvetica,Arial,sans-serif;
|
|
324
|
+
}}
|
|
325
|
+
body{{background:var(--bg);color:var(--text);font-family:var(--font-sans);line-height:1.6;padding:2rem;max-width:1400px;margin:0 auto}}
|
|
326
|
+
h1{{font-size:1.6rem;font-weight:700;margin-bottom:.25rem}}
|
|
327
|
+
h2{{font-size:1.2rem;font-weight:600;margin:2rem 0 1rem;padding-bottom:.5rem;border-bottom:1px solid var(--border)}}
|
|
328
|
+
h3{{font-size:1rem;font-weight:600;margin-bottom:.75rem}}
|
|
329
|
+
.subtitle{{color:var(--text-dim);margin-bottom:1.5rem;font-size:.9rem}}
|
|
330
|
+
|
|
331
|
+
.summary-section{{margin-bottom:2rem}}
|
|
332
|
+
.summary-grid{{display:flex;gap:1rem;flex-wrap:wrap}}
|
|
333
|
+
.summary-card{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1rem 1.5rem;min-width:180px}}
|
|
334
|
+
.summary-label{{font-size:.75rem;color:var(--text-dim);text-transform:uppercase;letter-spacing:.05em}}
|
|
335
|
+
.summary-value{{font-size:1.4rem;font-weight:700;font-family:var(--font-mono);margin:.25rem 0}}
|
|
336
|
+
.summary-sub{{font-size:.8rem;color:var(--text-dim)}}
|
|
337
|
+
|
|
338
|
+
.comparison{{display:grid;grid-template-columns:1fr 1fr;gap:1.5rem;margin-bottom:2rem}}
|
|
339
|
+
@media(max-width:900px){{.comparison{{grid-template-columns:1fr}}}}
|
|
340
|
+
.comp-panel{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1.25rem}}
|
|
341
|
+
.comp-stats{{display:flex;gap:1rem;margin-bottom:1rem;font-family:var(--font-mono);font-size:.85rem}}
|
|
342
|
+
.stat-pass{{color:var(--pass)}}
|
|
343
|
+
.stat-fail{{color:var(--fail)}}
|
|
344
|
+
.stat-total{{color:var(--text-dim)}}
|
|
345
|
+
|
|
346
|
+
.result-table{{width:100%;border-collapse:collapse;font-size:.85rem}}
|
|
347
|
+
.result-table th{{text-align:left;padding:.5rem;border-bottom:2px solid var(--border);color:var(--text-dim);font-weight:600}}
|
|
348
|
+
.result-table td{{padding:.5rem;border-bottom:1px solid var(--border);vertical-align:top}}
|
|
349
|
+
.result-table tr:last-child td{{border-bottom:none}}
|
|
350
|
+
.status-pass{{color:var(--pass);font-weight:700;font-family:var(--font-mono)}}
|
|
351
|
+
.status-fail{{color:var(--fail);font-weight:700;font-family:var(--font-mono)}}
|
|
352
|
+
.detail{{color:var(--text-dim);font-size:.8rem}}
|
|
353
|
+
.meta-inline{{font-size:.75rem;color:var(--text-dim);font-family:var(--font-mono)}}
|
|
354
|
+
|
|
355
|
+
details.evidence{{margin-top:.4rem}}
|
|
356
|
+
details.evidence summary{{cursor:pointer;color:var(--accent);font-size:.8rem}}
|
|
357
|
+
details.evidence summary:hover{{color:var(--accent-hover)}}
|
|
358
|
+
details.evidence pre{{background:var(--bg);border:1px solid var(--border);border-radius:4px;padding:.5rem;margin-top:.3rem;font-size:.75rem;overflow-x:auto;white-space:pre-wrap;word-break:break-word}}
|
|
359
|
+
|
|
360
|
+
.delta-section{{margin-bottom:2rem}}
|
|
361
|
+
.delta-stats{{display:flex;gap:1rem;flex-wrap:wrap}}
|
|
362
|
+
.delta-card{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1rem 1.5rem;min-width:140px}}
|
|
363
|
+
.delta-label{{font-size:.75rem;color:var(--text-dim);text-transform:uppercase;letter-spacing:.05em}}
|
|
364
|
+
.delta-value{{font-size:1.6rem;font-weight:700;font-family:var(--font-mono)}}
|
|
365
|
+
.delta-detail{{font-size:.8rem;color:var(--text-dim);margin-top:.25rem}}
|
|
366
|
+
.delta-positive{{color:var(--pass)}}
|
|
367
|
+
.delta-negative{{color:var(--fail)}}
|
|
368
|
+
|
|
369
|
+
.feedback-section{{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:1.5rem;margin-top:2rem}}
|
|
370
|
+
.feedback-section h2{{margin-top:0;border-bottom:none;padding-bottom:0}}
|
|
371
|
+
.feedback-section textarea{{width:100%;min-height:120px;background:var(--bg);color:var(--text);border:1px solid var(--border);border-radius:6px;padding:.75rem;font-family:var(--font-sans);font-size:.9rem;resize:vertical;margin:1rem 0}}
|
|
372
|
+
.btn{{background:var(--surface);color:var(--text);border:1px solid var(--border);border-radius:6px;padding:.5rem 1.25rem;cursor:pointer;font-size:.85rem;font-family:var(--font-sans);transition:border-color .15s,background .15s}}
|
|
373
|
+
.btn:hover{{border-color:var(--accent);background:#1c2129}}
|
|
374
|
+
.btn-primary{{background:var(--accent);color:#000;border-color:var(--accent);font-weight:600}}
|
|
375
|
+
.btn-primary:hover{{background:var(--accent-hover)}}
|
|
376
|
+
.btn-group{{display:flex;gap:.5rem}}
|
|
377
|
+
</style>
|
|
378
|
+
</head>
|
|
379
|
+
<body>
|
|
380
|
+
|
|
381
|
+
<h1>Benchmark Review \u2014 {html.escape(skill_name)}</h1>
|
|
382
|
+
<p class="subtitle">Iteration: {html.escape(iteration_name)} | Generated from: {html.escape(str(iteration_dir))}</p>
|
|
383
|
+
|
|
384
|
+
{summary_html}
|
|
385
|
+
{delta_html}
|
|
386
|
+
|
|
387
|
+
<h2>Side-by-Side Comparison</h2>
|
|
388
|
+
{comparison_html}
|
|
389
|
+
|
|
390
|
+
<div class="feedback-section">
|
|
391
|
+
<h2>Feedback</h2>
|
|
392
|
+
<p style="color:var(--text-dim);font-size:.85rem">Add notes about this iteration's results. Download as JSON for the next improvement cycle.</p>
|
|
393
|
+
<textarea id="feedbackText" placeholder="What worked well? What needs improvement? Which assertions need attention?"></textarea>
|
|
394
|
+
<div class="btn-group">
|
|
395
|
+
<button class="btn btn-primary" onclick="downloadFeedback()">Download Feedback JSON</button>
|
|
396
|
+
</div>
|
|
397
|
+
</div>
|
|
398
|
+
|
|
399
|
+
<script>
|
|
400
|
+
(function(){{
|
|
401
|
+
"use strict";
|
|
402
|
+
window.downloadFeedback = function(){{
|
|
403
|
+
var feedback = {{
|
|
404
|
+
skill_name: {json.dumps(skill_name)},
|
|
405
|
+
iteration: {json.dumps(iteration_name)},
|
|
406
|
+
timestamp: new Date().toISOString(),
|
|
407
|
+
feedback: document.getElementById("feedbackText").value,
|
|
408
|
+
results_summary: {{
|
|
409
|
+
with_skill: {json.dumps(count_assertions(results["with_skill"]))},
|
|
410
|
+
baseline: {json.dumps(count_assertions(results["baseline"]))}
|
|
411
|
+
}}
|
|
412
|
+
}};
|
|
413
|
+
var blob = new Blob([JSON.stringify(feedback, null, 2)], {{type:"application/json"}});
|
|
414
|
+
var a = document.createElement("a");
|
|
415
|
+
a.href = URL.createObjectURL(blob);
|
|
416
|
+
a.download = "feedback-" + {json.dumps(iteration_name)} + ".json";
|
|
417
|
+
a.click();
|
|
418
|
+
URL.revokeObjectURL(a.href);
|
|
419
|
+
}};
|
|
420
|
+
}})();
|
|
421
|
+
</script>
|
|
422
|
+
</body>
|
|
423
|
+
</html>"""
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def main() -> int:
|
|
427
|
+
parser = argparse.ArgumentParser(
|
|
428
|
+
description="Generate HTML review for skill-creator benchmark results."
|
|
429
|
+
)
|
|
430
|
+
parser.add_argument(
|
|
431
|
+
"iteration_dir",
|
|
432
|
+
type=Path,
|
|
433
|
+
help="Path to iteration directory (e.g. workspace/iteration-1)",
|
|
434
|
+
)
|
|
435
|
+
parser.add_argument(
|
|
436
|
+
"--skill-name",
|
|
437
|
+
required=True,
|
|
438
|
+
help="Name of the skill being benchmarked",
|
|
439
|
+
)
|
|
440
|
+
parser.add_argument(
|
|
441
|
+
"--benchmark",
|
|
442
|
+
type=Path,
|
|
443
|
+
default=None,
|
|
444
|
+
help="Path to benchmark.json with statistics (pass_rate, tokens, time)",
|
|
445
|
+
)
|
|
446
|
+
parser.add_argument(
|
|
447
|
+
"--previous-workspace",
|
|
448
|
+
type=Path,
|
|
449
|
+
default=None,
|
|
450
|
+
help="Path to previous iteration directory for delta comparison",
|
|
451
|
+
)
|
|
452
|
+
parser.add_argument(
|
|
453
|
+
"--static",
|
|
454
|
+
type=Path,
|
|
455
|
+
default=None,
|
|
456
|
+
help="Output path for static HTML file (default: stdout)",
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
args = parser.parse_args()
|
|
460
|
+
|
|
461
|
+
if not args.iteration_dir.is_dir():
|
|
462
|
+
print(f"Error: {args.iteration_dir} is not a directory", file=sys.stderr)
|
|
463
|
+
return 1
|
|
464
|
+
|
|
465
|
+
benchmark = None
|
|
466
|
+
if args.benchmark:
|
|
467
|
+
benchmark = load_json(args.benchmark)
|
|
468
|
+
if benchmark is None:
|
|
469
|
+
print(f"Warning: could not load {args.benchmark}", file=sys.stderr)
|
|
470
|
+
|
|
471
|
+
previous_benchmark = None
|
|
472
|
+
if args.previous_workspace:
|
|
473
|
+
prev_bench_path = args.previous_workspace / "benchmark.json"
|
|
474
|
+
previous_benchmark = load_json(prev_bench_path)
|
|
475
|
+
|
|
476
|
+
html_content = generate_html(
|
|
477
|
+
skill_name=args.skill_name,
|
|
478
|
+
iteration_dir=args.iteration_dir,
|
|
479
|
+
benchmark=benchmark,
|
|
480
|
+
previous_dir=args.previous_workspace,
|
|
481
|
+
previous_benchmark=previous_benchmark,
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
if args.static:
|
|
485
|
+
args.static.parent.mkdir(parents=True, exist_ok=True)
|
|
486
|
+
with open(args.static, "w", encoding="utf-8") as f:
|
|
487
|
+
f.write(html_content)
|
|
488
|
+
print(f"Written to {args.static}", file=sys.stderr)
|
|
489
|
+
else:
|
|
490
|
+
print(html_content)
|
|
491
|
+
|
|
492
|
+
return 0
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
if __name__ == "__main__":
|
|
496
|
+
sys.exit(main())
|