myaidev-method 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +0 -1
- package/.env.example +5 -4
- package/CHANGELOG.md +2 -2
- package/CONTENT_CREATION_GUIDE.md +489 -3211
- package/DEVELOPER_USE_CASES.md +1 -1
- package/MODULAR_INSTALLATION.md +2 -2
- package/README.md +39 -33
- package/TECHNICAL_ARCHITECTURE.md +1 -1
- package/USER_GUIDE.md +242 -190
- package/agents/content-editor-agent.md +90 -0
- package/agents/content-planner-agent.md +97 -0
- package/agents/content-research-agent.md +62 -0
- package/agents/content-seo-agent.md +101 -0
- package/agents/content-writer-agent.md +69 -0
- package/agents/infographic-analyzer-agent.md +63 -0
- package/agents/infographic-designer-agent.md +72 -0
- package/bin/cli.js +776 -422
- package/{content-rules.example.md → content-rules-example.md} +2 -2
- package/dist/mcp/health-check.js +82 -68
- package/dist/mcp/mcp-config.json +8 -0
- package/dist/mcp/openstack-server.js +1746 -1262
- package/dist/server/.tsbuildinfo +1 -1
- package/extension.json +21 -4
- package/package.json +181 -184
- package/skills/company-config/SKILL.md +133 -0
- package/skills/configure/SKILL.md +1 -1
- package/skills/myai-configurator/SKILL.md +77 -0
- package/skills/myai-configurator/content-creation-configurator/SKILL.md +516 -0
- package/skills/myai-configurator/content-maintenance-configurator/SKILL.md +397 -0
- package/skills/myai-content-enrichment/SKILL.md +114 -0
- package/skills/myai-content-ideation/SKILL.md +288 -0
- package/skills/myai-content-ideation/evals/evals.json +182 -0
- package/skills/myai-content-production-coordinator/SKILL.md +946 -0
- package/skills/{content-rules-setup → myai-content-rules-setup}/SKILL.md +1 -1
- package/skills/{content-verifier → myai-content-verifier}/SKILL.md +1 -1
- package/skills/myai-content-writer/SKILL.md +333 -0
- package/skills/{infographic → myai-infographic}/SKILL.md +1 -1
- package/skills/myai-proprietary-content-verifier/SKILL.md +175 -0
- package/skills/myai-proprietary-content-verifier/evals/evals.json +36 -0
- package/skills/myai-skill-builder/SKILL.md +699 -0
- package/skills/myai-skill-builder/agents/analyzer-agent.md +137 -0
- package/skills/myai-skill-builder/agents/comparator-agent.md +77 -0
- package/skills/myai-skill-builder/agents/grader-agent.md +103 -0
- package/skills/myai-skill-builder/assets/eval_review.html +131 -0
- package/skills/myai-skill-builder/references/schemas.md +211 -0
- package/skills/myai-skill-builder/scripts/aggregate_benchmark.py +190 -0
- package/skills/myai-skill-builder/scripts/generate_review.py +381 -0
- package/skills/myai-skill-builder/scripts/package_skill.py +91 -0
- package/skills/myai-skill-builder/scripts/run_eval.py +105 -0
- package/skills/myai-skill-builder/scripts/run_loop.py +211 -0
- package/skills/myai-skill-builder/scripts/utils.py +123 -0
- package/skills/myai-visual-generator/SKILL.md +125 -0
- package/skills/myai-visual-generator/evals/evals.json +155 -0
- package/skills/myai-visual-generator/references/infographic-pipeline.md +73 -0
- package/skills/myai-visual-generator/references/research-visuals.md +57 -0
- package/skills/myai-visual-generator/references/services.md +89 -0
- package/skills/myai-visual-generator/scripts/visual-generation-utils.js +1272 -0
- package/skills/myaidev-figma/SKILL.md +212 -0
- package/skills/myaidev-figma/capture.js +133 -0
- package/skills/myaidev-figma/crawl.js +130 -0
- package/skills/myaidev-figma-configure/SKILL.md +130 -0
- package/skills/openstack-manager/SKILL.md +1 -1
- package/skills/payloadcms-publisher/SKILL.md +141 -77
- package/skills/payloadcms-publisher/references/field-mapping.md +142 -0
- package/skills/payloadcms-publisher/references/lexical-format.md +97 -0
- package/skills/security-auditor/SKILL.md +1 -1
- package/src/cli/commands/addon.js +105 -7
- package/src/config/workflows.js +172 -228
- package/src/lib/ascii-banner.js +197 -182
- package/src/lib/{content-coordinator.js → content-production-coordinator.js} +649 -459
- package/src/lib/installation-detector.js +93 -59
- package/src/lib/payloadcms-utils.js +285 -510
- package/src/lib/workflow-installer.js +55 -0
- package/src/mcp/health-check.js +82 -68
- package/src/mcp/openstack-server.js +1746 -1262
- package/src/scripts/configure-visual-apis.js +224 -173
- package/src/scripts/configure-wordpress-mcp.js +96 -66
- package/src/scripts/init/install.js +109 -85
- package/src/scripts/init-project.js +138 -67
- package/src/scripts/utils/write-content.js +67 -52
- package/src/scripts/wordpress/publish-to-wordpress.js +128 -128
- package/src/templates/claude/CLAUDE.md +19 -12
- package/hooks/hooks.json +0 -26
- package/skills/content-coordinator/SKILL.md +0 -130
- package/skills/content-enrichment/SKILL.md +0 -80
- package/skills/content-writer/SKILL.md +0 -285
- package/skills/skill-builder/SKILL.md +0 -417
- package/skills/visual-generator/SKILL.md +0 -140
- /package/skills/{content-writer → myai-content-writer}/agents/editor-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/planner-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/research-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/seo-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/visual-planner-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/writer-agent.md +0 -0
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Aggregate grading.json and timing.json files into a benchmark.json summary.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python3 aggregate_benchmark.py <iteration-dir>
|
|
6
|
+
python3 aggregate_benchmark.py <workspace-dir> --multi-run
|
|
7
|
+
|
|
8
|
+
Single iteration mode:
|
|
9
|
+
Reads all grading.json and timing.json from an iteration directory,
|
|
10
|
+
produces benchmark.json with pass rates, tokens, and timing per config.
|
|
11
|
+
|
|
12
|
+
Multi-run mode (--multi-run):
|
|
13
|
+
Reads benchmark.json from multiple iteration directories,
|
|
14
|
+
computes mean ± stddev across runs.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
# Add script directory to path for utils import
|
|
22
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
23
|
+
from utils import (
|
|
24
|
+
load_json, save_json, collect_grading_files, collect_timing_files,
|
|
25
|
+
mean, stddev, now_iso
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def aggregate_iteration(iteration_dir):
|
|
30
|
+
"""Aggregate results from a single iteration into benchmark.json."""
|
|
31
|
+
iteration_dir = Path(iteration_dir)
|
|
32
|
+
gradings = collect_grading_files(iteration_dir)
|
|
33
|
+
timings = collect_timing_files(iteration_dir)
|
|
34
|
+
|
|
35
|
+
if not gradings:
|
|
36
|
+
print(f"No grading.json files found in {iteration_dir}", file=sys.stderr)
|
|
37
|
+
sys.exit(1)
|
|
38
|
+
|
|
39
|
+
# Group by config
|
|
40
|
+
configs = {}
|
|
41
|
+
for g in gradings:
|
|
42
|
+
config = g.get("config", "unknown")
|
|
43
|
+
if config not in configs:
|
|
44
|
+
configs[config] = {"gradings": [], "timings": []}
|
|
45
|
+
configs[config]["gradings"].append(g)
|
|
46
|
+
|
|
47
|
+
for t in timings:
|
|
48
|
+
config = t.get("config", "unknown")
|
|
49
|
+
if config in configs:
|
|
50
|
+
configs[config]["timings"].append(t)
|
|
51
|
+
|
|
52
|
+
# Build benchmark
|
|
53
|
+
benchmark = {
|
|
54
|
+
"iteration": int(iteration_dir.name.split("-")[1]) if "-" in iteration_dir.name else 1,
|
|
55
|
+
"timestamp": now_iso(),
|
|
56
|
+
"configs": {},
|
|
57
|
+
"comparison": {}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
for config_name, data in configs.items():
|
|
61
|
+
pass_rates = [g["pass_rate"] for g in data["gradings"]]
|
|
62
|
+
tokens = [t["total_tokens"] for t in data["timings"]]
|
|
63
|
+
durations = [t["duration_ms"] for t in data["timings"]]
|
|
64
|
+
|
|
65
|
+
evals = []
|
|
66
|
+
for g in data["gradings"]:
|
|
67
|
+
eval_entry = {
|
|
68
|
+
"eval_id": g["eval_id"],
|
|
69
|
+
"pass_rate": g["pass_rate"],
|
|
70
|
+
"pass_count": g["pass_count"],
|
|
71
|
+
"fail_count": g["fail_count"],
|
|
72
|
+
}
|
|
73
|
+
# Find matching timing
|
|
74
|
+
matching_timing = [t for t in data["timings"] if t["eval_id"] == g["eval_id"]]
|
|
75
|
+
if matching_timing:
|
|
76
|
+
eval_entry["tokens"] = matching_timing[0]["total_tokens"]
|
|
77
|
+
eval_entry["duration_ms"] = matching_timing[0]["duration_ms"]
|
|
78
|
+
evals.append(eval_entry)
|
|
79
|
+
|
|
80
|
+
benchmark["configs"][config_name] = {
|
|
81
|
+
"overall_pass_rate": mean(pass_rates),
|
|
82
|
+
"total_tokens_mean": mean(tokens) if tokens else 0,
|
|
83
|
+
"total_tokens_stddev": stddev(tokens) if tokens else 0,
|
|
84
|
+
"duration_ms_mean": mean(durations) if durations else 0,
|
|
85
|
+
"duration_ms_stddev": stddev(durations) if durations else 0,
|
|
86
|
+
"evals": evals
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
# Compute comparison if both configs exist
|
|
90
|
+
if "with_skill" in benchmark["configs"] and "without_skill" in benchmark["configs"]:
|
|
91
|
+
ws = benchmark["configs"]["with_skill"]
|
|
92
|
+
wo = benchmark["configs"]["without_skill"]
|
|
93
|
+
benchmark["comparison"] = {
|
|
94
|
+
"pass_rate_delta": ws["overall_pass_rate"] - wo["overall_pass_rate"],
|
|
95
|
+
"token_overhead_percent": (
|
|
96
|
+
((ws["total_tokens_mean"] - wo["total_tokens_mean"]) / wo["total_tokens_mean"] * 100)
|
|
97
|
+
if wo["total_tokens_mean"] > 0 else 0
|
|
98
|
+
),
|
|
99
|
+
"time_overhead_percent": (
|
|
100
|
+
((ws["duration_ms_mean"] - wo["duration_ms_mean"]) / wo["duration_ms_mean"] * 100)
|
|
101
|
+
if wo["duration_ms_mean"] > 0 else 0
|
|
102
|
+
),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# Find non-discriminating assertions
|
|
106
|
+
non_disc = []
|
|
107
|
+
ws_evals = {e["eval_id"]: e for e in ws["evals"]}
|
|
108
|
+
wo_evals = {e["eval_id"]: e for e in wo["evals"]}
|
|
109
|
+
for eval_id in ws_evals:
|
|
110
|
+
if eval_id in wo_evals:
|
|
111
|
+
if ws_evals[eval_id]["pass_rate"] == 1.0 and wo_evals[eval_id]["pass_rate"] == 1.0:
|
|
112
|
+
non_disc.append(eval_id)
|
|
113
|
+
benchmark["comparison"]["non_discriminating_evals"] = non_disc
|
|
114
|
+
|
|
115
|
+
output_path = iteration_dir / "benchmark.json"
|
|
116
|
+
save_json(output_path, benchmark)
|
|
117
|
+
return benchmark
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def aggregate_multi_run(workspace_dir):
|
|
121
|
+
"""Aggregate benchmark.json files from multiple iterations for statistical analysis."""
|
|
122
|
+
workspace_dir = Path(workspace_dir)
|
|
123
|
+
benchmarks = []
|
|
124
|
+
|
|
125
|
+
for d in sorted(workspace_dir.iterdir()):
|
|
126
|
+
if d.is_dir() and d.name.startswith("iteration-"):
|
|
127
|
+
bm_path = d / "benchmark.json"
|
|
128
|
+
if bm_path.exists():
|
|
129
|
+
benchmarks.append(load_json(bm_path))
|
|
130
|
+
|
|
131
|
+
if len(benchmarks) < 2:
|
|
132
|
+
print(f"Need at least 2 iteration benchmarks for multi-run analysis, found {len(benchmarks)}", file=sys.stderr)
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
|
|
135
|
+
result = {
|
|
136
|
+
"timestamp": now_iso(),
|
|
137
|
+
"num_runs": len(benchmarks),
|
|
138
|
+
"configs": {}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
# Aggregate per config
|
|
142
|
+
for config_name in ["with_skill", "without_skill"]:
|
|
143
|
+
pass_rates = [b["configs"][config_name]["overall_pass_rate"]
|
|
144
|
+
for b in benchmarks if config_name in b.get("configs", {})]
|
|
145
|
+
tokens = [b["configs"][config_name]["total_tokens_mean"]
|
|
146
|
+
for b in benchmarks if config_name in b.get("configs", {})]
|
|
147
|
+
durations = [b["configs"][config_name]["duration_ms_mean"]
|
|
148
|
+
for b in benchmarks if config_name in b.get("configs", {})]
|
|
149
|
+
|
|
150
|
+
if pass_rates:
|
|
151
|
+
result["configs"][config_name] = {
|
|
152
|
+
"pass_rate": {"mean": mean(pass_rates), "stddev": stddev(pass_rates)},
|
|
153
|
+
"tokens": {"mean": mean(tokens), "stddev": stddev(tokens)},
|
|
154
|
+
"duration_ms": {"mean": mean(durations), "stddev": stddev(durations)},
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
output_path = workspace_dir / "benchmark.json"
|
|
158
|
+
save_json(output_path, result)
|
|
159
|
+
return result
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def main():
|
|
163
|
+
parser = argparse.ArgumentParser(description="Aggregate eval results into benchmark summary")
|
|
164
|
+
parser.add_argument("path", help="Iteration directory or workspace directory (with --multi-run)")
|
|
165
|
+
parser.add_argument("--multi-run", action="store_true", help="Aggregate across multiple iterations")
|
|
166
|
+
args = parser.parse_args()
|
|
167
|
+
|
|
168
|
+
if args.multi_run:
|
|
169
|
+
result = aggregate_multi_run(args.path)
|
|
170
|
+
print(f"\nMulti-run benchmark ({result['num_runs']} runs):")
|
|
171
|
+
for config, stats in result["configs"].items():
|
|
172
|
+
pr = stats["pass_rate"]
|
|
173
|
+
print(f" {config}: pass_rate={pr['mean']:.2f} ± {pr['stddev']:.2f}")
|
|
174
|
+
else:
|
|
175
|
+
result = aggregate_iteration(args.path)
|
|
176
|
+
print(f"\nIteration {result['iteration']} benchmark:")
|
|
177
|
+
for config, stats in result["configs"].items():
|
|
178
|
+
print(f" {config}: pass_rate={stats['overall_pass_rate']:.2f}, "
|
|
179
|
+
f"tokens={stats['total_tokens_mean']:.0f}, "
|
|
180
|
+
f"time={stats['duration_ms_mean']:.0f}ms")
|
|
181
|
+
if result.get("comparison"):
|
|
182
|
+
c = result["comparison"]
|
|
183
|
+
print(f"\n Comparison:")
|
|
184
|
+
print(f" Pass rate delta: +{c['pass_rate_delta']:.2f}")
|
|
185
|
+
print(f" Token overhead: {c['token_overhead_percent']:.1f}%")
|
|
186
|
+
print(f" Time overhead: {c['time_overhead_percent']:.1f}%")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
if __name__ == "__main__":
|
|
190
|
+
main()
|
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Generate an HTML review page for eval results.
|
|
3
|
+
|
|
4
|
+
Creates an interactive HTML page showing eval outputs, grading results,
|
|
5
|
+
and benchmark data for human review.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 generate_review.py <workspace-dir>
|
|
9
|
+
python3 generate_review.py <workspace-dir> --static output.html
|
|
10
|
+
python3 generate_review.py <workspace-dir> --open
|
|
11
|
+
|
|
12
|
+
Options:
|
|
13
|
+
--static <path> Generate a standalone HTML file instead of serving
|
|
14
|
+
--open Open the generated HTML in the default browser
|
|
15
|
+
--iteration <n> Show specific iteration (default: latest)
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import sys
|
|
22
|
+
import webbrowser
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
26
|
+
from utils import load_json, find_latest_iteration
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def collect_eval_data(iteration_dir):
|
|
30
|
+
"""Collect all eval data from an iteration directory."""
|
|
31
|
+
iteration_dir = Path(iteration_dir)
|
|
32
|
+
evals = []
|
|
33
|
+
|
|
34
|
+
for eval_dir in sorted(iteration_dir.iterdir()):
|
|
35
|
+
if not eval_dir.is_dir() or not eval_dir.name.startswith("eval-"):
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
eval_id = eval_dir.name.replace("eval-", "")
|
|
39
|
+
eval_data = {"id": eval_id, "configs": {}}
|
|
40
|
+
|
|
41
|
+
for config in ["with_skill", "without_skill"]:
|
|
42
|
+
config_dir = eval_dir / config
|
|
43
|
+
if not config_dir.exists():
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
config_data = {"outputs": {}, "grading": None, "timing": None}
|
|
47
|
+
|
|
48
|
+
# Read outputs
|
|
49
|
+
outputs_dir = config_dir / "outputs"
|
|
50
|
+
if outputs_dir.exists():
|
|
51
|
+
for f in sorted(outputs_dir.iterdir()):
|
|
52
|
+
if f.is_file():
|
|
53
|
+
try:
|
|
54
|
+
config_data["outputs"][f.name] = f.read_text()[:5000]
|
|
55
|
+
except (UnicodeDecodeError, PermissionError):
|
|
56
|
+
config_data["outputs"][f.name] = "(binary file)"
|
|
57
|
+
|
|
58
|
+
# Read grading
|
|
59
|
+
grading_path = config_dir / "grading.json"
|
|
60
|
+
if grading_path.exists():
|
|
61
|
+
config_data["grading"] = load_json(grading_path)
|
|
62
|
+
|
|
63
|
+
# Read timing
|
|
64
|
+
timing_path = config_dir / "timing.json"
|
|
65
|
+
if timing_path.exists():
|
|
66
|
+
config_data["timing"] = load_json(timing_path)
|
|
67
|
+
|
|
68
|
+
eval_data["configs"][config] = config_data
|
|
69
|
+
|
|
70
|
+
evals.append(eval_data)
|
|
71
|
+
|
|
72
|
+
# Read benchmark
|
|
73
|
+
benchmark = None
|
|
74
|
+
benchmark_path = iteration_dir / "benchmark.json"
|
|
75
|
+
if benchmark_path.exists():
|
|
76
|
+
benchmark = load_json(benchmark_path)
|
|
77
|
+
|
|
78
|
+
return evals, benchmark
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def generate_html(evals, benchmark, iteration_num, workspace_dir):
|
|
82
|
+
"""Generate the HTML review page."""
|
|
83
|
+
# Escape JSON for embedding in HTML
|
|
84
|
+
evals_json = json.dumps(evals, indent=2)
|
|
85
|
+
benchmark_json = json.dumps(benchmark, indent=2) if benchmark else "null"
|
|
86
|
+
|
|
87
|
+
html = f"""<!DOCTYPE html>
|
|
88
|
+
<html lang="en">
|
|
89
|
+
<head>
|
|
90
|
+
<meta charset="UTF-8">
|
|
91
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
92
|
+
<title>Skill Eval Review - Iteration {iteration_num}</title>
|
|
93
|
+
<style>
|
|
94
|
+
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
|
95
|
+
body {{ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace; background: #0d1117; color: #c9d1d9; }}
|
|
96
|
+
.header {{ background: #161b22; border-bottom: 1px solid #30363d; padding: 16px 24px; display: flex; justify-content: space-between; align-items: center; }}
|
|
97
|
+
.header h1 {{ font-size: 20px; color: #f0f6fc; }}
|
|
98
|
+
.tabs {{ display: flex; gap: 0; border-bottom: 1px solid #30363d; background: #161b22; padding: 0 24px; }}
|
|
99
|
+
.tab {{ padding: 12px 20px; cursor: pointer; border-bottom: 2px solid transparent; color: #8b949e; font-size: 14px; }}
|
|
100
|
+
.tab:hover {{ color: #c9d1d9; }}
|
|
101
|
+
.tab.active {{ color: #f0f6fc; border-bottom-color: #f78166; }}
|
|
102
|
+
.content {{ padding: 24px; max-width: 1400px; margin: 0 auto; }}
|
|
103
|
+
.eval-card {{ background: #161b22; border: 1px solid #30363d; border-radius: 6px; margin-bottom: 16px; overflow: hidden; }}
|
|
104
|
+
.eval-header {{ padding: 12px 16px; background: #1c2128; border-bottom: 1px solid #30363d; display: flex; justify-content: space-between; align-items: center; }}
|
|
105
|
+
.eval-header h3 {{ font-size: 16px; color: #f0f6fc; }}
|
|
106
|
+
.badge {{ padding: 2px 8px; border-radius: 12px; font-size: 12px; font-weight: 600; }}
|
|
107
|
+
.badge.pass {{ background: #1a3a2a; color: #3fb950; }}
|
|
108
|
+
.badge.fail {{ background: #3a1a1a; color: #f85149; }}
|
|
109
|
+
.badge.mixed {{ background: #3a2a1a; color: #d29922; }}
|
|
110
|
+
.eval-body {{ padding: 16px; }}
|
|
111
|
+
.config-columns {{ display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }}
|
|
112
|
+
.config-col {{ background: #0d1117; border: 1px solid #30363d; border-radius: 6px; padding: 12px; }}
|
|
113
|
+
.config-col h4 {{ font-size: 14px; margin-bottom: 8px; color: #8b949e; }}
|
|
114
|
+
.assertion {{ padding: 8px; margin: 4px 0; border-radius: 4px; font-size: 13px; }}
|
|
115
|
+
.assertion.pass {{ background: #1a3a2a; border-left: 3px solid #3fb950; }}
|
|
116
|
+
.assertion.fail {{ background: #3a1a1a; border-left: 3px solid #f85149; }}
|
|
117
|
+
.evidence {{ color: #8b949e; font-size: 12px; margin-top: 4px; }}
|
|
118
|
+
.output-block {{ background: #0d1117; border: 1px solid #30363d; border-radius: 4px; padding: 12px; margin: 8px 0; max-height: 300px; overflow-y: auto; font-size: 13px; white-space: pre-wrap; }}
|
|
119
|
+
.benchmark-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); gap: 16px; }}
|
|
120
|
+
.metric-card {{ background: #161b22; border: 1px solid #30363d; border-radius: 6px; padding: 16px; text-align: center; }}
|
|
121
|
+
.metric-value {{ font-size: 32px; font-weight: 700; color: #f0f6fc; margin: 8px 0; }}
|
|
122
|
+
.metric-label {{ font-size: 13px; color: #8b949e; }}
|
|
123
|
+
.metric-sub {{ font-size: 12px; color: #8b949e; margin-top: 4px; }}
|
|
124
|
+
.feedback-section {{ margin-top: 16px; }}
|
|
125
|
+
.feedback-section textarea {{ width: 100%; height: 80px; background: #0d1117; border: 1px solid #30363d; border-radius: 6px; padding: 8px; color: #c9d1d9; font-family: inherit; font-size: 13px; resize: vertical; }}
|
|
126
|
+
.feedback-section select {{ background: #0d1117; border: 1px solid #30363d; border-radius: 6px; padding: 6px 12px; color: #c9d1d9; font-size: 13px; margin-right: 8px; }}
|
|
127
|
+
.btn {{ padding: 8px 16px; border-radius: 6px; border: none; cursor: pointer; font-size: 14px; font-weight: 600; }}
|
|
128
|
+
.btn-primary {{ background: #238636; color: #fff; }}
|
|
129
|
+
.btn-primary:hover {{ background: #2ea043; }}
|
|
130
|
+
.submit-bar {{ position: fixed; bottom: 0; left: 0; right: 0; background: #161b22; border-top: 1px solid #30363d; padding: 12px 24px; display: flex; justify-content: flex-end; gap: 12px; }}
|
|
131
|
+
.hidden {{ display: none; }}
|
|
132
|
+
.bar-chart {{ display: flex; align-items: flex-end; gap: 8px; height: 120px; margin: 16px 0; }}
|
|
133
|
+
.bar {{ flex: 1; border-radius: 4px 4px 0 0; min-width: 30px; position: relative; }}
|
|
134
|
+
.bar.with {{ background: #238636; }}
|
|
135
|
+
.bar.without {{ background: #6e7681; }}
|
|
136
|
+
.bar-label {{ position: absolute; bottom: -20px; left: 50%; transform: translateX(-50%); font-size: 10px; white-space: nowrap; }}
|
|
137
|
+
</style>
|
|
138
|
+
</head>
|
|
139
|
+
<body>
|
|
140
|
+
<div class="header">
|
|
141
|
+
<h1>Skill Eval Review — Iteration {iteration_num}</h1>
|
|
142
|
+
<span style="color: #8b949e; font-size: 13px;">{workspace_dir}</span>
|
|
143
|
+
</div>
|
|
144
|
+
<div class="tabs">
|
|
145
|
+
<div class="tab active" onclick="showTab('outputs')">Outputs</div>
|
|
146
|
+
<div class="tab" onclick="showTab('benchmark')">Benchmark</div>
|
|
147
|
+
</div>
|
|
148
|
+
<div id="outputs-tab" class="content"></div>
|
|
149
|
+
<div id="benchmark-tab" class="content hidden"></div>
|
|
150
|
+
<div class="submit-bar">
|
|
151
|
+
<select id="action-select">
|
|
152
|
+
<option value="iterate">Iterate (improve & rerun)</option>
|
|
153
|
+
<option value="publish">Publish (submit to marketplace)</option>
|
|
154
|
+
<option value="stop">Stop (done for now)</option>
|
|
155
|
+
</select>
|
|
156
|
+
<button class="btn btn-primary" onclick="submitFeedback()">Submit All Reviews</button>
|
|
157
|
+
</div>
|
|
158
|
+
|
|
159
|
+
<script>
|
|
160
|
+
const evals = {evals_json};
|
|
161
|
+
const benchmark = {benchmark_json};
|
|
162
|
+
|
|
163
|
+
function showTab(name) {{
|
|
164
|
+
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
|
165
|
+
document.querySelectorAll('.content').forEach(c => c.classList.add('hidden'));
|
|
166
|
+
event.target.classList.add('active');
|
|
167
|
+
document.getElementById(name + '-tab').classList.remove('hidden');
|
|
168
|
+
}}
|
|
169
|
+
|
|
170
|
+
function renderOutputs() {{
|
|
171
|
+
const container = document.getElementById('outputs-tab');
|
|
172
|
+
let html = '';
|
|
173
|
+
|
|
174
|
+
evals.forEach(ev => {{
|
|
175
|
+
const wsGrading = ev.configs.with_skill?.grading;
|
|
176
|
+
const woGrading = ev.configs.without_skill?.grading;
|
|
177
|
+
const wsRate = wsGrading ? wsGrading.pass_rate : null;
|
|
178
|
+
const woRate = woGrading ? woGrading.pass_rate : null;
|
|
179
|
+
|
|
180
|
+
let badgeClass = 'mixed';
|
|
181
|
+
let badgeText = 'N/A';
|
|
182
|
+
if (wsRate !== null) {{
|
|
183
|
+
if (wsRate === 1.0) {{ badgeClass = 'pass'; badgeText = 'ALL PASS'; }}
|
|
184
|
+
else if (wsRate === 0) {{ badgeClass = 'fail'; badgeText = 'ALL FAIL'; }}
|
|
185
|
+
else {{ badgeText = Math.round(wsRate * 100) + '% pass'; }}
|
|
186
|
+
}}
|
|
187
|
+
|
|
188
|
+
html += '<div class="eval-card">';
|
|
189
|
+
html += '<div class="eval-header">';
|
|
190
|
+
html += '<h3>eval-' + ev.id + '</h3>';
|
|
191
|
+
html += '<span class="badge ' + badgeClass + '">' + badgeText + '</span>';
|
|
192
|
+
html += '</div>';
|
|
193
|
+
html += '<div class="eval-body">';
|
|
194
|
+
html += '<div class="config-columns">';
|
|
195
|
+
|
|
196
|
+
['with_skill', 'without_skill'].forEach(config => {{
|
|
197
|
+
const data = ev.configs[config];
|
|
198
|
+
html += '<div class="config-col">';
|
|
199
|
+
html += '<h4>' + config.replace('_', ' ') + '</h4>';
|
|
200
|
+
|
|
201
|
+
if (data && data.grading) {{
|
|
202
|
+
data.grading.expectations.forEach(exp => {{
|
|
203
|
+
html += '<div class="assertion ' + exp.verdict.toLowerCase() + '">';
|
|
204
|
+
html += '<strong>' + exp.verdict + '</strong>: ' + exp.assertion;
|
|
205
|
+
html += '<div class="evidence">' + (exp.evidence || '') + '</div>';
|
|
206
|
+
html += '</div>';
|
|
207
|
+
}});
|
|
208
|
+
}}
|
|
209
|
+
|
|
210
|
+
if (data && Object.keys(data.outputs).length > 0) {{
|
|
211
|
+
html += '<details><summary style="cursor:pointer;margin-top:8px;color:#8b949e">Show outputs</summary>';
|
|
212
|
+
Object.entries(data.outputs).forEach(([name, content]) => {{
|
|
213
|
+
html += '<div style="margin-top:4px;font-size:12px;color:#8b949e">' + name + '</div>';
|
|
214
|
+
html += '<div class="output-block">' + escapeHtml(content) + '</div>';
|
|
215
|
+
}});
|
|
216
|
+
html += '</details>';
|
|
217
|
+
}}
|
|
218
|
+
|
|
219
|
+
if (data && data.timing) {{
|
|
220
|
+
html += '<div style="margin-top:8px;font-size:12px;color:#8b949e">';
|
|
221
|
+
html += 'Tokens: ' + data.timing.total_tokens + ' | Time: ' + data.timing.duration_ms + 'ms';
|
|
222
|
+
html += '</div>';
|
|
223
|
+
}}
|
|
224
|
+
|
|
225
|
+
html += '</div>';
|
|
226
|
+
}});
|
|
227
|
+
|
|
228
|
+
html += '</div>';
|
|
229
|
+
html += '<div class="feedback-section">';
|
|
230
|
+
html += '<select data-eval="' + ev.id + '" class="eval-rating">';
|
|
231
|
+
html += '<option value="good">Good</option>';
|
|
232
|
+
html += '<option value="needs-work">Needs Work</option>';
|
|
233
|
+
html += '<option value="bad">Bad</option>';
|
|
234
|
+
html += '</select>';
|
|
235
|
+
html += '<textarea data-eval="' + ev.id + '" class="eval-comment" placeholder="Feedback for this eval..."></textarea>';
|
|
236
|
+
html += '</div>';
|
|
237
|
+
html += '</div></div>';
|
|
238
|
+
}});
|
|
239
|
+
|
|
240
|
+
container.innerHTML = html;
|
|
241
|
+
}}
|
|
242
|
+
|
|
243
|
+
function renderBenchmark() {{
|
|
244
|
+
const container = document.getElementById('benchmark-tab');
|
|
245
|
+
if (!benchmark) {{
|
|
246
|
+
container.innerHTML = '<p style="color:#8b949e">No benchmark data available. Run evals first.</p>';
|
|
247
|
+
return;
|
|
248
|
+
}}
|
|
249
|
+
|
|
250
|
+
let html = '<div class="benchmark-grid">';
|
|
251
|
+
|
|
252
|
+
const configs = benchmark.configs || {{}};
|
|
253
|
+
['with_skill', 'without_skill'].forEach(config => {{
|
|
254
|
+
const data = configs[config];
|
|
255
|
+
if (!data) return;
|
|
256
|
+
|
|
257
|
+
html += '<div class="metric-card">';
|
|
258
|
+
html += '<div class="metric-label">' + config.replace('_', ' ') + ' — Pass Rate</div>';
|
|
259
|
+
html += '<div class="metric-value">' + Math.round(data.overall_pass_rate * 100) + '%</div>';
|
|
260
|
+
html += '</div>';
|
|
261
|
+
|
|
262
|
+
html += '<div class="metric-card">';
|
|
263
|
+
html += '<div class="metric-label">' + config.replace('_', ' ') + ' — Avg Tokens</div>';
|
|
264
|
+
html += '<div class="metric-value">' + Math.round(data.total_tokens_mean) + '</div>';
|
|
265
|
+
if (data.total_tokens_stddev) html += '<div class="metric-sub">± ' + Math.round(data.total_tokens_stddev) + '</div>';
|
|
266
|
+
html += '</div>';
|
|
267
|
+
|
|
268
|
+
html += '<div class="metric-card">';
|
|
269
|
+
html += '<div class="metric-label">' + config.replace('_', ' ') + ' — Avg Time</div>';
|
|
270
|
+
html += '<div class="metric-value">' + Math.round(data.duration_ms_mean / 1000 * 10) / 10 + 's</div>';
|
|
271
|
+
if (data.duration_ms_stddev) html += '<div class="metric-sub">± ' + Math.round(data.duration_ms_stddev) + 'ms</div>';
|
|
272
|
+
html += '</div>';
|
|
273
|
+
}});
|
|
274
|
+
|
|
275
|
+
if (benchmark.comparison) {{
|
|
276
|
+
const c = benchmark.comparison;
|
|
277
|
+
html += '<div class="metric-card">';
|
|
278
|
+
html += '<div class="metric-label">Pass Rate Delta</div>';
|
|
279
|
+
html += '<div class="metric-value" style="color:' + (c.pass_rate_delta >= 0 ? '#3fb950' : '#f85149') + '">';
|
|
280
|
+
html += (c.pass_rate_delta >= 0 ? '+' : '') + Math.round(c.pass_rate_delta * 100) + '%';
|
|
281
|
+
html += '</div></div>';
|
|
282
|
+
|
|
283
|
+
html += '<div class="metric-card">';
|
|
284
|
+
html += '<div class="metric-label">Token Overhead</div>';
|
|
285
|
+
html += '<div class="metric-value">' + Math.round(c.token_overhead_percent) + '%</div>';
|
|
286
|
+
html += '</div>';
|
|
287
|
+
}}
|
|
288
|
+
|
|
289
|
+
html += '</div>';
|
|
290
|
+
container.innerHTML = html;
|
|
291
|
+
}}
|
|
292
|
+
|
|
293
|
+
function escapeHtml(text) {{
|
|
294
|
+
const div = document.createElement('div');
|
|
295
|
+
div.textContent = text;
|
|
296
|
+
return div.innerHTML;
|
|
297
|
+
}}
|
|
298
|
+
|
|
299
|
+
function submitFeedback() {{
|
|
300
|
+
const feedback = {{
|
|
301
|
+
timestamp: new Date().toISOString(),
|
|
302
|
+
iteration: {iteration_num},
|
|
303
|
+
eval_feedback: [],
|
|
304
|
+
action: document.getElementById('action-select').value
|
|
305
|
+
}};
|
|
306
|
+
|
|
307
|
+
document.querySelectorAll('.eval-rating').forEach(select => {{
|
|
308
|
+
const evalId = select.dataset.eval;
|
|
309
|
+
const comment = document.querySelector('.eval-comment[data-eval="' + evalId + '"]').value;
|
|
310
|
+
feedback.eval_feedback.push({{
|
|
311
|
+
eval_id: evalId,
|
|
312
|
+
rating: select.value,
|
|
313
|
+
comment: comment
|
|
314
|
+
}});
|
|
315
|
+
}});
|
|
316
|
+
|
|
317
|
+
// Save as downloadable file
|
|
318
|
+
const blob = new Blob([JSON.stringify(feedback, null, 2)], {{ type: 'application/json' }});
|
|
319
|
+
const a = document.createElement('a');
|
|
320
|
+
a.href = URL.createObjectURL(blob);
|
|
321
|
+
a.download = 'feedback.json';
|
|
322
|
+
a.click();
|
|
323
|
+
alert('Feedback saved! Place feedback.json in the workspace directory.');
|
|
324
|
+
}}
|
|
325
|
+
|
|
326
|
+
renderOutputs();
|
|
327
|
+
renderBenchmark();
|
|
328
|
+
</script>
|
|
329
|
+
</body>
|
|
330
|
+
</html>"""
|
|
331
|
+
return html
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def main():
|
|
335
|
+
parser = argparse.ArgumentParser(description="Generate HTML eval review page")
|
|
336
|
+
parser.add_argument("workspace", help="Path to skill workspace directory")
|
|
337
|
+
parser.add_argument("--static", help="Output path for standalone HTML file")
|
|
338
|
+
parser.add_argument("--open", action="store_true", help="Open in browser")
|
|
339
|
+
parser.add_argument("--iteration", type=int, help="Specific iteration to show")
|
|
340
|
+
args = parser.parse_args()
|
|
341
|
+
|
|
342
|
+
workspace = Path(args.workspace)
|
|
343
|
+
if not workspace.exists():
|
|
344
|
+
print(f"Error: {workspace} not found", file=sys.stderr)
|
|
345
|
+
sys.exit(1)
|
|
346
|
+
|
|
347
|
+
# Find iteration
|
|
348
|
+
if args.iteration:
|
|
349
|
+
iteration_num = args.iteration
|
|
350
|
+
else:
|
|
351
|
+
iteration_num = find_latest_iteration(workspace)
|
|
352
|
+
|
|
353
|
+
if iteration_num == 0:
|
|
354
|
+
print("Error: No iterations found in workspace", file=sys.stderr)
|
|
355
|
+
sys.exit(1)
|
|
356
|
+
|
|
357
|
+
iteration_dir = workspace / f"iteration-{iteration_num}"
|
|
358
|
+
if not iteration_dir.exists():
|
|
359
|
+
print(f"Error: {iteration_dir} not found", file=sys.stderr)
|
|
360
|
+
sys.exit(1)
|
|
361
|
+
|
|
362
|
+
# Collect data
|
|
363
|
+
evals, benchmark = collect_eval_data(iteration_dir)
|
|
364
|
+
if not evals:
|
|
365
|
+
print("Error: No eval data found", file=sys.stderr)
|
|
366
|
+
sys.exit(1)
|
|
367
|
+
|
|
368
|
+
# Generate HTML
|
|
369
|
+
html = generate_html(evals, benchmark, iteration_num, str(workspace))
|
|
370
|
+
|
|
371
|
+
# Output
|
|
372
|
+
output_path = args.static or str(workspace / "review.html")
|
|
373
|
+
Path(output_path).write_text(html)
|
|
374
|
+
print(f"Generated: {output_path}")
|
|
375
|
+
|
|
376
|
+
if args.open:
|
|
377
|
+
webbrowser.open(f"file://{os.path.abspath(output_path)}")
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
if __name__ == "__main__":
|
|
381
|
+
main()
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Package a skill directory into a distributable archive.
|
|
3
|
+
|
|
4
|
+
Creates a .tar.gz archive of a skill directory, excluding workspace
|
|
5
|
+
and temporary files.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python3 package_skill.py .claude/skills/my-skill
|
|
9
|
+
python3 package_skill.py .claude/skills/my-skill --output my-skill-v1.tar.gz
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import tarfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
# Directories and patterns to exclude from the archive
|
|
19
|
+
EXCLUDE_PATTERNS = {
|
|
20
|
+
"*-workspace",
|
|
21
|
+
"__pycache__",
|
|
22
|
+
"*.pyc",
|
|
23
|
+
".DS_Store",
|
|
24
|
+
"description_eval.json",
|
|
25
|
+
"feedback.json",
|
|
26
|
+
"review.html",
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def should_exclude(path):
|
|
31
|
+
"""Check if a path should be excluded from the archive."""
|
|
32
|
+
name = Path(path).name
|
|
33
|
+
for pattern in EXCLUDE_PATTERNS:
|
|
34
|
+
if pattern.startswith("*") and name.endswith(pattern[1:]):
|
|
35
|
+
return True
|
|
36
|
+
if pattern.endswith("*") and name.startswith(pattern[:-1]):
|
|
37
|
+
return True
|
|
38
|
+
if name == pattern:
|
|
39
|
+
return True
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def package_skill(skill_dir, output_path=None):
|
|
44
|
+
"""Create a .tar.gz archive of the skill directory."""
|
|
45
|
+
skill_dir = Path(skill_dir).resolve()
|
|
46
|
+
if not skill_dir.exists():
|
|
47
|
+
print(f"Error: {skill_dir} not found", file=sys.stderr)
|
|
48
|
+
sys.exit(1)
|
|
49
|
+
|
|
50
|
+
skill_md = skill_dir / "SKILL.md"
|
|
51
|
+
if not skill_md.exists():
|
|
52
|
+
print(f"Error: No SKILL.md found in {skill_dir}", file=sys.stderr)
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
slug = skill_dir.name
|
|
56
|
+
if output_path is None:
|
|
57
|
+
output_path = skill_dir.parent / f"{slug}.tar.gz"
|
|
58
|
+
else:
|
|
59
|
+
output_path = Path(output_path)
|
|
60
|
+
|
|
61
|
+
file_count = 0
|
|
62
|
+
with tarfile.open(output_path, "w:gz") as tar:
|
|
63
|
+
for root, dirs, files in os.walk(skill_dir):
|
|
64
|
+
# Filter out excluded directories
|
|
65
|
+
dirs[:] = [d for d in dirs if not should_exclude(d)]
|
|
66
|
+
|
|
67
|
+
for f in files:
|
|
68
|
+
filepath = Path(root) / f
|
|
69
|
+
if should_exclude(filepath):
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
arcname = str(filepath.relative_to(skill_dir.parent))
|
|
73
|
+
tar.add(filepath, arcname=arcname)
|
|
74
|
+
file_count += 1
|
|
75
|
+
|
|
76
|
+
size_kb = output_path.stat().st_size / 1024
|
|
77
|
+
print(f"Packaged {file_count} files into {output_path} ({size_kb:.1f} KB)")
|
|
78
|
+
return str(output_path)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def main():
|
|
82
|
+
parser = argparse.ArgumentParser(description="Package a skill for distribution")
|
|
83
|
+
parser.add_argument("skill_dir", help="Path to skill directory")
|
|
84
|
+
parser.add_argument("--output", "-o", help="Output archive path")
|
|
85
|
+
args = parser.parse_args()
|
|
86
|
+
|
|
87
|
+
package_skill(args.skill_dir, args.output)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
if __name__ == "__main__":
|
|
91
|
+
main()
|