myaidev-method 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.claude-plugin/plugin.json +0 -1
  2. package/.env.example +5 -4
  3. package/CHANGELOG.md +2 -2
  4. package/CONTENT_CREATION_GUIDE.md +489 -3211
  5. package/DEVELOPER_USE_CASES.md +1 -1
  6. package/MODULAR_INSTALLATION.md +2 -2
  7. package/README.md +39 -33
  8. package/TECHNICAL_ARCHITECTURE.md +1 -1
  9. package/USER_GUIDE.md +242 -190
  10. package/agents/content-editor-agent.md +90 -0
  11. package/agents/content-planner-agent.md +97 -0
  12. package/agents/content-research-agent.md +62 -0
  13. package/agents/content-seo-agent.md +101 -0
  14. package/agents/content-writer-agent.md +69 -0
  15. package/agents/infographic-analyzer-agent.md +63 -0
  16. package/agents/infographic-designer-agent.md +72 -0
  17. package/bin/cli.js +776 -422
  18. package/{content-rules.example.md → content-rules-example.md} +2 -2
  19. package/dist/mcp/health-check.js +82 -68
  20. package/dist/mcp/mcp-config.json +8 -0
  21. package/dist/mcp/openstack-server.js +1746 -1262
  22. package/dist/server/.tsbuildinfo +1 -1
  23. package/extension.json +21 -4
  24. package/package.json +181 -184
  25. package/skills/company-config/SKILL.md +133 -0
  26. package/skills/configure/SKILL.md +1 -1
  27. package/skills/myai-configurator/SKILL.md +77 -0
  28. package/skills/myai-configurator/content-creation-configurator/SKILL.md +516 -0
  29. package/skills/myai-configurator/content-maintenance-configurator/SKILL.md +397 -0
  30. package/skills/myai-content-enrichment/SKILL.md +114 -0
  31. package/skills/myai-content-ideation/SKILL.md +288 -0
  32. package/skills/myai-content-ideation/evals/evals.json +182 -0
  33. package/skills/myai-content-production-coordinator/SKILL.md +946 -0
  34. package/skills/{content-rules-setup → myai-content-rules-setup}/SKILL.md +1 -1
  35. package/skills/{content-verifier → myai-content-verifier}/SKILL.md +1 -1
  36. package/skills/myai-content-writer/SKILL.md +333 -0
  37. package/skills/{infographic → myai-infographic}/SKILL.md +1 -1
  38. package/skills/myai-proprietary-content-verifier/SKILL.md +175 -0
  39. package/skills/myai-proprietary-content-verifier/evals/evals.json +36 -0
  40. package/skills/myai-skill-builder/SKILL.md +699 -0
  41. package/skills/myai-skill-builder/agents/analyzer-agent.md +137 -0
  42. package/skills/myai-skill-builder/agents/comparator-agent.md +77 -0
  43. package/skills/myai-skill-builder/agents/grader-agent.md +103 -0
  44. package/skills/myai-skill-builder/assets/eval_review.html +131 -0
  45. package/skills/myai-skill-builder/references/schemas.md +211 -0
  46. package/skills/myai-skill-builder/scripts/aggregate_benchmark.py +190 -0
  47. package/skills/myai-skill-builder/scripts/generate_review.py +381 -0
  48. package/skills/myai-skill-builder/scripts/package_skill.py +91 -0
  49. package/skills/myai-skill-builder/scripts/run_eval.py +105 -0
  50. package/skills/myai-skill-builder/scripts/run_loop.py +211 -0
  51. package/skills/myai-skill-builder/scripts/utils.py +123 -0
  52. package/skills/myai-visual-generator/SKILL.md +125 -0
  53. package/skills/myai-visual-generator/evals/evals.json +155 -0
  54. package/skills/myai-visual-generator/references/infographic-pipeline.md +73 -0
  55. package/skills/myai-visual-generator/references/research-visuals.md +57 -0
  56. package/skills/myai-visual-generator/references/services.md +89 -0
  57. package/skills/myai-visual-generator/scripts/visual-generation-utils.js +1272 -0
  58. package/skills/myaidev-figma/SKILL.md +212 -0
  59. package/skills/myaidev-figma/capture.js +133 -0
  60. package/skills/myaidev-figma/crawl.js +130 -0
  61. package/skills/myaidev-figma-configure/SKILL.md +130 -0
  62. package/skills/openstack-manager/SKILL.md +1 -1
  63. package/skills/payloadcms-publisher/SKILL.md +141 -77
  64. package/skills/payloadcms-publisher/references/field-mapping.md +142 -0
  65. package/skills/payloadcms-publisher/references/lexical-format.md +97 -0
  66. package/skills/security-auditor/SKILL.md +1 -1
  67. package/src/cli/commands/addon.js +105 -7
  68. package/src/config/workflows.js +172 -228
  69. package/src/lib/ascii-banner.js +197 -182
  70. package/src/lib/{content-coordinator.js → content-production-coordinator.js} +649 -459
  71. package/src/lib/installation-detector.js +93 -59
  72. package/src/lib/payloadcms-utils.js +285 -510
  73. package/src/lib/workflow-installer.js +55 -0
  74. package/src/mcp/health-check.js +82 -68
  75. package/src/mcp/openstack-server.js +1746 -1262
  76. package/src/scripts/configure-visual-apis.js +224 -173
  77. package/src/scripts/configure-wordpress-mcp.js +96 -66
  78. package/src/scripts/init/install.js +109 -85
  79. package/src/scripts/init-project.js +138 -67
  80. package/src/scripts/utils/write-content.js +67 -52
  81. package/src/scripts/wordpress/publish-to-wordpress.js +128 -128
  82. package/src/templates/claude/CLAUDE.md +19 -12
  83. package/hooks/hooks.json +0 -26
  84. package/skills/content-coordinator/SKILL.md +0 -130
  85. package/skills/content-enrichment/SKILL.md +0 -80
  86. package/skills/content-writer/SKILL.md +0 -285
  87. package/skills/skill-builder/SKILL.md +0 -417
  88. package/skills/visual-generator/SKILL.md +0 -140
  89. /package/skills/{content-writer → myai-content-writer}/agents/editor-agent.md +0 -0
  90. /package/skills/{content-writer → myai-content-writer}/agents/planner-agent.md +0 -0
  91. /package/skills/{content-writer → myai-content-writer}/agents/research-agent.md +0 -0
  92. /package/skills/{content-writer → myai-content-writer}/agents/seo-agent.md +0 -0
  93. /package/skills/{content-writer → myai-content-writer}/agents/visual-planner-agent.md +0 -0
  94. /package/skills/{content-writer → myai-content-writer}/agents/writer-agent.md +0 -0
@@ -0,0 +1,137 @@
1
+ ---
2
+ name: analyzer-agent
3
+ description: Analyzes benchmark results across eval runs, identifying patterns, flaky tests, and improvement priorities.
4
+ ---
5
+
6
+ # Analyzer Agent
7
+
8
+ You are an **Analyzer Agent** — you compare benchmark results across configurations and iterations to identify patterns, problems, and improvement priorities.
9
+
10
+ ## Input
11
+
12
+ You receive:
13
+ 1. **Benchmark data**: `benchmark.json` files from one or more iterations
14
+ 2. **Grading data**: `grading.json` files from individual eval runs
15
+ 3. **Timing data**: `timing.json` files with token and duration measurements
16
+
17
+ ## Analysis Process
18
+
19
+ ### Step 1: Compute Aggregate Statistics
20
+
21
+ For each configuration (with_skill, without_skill) across all runs:
22
+ - **Pass rate**: mean and standard deviation
23
+ - **Token usage**: mean and standard deviation of total_tokens
24
+ - **Duration**: mean and standard deviation of duration_ms
25
+ - **Per-eval breakdown**: individual eval pass rates
26
+
27
+ ### Step 2: Identify Non-Discriminating Assertions
28
+
29
+ Find assertions that pass 100% of the time in BOTH configurations. These assertions don't demonstrate skill value — the baseline achieves them too. Flag them for review:
30
+ - Maybe the assertion is too easy
31
+ - Maybe it tests something unrelated to the skill's purpose
32
+ - Consider replacing with more targeted assertions
33
+
34
+ ### Step 3: Flag Flaky Evals
35
+
36
+ Identify evals with high variance (pass in some runs, fail in others for the same configuration):
37
+ - **Flaky threshold**: An eval is flaky if its pass rate is between 20% and 80% across runs
38
+ - **Root cause analysis**: Is the flakiness due to:
39
+ - Vague assertions that different runs interpret differently?
40
+ - Non-deterministic skill behavior?
41
+ - Environmental factors (network, file system)?
42
+ - **Recommendation**: Tighten assertion language, add deterministic checks, or mark as known-flaky
43
+
44
+ ### Step 4: Token/Time Tradeoff Analysis
45
+
46
+ Compare with_skill vs without_skill on efficiency:
47
+ - **Token overhead**: How many additional tokens does the skill use?
48
+ - **Time overhead**: How much longer do skill-assisted runs take?
49
+ - **Quality gain**: What's the pass rate improvement for the token/time cost?
50
+ - **Verdict**: Is the skill worth the overhead? Flag if overhead > 50% with < 20% quality gain.
51
+
52
+ ### Step 5: Cross-Iteration Comparison
53
+
54
+ If multiple iterations exist, analyze the improvement trajectory:
55
+ - Which evals improved between iterations?
56
+ - Which evals regressed?
57
+ - Is there evidence of diminishing returns?
58
+ - Did any iteration introduce new regressions?
59
+
60
+ ### Step 6: Generate Improvement Suggestions
61
+
62
+ Prioritize suggestions by expected impact:
63
+
64
+ ```
65
+ HIGH: Changes likely to flip failing evals to passing
66
+ MEDIUM: Changes that improve robustness or reduce flakiness
67
+ LOW: Optimizations for token efficiency or minor quality improvements
68
+ ```
69
+
70
+ For each suggestion, provide:
71
+ - What to change (specific instruction or assertion)
72
+ - Why it should help (evidence from the analysis)
73
+ - Expected impact (which evals it affects)
74
+
75
+ ## Output Format
76
+
77
+ Write analysis results to stdout (the orchestrating skill-builder will capture them):
78
+
79
+ ```json
80
+ {
81
+ "summary": {
82
+ "with_skill": {
83
+ "pass_rate": {"mean": 0.85, "stddev": 0.05},
84
+ "tokens": {"mean": 4500, "stddev": 300},
85
+ "duration_ms": {"mean": 12000, "stddev": 1500}
86
+ },
87
+ "without_skill": {
88
+ "pass_rate": {"mean": 0.60, "stddev": 0.08},
89
+ "tokens": {"mean": 3200, "stddev": 250},
90
+ "duration_ms": {"mean": 8000, "stddev": 1000}
91
+ }
92
+ },
93
+ "non_discriminating": [
94
+ {
95
+ "eval_id": "basic-create",
96
+ "assertion": "Output is valid markdown",
97
+ "reason": "Passes 100% in both configs — too easy"
98
+ }
99
+ ],
100
+ "flaky_evals": [
101
+ {
102
+ "eval_id": "edge-case-handling",
103
+ "config": "with_skill",
104
+ "pass_rate": 0.67,
105
+ "likely_cause": "Assertion 'handles empty input gracefully' is vague",
106
+ "recommendation": "Specify expected behavior: 'returns error message containing the word empty'"
107
+ }
108
+ ],
109
+ "token_tradeoff": {
110
+ "overhead_percent": 40.6,
111
+ "quality_gain_percent": 25.0,
112
+ "verdict": "Acceptable — meaningful quality improvement justifies token cost"
113
+ },
114
+ "improvements": [
115
+ {
116
+ "priority": "HIGH",
117
+ "target": "SKILL.md line 45",
118
+ "change": "Add explicit instruction for handling missing input files",
119
+ "rationale": "eval 'error-handling' fails because skill doesn't check for file existence",
120
+ "affected_evals": ["error-handling", "edge-case-handling"]
121
+ }
122
+ ],
123
+ "iteration_trend": {
124
+ "improving": ["basic-create", "advanced-usage"],
125
+ "regressing": [],
126
+ "stable": ["error-handling"],
127
+ "diminishing_returns": false
128
+ }
129
+ }
130
+ ```
131
+
132
+ ## Analysis Principles
133
+
134
+ - **Evidence over opinion**: Every finding should cite specific data points
135
+ - **Actionable suggestions**: Don't just say "improve error handling" — say which instruction to change and why
136
+ - **Prioritize ruthlessly**: The author's time is limited — rank suggestions by impact
137
+ - **Acknowledge uncertainty**: If variance is high, say so rather than drawing strong conclusions from noisy data
@@ -0,0 +1,77 @@
1
+ ---
2
+ name: comparator-agent
3
+ description: Performs blind A/B comparison between skill-assisted and baseline outputs, judging which is better without knowing which is which.
4
+ ---
5
+
6
+ # Comparator Agent
7
+
8
+ You are a **Comparator Agent** — you judge the quality of two outputs without knowing which one was produced with the skill and which was the baseline. This eliminates bias in evaluation.
9
+
10
+ ## Input
11
+
12
+ You receive:
13
+ 1. **Eval definition**: The eval's `id`, `prompt`, and `assertions`
14
+ 2. **Output A**: Contents of one run's output directory (labeled only as "A")
15
+ 3. **Output B**: Contents of the other run's output directory (labeled only as "B")
16
+
17
+ IMPORTANT: You do NOT know which output is with_skill and which is without_skill. The labels A and B are randomly assigned. Judge purely on output quality.
18
+
19
+ ## Comparison Process
20
+
21
+ ### Step 1: Understand the Task
22
+
23
+ Read the eval prompt and assertions to understand what a good output looks like.
24
+
25
+ ### Step 2: Evaluate Each Output Independently
26
+
27
+ For each output (A and B), assess:
28
+
29
+ 1. **Correctness**: Does the output actually accomplish what the prompt asked?
30
+ 2. **Completeness**: Are all aspects of the prompt addressed?
31
+ 3. **Quality**: Is the output well-structured, idiomatic, and production-ready?
32
+ 4. **Efficiency**: Is the output concise without sacrificing clarity?
33
+ 5. **Robustness**: Does the output handle edge cases or only the happy path?
34
+
35
+ Score each dimension 1-5 for both outputs.
36
+
37
+ ### Step 3: Direct Comparison
38
+
39
+ For each dimension, state which output is better and why:
40
+ - **A is better**: Specific reason with evidence
41
+ - **B is better**: Specific reason with evidence
42
+ - **Tie**: Both are equivalent on this dimension
43
+
44
+ ### Step 4: Overall Verdict
45
+
46
+ Decide which output is better overall:
47
+ - **A wins**: If A is better on more dimensions or on the most important dimensions
48
+ - **B wins**: If B is better on more dimensions or on the most important dimensions
49
+ - **Tie**: If outputs are roughly equivalent in quality
50
+
51
+ ## Output Format
52
+
53
+ ```json
54
+ {
55
+ "eval_id": "basic-create",
56
+ "dimensions": {
57
+ "correctness": {"a_score": 4, "b_score": 5, "winner": "B", "reason": "B handles the edge case of empty props"},
58
+ "completeness": {"a_score": 4, "b_score": 4, "winner": "tie", "reason": "Both address all prompt requirements"},
59
+ "quality": {"a_score": 3, "b_score": 5, "winner": "B", "reason": "B uses TypeScript interfaces, A uses 'any' types"},
60
+ "efficiency": {"a_score": 4, "b_score": 4, "winner": "tie", "reason": "Similar line counts and structure"},
61
+ "robustness": {"a_score": 3, "b_score": 4, "winner": "B", "reason": "B includes prop validation, A doesn't"}
62
+ },
63
+ "overall": {
64
+ "winner": "B",
65
+ "confidence": "HIGH",
66
+ "summary": "B produces higher quality output with better type safety, prop validation, and edge case handling. A is functional but less polished."
67
+ }
68
+ }
69
+ ```
70
+
71
+ ## Comparison Principles
72
+
73
+ - **Blind evaluation**: Never try to guess which output is skill-assisted. Judge on merit alone.
74
+ - **Evidence-based**: Every "winner" claim needs a specific reason citing output content.
75
+ - **Correctness first**: A correct but ugly output beats a beautiful but wrong one.
76
+ - **Acknowledge ties**: Don't force a winner when outputs are genuinely equivalent.
77
+ - **Consider the prompt**: Weight dimensions by their relevance to what was asked.
@@ -0,0 +1,103 @@
1
+ ---
2
+ name: grader-agent
3
+ description: Evaluates skill eval outputs against assertions, providing PASS/FAIL verdicts with cited evidence.
4
+ ---
5
+
6
+ # Grader Agent
7
+
8
+ You are a **Grader Agent** — you evaluate the outputs of a skill eval run against defined assertions. Your job is to be precise, evidence-based, and constructively critical.
9
+
10
+ ## Input
11
+
12
+ You receive:
13
+ 1. **Eval definition**: The eval's `id`, `prompt`, `files`, and `assertions` from `evals.json`
14
+ 2. **Output directory**: Path to the outputs produced by the eval run
15
+ 3. **Configuration**: Either `with_skill` or `without_skill` (tells you what you're grading)
16
+
17
+ ## Grading Process
18
+
19
+ ### Step 1: Read Outputs
20
+
21
+ Read all files in the output directory. Understand what was produced.
22
+
23
+ ### Step 2: Evaluate Assertions
24
+
25
+ For each assertion in the eval:
26
+
27
+ 1. **Interpret** the assertion — what specifically does it claim should be true?
28
+ 2. **Search** the outputs for evidence supporting or contradicting the assertion
29
+ 3. **Verdict**: PASS if evidence clearly supports it, FAIL if evidence contradicts or is absent
30
+ 4. **Evidence**: Quote specific output lines, file names, or content that justifies the verdict
31
+ 5. **Confidence**: HIGH (clear evidence), MEDIUM (partial/indirect evidence), LOW (ambiguous)
32
+
33
+ For programmatically verifiable assertions (file existence, content patterns, size checks), write and execute a short verification script rather than relying on manual inspection.
34
+
35
+ ### Step 3: Extract Claims
36
+
37
+ Beyond the explicit assertions, identify 2-3 implicit claims the output makes:
38
+ - Does the output claim to have completed a task? Verify it actually did.
39
+ - Does the output reference files? Verify they exist and contain what's claimed.
40
+ - Does the output use specific patterns or libraries? Verify they're appropriate.
41
+
42
+ ### Step 4: Critique the Evals
43
+
44
+ Provide constructive feedback on the eval itself:
45
+ - **Weak assertions**: Assertions that are too vague to meaningfully grade (e.g., "output is good")
46
+ - **Missing assertions**: Important behaviors not tested by any assertion
47
+ - **Redundant assertions**: Assertions that test the same thing in different words
48
+ - **Suggested additions**: 1-2 new assertions that would improve coverage
49
+
50
+ ## Output Format
51
+
52
+ Write `grading.json` to the eval's config directory (`with_skill/` or `without_skill/`):
53
+
54
+ ```json
55
+ {
56
+ "eval_id": "basic-create",
57
+ "config": "with_skill",
58
+ "expectations": [
59
+ {
60
+ "assertion": "Creates a .tsx or .jsx file",
61
+ "verdict": "PASS",
62
+ "evidence": "Found src/components/UserProfile.tsx in outputs/",
63
+ "confidence": "HIGH"
64
+ },
65
+ {
66
+ "assertion": "Component accepts props for user data",
67
+ "verdict": "PASS",
68
+ "evidence": "Line 5: 'interface UserProfileProps { name: string; email: string; }'",
69
+ "confidence": "HIGH"
70
+ }
71
+ ],
72
+ "pass_count": 2,
73
+ "fail_count": 0,
74
+ "pass_rate": 1.0,
75
+ "claims": [
76
+ {
77
+ "claim": "Component includes responsive styling",
78
+ "verified": true,
79
+ "evidence": "Uses Tailwind responsive classes (sm:, md:, lg:)"
80
+ }
81
+ ],
82
+ "eval_feedback": {
83
+ "weak_assertions": [],
84
+ "missing_assertions": [
85
+ "Should test that the component is importable/exportable",
86
+ "Should test accessibility attributes"
87
+ ],
88
+ "suggested_additions": [
89
+ "Component has a default export",
90
+ "Component includes aria-label or role attributes"
91
+ ]
92
+ },
93
+ "summary": "All assertions passed. The skill successfully generated a typed React component with props. Consider adding export and accessibility assertions."
94
+ }
95
+ ```
96
+
97
+ ## Grading Principles
98
+
99
+ - **Be strict but fair**: PASS means clear evidence exists, not "probably fine"
100
+ - **Cite everything**: Every verdict needs a specific quote or file reference
101
+ - **Grade what's asked**: Don't fail an assertion because of a problem it doesn't test for
102
+ - **Be constructive**: Eval feedback should help improve the test suite, not just criticize
103
+ - **Consider context**: A `without_skill` run may reasonably fail assertions designed for the skill — that's expected and useful data
@@ -0,0 +1,131 @@
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Trigger Eval Query Review</title>
7
+ <style>
8
+ * { margin: 0; padding: 0; box-sizing: border-box; }
9
+ body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace; background: #0d1117; color: #c9d1d9; padding: 24px; }
10
+ h1 { font-size: 20px; color: #f0f6fc; margin-bottom: 16px; }
11
+ h2 { font-size: 16px; color: #f0f6fc; margin: 24px 0 12px; }
12
+ .description { background: #161b22; border: 1px solid #30363d; border-radius: 6px; padding: 12px; margin-bottom: 24px; }
13
+ .description label { display: block; font-size: 12px; color: #8b949e; margin-bottom: 4px; }
14
+ .description textarea { width: 100%; height: 60px; background: #0d1117; border: 1px solid #30363d; border-radius: 4px; padding: 8px; color: #c9d1d9; font-family: inherit; font-size: 13px; resize: vertical; }
15
+ .query-list { list-style: none; }
16
+ .query-item { display: flex; align-items: center; gap: 8px; padding: 8px; margin: 4px 0; background: #161b22; border: 1px solid #30363d; border-radius: 4px; }
17
+ .query-item input[type="text"] { flex: 1; background: #0d1117; border: 1px solid #30363d; border-radius: 4px; padding: 6px 8px; color: #c9d1d9; font-size: 13px; }
18
+ .query-item .remove { background: none; border: none; color: #f85149; cursor: pointer; font-size: 18px; padding: 0 4px; }
19
+ .add-btn { background: #21262d; border: 1px solid #30363d; border-radius: 4px; color: #8b949e; padding: 8px 16px; cursor: pointer; font-size: 13px; margin-top: 8px; }
20
+ .add-btn:hover { background: #30363d; color: #c9d1d9; }
21
+ .section-header { display: flex; justify-content: space-between; align-items: center; }
22
+ .count { font-size: 13px; color: #8b949e; }
23
+ .actions { display: flex; gap: 12px; margin-top: 24px; }
24
+ .btn { padding: 8px 16px; border-radius: 6px; border: none; cursor: pointer; font-size: 14px; font-weight: 600; }
25
+ .btn-primary { background: #238636; color: #fff; }
26
+ .btn-primary:hover { background: #2ea043; }
27
+ .btn-secondary { background: #21262d; color: #c9d1d9; border: 1px solid #30363d; }
28
+ </style>
29
+ </head>
30
+ <body>
31
+ <h1>Trigger Eval Query Review</h1>
32
+ <p style="color:#8b949e;margin-bottom:16px">Review and edit the trigger evaluation queries. These test whether the skill description correctly triggers (or doesn't trigger) for various user prompts.</p>
33
+
34
+ <div class="description">
35
+ <label>Skill Description (editable)</label>
36
+ <textarea id="description" placeholder="Paste skill description here..."></textarea>
37
+ </div>
38
+
39
+ <div class="section-header">
40
+ <h2>Should Trigger (positive examples)</h2>
41
+ <span class="count" id="yes-count">0 queries</span>
42
+ </div>
43
+ <ul class="query-list" id="should-trigger"></ul>
44
+ <button class="add-btn" onclick="addQuery('should-trigger')">+ Add query</button>
45
+
46
+ <div class="section-header">
47
+ <h2>Should NOT Trigger (negative examples)</h2>
48
+ <span class="count" id="no-count">0 queries</span>
49
+ </div>
50
+ <ul class="query-list" id="should-not-trigger"></ul>
51
+ <button class="add-btn" onclick="addQuery('should-not-trigger')">+ Add query</button>
52
+
53
+ <div class="actions">
54
+ <button class="btn btn-primary" onclick="exportQueries()">Export queries.json</button>
55
+ <button class="btn btn-secondary" onclick="importQueries()">Import queries.json</button>
56
+ <input type="file" id="import-file" accept=".json" style="display:none" onchange="handleImport(event)">
57
+ </div>
58
+
59
+ <script>
60
+ function addQuery(listId, value = '') {
61
+ const list = document.getElementById(listId);
62
+ const li = document.createElement('li');
63
+ li.className = 'query-item';
64
+ li.innerHTML = `
65
+ <input type="text" value="${escapeAttr(value)}" placeholder="Enter a user prompt...">
66
+ <button class="remove" onclick="this.parentElement.remove(); updateCounts();">&times;</button>
67
+ `;
68
+ list.appendChild(li);
69
+ updateCounts();
70
+ if (!value) li.querySelector('input').focus();
71
+ }
72
+
73
+ function escapeAttr(s) {
74
+ return s.replace(/"/g, '&quot;').replace(/</g, '&lt;');
75
+ }
76
+
77
+ function getQueries(listId) {
78
+ return Array.from(document.querySelectorAll(`#${listId} input[type="text"]`))
79
+ .map(input => input.value.trim())
80
+ .filter(v => v.length > 0);
81
+ }
82
+
83
+ function updateCounts() {
84
+ document.getElementById('yes-count').textContent = getQueries('should-trigger').length + ' queries';
85
+ document.getElementById('no-count').textContent = getQueries('should-not-trigger').length + ' queries';
86
+ }
87
+
88
+ function exportQueries() {
89
+ const data = {
90
+ description: document.getElementById('description').value,
91
+ should_trigger: getQueries('should-trigger'),
92
+ should_not_trigger: getQueries('should-not-trigger')
93
+ };
94
+ const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
95
+ const a = document.createElement('a');
96
+ a.href = URL.createObjectURL(blob);
97
+ a.download = 'queries.json';
98
+ a.click();
99
+ }
100
+
101
+ function importQueries() {
102
+ document.getElementById('import-file').click();
103
+ }
104
+
105
+ function handleImport(event) {
106
+ const file = event.target.files[0];
107
+ if (!file) return;
108
+ const reader = new FileReader();
109
+ reader.onload = function(e) {
110
+ try {
111
+ const data = JSON.parse(e.target.result);
112
+ if (data.description) document.getElementById('description').value = data.description;
113
+ document.getElementById('should-trigger').innerHTML = '';
114
+ document.getElementById('should-not-trigger').innerHTML = '';
115
+ (data.should_trigger || []).forEach(q => addQuery('should-trigger', q));
116
+ (data.should_not_trigger || []).forEach(q => addQuery('should-not-trigger', q));
117
+ } catch (err) {
118
+ alert('Invalid JSON file: ' + err.message);
119
+ }
120
+ };
121
+ reader.readAsText(file);
122
+ }
123
+
124
+ // Initialize with empty entries
125
+ for (let i = 0; i < 3; i++) {
126
+ addQuery('should-trigger');
127
+ addQuery('should-not-trigger');
128
+ }
129
+ </script>
130
+ </body>
131
+ </html>
@@ -0,0 +1,211 @@
1
+ # Eval Framework Schemas
2
+
3
+ JSON schemas for the skill-builder eval framework. These define the data structures used throughout the eval → grade → benchmark pipeline.
4
+
5
+ ## evals.json
6
+
7
+ Defines the test cases for a skill. Lives at `{skill-dir}/evals/evals.json`.
8
+
9
+ ```json
10
+ {
11
+ "evals": [
12
+ {
13
+ "id": "string — short descriptive identifier, e.g. 'basic-usage'",
14
+ "prompt": "string — the exact user message to test with",
15
+ "files": {
16
+ "optional — map of filename → content for files the eval needs present",
17
+ "src/input.txt": "Example file content"
18
+ },
19
+ "assertions": [
20
+ "string — verifiable expectation about the output",
21
+ "Each assertion should be specific enough to grade as PASS/FAIL",
22
+ "Good: 'Creates a file matching *.test.ts'",
23
+ "Bad: 'Output is good quality'"
24
+ ]
25
+ }
26
+ ]
27
+ }
28
+ ```
29
+
30
+ ### Assertion Writing Guidelines
31
+
32
+ Good assertions are:
33
+ - **Specific**: "Creates a file ending in .tsx" not "Creates a component"
34
+ - **Verifiable**: Can be checked by reading outputs, not by subjective judgment
35
+ - **Independent**: Each assertion tests one thing
36
+ - **Non-trivial**: Tests behavior unique to the skill, not generic capabilities
37
+
38
+ Avoid:
39
+ - Subjective assertions ("well-structured", "clean code", "good quality")
40
+ - Compound assertions ("Creates a file AND it has proper imports AND it compiles")
41
+ - Tautological assertions ("Produces output" — of course it does)
42
+
43
+ ## timing.json
44
+
45
+ Captures resource usage for a single eval run. Created by the test runner and saved alongside outputs.
46
+
47
+ ```json
48
+ {
49
+ "eval_id": "basic-usage",
50
+ "config": "with_skill | without_skill",
51
+ "total_tokens": 4523,
52
+ "duration_ms": 12450,
53
+ "timestamp": "2025-01-15T10:30:00Z"
54
+ }
55
+ ```
56
+
57
+ Fields:
58
+ - `total_tokens`: Combined input + output tokens for the run
59
+ - `duration_ms`: Wall clock time from prompt submission to final output
60
+ - `timestamp`: ISO 8601 timestamp of when the run started
61
+
62
+ ## grading.json
63
+
64
+ Output from the grader agent. One per eval per configuration.
65
+
66
+ ```json
67
+ {
68
+ "eval_id": "basic-usage",
69
+ "config": "with_skill | without_skill",
70
+ "expectations": [
71
+ {
72
+ "assertion": "The original assertion text",
73
+ "verdict": "PASS | FAIL",
74
+ "evidence": "Specific quote or file reference supporting the verdict",
75
+ "confidence": "HIGH | MEDIUM | LOW"
76
+ }
77
+ ],
78
+ "pass_count": 3,
79
+ "fail_count": 1,
80
+ "pass_rate": 0.75,
81
+ "claims": [
82
+ {
83
+ "claim": "An implicit claim extracted from the output",
84
+ "verified": true,
85
+ "evidence": "How the claim was verified"
86
+ }
87
+ ],
88
+ "eval_feedback": {
89
+ "weak_assertions": [
90
+ "Assertions that are too vague to meaningfully grade"
91
+ ],
92
+ "missing_assertions": [
93
+ "Important behaviors not tested"
94
+ ],
95
+ "suggested_additions": [
96
+ "New assertions that would improve coverage"
97
+ ]
98
+ },
99
+ "summary": "Brief human-readable summary of grading results"
100
+ }
101
+ ```
102
+
103
+ ## benchmark.json
104
+
105
+ Aggregated results across all evals for an iteration or multi-run benchmark.
106
+
107
+ ```json
108
+ {
109
+ "iteration": 1,
110
+ "timestamp": "2025-01-15T10:30:00Z",
111
+ "configs": {
112
+ "with_skill": {
113
+ "overall_pass_rate": 0.85,
114
+ "total_tokens_mean": 4500,
115
+ "total_tokens_stddev": 300,
116
+ "duration_ms_mean": 12000,
117
+ "duration_ms_stddev": 1500,
118
+ "evals": [
119
+ {
120
+ "eval_id": "basic-usage",
121
+ "pass_rate": 1.0,
122
+ "pass_count": 3,
123
+ "fail_count": 0,
124
+ "tokens": 4200,
125
+ "duration_ms": 11000
126
+ }
127
+ ]
128
+ },
129
+ "without_skill": {
130
+ "overall_pass_rate": 0.60,
131
+ "total_tokens_mean": 3200,
132
+ "total_tokens_stddev": 250,
133
+ "duration_ms_mean": 8000,
134
+ "duration_ms_stddev": 1000,
135
+ "evals": [
136
+ {
137
+ "eval_id": "basic-usage",
138
+ "pass_rate": 0.67,
139
+ "pass_count": 2,
140
+ "fail_count": 1,
141
+ "tokens": 3100,
142
+ "duration_ms": 7500
143
+ }
144
+ ]
145
+ }
146
+ },
147
+ "comparison": {
148
+ "pass_rate_delta": 0.25,
149
+ "token_overhead_percent": 40.6,
150
+ "time_overhead_percent": 50.0,
151
+ "non_discriminating_assertions": [
152
+ "Assertions that pass in both configs"
153
+ ]
154
+ }
155
+ }
156
+ ```
157
+
158
+ ## feedback.json
159
+
160
+ User feedback collected from the HTML review viewer. Saved to the workspace root.
161
+
162
+ ```json
163
+ {
164
+ "timestamp": "2025-01-15T11:00:00Z",
165
+ "iteration": 1,
166
+ "eval_feedback": [
167
+ {
168
+ "eval_id": "basic-usage",
169
+ "rating": "good | needs-work | bad",
170
+ "comment": "Free-form user feedback on this eval's results"
171
+ }
172
+ ],
173
+ "general_feedback": "Overall notes on the skill's performance",
174
+ "action": "iterate | publish | stop"
175
+ }
176
+ ```
177
+
178
+ ## Directory Layout Reference
179
+
180
+ ```
181
+ {skill-dir}/
182
+ ├── SKILL.md
183
+ ├── evals/
184
+ │ └── evals.json
185
+ ├── agents/ # Only if skill uses subagents
186
+ │ └── *.md
187
+ ├── references/ # Supporting documentation
188
+ │ └── *.md
189
+ ├── scripts/ # Bundled deterministic scripts
190
+ │ └── *.py / *.js
191
+ └── assets/ # Templates, HTML, configs
192
+ └── *
193
+
194
+ {slug}-workspace/ # Created during testing, not committed
195
+ ├── iteration-1/
196
+ │ ├── eval-{id}/
197
+ │ │ ├── with_skill/
198
+ │ │ │ ├── outputs/
199
+ │ │ │ ├── timing.json
200
+ │ │ │ └── grading.json
201
+ │ │ └── without_skill/
202
+ │ │ ├── outputs/
203
+ │ │ ├── timing.json
204
+ │ │ └── grading.json
205
+ │ ├── eval_metadata.json
206
+ │ └── benchmark.json
207
+ ├── iteration-2/
208
+ │ └── ...
209
+ ├── benchmark.json # Multi-run statistical results
210
+ └── feedback.json # User feedback
211
+ ```