myaidev-method 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +0 -1
- package/.env.example +5 -4
- package/CHANGELOG.md +2 -2
- package/CONTENT_CREATION_GUIDE.md +489 -3211
- package/DEVELOPER_USE_CASES.md +1 -1
- package/MODULAR_INSTALLATION.md +2 -2
- package/README.md +39 -33
- package/TECHNICAL_ARCHITECTURE.md +1 -1
- package/USER_GUIDE.md +242 -190
- package/agents/content-editor-agent.md +90 -0
- package/agents/content-planner-agent.md +97 -0
- package/agents/content-research-agent.md +62 -0
- package/agents/content-seo-agent.md +101 -0
- package/agents/content-writer-agent.md +69 -0
- package/agents/infographic-analyzer-agent.md +63 -0
- package/agents/infographic-designer-agent.md +72 -0
- package/bin/cli.js +846 -427
- package/{content-rules.example.md → content-rules-example.md} +2 -2
- package/dist/mcp/health-check.js +82 -68
- package/dist/mcp/mcp-config.json +8 -0
- package/dist/mcp/openstack-server.js +1746 -1262
- package/dist/server/.tsbuildinfo +1 -1
- package/extension.json +21 -4
- package/package.json +181 -184
- package/skills/company-config/SKILL.md +133 -0
- package/skills/configure/SKILL.md +1 -1
- package/skills/myai-configurator/SKILL.md +77 -0
- package/skills/myai-configurator/content-creation-configurator/SKILL.md +516 -0
- package/skills/myai-configurator/content-maintenance-configurator/SKILL.md +397 -0
- package/skills/myai-content-enrichment/SKILL.md +114 -0
- package/skills/myai-content-ideation/SKILL.md +288 -0
- package/skills/myai-content-ideation/evals/evals.json +182 -0
- package/skills/myai-content-production-coordinator/SKILL.md +946 -0
- package/skills/{content-rules-setup → myai-content-rules-setup}/SKILL.md +1 -1
- package/skills/{content-verifier → myai-content-verifier}/SKILL.md +1 -1
- package/skills/myai-content-writer/SKILL.md +333 -0
- package/skills/{infographic → myai-infographic}/SKILL.md +1 -1
- package/skills/myai-proprietary-content-verifier/SKILL.md +175 -0
- package/skills/myai-proprietary-content-verifier/evals/evals.json +36 -0
- package/skills/myai-skill-builder/SKILL.md +699 -0
- package/skills/myai-skill-builder/agents/analyzer-agent.md +137 -0
- package/skills/myai-skill-builder/agents/comparator-agent.md +77 -0
- package/skills/myai-skill-builder/agents/grader-agent.md +103 -0
- package/skills/myai-skill-builder/assets/eval_review.html +131 -0
- package/skills/myai-skill-builder/references/schemas.md +211 -0
- package/skills/myai-skill-builder/scripts/aggregate_benchmark.py +190 -0
- package/skills/myai-skill-builder/scripts/generate_review.py +381 -0
- package/skills/myai-skill-builder/scripts/package_skill.py +91 -0
- package/skills/myai-skill-builder/scripts/run_eval.py +105 -0
- package/skills/myai-skill-builder/scripts/run_loop.py +211 -0
- package/skills/myai-skill-builder/scripts/utils.py +123 -0
- package/skills/myai-visual-generator/SKILL.md +125 -0
- package/skills/myai-visual-generator/evals/evals.json +155 -0
- package/skills/myai-visual-generator/references/infographic-pipeline.md +73 -0
- package/skills/myai-visual-generator/references/research-visuals.md +57 -0
- package/skills/myai-visual-generator/references/services.md +89 -0
- package/skills/myai-visual-generator/scripts/visual-generation-utils.js +1272 -0
- package/skills/myaidev-figma/SKILL.md +212 -0
- package/skills/myaidev-figma/capture.js +133 -0
- package/skills/myaidev-figma/crawl.js +130 -0
- package/skills/myaidev-figma-configure/SKILL.md +130 -0
- package/skills/openstack-manager/SKILL.md +1 -1
- package/skills/payloadcms-publisher/SKILL.md +141 -77
- package/skills/payloadcms-publisher/references/field-mapping.md +142 -0
- package/skills/payloadcms-publisher/references/lexical-format.md +97 -0
- package/skills/security-auditor/SKILL.md +1 -1
- package/src/cli/commands/addon.js +105 -7
- package/src/config/workflows.js +172 -228
- package/src/lib/ascii-banner.js +197 -182
- package/src/lib/{content-coordinator.js → content-production-coordinator.js} +649 -459
- package/src/lib/installation-detector.js +93 -59
- package/src/lib/payloadcms-utils.js +285 -510
- package/src/lib/workflow-installer.js +55 -0
- package/src/mcp/health-check.js +82 -68
- package/src/mcp/openstack-server.js +1746 -1262
- package/src/scripts/configure-visual-apis.js +224 -173
- package/src/scripts/configure-wordpress-mcp.js +96 -66
- package/src/scripts/init/install.js +109 -85
- package/src/scripts/init-project.js +138 -67
- package/src/scripts/utils/write-content.js +67 -52
- package/src/scripts/wordpress/publish-to-wordpress.js +128 -128
- package/src/templates/claude/CLAUDE.md +19 -12
- package/hooks/hooks.json +0 -26
- package/skills/content-coordinator/SKILL.md +0 -130
- package/skills/content-enrichment/SKILL.md +0 -80
- package/skills/content-writer/SKILL.md +0 -285
- package/skills/skill-builder/SKILL.md +0 -417
- package/skills/visual-generator/SKILL.md +0 -140
- /package/skills/{content-writer → myai-content-writer}/agents/editor-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/planner-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/research-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/seo-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/visual-planner-agent.md +0 -0
- /package/skills/{content-writer → myai-content-writer}/agents/writer-agent.md +0 -0
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: analyzer-agent
|
|
3
|
+
description: Analyzes benchmark results across eval runs, identifying patterns, flaky tests, and improvement priorities.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Analyzer Agent
|
|
7
|
+
|
|
8
|
+
You are an **Analyzer Agent** — you compare benchmark results across configurations and iterations to identify patterns, problems, and improvement priorities.
|
|
9
|
+
|
|
10
|
+
## Input
|
|
11
|
+
|
|
12
|
+
You receive:
|
|
13
|
+
1. **Benchmark data**: `benchmark.json` files from one or more iterations
|
|
14
|
+
2. **Grading data**: `grading.json` files from individual eval runs
|
|
15
|
+
3. **Timing data**: `timing.json` files with token and duration measurements
|
|
16
|
+
|
|
17
|
+
## Analysis Process
|
|
18
|
+
|
|
19
|
+
### Step 1: Compute Aggregate Statistics
|
|
20
|
+
|
|
21
|
+
For each configuration (with_skill, without_skill) across all runs:
|
|
22
|
+
- **Pass rate**: mean and standard deviation
|
|
23
|
+
- **Token usage**: mean and standard deviation of total_tokens
|
|
24
|
+
- **Duration**: mean and standard deviation of duration_ms
|
|
25
|
+
- **Per-eval breakdown**: individual eval pass rates
|
|
26
|
+
|
|
27
|
+
### Step 2: Identify Non-Discriminating Assertions
|
|
28
|
+
|
|
29
|
+
Find assertions that pass 100% of the time in BOTH configurations. These assertions don't demonstrate skill value — the baseline achieves them too. Flag them for review:
|
|
30
|
+
- Maybe the assertion is too easy
|
|
31
|
+
- Maybe it tests something unrelated to the skill's purpose
|
|
32
|
+
- Consider replacing with more targeted assertions
|
|
33
|
+
|
|
34
|
+
### Step 3: Flag Flaky Evals
|
|
35
|
+
|
|
36
|
+
Identify evals with high variance (pass in some runs, fail in others for the same configuration):
|
|
37
|
+
- **Flaky threshold**: An eval is flaky if its pass rate is between 20% and 80% across runs
|
|
38
|
+
- **Root cause analysis**: Is the flakiness due to:
|
|
39
|
+
- Vague assertions that different runs interpret differently?
|
|
40
|
+
- Non-deterministic skill behavior?
|
|
41
|
+
- Environmental factors (network, file system)?
|
|
42
|
+
- **Recommendation**: Tighten assertion language, add deterministic checks, or mark as known-flaky
|
|
43
|
+
|
|
44
|
+
### Step 4: Token/Time Tradeoff Analysis
|
|
45
|
+
|
|
46
|
+
Compare with_skill vs without_skill on efficiency:
|
|
47
|
+
- **Token overhead**: How many additional tokens does the skill use?
|
|
48
|
+
- **Time overhead**: How much longer do skill-assisted runs take?
|
|
49
|
+
- **Quality gain**: What's the pass rate improvement for the token/time cost?
|
|
50
|
+
- **Verdict**: Is the skill worth the overhead? Flag if overhead > 50% with < 20% quality gain.
|
|
51
|
+
|
|
52
|
+
### Step 5: Cross-Iteration Comparison
|
|
53
|
+
|
|
54
|
+
If multiple iterations exist, analyze the improvement trajectory:
|
|
55
|
+
- Which evals improved between iterations?
|
|
56
|
+
- Which evals regressed?
|
|
57
|
+
- Is there evidence of diminishing returns?
|
|
58
|
+
- Did any iteration introduce new regressions?
|
|
59
|
+
|
|
60
|
+
### Step 6: Generate Improvement Suggestions
|
|
61
|
+
|
|
62
|
+
Prioritize suggestions by expected impact:
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
HIGH: Changes likely to flip failing evals to passing
|
|
66
|
+
MEDIUM: Changes that improve robustness or reduce flakiness
|
|
67
|
+
LOW: Optimizations for token efficiency or minor quality improvements
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
For each suggestion, provide:
|
|
71
|
+
- What to change (specific instruction or assertion)
|
|
72
|
+
- Why it should help (evidence from the analysis)
|
|
73
|
+
- Expected impact (which evals it affects)
|
|
74
|
+
|
|
75
|
+
## Output Format
|
|
76
|
+
|
|
77
|
+
Write analysis results to stdout (the orchestrating skill-builder will capture them):
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"summary": {
|
|
82
|
+
"with_skill": {
|
|
83
|
+
"pass_rate": {"mean": 0.85, "stddev": 0.05},
|
|
84
|
+
"tokens": {"mean": 4500, "stddev": 300},
|
|
85
|
+
"duration_ms": {"mean": 12000, "stddev": 1500}
|
|
86
|
+
},
|
|
87
|
+
"without_skill": {
|
|
88
|
+
"pass_rate": {"mean": 0.60, "stddev": 0.08},
|
|
89
|
+
"tokens": {"mean": 3200, "stddev": 250},
|
|
90
|
+
"duration_ms": {"mean": 8000, "stddev": 1000}
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
"non_discriminating": [
|
|
94
|
+
{
|
|
95
|
+
"eval_id": "basic-create",
|
|
96
|
+
"assertion": "Output is valid markdown",
|
|
97
|
+
"reason": "Passes 100% in both configs — too easy"
|
|
98
|
+
}
|
|
99
|
+
],
|
|
100
|
+
"flaky_evals": [
|
|
101
|
+
{
|
|
102
|
+
"eval_id": "edge-case-handling",
|
|
103
|
+
"config": "with_skill",
|
|
104
|
+
"pass_rate": 0.67,
|
|
105
|
+
"likely_cause": "Assertion 'handles empty input gracefully' is vague",
|
|
106
|
+
"recommendation": "Specify expected behavior: 'returns error message containing the word empty'"
|
|
107
|
+
}
|
|
108
|
+
],
|
|
109
|
+
"token_tradeoff": {
|
|
110
|
+
"overhead_percent": 40.6,
|
|
111
|
+
"quality_gain_percent": 25.0,
|
|
112
|
+
"verdict": "Acceptable — meaningful quality improvement justifies token cost"
|
|
113
|
+
},
|
|
114
|
+
"improvements": [
|
|
115
|
+
{
|
|
116
|
+
"priority": "HIGH",
|
|
117
|
+
"target": "SKILL.md line 45",
|
|
118
|
+
"change": "Add explicit instruction for handling missing input files",
|
|
119
|
+
"rationale": "eval 'error-handling' fails because skill doesn't check for file existence",
|
|
120
|
+
"affected_evals": ["error-handling", "edge-case-handling"]
|
|
121
|
+
}
|
|
122
|
+
],
|
|
123
|
+
"iteration_trend": {
|
|
124
|
+
"improving": ["basic-create", "advanced-usage"],
|
|
125
|
+
"regressing": [],
|
|
126
|
+
"stable": ["error-handling"],
|
|
127
|
+
"diminishing_returns": false
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
## Analysis Principles
|
|
133
|
+
|
|
134
|
+
- **Evidence over opinion**: Every finding should cite specific data points
|
|
135
|
+
- **Actionable suggestions**: Don't just say "improve error handling" — say which instruction to change and why
|
|
136
|
+
- **Prioritize ruthlessly**: The author's time is limited — rank suggestions by impact
|
|
137
|
+
- **Acknowledge uncertainty**: If variance is high, say so rather than drawing strong conclusions from noisy data
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: comparator-agent
|
|
3
|
+
description: Performs blind A/B comparison between skill-assisted and baseline outputs, judging which is better without knowing which is which.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Comparator Agent
|
|
7
|
+
|
|
8
|
+
You are a **Comparator Agent** — you judge the quality of two outputs without knowing which one was produced with the skill and which was the baseline. This eliminates bias in evaluation.
|
|
9
|
+
|
|
10
|
+
## Input
|
|
11
|
+
|
|
12
|
+
You receive:
|
|
13
|
+
1. **Eval definition**: The eval's `id`, `prompt`, and `assertions`
|
|
14
|
+
2. **Output A**: Contents of one run's output directory (labeled only as "A")
|
|
15
|
+
3. **Output B**: Contents of the other run's output directory (labeled only as "B")
|
|
16
|
+
|
|
17
|
+
IMPORTANT: You do NOT know which output is with_skill and which is without_skill. The labels A and B are randomly assigned. Judge purely on output quality.
|
|
18
|
+
|
|
19
|
+
## Comparison Process
|
|
20
|
+
|
|
21
|
+
### Step 1: Understand the Task
|
|
22
|
+
|
|
23
|
+
Read the eval prompt and assertions to understand what a good output looks like.
|
|
24
|
+
|
|
25
|
+
### Step 2: Evaluate Each Output Independently
|
|
26
|
+
|
|
27
|
+
For each output (A and B), assess:
|
|
28
|
+
|
|
29
|
+
1. **Correctness**: Does the output actually accomplish what the prompt asked?
|
|
30
|
+
2. **Completeness**: Are all aspects of the prompt addressed?
|
|
31
|
+
3. **Quality**: Is the output well-structured, idiomatic, and production-ready?
|
|
32
|
+
4. **Efficiency**: Is the output concise without sacrificing clarity?
|
|
33
|
+
5. **Robustness**: Does the output handle edge cases or only the happy path?
|
|
34
|
+
|
|
35
|
+
Score each dimension 1-5 for both outputs.
|
|
36
|
+
|
|
37
|
+
### Step 3: Direct Comparison
|
|
38
|
+
|
|
39
|
+
For each dimension, state which output is better and why:
|
|
40
|
+
- **A is better**: Specific reason with evidence
|
|
41
|
+
- **B is better**: Specific reason with evidence
|
|
42
|
+
- **Tie**: Both are equivalent on this dimension
|
|
43
|
+
|
|
44
|
+
### Step 4: Overall Verdict
|
|
45
|
+
|
|
46
|
+
Decide which output is better overall:
|
|
47
|
+
- **A wins**: If A is better on more dimensions or on the most important dimensions
|
|
48
|
+
- **B wins**: If B is better on more dimensions or on the most important dimensions
|
|
49
|
+
- **Tie**: If outputs are roughly equivalent in quality
|
|
50
|
+
|
|
51
|
+
## Output Format
|
|
52
|
+
|
|
53
|
+
```json
|
|
54
|
+
{
|
|
55
|
+
"eval_id": "basic-create",
|
|
56
|
+
"dimensions": {
|
|
57
|
+
"correctness": {"a_score": 4, "b_score": 5, "winner": "B", "reason": "B handles the edge case of empty props"},
|
|
58
|
+
"completeness": {"a_score": 4, "b_score": 4, "winner": "tie", "reason": "Both address all prompt requirements"},
|
|
59
|
+
"quality": {"a_score": 3, "b_score": 5, "winner": "B", "reason": "B uses TypeScript interfaces, A uses 'any' types"},
|
|
60
|
+
"efficiency": {"a_score": 4, "b_score": 4, "winner": "tie", "reason": "Similar line counts and structure"},
|
|
61
|
+
"robustness": {"a_score": 3, "b_score": 4, "winner": "B", "reason": "B includes prop validation, A doesn't"}
|
|
62
|
+
},
|
|
63
|
+
"overall": {
|
|
64
|
+
"winner": "B",
|
|
65
|
+
"confidence": "HIGH",
|
|
66
|
+
"summary": "B produces higher quality output with better type safety, prop validation, and edge case handling. A is functional but less polished."
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Comparison Principles
|
|
72
|
+
|
|
73
|
+
- **Blind evaluation**: Never try to guess which output is skill-assisted. Judge on merit alone.
|
|
74
|
+
- **Evidence-based**: Every "winner" claim needs a specific reason citing output content.
|
|
75
|
+
- **Correctness first**: A correct but ugly output beats a beautiful but wrong one.
|
|
76
|
+
- **Acknowledge ties**: Don't force a winner when outputs are genuinely equivalent.
|
|
77
|
+
- **Consider the prompt**: Weight dimensions by their relevance to what was asked.
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: grader-agent
|
|
3
|
+
description: Evaluates skill eval outputs against assertions, providing PASS/FAIL verdicts with cited evidence.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Grader Agent
|
|
7
|
+
|
|
8
|
+
You are a **Grader Agent** — you evaluate the outputs of a skill eval run against defined assertions. Your job is to be precise, evidence-based, and constructively critical.
|
|
9
|
+
|
|
10
|
+
## Input
|
|
11
|
+
|
|
12
|
+
You receive:
|
|
13
|
+
1. **Eval definition**: The eval's `id`, `prompt`, `files`, and `assertions` from `evals.json`
|
|
14
|
+
2. **Output directory**: Path to the outputs produced by the eval run
|
|
15
|
+
3. **Configuration**: Either `with_skill` or `without_skill` (tells you what you're grading)
|
|
16
|
+
|
|
17
|
+
## Grading Process
|
|
18
|
+
|
|
19
|
+
### Step 1: Read Outputs
|
|
20
|
+
|
|
21
|
+
Read all files in the output directory. Understand what was produced.
|
|
22
|
+
|
|
23
|
+
### Step 2: Evaluate Assertions
|
|
24
|
+
|
|
25
|
+
For each assertion in the eval:
|
|
26
|
+
|
|
27
|
+
1. **Interpret** the assertion — what specifically does it claim should be true?
|
|
28
|
+
2. **Search** the outputs for evidence supporting or contradicting the assertion
|
|
29
|
+
3. **Verdict**: PASS if evidence clearly supports it, FAIL if evidence contradicts or is absent
|
|
30
|
+
4. **Evidence**: Quote specific output lines, file names, or content that justifies the verdict
|
|
31
|
+
5. **Confidence**: HIGH (clear evidence), MEDIUM (partial/indirect evidence), LOW (ambiguous)
|
|
32
|
+
|
|
33
|
+
For programmatically verifiable assertions (file existence, content patterns, size checks), write and execute a short verification script rather than relying on manual inspection.
|
|
34
|
+
|
|
35
|
+
### Step 3: Extract Claims
|
|
36
|
+
|
|
37
|
+
Beyond the explicit assertions, identify 2-3 implicit claims the output makes:
|
|
38
|
+
- Does the output claim to have completed a task? Verify it actually did.
|
|
39
|
+
- Does the output reference files? Verify they exist and contain what's claimed.
|
|
40
|
+
- Does the output use specific patterns or libraries? Verify they're appropriate.
|
|
41
|
+
|
|
42
|
+
### Step 4: Critique the Evals
|
|
43
|
+
|
|
44
|
+
Provide constructive feedback on the eval itself:
|
|
45
|
+
- **Weak assertions**: Assertions that are too vague to meaningfully grade (e.g., "output is good")
|
|
46
|
+
- **Missing assertions**: Important behaviors not tested by any assertion
|
|
47
|
+
- **Redundant assertions**: Assertions that test the same thing in different words
|
|
48
|
+
- **Suggested additions**: 1-2 new assertions that would improve coverage
|
|
49
|
+
|
|
50
|
+
## Output Format
|
|
51
|
+
|
|
52
|
+
Write `grading.json` to the eval's config directory (`with_skill/` or `without_skill/`):
|
|
53
|
+
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"eval_id": "basic-create",
|
|
57
|
+
"config": "with_skill",
|
|
58
|
+
"expectations": [
|
|
59
|
+
{
|
|
60
|
+
"assertion": "Creates a .tsx or .jsx file",
|
|
61
|
+
"verdict": "PASS",
|
|
62
|
+
"evidence": "Found src/components/UserProfile.tsx in outputs/",
|
|
63
|
+
"confidence": "HIGH"
|
|
64
|
+
},
|
|
65
|
+
{
|
|
66
|
+
"assertion": "Component accepts props for user data",
|
|
67
|
+
"verdict": "PASS",
|
|
68
|
+
"evidence": "Line 5: 'interface UserProfileProps { name: string; email: string; }'",
|
|
69
|
+
"confidence": "HIGH"
|
|
70
|
+
}
|
|
71
|
+
],
|
|
72
|
+
"pass_count": 2,
|
|
73
|
+
"fail_count": 0,
|
|
74
|
+
"pass_rate": 1.0,
|
|
75
|
+
"claims": [
|
|
76
|
+
{
|
|
77
|
+
"claim": "Component includes responsive styling",
|
|
78
|
+
"verified": true,
|
|
79
|
+
"evidence": "Uses Tailwind responsive classes (sm:, md:, lg:)"
|
|
80
|
+
}
|
|
81
|
+
],
|
|
82
|
+
"eval_feedback": {
|
|
83
|
+
"weak_assertions": [],
|
|
84
|
+
"missing_assertions": [
|
|
85
|
+
"Should test that the component is importable/exportable",
|
|
86
|
+
"Should test accessibility attributes"
|
|
87
|
+
],
|
|
88
|
+
"suggested_additions": [
|
|
89
|
+
"Component has a default export",
|
|
90
|
+
"Component includes aria-label or role attributes"
|
|
91
|
+
]
|
|
92
|
+
},
|
|
93
|
+
"summary": "All assertions passed. The skill successfully generated a typed React component with props. Consider adding export and accessibility assertions."
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Grading Principles
|
|
98
|
+
|
|
99
|
+
- **Be strict but fair**: PASS means clear evidence exists, not "probably fine"
|
|
100
|
+
- **Cite everything**: Every verdict needs a specific quote or file reference
|
|
101
|
+
- **Grade what's asked**: Don't fail an assertion because of a problem it doesn't test for
|
|
102
|
+
- **Be constructive**: Eval feedback should help improve the test suite, not just criticize
|
|
103
|
+
- **Consider context**: A `without_skill` run may reasonably fail assertions designed for the skill — that's expected and useful data
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
<!DOCTYPE html>
|
|
2
|
+
<html lang="en">
|
|
3
|
+
<head>
|
|
4
|
+
<meta charset="UTF-8">
|
|
5
|
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
6
|
+
<title>Trigger Eval Query Review</title>
|
|
7
|
+
<style>
|
|
8
|
+
* { margin: 0; padding: 0; box-sizing: border-box; }
|
|
9
|
+
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, monospace; background: #0d1117; color: #c9d1d9; padding: 24px; }
|
|
10
|
+
h1 { font-size: 20px; color: #f0f6fc; margin-bottom: 16px; }
|
|
11
|
+
h2 { font-size: 16px; color: #f0f6fc; margin: 24px 0 12px; }
|
|
12
|
+
.description { background: #161b22; border: 1px solid #30363d; border-radius: 6px; padding: 12px; margin-bottom: 24px; }
|
|
13
|
+
.description label { display: block; font-size: 12px; color: #8b949e; margin-bottom: 4px; }
|
|
14
|
+
.description textarea { width: 100%; height: 60px; background: #0d1117; border: 1px solid #30363d; border-radius: 4px; padding: 8px; color: #c9d1d9; font-family: inherit; font-size: 13px; resize: vertical; }
|
|
15
|
+
.query-list { list-style: none; }
|
|
16
|
+
.query-item { display: flex; align-items: center; gap: 8px; padding: 8px; margin: 4px 0; background: #161b22; border: 1px solid #30363d; border-radius: 4px; }
|
|
17
|
+
.query-item input[type="text"] { flex: 1; background: #0d1117; border: 1px solid #30363d; border-radius: 4px; padding: 6px 8px; color: #c9d1d9; font-size: 13px; }
|
|
18
|
+
.query-item .remove { background: none; border: none; color: #f85149; cursor: pointer; font-size: 18px; padding: 0 4px; }
|
|
19
|
+
.add-btn { background: #21262d; border: 1px solid #30363d; border-radius: 4px; color: #8b949e; padding: 8px 16px; cursor: pointer; font-size: 13px; margin-top: 8px; }
|
|
20
|
+
.add-btn:hover { background: #30363d; color: #c9d1d9; }
|
|
21
|
+
.section-header { display: flex; justify-content: space-between; align-items: center; }
|
|
22
|
+
.count { font-size: 13px; color: #8b949e; }
|
|
23
|
+
.actions { display: flex; gap: 12px; margin-top: 24px; }
|
|
24
|
+
.btn { padding: 8px 16px; border-radius: 6px; border: none; cursor: pointer; font-size: 14px; font-weight: 600; }
|
|
25
|
+
.btn-primary { background: #238636; color: #fff; }
|
|
26
|
+
.btn-primary:hover { background: #2ea043; }
|
|
27
|
+
.btn-secondary { background: #21262d; color: #c9d1d9; border: 1px solid #30363d; }
|
|
28
|
+
</style>
|
|
29
|
+
</head>
|
|
30
|
+
<body>
|
|
31
|
+
<h1>Trigger Eval Query Review</h1>
|
|
32
|
+
<p style="color:#8b949e;margin-bottom:16px">Review and edit the trigger evaluation queries. These test whether the skill description correctly triggers (or doesn't trigger) for various user prompts.</p>
|
|
33
|
+
|
|
34
|
+
<div class="description">
|
|
35
|
+
<label>Skill Description (editable)</label>
|
|
36
|
+
<textarea id="description" placeholder="Paste skill description here..."></textarea>
|
|
37
|
+
</div>
|
|
38
|
+
|
|
39
|
+
<div class="section-header">
|
|
40
|
+
<h2>Should Trigger (positive examples)</h2>
|
|
41
|
+
<span class="count" id="yes-count">0 queries</span>
|
|
42
|
+
</div>
|
|
43
|
+
<ul class="query-list" id="should-trigger"></ul>
|
|
44
|
+
<button class="add-btn" onclick="addQuery('should-trigger')">+ Add query</button>
|
|
45
|
+
|
|
46
|
+
<div class="section-header">
|
|
47
|
+
<h2>Should NOT Trigger (negative examples)</h2>
|
|
48
|
+
<span class="count" id="no-count">0 queries</span>
|
|
49
|
+
</div>
|
|
50
|
+
<ul class="query-list" id="should-not-trigger"></ul>
|
|
51
|
+
<button class="add-btn" onclick="addQuery('should-not-trigger')">+ Add query</button>
|
|
52
|
+
|
|
53
|
+
<div class="actions">
|
|
54
|
+
<button class="btn btn-primary" onclick="exportQueries()">Export queries.json</button>
|
|
55
|
+
<button class="btn btn-secondary" onclick="importQueries()">Import queries.json</button>
|
|
56
|
+
<input type="file" id="import-file" accept=".json" style="display:none" onchange="handleImport(event)">
|
|
57
|
+
</div>
|
|
58
|
+
|
|
59
|
+
<script>
|
|
60
|
+
function addQuery(listId, value = '') {
|
|
61
|
+
const list = document.getElementById(listId);
|
|
62
|
+
const li = document.createElement('li');
|
|
63
|
+
li.className = 'query-item';
|
|
64
|
+
li.innerHTML = `
|
|
65
|
+
<input type="text" value="${escapeAttr(value)}" placeholder="Enter a user prompt...">
|
|
66
|
+
<button class="remove" onclick="this.parentElement.remove(); updateCounts();">×</button>
|
|
67
|
+
`;
|
|
68
|
+
list.appendChild(li);
|
|
69
|
+
updateCounts();
|
|
70
|
+
if (!value) li.querySelector('input').focus();
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
function escapeAttr(s) {
|
|
74
|
+
return s.replace(/"/g, '"').replace(/</g, '<');
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
function getQueries(listId) {
|
|
78
|
+
return Array.from(document.querySelectorAll(`#${listId} input[type="text"]`))
|
|
79
|
+
.map(input => input.value.trim())
|
|
80
|
+
.filter(v => v.length > 0);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function updateCounts() {
|
|
84
|
+
document.getElementById('yes-count').textContent = getQueries('should-trigger').length + ' queries';
|
|
85
|
+
document.getElementById('no-count').textContent = getQueries('should-not-trigger').length + ' queries';
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function exportQueries() {
|
|
89
|
+
const data = {
|
|
90
|
+
description: document.getElementById('description').value,
|
|
91
|
+
should_trigger: getQueries('should-trigger'),
|
|
92
|
+
should_not_trigger: getQueries('should-not-trigger')
|
|
93
|
+
};
|
|
94
|
+
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
|
|
95
|
+
const a = document.createElement('a');
|
|
96
|
+
a.href = URL.createObjectURL(blob);
|
|
97
|
+
a.download = 'queries.json';
|
|
98
|
+
a.click();
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function importQueries() {
|
|
102
|
+
document.getElementById('import-file').click();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function handleImport(event) {
|
|
106
|
+
const file = event.target.files[0];
|
|
107
|
+
if (!file) return;
|
|
108
|
+
const reader = new FileReader();
|
|
109
|
+
reader.onload = function(e) {
|
|
110
|
+
try {
|
|
111
|
+
const data = JSON.parse(e.target.result);
|
|
112
|
+
if (data.description) document.getElementById('description').value = data.description;
|
|
113
|
+
document.getElementById('should-trigger').innerHTML = '';
|
|
114
|
+
document.getElementById('should-not-trigger').innerHTML = '';
|
|
115
|
+
(data.should_trigger || []).forEach(q => addQuery('should-trigger', q));
|
|
116
|
+
(data.should_not_trigger || []).forEach(q => addQuery('should-not-trigger', q));
|
|
117
|
+
} catch (err) {
|
|
118
|
+
alert('Invalid JSON file: ' + err.message);
|
|
119
|
+
}
|
|
120
|
+
};
|
|
121
|
+
reader.readAsText(file);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Initialize with empty entries
|
|
125
|
+
for (let i = 0; i < 3; i++) {
|
|
126
|
+
addQuery('should-trigger');
|
|
127
|
+
addQuery('should-not-trigger');
|
|
128
|
+
}
|
|
129
|
+
</script>
|
|
130
|
+
</body>
|
|
131
|
+
</html>
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# Eval Framework Schemas
|
|
2
|
+
|
|
3
|
+
JSON schemas for the skill-builder eval framework. These define the data structures used throughout the eval → grade → benchmark pipeline.
|
|
4
|
+
|
|
5
|
+
## evals.json
|
|
6
|
+
|
|
7
|
+
Defines the test cases for a skill. Lives at `{skill-dir}/evals/evals.json`.
|
|
8
|
+
|
|
9
|
+
```json
|
|
10
|
+
{
|
|
11
|
+
"evals": [
|
|
12
|
+
{
|
|
13
|
+
"id": "string — short descriptive identifier, e.g. 'basic-usage'",
|
|
14
|
+
"prompt": "string — the exact user message to test with",
|
|
15
|
+
"files": {
|
|
16
|
+
"optional — map of filename → content for files the eval needs present",
|
|
17
|
+
"src/input.txt": "Example file content"
|
|
18
|
+
},
|
|
19
|
+
"assertions": [
|
|
20
|
+
"string — verifiable expectation about the output",
|
|
21
|
+
"Each assertion should be specific enough to grade as PASS/FAIL",
|
|
22
|
+
"Good: 'Creates a file matching *.test.ts'",
|
|
23
|
+
"Bad: 'Output is good quality'"
|
|
24
|
+
]
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Assertion Writing Guidelines
|
|
31
|
+
|
|
32
|
+
Good assertions are:
|
|
33
|
+
- **Specific**: "Creates a file ending in .tsx" not "Creates a component"
|
|
34
|
+
- **Verifiable**: Can be checked by reading outputs, not by subjective judgment
|
|
35
|
+
- **Independent**: Each assertion tests one thing
|
|
36
|
+
- **Non-trivial**: Tests behavior unique to the skill, not generic capabilities
|
|
37
|
+
|
|
38
|
+
Avoid:
|
|
39
|
+
- Subjective assertions ("well-structured", "clean code", "good quality")
|
|
40
|
+
- Compound assertions ("Creates a file AND it has proper imports AND it compiles")
|
|
41
|
+
- Tautological assertions ("Produces output" — of course it does)
|
|
42
|
+
|
|
43
|
+
## timing.json
|
|
44
|
+
|
|
45
|
+
Captures resource usage for a single eval run. Created by the test runner and saved alongside outputs.
|
|
46
|
+
|
|
47
|
+
```json
|
|
48
|
+
{
|
|
49
|
+
"eval_id": "basic-usage",
|
|
50
|
+
"config": "with_skill | without_skill",
|
|
51
|
+
"total_tokens": 4523,
|
|
52
|
+
"duration_ms": 12450,
|
|
53
|
+
"timestamp": "2025-01-15T10:30:00Z"
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Fields:
|
|
58
|
+
- `total_tokens`: Combined input + output tokens for the run
|
|
59
|
+
- `duration_ms`: Wall clock time from prompt submission to final output
|
|
60
|
+
- `timestamp`: ISO 8601 timestamp of when the run started
|
|
61
|
+
|
|
62
|
+
## grading.json
|
|
63
|
+
|
|
64
|
+
Output from the grader agent. One per eval per configuration.
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"eval_id": "basic-usage",
|
|
69
|
+
"config": "with_skill | without_skill",
|
|
70
|
+
"expectations": [
|
|
71
|
+
{
|
|
72
|
+
"assertion": "The original assertion text",
|
|
73
|
+
"verdict": "PASS | FAIL",
|
|
74
|
+
"evidence": "Specific quote or file reference supporting the verdict",
|
|
75
|
+
"confidence": "HIGH | MEDIUM | LOW"
|
|
76
|
+
}
|
|
77
|
+
],
|
|
78
|
+
"pass_count": 3,
|
|
79
|
+
"fail_count": 1,
|
|
80
|
+
"pass_rate": 0.75,
|
|
81
|
+
"claims": [
|
|
82
|
+
{
|
|
83
|
+
"claim": "An implicit claim extracted from the output",
|
|
84
|
+
"verified": true,
|
|
85
|
+
"evidence": "How the claim was verified"
|
|
86
|
+
}
|
|
87
|
+
],
|
|
88
|
+
"eval_feedback": {
|
|
89
|
+
"weak_assertions": [
|
|
90
|
+
"Assertions that are too vague to meaningfully grade"
|
|
91
|
+
],
|
|
92
|
+
"missing_assertions": [
|
|
93
|
+
"Important behaviors not tested"
|
|
94
|
+
],
|
|
95
|
+
"suggested_additions": [
|
|
96
|
+
"New assertions that would improve coverage"
|
|
97
|
+
]
|
|
98
|
+
},
|
|
99
|
+
"summary": "Brief human-readable summary of grading results"
|
|
100
|
+
}
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## benchmark.json
|
|
104
|
+
|
|
105
|
+
Aggregated results across all evals for an iteration or multi-run benchmark.
|
|
106
|
+
|
|
107
|
+
```json
|
|
108
|
+
{
|
|
109
|
+
"iteration": 1,
|
|
110
|
+
"timestamp": "2025-01-15T10:30:00Z",
|
|
111
|
+
"configs": {
|
|
112
|
+
"with_skill": {
|
|
113
|
+
"overall_pass_rate": 0.85,
|
|
114
|
+
"total_tokens_mean": 4500,
|
|
115
|
+
"total_tokens_stddev": 300,
|
|
116
|
+
"duration_ms_mean": 12000,
|
|
117
|
+
"duration_ms_stddev": 1500,
|
|
118
|
+
"evals": [
|
|
119
|
+
{
|
|
120
|
+
"eval_id": "basic-usage",
|
|
121
|
+
"pass_rate": 1.0,
|
|
122
|
+
"pass_count": 3,
|
|
123
|
+
"fail_count": 0,
|
|
124
|
+
"tokens": 4200,
|
|
125
|
+
"duration_ms": 11000
|
|
126
|
+
}
|
|
127
|
+
]
|
|
128
|
+
},
|
|
129
|
+
"without_skill": {
|
|
130
|
+
"overall_pass_rate": 0.60,
|
|
131
|
+
"total_tokens_mean": 3200,
|
|
132
|
+
"total_tokens_stddev": 250,
|
|
133
|
+
"duration_ms_mean": 8000,
|
|
134
|
+
"duration_ms_stddev": 1000,
|
|
135
|
+
"evals": [
|
|
136
|
+
{
|
|
137
|
+
"eval_id": "basic-usage",
|
|
138
|
+
"pass_rate": 0.67,
|
|
139
|
+
"pass_count": 2,
|
|
140
|
+
"fail_count": 1,
|
|
141
|
+
"tokens": 3100,
|
|
142
|
+
"duration_ms": 7500
|
|
143
|
+
}
|
|
144
|
+
]
|
|
145
|
+
}
|
|
146
|
+
},
|
|
147
|
+
"comparison": {
|
|
148
|
+
"pass_rate_delta": 0.25,
|
|
149
|
+
"token_overhead_percent": 40.6,
|
|
150
|
+
"time_overhead_percent": 50.0,
|
|
151
|
+
"non_discriminating_assertions": [
|
|
152
|
+
"Assertions that pass in both configs"
|
|
153
|
+
]
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## feedback.json
|
|
159
|
+
|
|
160
|
+
User feedback collected from the HTML review viewer. Saved to the workspace root.
|
|
161
|
+
|
|
162
|
+
```json
|
|
163
|
+
{
|
|
164
|
+
"timestamp": "2025-01-15T11:00:00Z",
|
|
165
|
+
"iteration": 1,
|
|
166
|
+
"eval_feedback": [
|
|
167
|
+
{
|
|
168
|
+
"eval_id": "basic-usage",
|
|
169
|
+
"rating": "good | needs-work | bad",
|
|
170
|
+
"comment": "Free-form user feedback on this eval's results"
|
|
171
|
+
}
|
|
172
|
+
],
|
|
173
|
+
"general_feedback": "Overall notes on the skill's performance",
|
|
174
|
+
"action": "iterate | publish | stop"
|
|
175
|
+
}
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
## Directory Layout Reference
|
|
179
|
+
|
|
180
|
+
```
|
|
181
|
+
{skill-dir}/
|
|
182
|
+
├── SKILL.md
|
|
183
|
+
├── evals/
|
|
184
|
+
│ └── evals.json
|
|
185
|
+
├── agents/ # Only if skill uses subagents
|
|
186
|
+
│ └── *.md
|
|
187
|
+
├── references/ # Supporting documentation
|
|
188
|
+
│ └── *.md
|
|
189
|
+
├── scripts/ # Bundled deterministic scripts
|
|
190
|
+
│ └── *.py / *.js
|
|
191
|
+
└── assets/ # Templates, HTML, configs
|
|
192
|
+
└── *
|
|
193
|
+
|
|
194
|
+
{slug}-workspace/ # Created during testing, not committed
|
|
195
|
+
├── iteration-1/
|
|
196
|
+
│ ├── eval-{id}/
|
|
197
|
+
│ │ ├── with_skill/
|
|
198
|
+
│ │ │ ├── outputs/
|
|
199
|
+
│ │ │ ├── timing.json
|
|
200
|
+
│ │ │ └── grading.json
|
|
201
|
+
│ │ └── without_skill/
|
|
202
|
+
│ │ ├── outputs/
|
|
203
|
+
│ │ ├── timing.json
|
|
204
|
+
│ │ └── grading.json
|
|
205
|
+
│ ├── eval_metadata.json
|
|
206
|
+
│ └── benchmark.json
|
|
207
|
+
├── iteration-2/
|
|
208
|
+
│ └── ...
|
|
209
|
+
├── benchmark.json # Multi-run statistical results
|
|
210
|
+
└── feedback.json # User feedback
|
|
211
|
+
```
|