supered 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/plugin.json +1 -1
- package/.codex-plugin/plugin.json +1 -1
- package/.cursor-plugin/plugin.json +1 -1
- package/README.md +28 -1
- package/bin/supered.mjs +45 -23
- package/docs/evals/README.md +34 -0
- package/docs/evals/baseline-results.json +132 -0
- package/docs/evals/scenarios.json +212 -0
- package/docs/index.html +23 -0
- package/docs/install.md +16 -0
- package/docs/roadmap.md +1 -0
- package/docs/styles.css +40 -0
- package/docs/which-skill.md +33 -0
- package/gemini-extension.json +1 -1
- package/install.sh +37 -1
- package/lib/eval-pack.js +101 -0
- package/lib/host-install.js +88 -0
- package/lib/install-doctor.js +181 -0
- package/lib/manifest.js +7 -73
- package/lib/package-verification.js +120 -0
- package/lib/release-bundle.js +135 -0
- package/lib/supered-policy.js +32 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -30,6 +30,12 @@ Install with npx:
|
|
|
30
30
|
npx supered install --target codex
|
|
31
31
|
```
|
|
32
32
|
|
|
33
|
+
Check the install:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npx supered doctor --target codex
|
|
37
|
+
```
|
|
38
|
+
|
|
33
39
|
Or install the default Codex skill set with curl:
|
|
34
40
|
|
|
35
41
|
```bash
|
|
@@ -50,6 +56,7 @@ cd Supered
|
|
|
50
56
|
npm test
|
|
51
57
|
npm run validate
|
|
52
58
|
node ./bin/supered.mjs install --target codex
|
|
59
|
+
node ./bin/supered.mjs doctor --target codex
|
|
53
60
|
```
|
|
54
61
|
|
|
55
62
|
For Claude or Gemini, replace `codex` with `claude` or `gemini`.
|
|
@@ -63,6 +70,25 @@ Host-specific notes:
|
|
|
63
70
|
|
|
64
71
|
Marketplace readiness lives in [docs/marketplace-checklist.md](docs/marketplace-checklist.md).
|
|
65
72
|
|
|
73
|
+
## Which Skill Should I Use?
|
|
74
|
+
|
|
75
|
+
Use [docs/which-skill.md](docs/which-skill.md) as the quick router:
|
|
76
|
+
|
|
77
|
+
- unclear task: `shape-the-task`
|
|
78
|
+
- clear but large task: `make-a-map`
|
|
79
|
+
- planned implementation: `build-in-slices`
|
|
80
|
+
- broken or flaky behavior: `trace-the-fault`
|
|
81
|
+
- completion claim: `prove-the-change`
|
|
82
|
+
- commit, publish, or handoff: `ship-the-work`
|
|
83
|
+
|
|
84
|
+
## Eval Pack
|
|
85
|
+
|
|
86
|
+
Supered includes a public eval pack so the skills can be judged against realistic work instead of vibes.
|
|
87
|
+
|
|
88
|
+
- [Scenario catalog](docs/evals/scenarios.json): 10 realistic coding-agent scenarios.
|
|
89
|
+
- [Baseline results](docs/evals/baseline-results.json): scores across clarity, actionability, guardrails, evidence, and outcome.
|
|
90
|
+
- [Eval report](docs/evals/README.md): how to use the scenarios to improve the skills.
|
|
91
|
+
|
|
66
92
|
## CLI
|
|
67
93
|
|
|
68
94
|
```bash
|
|
@@ -72,13 +98,14 @@ npm run smoke-install
|
|
|
72
98
|
npm run verify-site
|
|
73
99
|
npm run verify-package
|
|
74
100
|
node ./bin/supered.mjs skills --json
|
|
101
|
+
node ./bin/supered.mjs doctor --target codex --json
|
|
75
102
|
```
|
|
76
103
|
|
|
77
104
|
The validator checks package metadata, plugin metadata, and skill frontmatter so the public repo does not drift into a half-installable state.
|
|
78
105
|
|
|
79
106
|
`npm run verify-site` opens the landing page in Chromium at desktop and mobile sizes, checks the logo and workflow text, and writes screenshots to `artifacts/site/`.
|
|
80
107
|
|
|
81
|
-
`npm run verify-package` builds the npm tarball and verifies `npx`-style installs for every supported host target.
|
|
108
|
+
`npm run verify-package` builds the npm tarball and verifies `npx`-style installs plus Doctor checks for every supported host target.
|
|
82
109
|
|
|
83
110
|
## Design Principles
|
|
84
111
|
|
package/bin/supered.mjs
CHANGED
|
@@ -1,19 +1,14 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
-
import { cp, mkdir } from "node:fs/promises";
|
|
3
2
|
import { dirname, resolve } from "node:path";
|
|
4
3
|
import { fileURLToPath } from "node:url";
|
|
5
4
|
|
|
6
|
-
import {
|
|
5
|
+
import { installSuperedSkills } from "../lib/host-install.js";
|
|
6
|
+
import { inspectSuperedInstall } from "../lib/install-doctor.js";
|
|
7
|
+
import { listSkills } from "../lib/manifest.js";
|
|
8
|
+
import { validateReleaseBundle } from "../lib/release-bundle.js";
|
|
7
9
|
|
|
8
10
|
const root = resolve(dirname(fileURLToPath(import.meta.url)), "..");
|
|
9
11
|
const [command, ...args] = process.argv.slice(2);
|
|
10
|
-
const installTargets = {
|
|
11
|
-
codex: ".codex/skills",
|
|
12
|
-
claude: ".claude/skills",
|
|
13
|
-
cursor: ".cursor/skills",
|
|
14
|
-
gemini: ".gemini/skills",
|
|
15
|
-
opencode: ".opencode/skills"
|
|
16
|
-
};
|
|
17
12
|
|
|
18
13
|
function printHelp() {
|
|
19
14
|
console.log(`Supered
|
|
@@ -22,10 +17,12 @@ Usage:
|
|
|
22
17
|
supered skills [--json]
|
|
23
18
|
supered validate
|
|
24
19
|
supered install --target <codex|claude|cursor|gemini|opencode> [--dest <path>]
|
|
20
|
+
supered doctor --target <codex|claude|cursor|gemini|opencode> [--dest <path>] [--json]
|
|
25
21
|
|
|
26
22
|
Examples:
|
|
27
23
|
npx supered install --target codex
|
|
28
24
|
npx supered install --target gemini --dest ~/.gemini/skills
|
|
25
|
+
supered doctor --target codex
|
|
29
26
|
`);
|
|
30
27
|
}
|
|
31
28
|
|
|
@@ -42,7 +39,7 @@ async function skillsCommand() {
|
|
|
42
39
|
}
|
|
43
40
|
|
|
44
41
|
async function validateCommand() {
|
|
45
|
-
const result = await
|
|
42
|
+
const result = await validateReleaseBundle(root);
|
|
46
43
|
if (result.errors.length > 0) {
|
|
47
44
|
for (const error of result.errors) {
|
|
48
45
|
console.error(`- ${error}`);
|
|
@@ -54,29 +51,52 @@ async function validateCommand() {
|
|
|
54
51
|
console.log(`Supered bundle is valid: ${result.skills.length} skills, ${result.checked.length} files checked.`);
|
|
55
52
|
}
|
|
56
53
|
|
|
57
|
-
function
|
|
58
|
-
const
|
|
59
|
-
|
|
60
|
-
|
|
54
|
+
async function installCommand() {
|
|
55
|
+
const targetIndex = args.indexOf("--target");
|
|
56
|
+
const destIndex = args.indexOf("--dest");
|
|
57
|
+
const target = targetIndex === -1 ? "" : args[targetIndex + 1];
|
|
58
|
+
const dest = destIndex === -1 ? undefined : args[destIndex + 1];
|
|
59
|
+
|
|
60
|
+
if (!target || (destIndex !== -1 && !dest)) {
|
|
61
|
+
throw new Error("Install requires --target <codex|claude|cursor|gemini|opencode>.");
|
|
61
62
|
}
|
|
62
63
|
|
|
63
|
-
|
|
64
|
-
|
|
64
|
+
const result = await installSuperedSkills({ root, target, dest });
|
|
65
|
+
console.log(`Installed Supered skills for ${result.target} at ${result.dest}.`);
|
|
65
66
|
}
|
|
66
67
|
|
|
67
|
-
async function
|
|
68
|
+
async function doctorCommand() {
|
|
68
69
|
const targetIndex = args.indexOf("--target");
|
|
69
70
|
const destIndex = args.indexOf("--dest");
|
|
70
71
|
const target = targetIndex === -1 ? "" : args[targetIndex + 1];
|
|
71
|
-
const dest = destIndex === -1 ?
|
|
72
|
+
const dest = destIndex === -1 ? undefined : args[destIndex + 1];
|
|
73
|
+
const json = args.includes("--json");
|
|
72
74
|
|
|
73
|
-
if (!target || !dest) {
|
|
74
|
-
throw new Error("
|
|
75
|
+
if (!target || (destIndex !== -1 && !dest)) {
|
|
76
|
+
throw new Error("Doctor requires --target <codex|claude|cursor|gemini|opencode>.");
|
|
75
77
|
}
|
|
76
78
|
|
|
77
|
-
await
|
|
78
|
-
|
|
79
|
-
|
|
79
|
+
const result = await inspectSuperedInstall({ root, target, dest });
|
|
80
|
+
if (json) {
|
|
81
|
+
console.log(JSON.stringify(result, null, 2));
|
|
82
|
+
if (result.status !== "ok") {
|
|
83
|
+
process.exitCode = 1;
|
|
84
|
+
}
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (result.status === "ok") {
|
|
89
|
+
console.log(`Supered doctor passed for ${result.target} at ${result.dest}.`);
|
|
90
|
+
console.log(`${result.installedSkills.length} skills installed and current.`);
|
|
91
|
+
return;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
console.log(`Supered doctor found ${result.issues.length} issue${result.issues.length === 1 ? "" : "s"} for ${result.target} at ${result.dest}.`);
|
|
95
|
+
for (const installIssue of result.issues) {
|
|
96
|
+
console.log(`- [${installIssue.code}] ${installIssue.message}`);
|
|
97
|
+
}
|
|
98
|
+
console.log(`Fix: ${result.fixCommand}`);
|
|
99
|
+
process.exitCode = 1;
|
|
80
100
|
}
|
|
81
101
|
|
|
82
102
|
try {
|
|
@@ -88,6 +108,8 @@ try {
|
|
|
88
108
|
await validateCommand();
|
|
89
109
|
} else if (command === "install") {
|
|
90
110
|
await installCommand();
|
|
111
|
+
} else if (command === "doctor") {
|
|
112
|
+
await doctorCommand();
|
|
91
113
|
} else {
|
|
92
114
|
throw new Error(`Unknown command: ${command}`);
|
|
93
115
|
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Supered Eval Pack
|
|
2
|
+
|
|
3
|
+
This pack gives Supered a visible usefulness standard: 10 realistic coding-agent scenarios, a simple scoring rubric, and baseline results for the current skill set.
|
|
4
|
+
|
|
5
|
+
The goal is not to claim scientific benchmark status. The goal is to make the product falsifiable: a reader can see what the skills are supposed to help with, how outcomes are judged, and where the current library is strong or still dependent on project context.
|
|
6
|
+
|
|
7
|
+
## Files
|
|
8
|
+
|
|
9
|
+
- [scenarios.json](scenarios.json): 10 realistic coding-agent scenarios with prompts, context, primary skills, success criteria, and expected evidence.
|
|
10
|
+
- [baseline-results.json](baseline-results.json): maintainer-scored baseline results for Supered v0.2.0.
|
|
11
|
+
|
|
12
|
+
## Scoring
|
|
13
|
+
|
|
14
|
+
Each scenario is scored from 1 to 5 across five dimensions:
|
|
15
|
+
|
|
16
|
+
- `clarity`: the skill helps the agent understand what kind of work this is.
|
|
17
|
+
- `actionability`: the skill tells the agent what to do next, not just how to think.
|
|
18
|
+
- `guardrails`: the skill prevents common bad moves such as scope creep, guessing, or premature claims.
|
|
19
|
+
- `evidence`: the skill asks for proof before completion or handoff.
|
|
20
|
+
- `outcome`: the skill increases the chance of a useful user-facing result.
|
|
21
|
+
|
|
22
|
+
## Baseline Summary
|
|
23
|
+
|
|
24
|
+
The current baseline average is `4.58 / 5` across the catalog. Strongest areas are release handoff, verification, and fault tracing. The scenarios with lower scores are intentionally product-heavy or environment-dependent, where a skill can guide the agent but cannot replace user priorities, browser access, database access, or security review.
|
|
25
|
+
|
|
26
|
+
## How To Use It
|
|
27
|
+
|
|
28
|
+
1. Pick a scenario close to your real task.
|
|
29
|
+
2. Ask an agent to use the recommended Supered skill.
|
|
30
|
+
3. Score the result against the five dimensions.
|
|
31
|
+
4. Add notes when the skill was vague, too strict, or missing a useful recovery path.
|
|
32
|
+
5. Propose a skill improvement or a new scenario when the score exposes a real gap.
|
|
33
|
+
|
|
34
|
+
This gives contributors a practical way to improve Supered without turning the project into a pile of opinions.
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
{
|
|
2
|
+
"product": "Supered",
|
|
3
|
+
"catalogVersion": "0.1",
|
|
4
|
+
"summary": {
|
|
5
|
+
"averageScore": 4.58,
|
|
6
|
+
"scenarioCount": 10,
|
|
7
|
+
"scoreScale": "1-5",
|
|
8
|
+
"method": "Maintainer review of the v0.2.0 Supered skill bodies against the scenario catalog. Scores are intentionally conservative until external user studies exist."
|
|
9
|
+
},
|
|
10
|
+
"results": [
|
|
11
|
+
{
|
|
12
|
+
"scenarioId": "S01",
|
|
13
|
+
"recommendedSkill": "shape-the-task",
|
|
14
|
+
"scores": {
|
|
15
|
+
"clarity": 5,
|
|
16
|
+
"actionability": 5,
|
|
17
|
+
"guardrails": 4,
|
|
18
|
+
"evidence": 4,
|
|
19
|
+
"outcome": 5
|
|
20
|
+
},
|
|
21
|
+
"notes": "The skill asks for users, constraints, assumptions, and acceptance signals before implementation, which directly prevents dashboard overreach. Evidence is strong but still depends on user confirmation."
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"scenarioId": "S02",
|
|
25
|
+
"recommendedSkill": "make-a-map",
|
|
26
|
+
"scores": {
|
|
27
|
+
"clarity": 5,
|
|
28
|
+
"actionability": 5,
|
|
29
|
+
"guardrails": 5,
|
|
30
|
+
"evidence": 4,
|
|
31
|
+
"outcome": 5
|
|
32
|
+
},
|
|
33
|
+
"notes": "The mapping playbook fits multi-file auth work well because it forces affected areas, checkpoints, and verification commands before edits. External security review may still be needed."
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
"scenarioId": "S03",
|
|
37
|
+
"recommendedSkill": "build-in-slices",
|
|
38
|
+
"scores": {
|
|
39
|
+
"clarity": 4,
|
|
40
|
+
"actionability": 5,
|
|
41
|
+
"guardrails": 5,
|
|
42
|
+
"evidence": 4,
|
|
43
|
+
"outcome": 4
|
|
44
|
+
},
|
|
45
|
+
"notes": "The slice discipline prevents a settings redesign from swallowing the small save-confirmation task. The score is slightly lower because visual quality still needs domain judgment."
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"scenarioId": "S04",
|
|
49
|
+
"recommendedSkill": "trace-the-fault",
|
|
50
|
+
"scores": {
|
|
51
|
+
"clarity": 5,
|
|
52
|
+
"actionability": 5,
|
|
53
|
+
"guardrails": 5,
|
|
54
|
+
"evidence": 5,
|
|
55
|
+
"outcome": 4
|
|
56
|
+
},
|
|
57
|
+
"notes": "The fault-tracing skill is strongest on this scenario: it demands symptom capture, hypotheses, probes, and proof before patches. The remaining risk is platform access to mobile Safari."
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
"scenarioId": "S05",
|
|
61
|
+
"recommendedSkill": "prove-the-change",
|
|
62
|
+
"scores": {
|
|
63
|
+
"clarity": 5,
|
|
64
|
+
"actionability": 4,
|
|
65
|
+
"guardrails": 5,
|
|
66
|
+
"evidence": 5,
|
|
67
|
+
"outcome": 5
|
|
68
|
+
},
|
|
69
|
+
"notes": "The verification skill explicitly blocks premature completion claims and requires fresh proof. Actionability is high, though the exact export check depends on the app's local tooling."
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"scenarioId": "S06",
|
|
73
|
+
"recommendedSkill": "ship-the-work",
|
|
74
|
+
"scores": {
|
|
75
|
+
"clarity": 5,
|
|
76
|
+
"actionability": 5,
|
|
77
|
+
"guardrails": 5,
|
|
78
|
+
"evidence": 5,
|
|
79
|
+
"outcome": 5
|
|
80
|
+
},
|
|
81
|
+
"notes": "The shipping skill aligns directly with public release work: clean status, intended staging, verification, push, release, and public read-back. It is the best fit in the catalog."
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
"scenarioId": "S07",
|
|
85
|
+
"recommendedSkill": "using-supered",
|
|
86
|
+
"scores": {
|
|
87
|
+
"clarity": 4,
|
|
88
|
+
"actionability": 5,
|
|
89
|
+
"guardrails": 4,
|
|
90
|
+
"evidence": 4,
|
|
91
|
+
"outcome": 4
|
|
92
|
+
},
|
|
93
|
+
"notes": "The entrypoint helps route a messy request into a first useful workflow instead of trying everything. The score is lower because the request still requires product prioritization."
|
|
94
|
+
},
|
|
95
|
+
{
|
|
96
|
+
"scenarioId": "S08",
|
|
97
|
+
"recommendedSkill": "trace-the-fault",
|
|
98
|
+
"scores": {
|
|
99
|
+
"clarity": 5,
|
|
100
|
+
"actionability": 5,
|
|
101
|
+
"guardrails": 5,
|
|
102
|
+
"evidence": 5,
|
|
103
|
+
"outcome": 4
|
|
104
|
+
},
|
|
105
|
+
"notes": "CI recovery benefits from the same diagnosis discipline: start from logs, reproduce, isolate, then patch. Outcome depends on whether the CI failure can be reproduced locally."
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
"scenarioId": "S09",
|
|
109
|
+
"recommendedSkill": "make-a-map",
|
|
110
|
+
"scores": {
|
|
111
|
+
"clarity": 4,
|
|
112
|
+
"actionability": 5,
|
|
113
|
+
"guardrails": 4,
|
|
114
|
+
"evidence": 5,
|
|
115
|
+
"outcome": 4
|
|
116
|
+
},
|
|
117
|
+
"notes": "The map-and-slice combination handles migration risk well by separating schema, data, and route compatibility. Database-specific rollback details remain project dependent."
|
|
118
|
+
},
|
|
119
|
+
{
|
|
120
|
+
"scenarioId": "S10",
|
|
121
|
+
"recommendedSkill": "ship-the-work",
|
|
122
|
+
"scores": {
|
|
123
|
+
"clarity": 5,
|
|
124
|
+
"actionability": 4,
|
|
125
|
+
"guardrails": 5,
|
|
126
|
+
"evidence": 5,
|
|
127
|
+
"outcome": 4
|
|
128
|
+
},
|
|
129
|
+
"notes": "The handoff scenario is well covered because the skill values honest status over false completion. It needs local judgment about whether to commit partial work or leave notes only."
|
|
130
|
+
}
|
|
131
|
+
]
|
|
132
|
+
}
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
{
|
|
2
|
+
"product": "Supered",
|
|
3
|
+
"version": "0.1",
|
|
4
|
+
"description": "A compact public catalog of realistic coding-agent scenarios for checking whether Supered skills are useful in practice.",
|
|
5
|
+
"scoring": {
|
|
6
|
+
"maxScore": 5,
|
|
7
|
+
"dimensions": [
|
|
8
|
+
"clarity",
|
|
9
|
+
"actionability",
|
|
10
|
+
"guardrails",
|
|
11
|
+
"evidence",
|
|
12
|
+
"outcome"
|
|
13
|
+
],
|
|
14
|
+
"rubric": {
|
|
15
|
+
"1": "Absent or actively misleading.",
|
|
16
|
+
"2": "Present but too vague to guide reliable work.",
|
|
17
|
+
"3": "Usable with noticeable gaps or extra human correction.",
|
|
18
|
+
"4": "Strong enough for normal work with minor judgment required.",
|
|
19
|
+
"5": "Excellent: specific, safe, evidence-driven, and ready to apply."
|
|
20
|
+
}
|
|
21
|
+
},
|
|
22
|
+
"scenarios": [
|
|
23
|
+
{
|
|
24
|
+
"id": "S01",
|
|
25
|
+
"title": "Turn a vague feature request into a buildable slice",
|
|
26
|
+
"prompt": "Add dashboards for customers so they can understand account health, but keep it simple and do not overbuild it.",
|
|
27
|
+
"context": "The repo has a React admin app, no product requirements document, three possible user roles, and no obvious owner for metric definitions. The agent must avoid inventing a large analytics system.",
|
|
28
|
+
"primarySkills": [
|
|
29
|
+
"using-supered",
|
|
30
|
+
"shape-the-task"
|
|
31
|
+
],
|
|
32
|
+
"successCriteria": [
|
|
33
|
+
"Clarifies the target user, first workflow, and non-goals.",
|
|
34
|
+
"Produces a short brief that can be accepted or corrected.",
|
|
35
|
+
"Identifies the smallest useful dashboard slice before implementation."
|
|
36
|
+
],
|
|
37
|
+
"expectedEvidence": [
|
|
38
|
+
"A concise task brief with assumptions called out.",
|
|
39
|
+
"A list of decisions that need user confirmation before code changes."
|
|
40
|
+
]
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"id": "S02",
|
|
44
|
+
"title": "Map a multi-file authentication change",
|
|
45
|
+
"prompt": "Add magic-link login to the existing app and make sure the old password login still works.",
|
|
46
|
+
"context": "The app has API routes, an email sender, session middleware, and browser tests. The request touches security-sensitive paths and must be broken into reviewable changes.",
|
|
47
|
+
"primarySkills": [
|
|
48
|
+
"make-a-map",
|
|
49
|
+
"prove-the-change"
|
|
50
|
+
],
|
|
51
|
+
"successCriteria": [
|
|
52
|
+
"Lists affected files and ownership boundaries before editing.",
|
|
53
|
+
"Sequences backend, email, session, and UI work into small checkpoints.",
|
|
54
|
+
"Names verification commands for old and new login paths."
|
|
55
|
+
],
|
|
56
|
+
"expectedEvidence": [
|
|
57
|
+
"A short execution map with ordered slices.",
|
|
58
|
+
"Verification notes covering both magic-link and password login."
|
|
59
|
+
]
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
"id": "S03",
|
|
63
|
+
"title": "Implement a narrow UI improvement without scope creep",
|
|
64
|
+
"prompt": "Make the settings page easier to scan and add a save confirmation when preferences change.",
|
|
65
|
+
"context": "The settings page is already used by customers, has mixed form components, and has a brittle screenshot test. The agent should improve the workflow without redesigning the app.",
|
|
66
|
+
"primarySkills": [
|
|
67
|
+
"build-in-slices"
|
|
68
|
+
],
|
|
69
|
+
"successCriteria": [
|
|
70
|
+
"Starts with one visible behavior and one verification path.",
|
|
71
|
+
"Keeps layout and interaction edits separable.",
|
|
72
|
+
"Avoids unrelated visual redesign or component churn."
|
|
73
|
+
],
|
|
74
|
+
"expectedEvidence": [
|
|
75
|
+
"A small diff grouped around the settings workflow.",
|
|
76
|
+
"Passing component or browser verification for the save confirmation."
|
|
77
|
+
]
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
"id": "S04",
|
|
81
|
+
"title": "Diagnose an intermittent checkout failure",
|
|
82
|
+
"prompt": "Checkout sometimes fails with a generic payment error, mostly on mobile Safari. Fix it.",
|
|
83
|
+
"context": "Logs show several possible causes: token refresh, double-submit, gateway declines, and a recent UI refactor. The agent must investigate before changing code.",
|
|
84
|
+
"primarySkills": [
|
|
85
|
+
"trace-the-fault"
|
|
86
|
+
],
|
|
87
|
+
"successCriteria": [
|
|
88
|
+
"Separates symptom, reproduction attempts, and hypotheses.",
|
|
89
|
+
"Finds evidence for one root cause before patching.",
|
|
90
|
+
"Adds or updates a regression check for the observed failure."
|
|
91
|
+
],
|
|
92
|
+
"expectedEvidence": [
|
|
93
|
+
"A fault trace showing ruled-out hypotheses.",
|
|
94
|
+
"A failing-then-passing check or a documented reproduction result."
|
|
95
|
+
]
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"id": "S05",
|
|
99
|
+
"title": "Prove a production bug fix before claiming completion",
|
|
100
|
+
"prompt": "The export button is fixed now, right? Tell the customer we are done and include the exact proof that the filtered CSV download works.",
|
|
101
|
+
"context": "A prior agent changed CSV generation but did not run the browser workflow. The customer issue involves filters, permissions, and a downloaded file.",
|
|
102
|
+
"primarySkills": [
|
|
103
|
+
"prove-the-change"
|
|
104
|
+
],
|
|
105
|
+
"successCriteria": [
|
|
106
|
+
"Refuses to claim completion from code changes alone.",
|
|
107
|
+
"Runs the relevant automated or manual export verification.",
|
|
108
|
+
"Reports the actual evidence and any untested risk."
|
|
109
|
+
],
|
|
110
|
+
"expectedEvidence": [
|
|
111
|
+
"Fresh command output or browser verification notes.",
|
|
112
|
+
"A final status that distinguishes verified behavior from assumptions."
|
|
113
|
+
]
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"id": "S06",
|
|
117
|
+
"title": "Prepare a small public release",
|
|
118
|
+
"prompt": "Commit this library update, push it, tag a release, and make sure the public install instructions still work.",
|
|
119
|
+
"context": "The repo is public, has CI, a package manifest, generated screenshots, and possible uncommitted user files. The agent must not overwrite unrelated work.",
|
|
120
|
+
"primarySkills": [
|
|
121
|
+
"ship-the-work",
|
|
122
|
+
"prove-the-change"
|
|
123
|
+
],
|
|
124
|
+
"successCriteria": [
|
|
125
|
+
"Checks the worktree and stages only intended files.",
|
|
126
|
+
"Runs release-relevant validation before tagging or publishing.",
|
|
127
|
+
"Reads back the public release or install path after shipping."
|
|
128
|
+
],
|
|
129
|
+
"expectedEvidence": [
|
|
130
|
+
"Git status, commit, push, and release URLs.",
|
|
131
|
+
"Public install or package registry verification."
|
|
132
|
+
]
|
|
133
|
+
},
|
|
134
|
+
{
|
|
135
|
+
"id": "S07",
|
|
136
|
+
"title": "Choose the right workflow for a mixed request",
|
|
137
|
+
"prompt": "The app is slow, the nav is confusing, and we should probably add AI summaries. Can you just improve it?",
|
|
138
|
+
"context": "The request mixes performance, design, and new AI features. The agent needs to route the session instead of starting a broad refactor.",
|
|
139
|
+
"primarySkills": [
|
|
140
|
+
"using-supered",
|
|
141
|
+
"shape-the-task"
|
|
142
|
+
],
|
|
143
|
+
"successCriteria": [
|
|
144
|
+
"Identifies separate work types and risks.",
|
|
145
|
+
"Chooses a first skill based on the highest uncertainty.",
|
|
146
|
+
"Offers a narrow next action instead of starting all tasks at once."
|
|
147
|
+
],
|
|
148
|
+
"expectedEvidence": [
|
|
149
|
+
"A routing decision with the chosen Supered skill.",
|
|
150
|
+
"A short set of questions or assumptions that unblock the first slice."
|
|
151
|
+
]
|
|
152
|
+
},
|
|
153
|
+
{
|
|
154
|
+
"id": "S08",
|
|
155
|
+
"title": "Recover from a failing CI run",
|
|
156
|
+
"prompt": "CI is red after the last push. Please inspect the failed job, fix the real cause, and keep the intended feature intact.",
|
|
157
|
+
"context": "The failure could come from tests, packaging, linting, or browser setup. The agent must inspect the failed job and avoid local-only guesses.",
|
|
158
|
+
"primarySkills": [
|
|
159
|
+
"trace-the-fault",
|
|
160
|
+
"build-in-slices"
|
|
161
|
+
],
|
|
162
|
+
"successCriteria": [
|
|
163
|
+
"Starts from the failing CI evidence.",
|
|
164
|
+
"Reproduces or narrows the failure locally before editing.",
|
|
165
|
+
"Makes the smallest fix that preserves the intended feature."
|
|
166
|
+
],
|
|
167
|
+
"expectedEvidence": [
|
|
168
|
+
"CI log excerpt or command that identifies the failure point.",
|
|
169
|
+
"A passing local check and, when possible, a green rerun."
|
|
170
|
+
]
|
|
171
|
+
},
|
|
172
|
+
{
|
|
173
|
+
"id": "S09",
|
|
174
|
+
"title": "Add tests around a risky data migration",
|
|
175
|
+
"prompt": "Add a migration that backfills missing organization slugs without breaking existing URLs.",
|
|
176
|
+
"context": "The database has legacy rows, duplicate display names, and routing code that assumes slugs are stable. The migration needs a reversible plan and proof.",
|
|
177
|
+
"primarySkills": [
|
|
178
|
+
"make-a-map",
|
|
179
|
+
"build-in-slices",
|
|
180
|
+
"prove-the-change"
|
|
181
|
+
],
|
|
182
|
+
"successCriteria": [
|
|
183
|
+
"Maps schema, data, and route changes before implementation.",
|
|
184
|
+
"Builds the migration and app compatibility in separate slices.",
|
|
185
|
+
"Verifies duplicate and legacy-row cases explicitly."
|
|
186
|
+
],
|
|
187
|
+
"expectedEvidence": [
|
|
188
|
+
"Migration test cases or dry-run output for edge cases.",
|
|
189
|
+
"Application route checks for old and new organization URLs."
|
|
190
|
+
]
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"id": "S10",
|
|
194
|
+
"title": "Hand off unfinished work honestly",
|
|
195
|
+
"prompt": "I need to stop here. Leave the branch in a state another engineer can continue from.",
|
|
196
|
+
"context": "The agent has partial implementation, one failing test, and two unverified assumptions. The handoff must be useful without pretending the work is done.",
|
|
197
|
+
"primarySkills": [
|
|
198
|
+
"ship-the-work",
|
|
199
|
+
"prove-the-change"
|
|
200
|
+
],
|
|
201
|
+
"successCriteria": [
|
|
202
|
+
"Separates completed, partial, and unstarted work.",
|
|
203
|
+
"Lists failing or skipped checks clearly.",
|
|
204
|
+
"Leaves next steps and evidence paths for the next engineer."
|
|
205
|
+
],
|
|
206
|
+
"expectedEvidence": [
|
|
207
|
+
"A handoff note with known status and blockers.",
|
|
208
|
+
"Command output showing the current failing or passing checks."
|
|
209
|
+
]
|
|
210
|
+
}
|
|
211
|
+
]
|
|
212
|
+
}
|
package/docs/index.html
CHANGED
|
@@ -15,6 +15,7 @@
|
|
|
15
15
|
</a>
|
|
16
16
|
<nav>
|
|
17
17
|
<a href="#skills">Skills</a>
|
|
18
|
+
<a href="#proof">Proof</a>
|
|
18
19
|
<a href="#install">Install</a>
|
|
19
20
|
<a href="https://github.com/fhajjej-ship-it/Supered">GitHub</a>
|
|
20
21
|
</nav>
|
|
@@ -65,6 +66,28 @@
|
|
|
65
66
|
</div>
|
|
66
67
|
</section>
|
|
67
68
|
|
|
69
|
+
<section id="proof" class="proof" aria-labelledby="proof-title">
|
|
70
|
+
<div class="section-head">
|
|
71
|
+
<p class="eyebrow">Proof</p>
|
|
72
|
+
<h2 id="proof-title">Eval pack, not vibes.</h2>
|
|
73
|
+
<p>Supered ships with 10 realistic coding-agent scenarios and a baseline scoring pass across clarity, actionability, guardrails, evidence, and outcome.</p>
|
|
74
|
+
</div>
|
|
75
|
+
<div class="proof-grid">
|
|
76
|
+
<a href="https://github.com/fhajjej-ship-it/Supered/blob/main/docs/which-skill.md">
|
|
77
|
+
<span>Guide</span>
|
|
78
|
+
<strong>Which skill should I use?</strong>
|
|
79
|
+
</a>
|
|
80
|
+
<a href="https://github.com/fhajjej-ship-it/Supered/blob/main/docs/evals/scenarios.json">
|
|
81
|
+
<span>Catalog</span>
|
|
82
|
+
<strong>10 realistic coding-agent scenarios</strong>
|
|
83
|
+
</a>
|
|
84
|
+
<a href="https://github.com/fhajjej-ship-it/Supered/blob/main/docs/evals/README.md">
|
|
85
|
+
<span>Eval pack</span>
|
|
86
|
+
<strong>How usefulness is scored</strong>
|
|
87
|
+
</a>
|
|
88
|
+
</div>
|
|
89
|
+
</section>
|
|
90
|
+
|
|
68
91
|
<section id="install" class="install" aria-labelledby="install-title">
|
|
69
92
|
<div>
|
|
70
93
|
<p class="eyebrow">Install</p>
|
package/docs/install.md
CHANGED
|
@@ -6,12 +6,14 @@ Supered ships plain skill folders plus lightweight plugin manifests.
|
|
|
6
6
|
|
|
7
7
|
```bash
|
|
8
8
|
npx supered install --target codex
|
|
9
|
+
npx supered doctor --target codex
|
|
9
10
|
```
|
|
10
11
|
|
|
11
12
|
Choose another host:
|
|
12
13
|
|
|
13
14
|
```bash
|
|
14
15
|
npx supered install --target opencode
|
|
16
|
+
npx supered doctor --target opencode
|
|
15
17
|
```
|
|
16
18
|
|
|
17
19
|
## One-line install
|
|
@@ -30,20 +32,34 @@ curl -fsSL https://raw.githubusercontent.com/fhajjej-ship-it/Supered/main/instal
|
|
|
30
32
|
|
|
31
33
|
```bash
|
|
32
34
|
node ./bin/supered.mjs install --target codex
|
|
35
|
+
node ./bin/supered.mjs doctor --target codex
|
|
33
36
|
```
|
|
34
37
|
|
|
35
38
|
## Claude
|
|
36
39
|
|
|
37
40
|
```bash
|
|
38
41
|
node ./bin/supered.mjs install --target claude
|
|
42
|
+
node ./bin/supered.mjs doctor --target claude
|
|
39
43
|
```
|
|
40
44
|
|
|
41
45
|
## Gemini
|
|
42
46
|
|
|
43
47
|
```bash
|
|
44
48
|
node ./bin/supered.mjs install --target gemini
|
|
49
|
+
node ./bin/supered.mjs doctor --target gemini
|
|
45
50
|
```
|
|
46
51
|
|
|
52
|
+
## Doctor
|
|
53
|
+
|
|
54
|
+
Doctor checks Install Health without changing files:
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
supered doctor --target codex
|
|
58
|
+
supered doctor --target codex --json
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
It reports missing skills, changed skill files, missing destinations, and unsafe symlinks. The fix it prints is a reinstall command for the same target and destination.
|
|
62
|
+
|
|
47
63
|
## Manual
|
|
48
64
|
|
|
49
65
|
Copy each folder in `skills/` into the skill directory used by your agent host.
|