kushi-agents 5.0.2 → 5.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +22 -0
- package/package.json +6 -2
- package/plugin/agents/kushi.agent.md +1 -1
- package/plugin/instructions/skill-evals.instructions.md +130 -0
- package/plugin/skills/aggregate-project/evals/evals.json +33 -0
- package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
- package/plugin/skills/ask-project/evals/evals.json +34 -0
- package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
- package/plugin/skills/build-state/evals/evals.json +31 -0
- package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
- package/plugin/skills/dashboard/evals/evals.json +33 -0
- package/plugin/skills/emit-vertex/evals/evals.json +33 -0
- package/plugin/skills/eval/SKILL.md +90 -0
- package/plugin/skills/eval/evals.schema.json +73 -0
- package/plugin/skills/eval/run-evals.ps1 +372 -0
- package/plugin/skills/fde-intake/evals/evals.json +33 -0
- package/plugin/skills/fde-report/evals/evals.json +33 -0
- package/plugin/skills/fde-triage/evals/evals.json +33 -0
- package/plugin/skills/intro/evals/evals.json +33 -0
- package/plugin/skills/link-entities/evals/evals.json +31 -0
- package/plugin/skills/project-status/evals/evals.json +33 -0
- package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
- package/plugin/skills/pull-ado/evals/evals.json +35 -0
- package/plugin/skills/pull-crm/evals/evals.json +35 -0
- package/plugin/skills/pull-email/evals/evals.json +35 -0
- package/plugin/skills/pull-loop/evals/evals.json +35 -0
- package/plugin/skills/pull-meetings/evals/evals.json +35 -0
- package/plugin/skills/pull-misc/evals/evals.json +35 -0
- package/plugin/skills/pull-onenote/evals/evals.json +35 -0
- package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
- package/plugin/skills/pull-teams/evals/evals.json +35 -0
- package/plugin/skills/refresh-project/evals/evals.json +31 -0
- package/plugin/skills/self-check/SKILL.md +1 -0
- package/plugin/skills/self-check/evals/evals.json +28 -0
- package/plugin/skills/self-check/run.ps1 +63 -0
- package/plugin/skills/setup/evals/evals.json +33 -0
- package/plugin/skills/tour/evals/evals.json +33 -0
- package/plugin/skills/vertex-link/evals/evals.json +33 -0
- package/src/eval-aggregator.mjs +209 -0
- package/src/eval-aggregator.test.mjs +64 -0
- package/src/eval-runner.test.mjs +69 -0
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-email",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-email. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-email-cached-1",
|
|
8
|
+
"name": "pull-email cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-email-rubric-1",
|
|
22
|
+
"name": "pull-email output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-email pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-email response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-loop",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-loop. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-loop-cached-1",
|
|
8
|
+
"name": "pull-loop cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-loop-rubric-1",
|
|
22
|
+
"name": "pull-loop output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-loop pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-loop response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-meetings",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-meetings. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-meetings-cached-1",
|
|
8
|
+
"name": "pull-meetings cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-meetings-rubric-1",
|
|
22
|
+
"name": "pull-meetings output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-meetings pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-meetings response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-misc",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-misc. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-misc-cached-1",
|
|
8
|
+
"name": "pull-misc cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-misc-rubric-1",
|
|
22
|
+
"name": "pull-misc output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-misc pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-misc response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-onenote",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-onenote. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-onenote-cached-1",
|
|
8
|
+
"name": "pull-onenote cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-onenote-rubric-1",
|
|
22
|
+
"name": "pull-onenote output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-onenote pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-onenote response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-sharepoint",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-sharepoint. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-sharepoint-cached-1",
|
|
8
|
+
"name": "pull-sharepoint cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-sharepoint-rubric-1",
|
|
22
|
+
"name": "pull-sharepoint output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-sharepoint pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-sharepoint response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "pull-teams",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for pull-teams. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "pull-teams-cached-1",
|
|
8
|
+
"name": "pull-teams cached/dry-run produces output",
|
|
9
|
+
"input": "--cached --dry-run fixture-acme",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": "fixture-acme",
|
|
16
|
+
"flags": "i"
|
|
17
|
+
}
|
|
18
|
+
]
|
|
19
|
+
},
|
|
20
|
+
{
|
|
21
|
+
"id": "pull-teams-rubric-1",
|
|
22
|
+
"name": "pull-teams output quality (LLM-rubric, skipped in canary)",
|
|
23
|
+
"input": "summarize fixture-acme pull-teams pulls",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "llm",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{
|
|
28
|
+
"type": "llm-rubric",
|
|
29
|
+
"rubric": "Does the pull-teams response cite a source file path and an ISO timestamp?",
|
|
30
|
+
"min_score": 4
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
]
|
|
35
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "refresh-project",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Verifies refresh chains link-entities -> dashboard -> tour on a fixture.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "rp-chain-mention",
|
|
8
|
+
"name": "refresh plan mentions link-entities + dashboard + tour",
|
|
9
|
+
"input": "refresh fixture-acme --dry-run\nWould chain: link-entities -> dashboard -> tour\nGraph: Evidence/_graph/project-graph.json\nDashboard: dashboard.html\nTour: State/tour.md",
|
|
10
|
+
"fixture": "evals/fixtures/fixture-acme",
|
|
11
|
+
"canary": true,
|
|
12
|
+
"grader_type": "script",
|
|
13
|
+
"expected_assertions": [
|
|
14
|
+
{ "type": "regex-match", "pattern": "link-entities" },
|
|
15
|
+
{ "type": "regex-match", "pattern": "dashboard" },
|
|
16
|
+
{ "type": "regex-match", "pattern": "tour" }
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "rp-graph-fixture-present",
|
|
21
|
+
"name": "refresh has a graph fixture to update",
|
|
22
|
+
"input": "verify graph fixture",
|
|
23
|
+
"fixture": "evals/fixtures/fixture-acme",
|
|
24
|
+
"canary": false,
|
|
25
|
+
"grader_type": "script",
|
|
26
|
+
"expected_assertions": [
|
|
27
|
+
{ "type": "file-exists", "path": "Evidence/_graph/project-graph.json" }
|
|
28
|
+
]
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
@@ -69,6 +69,7 @@ Checks split into **core** (always run) and **deep** (opt-in).
|
|
|
69
69
|
| D30.description-optimized | Trigger-based description | every SKILL.md `description:` front-matter leads with `USE WHEN` or `WHEN ` per <https://agentskills.io/skill-creation/optimizing-descriptions>. |
|
|
70
70
|
| D31.genealogy | Release genealogy entry exists | every `git tag` matching `v<x.y.z>` MUST appear in `docs/genealogy.md` as a `## v<x.y.z>` heading or be named under a parent's "Patch lineage" line. See `release-genealogy.instructions.md`. |
|
|
71
71
|
| D32.multi-host | Multi-host install integrity | validates `src/multi-host.mjs` exports + `bin/cli.mjs` flag handling, then performs a temp-dir dry-run install for BOTH supported hosts (Clawpilot + VS Code Chat) under a fake `$HOME` in `$env:TEMP`. Asserts SKILL.md + agent file + skills/ + prompts/ + skills-metadata.json with a kushi entry are present, then asserts a clean uninstall. NEVER touches the real `~/.copilot/` or `~/.vscode/`. See `multi-host-install.instructions.md`. |
|
|
72
|
+
| D33.evals | Skill evals framework integrity | every `plugin/skills/<name>/` (except `eval`) ships `evals/evals.json` with ≥ 2 cases and ≥ 1 assertion per case; the runner (`plugin/skills/eval/run-evals.ps1`) and schema (`plugin/skills/eval/evals.schema.json`) are present; `evals/baseline.json` exists (warn-only). Six sub-checks: `D33.evals-exist`, `D33.evals-schema`, `D33.evals-min-cases`, `D33.evals-have-assertions`, `D33.eval-runner-exists`, `D33.baseline-exists`. See `skill-evals.instructions.md`. |
|
|
72
73
|
| **CSC weekly-layout checks (kushi v4.9.0)** | | gated on `Resolve-EngagementRoots` — no-ops on the kushi repo itself. |
|
|
73
74
|
| D11.csc | CSC entity coverage + depth | every `Evidence/<alias>/<source>/weekly/*-csc.md` has ≥ 1 entity heading; per-source minimum bullet count + populated-section count (meetings 25/6, email 8/4, teams 6/3, onenote 10/4, sharepoint 8/3, crm 12/5, ado 8/4). Coverage-Notes-only blocks (low-signal escape) are exempt. |
|
|
74
75
|
| D12.csc | CSC section order | every entity block's `###` section headings appear in the canonical order: Participants → Topics → Q&A → Who Said What → Decisions → Dates & Numbers → Action Items → Next Steps → Open Questions → Risks → Customer Asks → Artifacts → Coverage Notes. |
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "self-check",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Meta — self-check's own run.ps1 is parseable and SKILL.md is present.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "sc-runps1-exists",
|
|
8
|
+
"name": "run.ps1 ships in the skill folder",
|
|
9
|
+
"input": "verify self-check artifacts",
|
|
10
|
+
"canary": true,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{ "type": "file-exists", "path": "plugin/skills/self-check/run.ps1" },
|
|
14
|
+
{ "type": "file-exists", "path": "plugin/skills/self-check/SKILL.md" }
|
|
15
|
+
]
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "sc-runsh-exists",
|
|
19
|
+
"name": "cross-platform run.sh ships too",
|
|
20
|
+
"input": "verify run.sh",
|
|
21
|
+
"canary": false,
|
|
22
|
+
"grader_type": "script",
|
|
23
|
+
"expected_assertions": [
|
|
24
|
+
{ "type": "file-exists", "path": "plugin/skills/self-check/run.sh" }
|
|
25
|
+
]
|
|
26
|
+
}
|
|
27
|
+
]
|
|
28
|
+
}
|
|
@@ -1632,6 +1632,69 @@ process.stdout.write(JSON.stringify(out));
|
|
|
1632
1632
|
if (Test-Path $fakeHome) { Remove-Item -LiteralPath $fakeHome -Recurse -Force -ErrorAction SilentlyContinue }
|
|
1633
1633
|
}
|
|
1634
1634
|
}
|
|
1635
|
+
|
|
1636
|
+
# === D33.evals — per-skill evals framework (v5.0.3+) ===
|
|
1637
|
+
# Per skill-evals.instructions.md, every plugin/skills/<name>/ (except eval and
|
|
1638
|
+
# self-check) MUST ship evals/evals.json with >=2 cases and every case MUST have
|
|
1639
|
+
# >=1 assertion. Schema lives at plugin/skills/eval/evals.schema.json. Runner
|
|
1640
|
+
# lives at plugin/skills/eval/run-evals.ps1. Baseline file evals/baseline.json
|
|
1641
|
+
# is warn-only — maintainers seed it with `npm run eval:baseline`.
|
|
1642
|
+
$evalSkillDir = Join-Path $Root 'plugin/skills/eval'
|
|
1643
|
+
$evalRunner = Join-Path $evalSkillDir 'run-evals.ps1'
|
|
1644
|
+
$evalSchema = Join-Path $evalSkillDir 'evals.schema.json'
|
|
1645
|
+
$baselineFile = Join-Path $Root 'evals/baseline.json'
|
|
1646
|
+
|
|
1647
|
+
if (-not (Test-Path $evalRunner)) {
|
|
1648
|
+
Add-Finding 'D33.eval-runner-exists' 'Evals' 'warning' 'plugin/skills/eval/run-evals.ps1 is missing' 'Restore the runner from git — it ships in v5.0.3+.' $evalRunner 0
|
|
1649
|
+
} else {
|
|
1650
|
+
# Quick parse-ability check: scan for at least one param( block.
|
|
1651
|
+
try {
|
|
1652
|
+
$rt = Get-Content -Raw $evalRunner
|
|
1653
|
+
if ($rt -notmatch '(?ms)^\s*param\s*\(') {
|
|
1654
|
+
Add-Finding 'D33.eval-runner-exists' 'Evals' 'warning' 'plugin/skills/eval/run-evals.ps1 has no param() block' 'Confirm the runner is valid pwsh; rerun seed-evals if it was clobbered.' $evalRunner 0
|
|
1655
|
+
}
|
|
1656
|
+
} catch {
|
|
1657
|
+
Add-Finding 'D33.eval-runner-exists' 'Evals' 'warning' "Could not read run-evals.ps1: $($_.Exception.Message)" 'Check file permissions.' $evalRunner 0
|
|
1658
|
+
}
|
|
1659
|
+
}
|
|
1660
|
+
|
|
1661
|
+
if (-not (Test-Path $evalSchema)) {
|
|
1662
|
+
Add-Finding 'D33.evals-schema' 'Evals' 'warning' 'plugin/skills/eval/evals.schema.json is missing' 'Restore the schema from git.' $evalSchema 0
|
|
1663
|
+
}
|
|
1664
|
+
|
|
1665
|
+
$skillsRoot = Join-Path $Root 'plugin/skills'
|
|
1666
|
+
$skillDirs = Get-ChildItem -Path $skillsRoot -Directory | Where-Object { $_.Name -notin @('eval', 'self-check') }
|
|
1667
|
+
foreach ($sd in $skillDirs) {
|
|
1668
|
+
$evalsFile = Join-Path $sd.FullName 'evals/evals.json'
|
|
1669
|
+
if (-not (Test-Path $evalsFile)) {
|
|
1670
|
+
Add-Finding 'D33.evals-exist' 'Evals' 'warning' "Skill '$($sd.Name)' has no evals/evals.json" "Create $evalsFile with >=2 cases per skill-evals.instructions.md. Quick start: run 'node scripts/seed-evals.mjs'." $evalsFile 0
|
|
1671
|
+
continue
|
|
1672
|
+
}
|
|
1673
|
+
try {
|
|
1674
|
+
$obj = Get-Content -Raw $evalsFile | ConvertFrom-Json
|
|
1675
|
+
} catch {
|
|
1676
|
+
Add-Finding 'D33.evals-schema' 'Evals' 'warning' "Skill '$($sd.Name)' evals.json is not valid JSON: $($_.Exception.Message)" 'Fix the JSON and rerun self-check.' $evalsFile 0
|
|
1677
|
+
continue
|
|
1678
|
+
}
|
|
1679
|
+
if (-not $obj.skill -or $obj.skill -ne $sd.Name) {
|
|
1680
|
+
Add-Finding 'D33.evals-schema' 'Evals' 'warning' "Skill '$($sd.Name)' evals.json declares skill='$($obj.skill)' (mismatch)" 'Set the skill field to match the directory name.' $evalsFile 0
|
|
1681
|
+
}
|
|
1682
|
+
if (-not $obj.cases -or $obj.cases.Count -lt 2) {
|
|
1683
|
+
Add-Finding 'D33.evals-min-cases' 'Evals' 'warning' "Skill '$($sd.Name)' has fewer than 2 eval cases" 'Add at least 2 deterministic cases per skill-evals.instructions.md.' $evalsFile 0
|
|
1684
|
+
}
|
|
1685
|
+
foreach ($c in $obj.cases) {
|
|
1686
|
+
if (-not $c.expected_assertions -or $c.expected_assertions.Count -lt 1) {
|
|
1687
|
+
Add-Finding 'D33.evals-have-assertions' 'Evals' 'warning' "Skill '$($sd.Name)' case '$($c.id)' has no expected_assertions" 'Every case needs >=1 assertion (file-exists / file-contains / json-path-equals / regex-match / llm-rubric).' $evalsFile 0
|
|
1688
|
+
}
|
|
1689
|
+
if ($c.grader_type -and ($c.grader_type -notin 'script', 'llm')) {
|
|
1690
|
+
Add-Finding 'D33.evals-schema' 'Evals' 'warning' "Skill '$($sd.Name)' case '$($c.id)' has invalid grader_type '$($c.grader_type)'" "Use 'script' or 'llm'." $evalsFile 0
|
|
1691
|
+
}
|
|
1692
|
+
}
|
|
1693
|
+
}
|
|
1694
|
+
|
|
1695
|
+
if (-not (Test-Path $baselineFile)) {
|
|
1696
|
+
Add-Finding 'D33.baseline-exists' 'Evals' 'warning' 'evals/baseline.json is missing' "Seed the baseline with 'npm run eval:baseline' (warn-only — does not block)." $baselineFile 0
|
|
1697
|
+
}
|
|
1635
1698
|
}
|
|
1636
1699
|
|
|
1637
1700
|
# === Output ===
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "setup",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for setup. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "setup-smoke-1",
|
|
8
|
+
"name": "setup produces a non-empty response",
|
|
9
|
+
"input": "synthetic setup probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "setup-smoke-2",
|
|
21
|
+
"name": "setup echoes case id",
|
|
22
|
+
"input": "case-id setup-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "setup-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "tour",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for tour. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "tour-smoke-1",
|
|
8
|
+
"name": "tour produces a non-empty response",
|
|
9
|
+
"input": "synthetic tour probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "tour-smoke-2",
|
|
21
|
+
"name": "tour echoes case id",
|
|
22
|
+
"input": "case-id tour-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "tour-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "vertex-link",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for vertex-link. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "vertex-link-smoke-1",
|
|
8
|
+
"name": "vertex-link produces a non-empty response",
|
|
9
|
+
"input": "synthetic vertex-link probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "vertex-link-smoke-2",
|
|
21
|
+
"name": "vertex-link echoes case id",
|
|
22
|
+
"input": "case-id vertex-link-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "vertex-link-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|