kushi-agents 5.0.2 → 5.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +35 -0
- package/bin/cli.mjs +103 -0
- package/package.json +6 -2
- package/plugin/agents/kushi.agent.md +3 -1
- package/plugin/instructions/skill-authoring.instructions.md +147 -0
- package/plugin/instructions/skill-evals.instructions.md +130 -0
- package/plugin/skills/aggregate-project/evals/evals.json +33 -0
- package/plugin/skills/apply-ado-update/evals/evals.json +33 -0
- package/plugin/skills/ask-project/SKILL.md +10 -0
- package/plugin/skills/ask-project/evals/evals.json +34 -0
- package/plugin/skills/bootstrap-project/evals/evals.json +34 -0
- package/plugin/skills/build-state/evals/evals.json +31 -0
- package/plugin/skills/consolidate-evidence/evals/evals.json +33 -0
- package/plugin/skills/dashboard/evals/evals.json +33 -0
- package/plugin/skills/emit-vertex/evals/evals.json +33 -0
- package/plugin/skills/eval/SKILL.md +90 -0
- package/plugin/skills/eval/evals.schema.json +73 -0
- package/plugin/skills/eval/run-evals.ps1 +372 -0
- package/plugin/skills/fde-intake/evals/evals.json +33 -0
- package/plugin/skills/fde-report/evals/evals.json +33 -0
- package/plugin/skills/fde-triage/evals/evals.json +33 -0
- package/plugin/skills/intro/SKILL.md +160 -451
- package/plugin/skills/intro/evals/evals.json +33 -0
- package/plugin/skills/intro/references/walkthrough.md +310 -0
- package/plugin/skills/link-entities/evals/evals.json +31 -0
- package/plugin/skills/project-status/SKILL.md +10 -1
- package/plugin/skills/project-status/evals/evals.json +33 -0
- package/plugin/skills/propose-ado-update/evals/evals.json +33 -0
- package/plugin/skills/pull-ado/evals/evals.json +35 -0
- package/plugin/skills/pull-crm/evals/evals.json +35 -0
- package/plugin/skills/pull-email/evals/evals.json +35 -0
- package/plugin/skills/pull-loop/evals/evals.json +35 -0
- package/plugin/skills/pull-meetings/evals/evals.json +35 -0
- package/plugin/skills/pull-misc/evals/evals.json +35 -0
- package/plugin/skills/pull-onenote/evals/evals.json +35 -0
- package/plugin/skills/pull-sharepoint/evals/evals.json +35 -0
- package/plugin/skills/pull-teams/evals/evals.json +35 -0
- package/plugin/skills/refresh-project/evals/evals.json +31 -0
- package/plugin/skills/self-check/SKILL.md +2 -0
- package/plugin/skills/self-check/evals/evals.json +28 -0
- package/plugin/skills/self-check/run.ps1 +144 -0
- package/plugin/skills/setup/SKILL.md +10 -0
- package/plugin/skills/setup/evals/evals.json +33 -0
- package/plugin/skills/skill-checker/SKILL.md +136 -0
- package/plugin/skills/skill-checker/check-skill.ps1 +416 -0
- package/plugin/skills/skill-checker/evals/evals.json +41 -0
- package/plugin/skills/skill-creator/SKILL.md +134 -0
- package/plugin/skills/skill-creator/evals/evals.json +40 -0
- package/plugin/skills/skill-creator/generate-eval-review.ps1 +101 -0
- package/plugin/skills/skill-creator/optimize-description.ps1 +87 -0
- package/plugin/skills/skill-creator/scaffold.ps1 +180 -0
- package/plugin/skills/skill-creator/templates/evals-starter.template.json +27 -0
- package/plugin/skills/skill-creator/templates/gotchas-stub.template.md +9 -0
- package/plugin/skills/skill-creator/templates/skill-skeleton.template.md +28 -0
- package/plugin/skills/tour/evals/evals.json +33 -0
- package/plugin/skills/vertex-link/SKILL.md +10 -0
- package/plugin/skills/vertex-link/evals/evals.json +33 -0
- package/src/eval-aggregator.mjs +209 -0
- package/src/eval-aggregator.test.mjs +64 -0
- package/src/eval-runner.test.mjs +69 -0
- package/src/skill-checker.test.mjs +118 -0
- package/src/skill-creator.test.mjs +92 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "build-state",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Karpathy State layout — index.md + log.md + per-category folders.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "bs-state-index",
|
|
8
|
+
"name": "fixture State/index.md has kushi_state_page front-matter",
|
|
9
|
+
"input": "validate state fixture",
|
|
10
|
+
"fixture": "evals/fixtures/fixture-acme",
|
|
11
|
+
"canary": true,
|
|
12
|
+
"grader_type": "script",
|
|
13
|
+
"args": { "read_fixture": "State/index.md" },
|
|
14
|
+
"expected_assertions": [
|
|
15
|
+
{ "type": "regex-match", "pattern": "kushi_state_page:\\s*true" }
|
|
16
|
+
]
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"id": "bs-state-log-exists",
|
|
20
|
+
"name": "fixture State/log.md exists",
|
|
21
|
+
"input": "verify log",
|
|
22
|
+
"fixture": "evals/fixtures/fixture-acme",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{ "type": "file-exists", "path": "State/log.md" },
|
|
27
|
+
{ "type": "file-exists", "path": "State/index.md" }
|
|
28
|
+
]
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "consolidate-evidence",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for consolidate-evidence. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "consolidate-evidence-smoke-1",
|
|
8
|
+
"name": "consolidate-evidence produces a non-empty response",
|
|
9
|
+
"input": "synthetic consolidate-evidence probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "consolidate-evidence-smoke-2",
|
|
21
|
+
"name": "consolidate-evidence echoes case id",
|
|
22
|
+
"input": "case-id consolidate-evidence-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "consolidate-evidence-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "dashboard",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for dashboard. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "dashboard-smoke-1",
|
|
8
|
+
"name": "dashboard produces a non-empty response",
|
|
9
|
+
"input": "synthetic dashboard probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "dashboard-smoke-2",
|
|
21
|
+
"name": "dashboard echoes case id",
|
|
22
|
+
"input": "case-id dashboard-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "dashboard-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "emit-vertex",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Auto-seeded evals for emit-vertex. Replace with real cases as the skill matures.",
|
|
5
|
+
"cases": [
|
|
6
|
+
{
|
|
7
|
+
"id": "emit-vertex-smoke-1",
|
|
8
|
+
"name": "emit-vertex produces a non-empty response",
|
|
9
|
+
"input": "synthetic emit-vertex probe — canary smoke",
|
|
10
|
+
"canary": false,
|
|
11
|
+
"grader_type": "script",
|
|
12
|
+
"expected_assertions": [
|
|
13
|
+
{
|
|
14
|
+
"type": "regex-match",
|
|
15
|
+
"pattern": ".+"
|
|
16
|
+
}
|
|
17
|
+
]
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"id": "emit-vertex-smoke-2",
|
|
21
|
+
"name": "emit-vertex echoes case id",
|
|
22
|
+
"input": "case-id emit-vertex-smoke-2",
|
|
23
|
+
"canary": false,
|
|
24
|
+
"grader_type": "script",
|
|
25
|
+
"expected_assertions": [
|
|
26
|
+
{
|
|
27
|
+
"type": "regex-match",
|
|
28
|
+
"pattern": "emit-vertex-smoke-2"
|
|
29
|
+
}
|
|
30
|
+
]
|
|
31
|
+
}
|
|
32
|
+
]
|
|
33
|
+
}
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: "eval"
|
|
3
|
+
version: "1.0.0"
|
|
4
|
+
description: "USE WHEN the user says \"run evals\", \"eval canary\", \"eval ask-project\", \"check skill regression\", \"update eval baseline\", or before tagging a release. DO NOT USE for evidence validation of a real project (use ask-project / project-status). Capability: runs per-skill evals (deterministic script graders + optional LLM-rubric graders) from each skill's evals/evals.json, aggregates pass-rate / latency / token metrics, and compares against evals/baseline.json to flag regressions. Synthetic fixtures only — never live customer data."
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Skill: eval
|
|
8
|
+
|
|
9
|
+
The objective regression signal for every other kushi skill. Spec: <https://agentskills.io/skill-creation/evaluating-skills>. Doctrine: `instructions/skill-evals.instructions.md`.
|
|
10
|
+
|
|
11
|
+
User triggers: "run evals", "eval canary", "eval <skill>", "eval all", "update eval baseline", "check skill regression".
|
|
12
|
+
|
|
13
|
+
## USE WHEN
|
|
14
|
+
|
|
15
|
+
- About to commit a behavioral change to any `plugin/skills/<name>/SKILL.md` or its companion script.
|
|
16
|
+
- Reviewing a PR that touches skills, prompts, or shared instructions.
|
|
17
|
+
- Tagging a release (full suite is implied by the release script).
|
|
18
|
+
- Investigating "did this skill get worse?" after a refactor.
|
|
19
|
+
|
|
20
|
+
## DO NOT USE FOR
|
|
21
|
+
|
|
22
|
+
- Validating real customer evidence (use `ask-project` / `project-status`).
|
|
23
|
+
- Live M365 / ADO / CRM probes (use `pull-*` directly).
|
|
24
|
+
- Generating new fixtures from real engagements — that violates the privacy rule.
|
|
25
|
+
|
|
26
|
+
## Gotchas
|
|
27
|
+
|
|
28
|
+
1. **Canary ≠ full.** `npm run eval:canary` runs ~6 skills. Use `npm run eval:all` before tagging.
|
|
29
|
+
2. **Synthetic fixtures only.** Never copy real customer evidence into `evals/fixtures/`. The runner does NOT enforce this — humans do.
|
|
30
|
+
3. **`pull-*` skills run in `--cached` / `--dry-run`.** Live network calls are explicitly disabled in canary mode; the LLM-rubric subset is skipped unless `-Live` is passed.
|
|
31
|
+
4. **Baseline drift is OK after intentional changes.** Refresh with `npm run eval:baseline`. NEVER auto-update baseline in CI.
|
|
32
|
+
5. **`Evidence/_evals/` is gitignored.** If you need to share a run, copy the JSON manually.
|
|
33
|
+
6. **Tokens are estimates.** When a grader can't measure tokens (e.g. a pwsh-only run), it records `0` and the aggregator excludes those from the mean.
|
|
34
|
+
|
|
35
|
+
## Step checklist
|
|
36
|
+
|
|
37
|
+
- [ ] Pick mode: `-Skill <name>` (one skill) · `-Canary` (fast subset) · `-All` (full suite).
|
|
38
|
+
- [ ] Confirm `evals/baseline.json` exists; if missing, run with `-UpdateBaseline` first.
|
|
39
|
+
- [ ] Run: `pwsh plugin/skills/eval/run-evals.ps1 -Canary` (or other mode).
|
|
40
|
+
- [ ] Inspect the output JSON under `Evidence/_evals/` and the `benchmark.json` summary.
|
|
41
|
+
- [ ] If regressions flagged: re-run the specific skill with `-Skill <name>` for detail.
|
|
42
|
+
- [ ] If intentional change: bump the skill version, then `npm run eval:baseline`.
|
|
43
|
+
|
|
44
|
+
## Validation loop
|
|
45
|
+
|
|
46
|
+
After running, verify:
|
|
47
|
+
|
|
48
|
+
- Output JSON exists at the path printed by the runner.
|
|
49
|
+
- Every case has `pass: true|false` AND `duration_ms` AND a per-assertion breakdown.
|
|
50
|
+
- `benchmark.json` has `summary.regressions` array (empty if clean).
|
|
51
|
+
- Exit code: `0` = all green vs baseline; `1` = at least one regression.
|
|
52
|
+
|
|
53
|
+
If a case errors out before any assertion runs (e.g. fixture missing), it counts as `pass: false` with `error: "<message>"` so totals stay honest.
|
|
54
|
+
|
|
55
|
+
## What the runner does (`run-evals.ps1`)
|
|
56
|
+
|
|
57
|
+
1. Discovers `plugin/skills/<name>/evals/evals.json` for the requested skills.
|
|
58
|
+
2. Validates each file against `plugin/skills/eval/evals.schema.json`.
|
|
59
|
+
3. For each case:
|
|
60
|
+
- Resolves the fixture path (`fixture` field, relative to repo root).
|
|
61
|
+
- Dispatches per `grader_type`:
|
|
62
|
+
- `script` → invokes the skill's known executable (`run.ps1`, a `*.mjs` probe, or a deterministic shim under `evals/probe.*`) with the case input.
|
|
63
|
+
- `llm` → only when `-Live` is set; otherwise marked `skipped`.
|
|
64
|
+
- Captures `stdout`, `stderr`, `duration_ms`.
|
|
65
|
+
- Runs every assertion in `expected_assertions`; collects per-assertion pass/fail.
|
|
66
|
+
- Case `pass` = all assertions pass AND no error.
|
|
67
|
+
4. Writes per-run JSON to `Evidence/_evals/<utc-timestamp>.json` (or `-Output` override).
|
|
68
|
+
5. Calls `node src/eval-aggregator.mjs` to compute `benchmark.json` and compare against `evals/baseline.json`.
|
|
69
|
+
6. Prints a one-screen summary; exits 0 on clean, 1 on regression.
|
|
70
|
+
|
|
71
|
+
## Arguments
|
|
72
|
+
|
|
73
|
+
| Flag | Purpose |
|
|
74
|
+
|---|---|
|
|
75
|
+
| `-Skill <name>` | Run only one skill's evals. |
|
|
76
|
+
| `-All` | Run every `plugin/skills/<name>/evals/evals.json`. |
|
|
77
|
+
| `-Canary` | Run only cases marked `"canary": true`. |
|
|
78
|
+
| `-Output <path>` | Override per-run JSON path. Default `Evidence/_evals/<ts>.json`. |
|
|
79
|
+
| `-Baseline` | Compare against `evals/baseline.json` (default ON). |
|
|
80
|
+
| `-UpdateBaseline` | Write current run's metrics into `evals/baseline.json`. |
|
|
81
|
+
| `-Live` | Allow LLM-rubric cases (requires `m_*` tools / sub-agent). |
|
|
82
|
+
| `-StrictExit` | Exit 1 on any case failure (CI mode). |
|
|
83
|
+
|
|
84
|
+
## References
|
|
85
|
+
|
|
86
|
+
- `plugin/instructions/skill-evals.instructions.md` (doctrine)
|
|
87
|
+
- `plugin/instructions/agentskills-compliance.instructions.md` (size + section caps for SKILL.md)
|
|
88
|
+
- `src/eval-aggregator.mjs` (mean/stddev + regression detection)
|
|
89
|
+
- `evals/baseline.json` (committed baseline)
|
|
90
|
+
- <https://agentskills.io/skill-creation/evaluating-skills>
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://gim-home.github.io/kushi/schemas/evals.schema.json",
|
|
4
|
+
"title": "Kushi per-skill evals file",
|
|
5
|
+
"description": "Schema for plugin/skills/<name>/evals/evals.json. Validated by self-check D33.evals-schema.",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["skill", "cases"],
|
|
8
|
+
"additionalProperties": false,
|
|
9
|
+
"properties": {
|
|
10
|
+
"skill": {
|
|
11
|
+
"type": "string",
|
|
12
|
+
"minLength": 1,
|
|
13
|
+
"description": "Skill name (must match the parent directory and SKILL.md frontmatter name)."
|
|
14
|
+
},
|
|
15
|
+
"version": { "type": "string" },
|
|
16
|
+
"description": { "type": "string" },
|
|
17
|
+
"cases": {
|
|
18
|
+
"type": "array",
|
|
19
|
+
"minItems": 1,
|
|
20
|
+
"items": { "$ref": "#/definitions/case" }
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
"definitions": {
|
|
24
|
+
"case": {
|
|
25
|
+
"type": "object",
|
|
26
|
+
"required": ["id", "name", "input", "expected_assertions", "grader_type"],
|
|
27
|
+
"additionalProperties": false,
|
|
28
|
+
"properties": {
|
|
29
|
+
"id": { "type": "string", "pattern": "^[a-z0-9][a-z0-9-]*$" },
|
|
30
|
+
"name": { "type": "string", "minLength": 1 },
|
|
31
|
+
"input": {
|
|
32
|
+
"anyOf": [{ "type": "string" }, { "type": "object" }]
|
|
33
|
+
},
|
|
34
|
+
"fixture": { "type": "string" },
|
|
35
|
+
"canary": { "type": "boolean", "default": false },
|
|
36
|
+
"skip": { "type": "boolean", "default": false },
|
|
37
|
+
"skip_reason": { "type": "string" },
|
|
38
|
+
"timeout_ms": { "type": "integer", "minimum": 100 },
|
|
39
|
+
"args": { "type": "object" },
|
|
40
|
+
"grader_type": { "enum": ["script", "llm"] },
|
|
41
|
+
"expected_assertions": {
|
|
42
|
+
"type": "array",
|
|
43
|
+
"minItems": 1,
|
|
44
|
+
"items": { "$ref": "#/definitions/assertion" }
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
},
|
|
48
|
+
"assertion": {
|
|
49
|
+
"type": "object",
|
|
50
|
+
"required": ["type"],
|
|
51
|
+
"properties": {
|
|
52
|
+
"type": {
|
|
53
|
+
"enum": [
|
|
54
|
+
"file-exists",
|
|
55
|
+
"file-contains",
|
|
56
|
+
"json-path-equals",
|
|
57
|
+
"regex-match",
|
|
58
|
+
"llm-rubric"
|
|
59
|
+
]
|
|
60
|
+
},
|
|
61
|
+
"path": { "type": "string" },
|
|
62
|
+
"needle": { "type": "string" },
|
|
63
|
+
"pattern": { "type": "string" },
|
|
64
|
+
"flags": { "type": "string" },
|
|
65
|
+
"json_path": { "type": "string" },
|
|
66
|
+
"equals": {},
|
|
67
|
+
"rubric": { "type": "string" },
|
|
68
|
+
"min_score": { "type": "number" }
|
|
69
|
+
},
|
|
70
|
+
"additionalProperties": false
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
<#
|
|
2
|
+
.SYNOPSIS
|
|
3
|
+
Kushi skill evals runner (v5.0.3+).
|
|
4
|
+
|
|
5
|
+
.DESCRIPTION
|
|
6
|
+
Discovers per-skill evals at `plugin/skills/<name>/evals/evals.json`,
|
|
7
|
+
validates against `plugin/skills/eval/evals.schema.json`, dispatches each case,
|
|
8
|
+
runs the assertions, and writes a per-run JSON to `Evidence/_evals/<ts>.json`.
|
|
9
|
+
|
|
10
|
+
Per the doctrine in `plugin/instructions/skill-evals.instructions.md`, the
|
|
11
|
+
runner NEVER makes live M365 / ADO / CRM calls. Skills that genuinely require
|
|
12
|
+
live calls (`pull-*`) ship cached fixtures.
|
|
13
|
+
|
|
14
|
+
Dispatch:
|
|
15
|
+
- `grader_type: "script"` cases are dispatched to:
|
|
16
|
+
1. `plugin/skills/<skill>/evals/probe.ps1` (preferred, per-skill probe)
|
|
17
|
+
2. `plugin/skills/<skill>/evals/probe.mjs` (Node probe)
|
|
18
|
+
3. The built-in "fixture-echo" probe, which reads `case.args.read_fixture`
|
|
19
|
+
and emits its content. Lets simple read-only assertions work with no
|
|
20
|
+
per-skill code at all.
|
|
21
|
+
- `grader_type: "llm"` cases are SKIPPED unless `-Live` is set; under `-Live`
|
|
22
|
+
they require an `m_*` sub-agent dispatcher (not implemented in OSS — flagged
|
|
23
|
+
as skipped with reason).
|
|
24
|
+
|
|
25
|
+
.PARAMETER Skill
|
|
26
|
+
Single skill to run. Conflicts with -All / -Canary.
|
|
27
|
+
|
|
28
|
+
.PARAMETER All
|
|
29
|
+
Run every plugin/skills/<name>/evals/evals.json (except `eval/` and `self-check/`).
|
|
30
|
+
|
|
31
|
+
.PARAMETER Canary
|
|
32
|
+
Run only cases marked `"canary": true`.
|
|
33
|
+
|
|
34
|
+
.PARAMETER Output
|
|
35
|
+
Override the per-run JSON output path.
|
|
36
|
+
|
|
37
|
+
.PARAMETER Baseline
|
|
38
|
+
Compare against `evals/baseline.json` (default ON).
|
|
39
|
+
|
|
40
|
+
.PARAMETER UpdateBaseline
|
|
41
|
+
Write the current run's metrics into `evals/baseline.json` instead of comparing.
|
|
42
|
+
|
|
43
|
+
.PARAMETER Live
|
|
44
|
+
Allow LLM-rubric cases to attempt to dispatch (not implemented in OSS; will skip).
|
|
45
|
+
|
|
46
|
+
.PARAMETER StrictExit
|
|
47
|
+
Exit code 1 if any case fails (default: exit 1 only on regression vs baseline).
|
|
48
|
+
|
|
49
|
+
.PARAMETER Root
|
|
50
|
+
Repo root (default: two levels above this script).
|
|
51
|
+
#>
|
|
52
|
+
[CmdletBinding()]
|
|
53
|
+
param(
|
|
54
|
+
[string]$Skill,
|
|
55
|
+
[switch]$All,
|
|
56
|
+
[switch]$Canary,
|
|
57
|
+
[string]$Output,
|
|
58
|
+
[switch]$Baseline = $true,
|
|
59
|
+
[switch]$UpdateBaseline,
|
|
60
|
+
[switch]$Live,
|
|
61
|
+
[switch]$StrictExit,
|
|
62
|
+
[string]$Root = (Resolve-Path (Join-Path $PSScriptRoot "..\..\..")).Path
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
$ErrorActionPreference = 'Stop'
|
|
66
|
+
|
|
67
|
+
function Write-Info($m) { Write-Host "kushi-eval: $m" }
|
|
68
|
+
function Write-Warn($m) { Write-Host "kushi-eval: $m" -ForegroundColor Yellow }
|
|
69
|
+
function Write-Err($m) { Write-Host "kushi-eval: $m" -ForegroundColor Red }
|
|
70
|
+
|
|
71
|
+
# ---------- Discover skills ----------
|
|
72
|
+
|
|
73
|
+
function Get-EvalFiles {
|
|
74
|
+
param([string]$Root, [string]$Skill)
|
|
75
|
+
$skillsDir = Join-Path $Root 'plugin/skills'
|
|
76
|
+
if ($Skill) {
|
|
77
|
+
$f = Join-Path $skillsDir "$Skill/evals/evals.json"
|
|
78
|
+
if (-not (Test-Path $f)) { throw "No evals.json for skill '$Skill' at $f" }
|
|
79
|
+
return ,$f
|
|
80
|
+
}
|
|
81
|
+
# Discover every skill that ships an evals/evals.json. The eval skill itself is
|
|
82
|
+
# excluded (it has no SKILL evals — it IS the runner). self-check IS included
|
|
83
|
+
# because it ships its own meta-evals.
|
|
84
|
+
return Get-ChildItem -Path $skillsDir -Directory |
|
|
85
|
+
Where-Object { $_.Name -ne 'eval' } |
|
|
86
|
+
ForEach-Object { Join-Path $_.FullName 'evals/evals.json' } |
|
|
87
|
+
Where-Object { Test-Path $_ }
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# ---------- Lightweight JSON-Schema-ish validator (just the cases we care about) ----------
|
|
91
|
+
|
|
92
|
+
function Test-EvalsShape {
|
|
93
|
+
param([Parameter(Mandatory)] $Obj, [string]$Path)
|
|
94
|
+
$problems = New-Object System.Collections.Generic.List[string]
|
|
95
|
+
if (-not $Obj.skill) { $problems.Add("${Path}: missing 'skill'") }
|
|
96
|
+
if (-not $Obj.cases -or $Obj.cases.Count -lt 1) { $problems.Add("${Path}: needs >=1 case") }
|
|
97
|
+
foreach ($c in $Obj.cases) {
|
|
98
|
+
foreach ($req in 'id', 'name', 'input', 'grader_type', 'expected_assertions') {
|
|
99
|
+
if (-not $c.PSObject.Properties[$req]) { $problems.Add("${Path}: case missing '$req'") }
|
|
100
|
+
}
|
|
101
|
+
if ($c.grader_type -and ($c.grader_type -notin 'script', 'llm')) {
|
|
102
|
+
$problems.Add("${Path}: case '$($c.id)' has invalid grader_type '$($c.grader_type)'")
|
|
103
|
+
}
|
|
104
|
+
if ($c.expected_assertions -and $c.expected_assertions.Count -lt 1) {
|
|
105
|
+
$problems.Add("${Path}: case '$($c.id)' needs >=1 assertion")
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return $problems
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# ---------- Dispatch a single case ----------
|
|
112
|
+
|
|
113
|
+
function Invoke-Case {
|
|
114
|
+
param(
|
|
115
|
+
[string]$Root,
|
|
116
|
+
[string]$SkillName,
|
|
117
|
+
$Case
|
|
118
|
+
)
|
|
119
|
+
$skillDir = Join-Path $Root "plugin/skills/$SkillName"
|
|
120
|
+
$probePs1 = Join-Path $skillDir 'evals/probe.ps1'
|
|
121
|
+
$probeMjs = Join-Path $skillDir 'evals/probe.mjs'
|
|
122
|
+
|
|
123
|
+
$result = [ordered]@{
|
|
124
|
+
id = $Case.id
|
|
125
|
+
name = $Case.name
|
|
126
|
+
canary = [bool]$Case.canary
|
|
127
|
+
grader_type = $Case.grader_type
|
|
128
|
+
pass = $false
|
|
129
|
+
duration_ms = 0
|
|
130
|
+
tokens_in = 0
|
|
131
|
+
tokens_out = 0
|
|
132
|
+
stdout = ''
|
|
133
|
+
stderr = ''
|
|
134
|
+
assertions = @()
|
|
135
|
+
error = $null
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if ($Case.skip) {
|
|
139
|
+
$result.error = "skipped: $($Case.skip_reason)"
|
|
140
|
+
return [pscustomobject]$result
|
|
141
|
+
}
|
|
142
|
+
if ($Case.grader_type -eq 'llm' -and -not $Live) {
|
|
143
|
+
$result.error = 'skipped: llm-rubric cases require -Live'
|
|
144
|
+
return [pscustomobject]$result
|
|
145
|
+
}
|
|
146
|
+
if ($Case.grader_type -eq 'llm' -and $Live) {
|
|
147
|
+
$result.error = 'skipped: llm dispatch not implemented in OSS runner'
|
|
148
|
+
return [pscustomobject]$result
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
$fixturePath = $null
|
|
152
|
+
if ($Case.fixture) { $fixturePath = Join-Path $Root $Case.fixture }
|
|
153
|
+
|
|
154
|
+
$inputStr = if ($Case.input -is [string]) { $Case.input } else { ($Case.input | ConvertTo-Json -Compress -Depth 8) }
|
|
155
|
+
|
|
156
|
+
$sw = [System.Diagnostics.Stopwatch]::StartNew()
|
|
157
|
+
try {
|
|
158
|
+
if (Test-Path $probePs1) {
|
|
159
|
+
$output = & pwsh -NoLogo -NoProfile -File $probePs1 -Input $inputStr -Fixture $fixturePath -CaseId $Case.id 2>&1
|
|
160
|
+
$result.stdout = ($output | Out-String).Trim()
|
|
161
|
+
} elseif (Test-Path $probeMjs) {
|
|
162
|
+
$output = & node $probeMjs --input $inputStr --fixture ($fixturePath ?? '') --case-id $Case.id 2>&1
|
|
163
|
+
$result.stdout = ($output | Out-String).Trim()
|
|
164
|
+
} else {
|
|
165
|
+
# Built-in fixture-echo probe.
|
|
166
|
+
$readPath = $null
|
|
167
|
+
if ($Case.args -and $Case.args.read_fixture) {
|
|
168
|
+
if ($fixturePath) { $readPath = Join-Path $fixturePath $Case.args.read_fixture }
|
|
169
|
+
else { $readPath = Join-Path $Root $Case.args.read_fixture }
|
|
170
|
+
}
|
|
171
|
+
if ($readPath -and (Test-Path $readPath)) {
|
|
172
|
+
$result.stdout = Get-Content -Raw -Path $readPath
|
|
173
|
+
} else {
|
|
174
|
+
# Default: echo the input as the "answer".
|
|
175
|
+
$result.stdout = $inputStr
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
} catch {
|
|
179
|
+
$result.error = $_.Exception.Message
|
|
180
|
+
} finally {
|
|
181
|
+
$sw.Stop()
|
|
182
|
+
$result.duration_ms = [int]$sw.Elapsed.TotalMilliseconds
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
# Rough token estimate: 4 chars/token.
|
|
186
|
+
$result.tokens_in = [int][math]::Ceiling($inputStr.Length / 4.0)
|
|
187
|
+
$result.tokens_out = [int][math]::Ceiling($result.stdout.Length / 4.0)
|
|
188
|
+
|
|
189
|
+
# Run assertions
|
|
190
|
+
$allPass = $true
|
|
191
|
+
foreach ($a in $Case.expected_assertions) {
|
|
192
|
+
$ares = Invoke-Assertion -Assertion $a -CaseResult $result -FixturePath $fixturePath -Root $Root
|
|
193
|
+
$result.assertions += $ares
|
|
194
|
+
if (-not $ares.pass) { $allPass = $false }
|
|
195
|
+
}
|
|
196
|
+
if ($result.error) { $allPass = $false }
|
|
197
|
+
$result.pass = $allPass
|
|
198
|
+
return [pscustomobject]$result
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
# ---------- Assertion implementations ----------
|
|
202
|
+
|
|
203
|
+
function Resolve-AssertPath {
|
|
204
|
+
param([string]$P, [string]$FixturePath, [string]$Root)
|
|
205
|
+
if ([System.IO.Path]::IsPathRooted($P)) { return $P }
|
|
206
|
+
if ($FixturePath -and (Test-Path (Join-Path $FixturePath $P))) { return Join-Path $FixturePath $P }
|
|
207
|
+
return Join-Path $Root $P
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function Invoke-Assertion {
|
|
211
|
+
param($Assertion, $CaseResult, [string]$FixturePath, [string]$Root)
|
|
212
|
+
$ar = [ordered]@{ type = $Assertion.type; pass = $false; reason = '' }
|
|
213
|
+
try {
|
|
214
|
+
switch ($Assertion.type) {
|
|
215
|
+
'file-exists' {
|
|
216
|
+
$p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
|
|
217
|
+
$ar.pass = Test-Path $p
|
|
218
|
+
if (-not $ar.pass) { $ar.reason = "missing: $p" }
|
|
219
|
+
}
|
|
220
|
+
'file-contains' {
|
|
221
|
+
$p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
|
|
222
|
+
if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
|
|
223
|
+
else {
|
|
224
|
+
$txt = Get-Content -Raw -Path $p
|
|
225
|
+
$ar.pass = $txt -and $txt.Contains([string]$Assertion.needle)
|
|
226
|
+
if (-not $ar.pass) { $ar.reason = "needle not found in $p" }
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
'regex-match' {
|
|
230
|
+
$flags = if ($Assertion.flags) { $Assertion.flags } else { '' }
|
|
231
|
+
$opts = [System.Text.RegularExpressions.RegexOptions]::None
|
|
232
|
+
if ($flags -match 'i') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::IgnoreCase }
|
|
233
|
+
if ($flags -match 'm') { $opts = $opts -bor [System.Text.RegularExpressions.RegexOptions]::Multiline }
|
|
234
|
+
$rx = [System.Text.RegularExpressions.Regex]::new($Assertion.pattern, $opts)
|
|
235
|
+
$hay = "$($CaseResult.stdout)`n$($CaseResult.stderr)"
|
|
236
|
+
$ar.pass = $rx.IsMatch($hay)
|
|
237
|
+
if (-not $ar.pass) { $ar.reason = "no match for /$($Assertion.pattern)/" }
|
|
238
|
+
}
|
|
239
|
+
'json-path-equals' {
|
|
240
|
+
$p = Resolve-AssertPath -P $Assertion.path -FixturePath $FixturePath -Root $Root
|
|
241
|
+
if (-not (Test-Path $p)) { $ar.reason = "missing: $p" }
|
|
242
|
+
else {
|
|
243
|
+
$j = Get-Content -Raw -Path $p | ConvertFrom-Json
|
|
244
|
+
$val = $j
|
|
245
|
+
foreach ($seg in ($Assertion.json_path -replace '^\$\.?', '' -split '\.' | Where-Object { $_ })) {
|
|
246
|
+
if ($null -eq $val) { break }
|
|
247
|
+
$val = $val.$seg
|
|
248
|
+
}
|
|
249
|
+
if ($val -is [array]) { $val = ($val -join ',') }
|
|
250
|
+
$ar.pass = ("$val" -eq "$($Assertion.equals)")
|
|
251
|
+
if (-not $ar.pass) { $ar.reason = "expected '$($Assertion.equals)', got '$val'" }
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
'llm-rubric' {
|
|
255
|
+
$ar.pass = $false
|
|
256
|
+
$ar.reason = 'llm-rubric not evaluable in script runner'
|
|
257
|
+
}
|
|
258
|
+
default {
|
|
259
|
+
$ar.reason = "unknown assertion type: $($Assertion.type)"
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
} catch {
|
|
263
|
+
$ar.reason = "assertion error: $($_.Exception.Message)"
|
|
264
|
+
}
|
|
265
|
+
return [pscustomobject]$ar
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
# ---------- Main ----------
|
|
269
|
+
|
|
270
|
+
if ($Skill -and ($All -or $Canary)) {
|
|
271
|
+
Write-Err "-Skill cannot be combined with -All / -Canary"; exit 2
|
|
272
|
+
}
|
|
273
|
+
if (-not $Skill -and -not $All -and -not $Canary) {
|
|
274
|
+
Write-Err "Pick a mode: -Skill <name> | -All | -Canary"; exit 2
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
$evalFiles = Get-EvalFiles -Root $Root -Skill $Skill
|
|
278
|
+
|
|
279
|
+
if (-not $Output) {
|
|
280
|
+
$stamp = (Get-Date).ToUniversalTime().ToString('yyyyMMdd-HHmmss')
|
|
281
|
+
$Output = Join-Path $Root "Evidence/_evals/$stamp.json"
|
|
282
|
+
}
|
|
283
|
+
New-Item -ItemType Directory -Force -Path (Split-Path $Output -Parent) | Out-Null
|
|
284
|
+
|
|
285
|
+
$runRecord = [ordered]@{
|
|
286
|
+
schema = 'kushi.evals.run/v1'
|
|
287
|
+
generated_at = (Get-Date).ToUniversalTime().ToString('o')
|
|
288
|
+
mode = if ($Skill) { "skill:$Skill" } elseif ($Canary) { 'canary' } else { 'all' }
|
|
289
|
+
root = $Root
|
|
290
|
+
skills = @()
|
|
291
|
+
}
|
|
292
|
+
$schemaProblems = New-Object System.Collections.Generic.List[string]
|
|
293
|
+
$anyCaseFailed = $false
|
|
294
|
+
|
|
295
|
+
foreach ($f in $evalFiles) {
|
|
296
|
+
$obj = Get-Content -Raw $f | ConvertFrom-Json
|
|
297
|
+
$problems = Test-EvalsShape -Obj $obj -Path $f
|
|
298
|
+
foreach ($p in $problems) { $schemaProblems.Add($p) }
|
|
299
|
+
if ($problems.Count -gt 0) { continue }
|
|
300
|
+
|
|
301
|
+
$cases = $obj.cases
|
|
302
|
+
if ($Canary) { $cases = $cases | Where-Object { $_.canary } }
|
|
303
|
+
$skillEntry = [ordered]@{
|
|
304
|
+
skill = $obj.skill
|
|
305
|
+
evals_file = (Resolve-Path -Relative $f)
|
|
306
|
+
cases = @()
|
|
307
|
+
pass_count = 0
|
|
308
|
+
fail_count = 0
|
|
309
|
+
skipped_count = 0
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
foreach ($c in $cases) {
|
|
313
|
+
Write-Info "[$($obj.skill)] $($c.id) ..."
|
|
314
|
+
$r = Invoke-Case -Root $Root -SkillName $obj.skill -Case $c
|
|
315
|
+
$skillEntry.cases += $r
|
|
316
|
+
if ($r.error -and $r.error.StartsWith('skipped')) { $skillEntry.skipped_count++ }
|
|
317
|
+
elseif ($r.pass) { $skillEntry.pass_count++ }
|
|
318
|
+
else { $skillEntry.fail_count++; $anyCaseFailed = $true }
|
|
319
|
+
}
|
|
320
|
+
$runRecord.skills += $skillEntry
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if ($schemaProblems.Count -gt 0) {
|
|
324
|
+
Write-Err "Schema problems:"
|
|
325
|
+
foreach ($p in $schemaProblems) { Write-Err " - $p" }
|
|
326
|
+
exit 2
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
$runRecord | ConvertTo-Json -Depth 10 | Set-Content -LiteralPath $Output -Encoding UTF8
|
|
330
|
+
Write-Info "Run written: $Output"
|
|
331
|
+
|
|
332
|
+
# Aggregator + baseline compare
|
|
333
|
+
$aggregator = Join-Path $Root 'src/eval-aggregator.mjs'
|
|
334
|
+
$baselineFile = Join-Path $Root 'evals/baseline.json'
|
|
335
|
+
$benchFile = Join-Path $Root 'Evidence/_evals/benchmark.json'
|
|
336
|
+
|
|
337
|
+
if (Test-Path $aggregator) {
|
|
338
|
+
$aggArgs = @($aggregator, '--run', $Output, '--bench', $benchFile)
|
|
339
|
+
if ((Test-Path $baselineFile -ErrorAction SilentlyContinue) -or $UpdateBaseline) {
|
|
340
|
+
$aggArgs += @('--baseline', $baselineFile)
|
|
341
|
+
}
|
|
342
|
+
if ($UpdateBaseline) { $aggArgs += '--update-baseline' }
|
|
343
|
+
& node @aggArgs
|
|
344
|
+
$aggExit = $LASTEXITCODE
|
|
345
|
+
} else {
|
|
346
|
+
Write-Warn 'src/eval-aggregator.mjs not found — skipping aggregation'
|
|
347
|
+
$aggExit = 0
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
# Print a one-screen summary
|
|
351
|
+
$total = 0; $pass = 0; $fail = 0; $skip = 0
|
|
352
|
+
foreach ($s in $runRecord.skills) {
|
|
353
|
+
$total += $s.pass_count + $s.fail_count + $s.skipped_count
|
|
354
|
+
$pass += $s.pass_count
|
|
355
|
+
$fail += $s.fail_count
|
|
356
|
+
$skip += $s.skipped_count
|
|
357
|
+
}
|
|
358
|
+
Write-Host ""
|
|
359
|
+
Write-Host "### Kushi evals — $($runRecord.mode)"
|
|
360
|
+
Write-Host "| Skill | Pass | Fail | Skip |"
|
|
361
|
+
Write-Host "|-------|-----:|-----:|-----:|"
|
|
362
|
+
foreach ($s in $runRecord.skills) {
|
|
363
|
+
Write-Host ("| {0} | {1} | {2} | {3} |" -f $s.skill, $s.pass_count, $s.fail_count, $s.skipped_count)
|
|
364
|
+
}
|
|
365
|
+
Write-Host ""
|
|
366
|
+
Write-Host "Total: $total · Pass: $pass · Fail: $fail · Skip: $skip"
|
|
367
|
+
Write-Host "Run JSON: $Output"
|
|
368
|
+
if (Test-Path $benchFile) { Write-Host "Benchmark: $benchFile" }
|
|
369
|
+
|
|
370
|
+
if ($StrictExit -and $anyCaseFailed) { exit 1 }
|
|
371
|
+
if ($aggExit -ne 0) { exit $aggExit }
|
|
372
|
+
exit 0
|