kushi-agents 6.1.2 → 6.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/plugin/instructions/agentskills-compliance.instructions.md +144 -0
- package/plugin/instructions/dashboard-artifact.instructions.md +132 -0
- package/plugin/instructions/guided-tour.instructions.md +100 -0
- package/plugin/instructions/karpathy-state-layout.instructions.md +124 -0
- package/plugin/instructions/schema-evolve.instructions.md +73 -0
- package/plugin/instructions/skill-authoring.instructions.md +147 -0
- package/plugin/instructions/skill-evals.instructions.md +130 -0
- package/plugin/runners/bootstrap.mjs +55 -22
- package/plugin/runners/lib/runlog.mjs +153 -6
- package/plugin/runners/migrate-to-v550.mjs +192 -0
- package/plugin/runners/pull-email.mjs +194 -3
- package/plugin/runners/pull-meetings.mjs +220 -4
- package/plugin/runners/pull-onenote.mjs +253 -3
- package/plugin/runners/pull-sharepoint.mjs +284 -3
- package/plugin/runners/pull-state.mjs +297 -0
- package/plugin/runners/pull-teams.mjs +183 -3
- package/plugin/runners/refresh.mjs +9 -1
- package/plugin/runners/test/fixtures/email-abn-amro.json +13 -0
- package/plugin/runners/test/fixtures/email-novel-error.json +9 -0
- package/plugin/runners/test/fixtures/meetings-abn-amro.json +10 -0
- package/plugin/runners/test/fixtures/meetings-body-unavailable.json +10 -0
- package/plugin/runners/test/fixtures/onenote-abn-amro.json +30 -0
- package/plugin/runners/test/fixtures/onenote-partial.json +21 -0
- package/plugin/runners/test/fixtures/refresh-dir/email.json +7 -4
- package/plugin/runners/test/fixtures/refresh-dir/teams.json +6 -4
- package/plugin/runners/test/fixtures/sharepoint-abn-amro.json +12 -0
- package/plugin/runners/test/fixtures/teams-abn-amro.json +11 -0
- package/plugin/runners/test/integration/migrate-to-v550.integration.test.mjs +138 -0
- package/plugin/runners/test/integration/pull-email.integration.test.mjs +149 -0
- package/plugin/runners/test/integration/pull-meetings.integration.test.mjs +92 -0
- package/plugin/runners/test/integration/pull-onenote.integration.test.mjs +86 -0
- package/plugin/runners/test/integration/pull-sharepoint.integration.test.mjs +93 -0
- package/plugin/runners/test/integration/pull-teams.integration.test.mjs +91 -0
- package/plugin/runners/test/unit/runlog.test.mjs +1 -1
- package/plugin/skills/build-state/SKILL.md +195 -0
- package/plugin/skills/build-state/evals/evals.json +31 -0
- package/plugin/skills/dashboard/SKILL.md +132 -0
- package/plugin/skills/dashboard/evals/evals.json +33 -0
- package/plugin/skills/lint-state/.created-by-skill-creator +0 -0
- package/plugin/skills/lint-state/SKILL.md +98 -0
- package/plugin/skills/lint-state/evals/evals.json +34 -0
- package/plugin/skills/lint-state/lint.ps1 +218 -0
- package/plugin/skills/promote/.created-by-skill-creator +1 -0
- package/plugin/skills/promote/SKILL.md +125 -0
- package/plugin/skills/promote/evals/evals.json +35 -0
- package/plugin/skills/schema-evolve/.created-by-skill-creator +0 -0
- package/plugin/skills/schema-evolve/SKILL.md +106 -0
- package/plugin/skills/schema-evolve/evals/evals.json +37 -0
- package/plugin/skills/skill-checker/SKILL.md +136 -0
- package/plugin/skills/skill-checker/check-skill.ps1 +416 -0
- package/plugin/skills/skill-checker/evals/evals.json +41 -0
- package/plugin/skills/skill-creator/SKILL.md +134 -0
- package/plugin/skills/skill-creator/evals/evals.json +40 -0
- package/plugin/skills/skill-creator/generate-eval-review.ps1 +101 -0
- package/plugin/skills/skill-creator/optimize-description.ps1 +87 -0
- package/plugin/skills/skill-creator/scaffold.ps1 +180 -0
- package/plugin/skills/skill-creator/templates/evals-starter.template.json +27 -0
- package/plugin/skills/skill-creator/templates/gotchas-stub.template.md +9 -0
- package/plugin/skills/skill-creator/templates/skill-skeleton.template.md +28 -0
- package/plugin/skills/teach/.created-by-skill-creator +0 -0
- package/plugin/skills/teach/SKILL.md +79 -0
- package/plugin/skills/teach/evals/evals.json +59 -0
- package/plugin/skills/tour/SKILL.md +85 -0
- package/plugin/skills/tour/build-tour.ps1 +185 -0
- package/plugin/skills/tour/evals/evals.json +33 -0
- package/plugin/templates/state/00_overview.template.md +44 -0
- package/plugin/templates/state/01_decisions.template.md +41 -0
- package/plugin/templates/state/02_stakeholders.template.md +48 -0
- package/plugin/templates/state/03_architecture-and-solution.template.md +56 -0
- package/plugin/templates/state/04_workshops-and-key-meetings.template.md +43 -0
- package/plugin/templates/state/05_action-items.template.md +29 -0
- package/plugin/templates/state/06_risks-and-issues.template.md +43 -0
- package/plugin/templates/state/07_timeline-and-milestones.template.md +45 -0
- package/plugin/templates/state/08_artifacts-and-deliverables.template.md +55 -0
- package/plugin/templates/state/09_open-questions.template.md +62 -0
- package/plugin/templates/state/AGENTS.template.md +33 -0
- package/plugin/templates/state/CLAUDE.template.md +33 -0
- package/plugin/templates/state/README.md +41 -0
- package/plugin/templates/state/answers.README.md +7 -0
- package/plugin/templates/state/hot.template.md +12 -0
- package/plugin/templates/state/index.template.md +41 -0
- package/plugin/templates/state/log.template.md +14 -0
- package/plugin/templates/state/page.template.md +22 -0
- package/plugin/templates/state/review-queue.template.md +10 -0
- package/plugin/runners/test/integration/csc-pull.integration.test.mjs +0 -160
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: "skill-authoring"
|
|
3
|
+
description: "v5.0.4 — How to author a new kushi skill so it ships conformant to the agentskills.io blueprint on day one. Codifies the required SKILL.md sections, file layout, evals starter, naming, and the description-optimization rules. Read this before running `npx kushi-agents create-skill`. Enforced by self-check D34.creator-conformance + by `kushi check-skill --lint`."
|
|
4
|
+
applies_to: "every plugin/skills/<name>/ created from v5.0.4 onward; existing skills are audited via the dogfood gate"
|
|
5
|
+
since: "kushi v5.0.4"
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# skill-authoring — doctrine
|
|
9
|
+
|
|
10
|
+
> Inspired by **Anthropic's [skill-creator](https://github.com/anthropics/skills/blob/main/skills/skill-creator/SKILL.md)**. Adapted to kushi's PowerShell-first stack, our 2-host install matrix, and the reality that the first 30 kushi skills were authored before any harness existed — hence the **retrofit** path in `skill-checker`.
|
|
11
|
+
|
|
12
|
+
## Why this exists
|
|
13
|
+
|
|
14
|
+
A SKILL.md is the prompt that loads into the agent the moment its trigger fires. Drift between intent and spec is silent until evals catch it (and only if there are evals). This doctrine + the `skill-creator` + `skill-checker` skills make conformant authoring the default, not an afterthought.
|
|
15
|
+
|
|
16
|
+
## Required files per skill
|
|
17
|
+
|
|
18
|
+
```text
|
|
19
|
+
plugin/skills/<name>/
|
|
20
|
+
├── SKILL.md ← REQUIRED — agent-loaded prompt; ≤500 lines, ≤5000 tokens
|
|
21
|
+
├── evals/
|
|
22
|
+
│ └── evals.json ← REQUIRED — ≥2 cases, each with ≥1 assertion
|
|
23
|
+
├── references/ ← OPTIONAL — load-on-trigger bulk content (>500 lines splits here)
|
|
24
|
+
└── .created-by-skill-creator ← OPTIONAL marker — set by `scaffold.ps1`; opts into the strict D34 gate
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
A skill MAY also ship a runner (`run.ps1`, `run.sh`, `*.mjs`) when it's a tool-skill (`self-check`, `eval`, `skill-checker`, etc.). Pure prompt-skills don't need one.
|
|
28
|
+
|
|
29
|
+
## Required SKILL.md sections
|
|
30
|
+
|
|
31
|
+
Every SKILL.md MUST have:
|
|
32
|
+
|
|
33
|
+
1. **YAML frontmatter** — `name` (kebab-case, matches dir) + `description` (lead with `USE WHEN`).
|
|
34
|
+
2. **`# Skill: <name>`** H1.
|
|
35
|
+
3. **One-paragraph purpose** immediately after the H1.
|
|
36
|
+
4. **At least one** of these procedure-shape sections, picked by skill **type**:
|
|
37
|
+
- `## Gotchas` — REQUIRED for `pull-*` and discovery skills (top-5 failure modes).
|
|
38
|
+
- `## Step checklist` — REQUIRED for orchestrators (`bootstrap-project`, `refresh-project`, `build-state`, `link-entities`, `dashboard`, `tour`, etc.); use GitHub `- [ ]` checkboxes.
|
|
39
|
+
- `## Validation loop` — REQUIRED for writer skills (anything that writes to `Evidence/`, `State/`, `_graph/`, `dashboards/`, `tours/`).
|
|
40
|
+
- `## Steps` or `## Procedure` — acceptable for other skills, plus one of the three above where it applies.
|
|
41
|
+
|
|
42
|
+
Skills can have more than one (e.g. `eval` ships all three). The checker is satisfied by **at least one** of `Gotchas` / `Step checklist` / `Validation loop` plus type-specific rules.
|
|
43
|
+
|
|
44
|
+
## Description optimization (per agentskills.io)
|
|
45
|
+
|
|
46
|
+
The `description:` is the sole trigger signal. Optimize it:
|
|
47
|
+
|
|
48
|
+
```yaml
|
|
49
|
+
description: "USE WHEN <situational trigger> AND <precondition>. DO NOT USE for <near-miss>. <one-line capability summary>."
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Rules (enforced by `D30.description-optimized` + `skill-checker --optimize-description`):
|
|
53
|
+
|
|
54
|
+
- Lead with `USE WHEN` (first 160 chars).
|
|
55
|
+
- Include a `DO NOT USE` clause for the most likely near-miss invocation.
|
|
56
|
+
- Be specific about the trigger (concrete user phrases or file/state conditions).
|
|
57
|
+
- No marketing fluff ("powerful", "comprehensive", "blazing").
|
|
58
|
+
- ≤1024 characters total.
|
|
59
|
+
|
|
60
|
+
| Bad | Good |
|
|
61
|
+
|---|---|
|
|
62
|
+
| `"Pulls OneNote pages."` | `"USE WHEN refreshing project evidence for a known kushi project AND boundaries.onenote.section_ids is non-empty. DO NOT USE for global OneNote search."` |
|
|
63
|
+
| `"Comprehensive eval framework."` | `"USE WHEN the user says 'run evals', 'eval canary', or before tagging a release. DO NOT USE for evidence validation of a real project."` |
|
|
64
|
+
|
|
65
|
+
## Size caps
|
|
66
|
+
|
|
67
|
+
- ≤ 500 lines (`D30.skill-size`)
|
|
68
|
+
- ≤ 5000 tokens (~20 KB)
|
|
69
|
+
|
|
70
|
+
When you exceed either, **split into `references/<topic>.md` files** and cite them with explicit triggers:
|
|
71
|
+
|
|
72
|
+
```markdown
|
|
73
|
+
Load `references/canonical-prompts.md` when constructing the WorkIQ query.
|
|
74
|
+
Load `references/error-modes.md` if the API returns non-200.
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Passive links (`[see foo](references/foo.md)`) do NOT count. The checker requires the literal substring `references/<file>.md` somewhere in SKILL.md if `references/` exists.
|
|
78
|
+
|
|
79
|
+
## Evals (≥2 cases)
|
|
80
|
+
|
|
81
|
+
Every skill MUST ship `evals/evals.json` validated against `plugin/skills/eval/evals.schema.json`. Per `skill-evals.instructions.md`:
|
|
82
|
+
|
|
83
|
+
- `id`, `name`, `input`, `expected_assertions[]` (≥1), `grader_type` (`script` | `llm`).
|
|
84
|
+
- ≥2 cases. Mark canary-worthy ones `"canary": true`.
|
|
85
|
+
- Synthetic fixtures only — never real customer data.
|
|
86
|
+
|
|
87
|
+
The `create-skill` scaffold emits a starter `evals.json` with one `file-exists` and one `regex-match` case so the skill ships green from minute one.
|
|
88
|
+
|
|
89
|
+
## Naming conventions
|
|
90
|
+
|
|
91
|
+
- **Skill directory + frontmatter `name`** — kebab-case, verb-led (`pull-onenote`, `consolidate-evidence`, `apply-ado-update`).
|
|
92
|
+
- **Skill types** (informational; pick at create time so the scaffold picks the right sections):
|
|
93
|
+
- `pull` — fetches evidence from a source.
|
|
94
|
+
- `writer` — writes files to `Evidence/` or `State/`.
|
|
95
|
+
- `orchestrator` — coordinates other skills.
|
|
96
|
+
- `other` — utility / tool / meta.
|
|
97
|
+
- **Instruction files** — `<topic>.instructions.md` in `plugin/instructions/`. Front-matter `name:` matches filename minus `.instructions.md`.
|
|
98
|
+
|
|
99
|
+
## Contributor workflow
|
|
100
|
+
|
|
101
|
+
```powershell
|
|
102
|
+
# 1. Scaffold
|
|
103
|
+
npx kushi-agents create-skill my-new-skill
|
|
104
|
+
# → answers: type (pull|writer|orchestrator|other), one-liner description
|
|
105
|
+
# → emits plugin/skills/my-new-skill/{SKILL.md, evals/evals.json}
|
|
106
|
+
# → marker file .created-by-skill-creator is written
|
|
107
|
+
|
|
108
|
+
# 2. Fill in the placeholders (search for "TODO(skill-creator)")
|
|
109
|
+
|
|
110
|
+
# 3. Validate
|
|
111
|
+
npx kushi-agents check-skill my-new-skill # lint mode
|
|
112
|
+
npm run eval -- my-new-skill # run evals
|
|
113
|
+
|
|
114
|
+
# 4. Optimize description before PR
|
|
115
|
+
npx kushi-agents optimize-description my-new-skill
|
|
116
|
+
# → emits a rewritten description; you decide whether to apply
|
|
117
|
+
|
|
118
|
+
# 5. Self-check + commit
|
|
119
|
+
pwsh plugin/skills/self-check/run.ps1 -Deep
|
|
120
|
+
git add plugin/skills/my-new-skill/
|
|
121
|
+
git commit -m "v<x.y.z>: my-new-skill"
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Retrofit (existing skills predating the harness)
|
|
125
|
+
|
|
126
|
+
Run `npx kushi-agents check-skill --all --retrofit` to identify gaps in legacy skills. `--apply` adds missing section stubs with `<!-- TODO(retrofit): fill in -->` markers — never overwrites existing content. The v5.0.4 dogfood report at `docs/audits/v5.0.4-skill-creator-dogfood.md` records the baseline.
|
|
127
|
+
|
|
128
|
+
## Enforcement
|
|
129
|
+
|
|
130
|
+
| Check | What it does |
|
|
131
|
+
|---|---|
|
|
132
|
+
| `D34.skill-creator-exists` | `plugin/skills/skill-creator/scaffold.ps1` is parseable. |
|
|
133
|
+
| `D34.skill-checker-exists` | `plugin/skills/skill-checker/check-skill.ps1` is parseable. |
|
|
134
|
+
| `D34.creator-output-conforms` | Every skill carrying `.created-by-skill-creator` passes `check-skill --lint`. |
|
|
135
|
+
| `D34.retrofit-clean` | `check-skill --all --retrofit --dry-run` shows no unresolved non-additive gaps. |
|
|
136
|
+
| `D34.dogfood-report-fresh` | `docs/audits/v5.0.4-skill-creator-dogfood.md` was touched within 14 days (warn-only). |
|
|
137
|
+
|
|
138
|
+
## References
|
|
139
|
+
|
|
140
|
+
- `plugin/instructions/agentskills-compliance.instructions.md` — the spec rules this builds on.
|
|
141
|
+
- `plugin/instructions/skill-evals.instructions.md` — the evals doctrine.
|
|
142
|
+
- `plugin/skills/skill-creator/SKILL.md` — the scaffolder.
|
|
143
|
+
- `plugin/skills/skill-checker/SKILL.md` — the linter / retrofitter.
|
|
144
|
+
- `docs/contributing/skill-authoring.md` — the human walkthrough.
|
|
145
|
+
- <https://github.com/anthropics/skills/blob/main/skills/skill-creator/SKILL.md> — upstream inspiration.
|
|
146
|
+
- <https://agentskills.io/skill-creation/best-practices>
|
|
147
|
+
- <https://agentskills.io/skill-creation/optimizing-descriptions>
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
---
|
|
2
|
+
description: "v5.0.3 — Skill evals doctrine, adapted from https://agentskills.io/skill-creation/evaluating-skills. Every skill MUST ship an evals/ folder with at least 2 deterministic cases plus structured assertions; a per-skill pass-rate is the objective regression signal. Canary subset runs on every PR; full suite runs on demand. Real customer data is FORBIDDEN in fixtures — use synthetic data only."
|
|
3
|
+
---
|
|
4
|
+
|
|
5
|
+
# Skill evals — doctrine
|
|
6
|
+
|
|
7
|
+
> Inspired by **<https://agentskills.io/skill-creation/evaluating-skills>**. Adapted to kushi's PowerShell + Node test stack and to our 2-host install matrix.
|
|
8
|
+
|
|
9
|
+
## Why
|
|
10
|
+
|
|
11
|
+
Skills are prompts plus a runner. Prompts drift silently. Without an objective per-skill regression signal, every change is a gamble. Evals make that signal cheap:
|
|
12
|
+
|
|
13
|
+
- **Per-skill pass-rate** is the headline metric.
|
|
14
|
+
- **Latency** and **tokens** are secondary metrics (regressions ≥50% latency / ≥10pp pass-rate flag a baseline failure).
|
|
15
|
+
- A **canary subset** runs on every PR (target: < 60s wall clock); the **full suite** runs on demand (`npm run eval:all`).
|
|
16
|
+
|
|
17
|
+
## Where evals live
|
|
18
|
+
|
|
19
|
+
```text
|
|
20
|
+
plugin/skills/<name>/
|
|
21
|
+
├── SKILL.md
|
|
22
|
+
└── evals/
|
|
23
|
+
├── evals.json ← REQUIRED — case list + assertions
|
|
24
|
+
└── fixtures/ ← OPTIONAL per-skill fixtures
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
Cross-skill fixtures live at the repo root:
|
|
28
|
+
|
|
29
|
+
```text
|
|
30
|
+
evals/
|
|
31
|
+
├── baseline.json ← Committed; maintainer updates with `npm run eval:baseline`
|
|
32
|
+
└── fixtures/ ← Tiny synthetic evidence trees, ADO fixtures, etc.
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Per-run output goes to `Evidence/_evals/<timestamp>.json` (gitignored; not customer data).
|
|
36
|
+
|
|
37
|
+
## Case schema
|
|
38
|
+
|
|
39
|
+
```jsonc
|
|
40
|
+
{
|
|
41
|
+
"skill": "<skill-name>",
|
|
42
|
+
"cases": [
|
|
43
|
+
{
|
|
44
|
+
"id": "ap-citations-format",
|
|
45
|
+
"name": "ask-project emits weekly-csc citation form",
|
|
46
|
+
"input": "what was decided about MACC for fixture-acme?",
|
|
47
|
+
"fixture": "evals/fixtures/fixture-acme", // optional
|
|
48
|
+
"canary": true,
|
|
49
|
+
"grader_type": "script", // "script" | "llm"
|
|
50
|
+
"expected_assertions": [
|
|
51
|
+
{ "type": "regex-match", "pattern": "\\[source:\\s*fixture-acme/email/weekly/" },
|
|
52
|
+
{ "type": "regex-match", "pattern": "Source-layout:\\s*weekly-csc" }
|
|
53
|
+
]
|
|
54
|
+
}
|
|
55
|
+
]
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Required fields per case
|
|
60
|
+
|
|
61
|
+
- `id` — unique within the skill; kebab-case.
|
|
62
|
+
- `name` — human-readable.
|
|
63
|
+
- `input` — what gets passed to the skill (string OR object).
|
|
64
|
+
- `expected_assertions` — array, **≥ 1** entry (enforced by `D33.evals-have-assertions`).
|
|
65
|
+
- `grader_type` — `"script"` for deterministic graders, `"llm"` for rubric-based.
|
|
66
|
+
|
|
67
|
+
### Optional fields
|
|
68
|
+
|
|
69
|
+
- `fixture` — repo-relative path to the fixture to point the skill at.
|
|
70
|
+
- `canary` — `true` to include in the fast CI subset.
|
|
71
|
+
- `args` — extra args forwarded to the skill script (e.g. `{ "DryRun": true }`).
|
|
72
|
+
- `skip` — `true` to skip (must include `skip_reason`).
|
|
73
|
+
- `timeout_ms` — override the runner default (30 000 ms).
|
|
74
|
+
|
|
75
|
+
## Assertion types
|
|
76
|
+
|
|
77
|
+
| Type | Shape | Passes when |
|
|
78
|
+
|---|---|---|
|
|
79
|
+
| `file-exists` | `{ "type": "file-exists", "path": "..." }` | Path exists post-run (relative to fixture or evidence dir). |
|
|
80
|
+
| `file-contains` | `{ "type": "file-contains", "path": "...", "needle": "..." }` | File exists and substring is present. |
|
|
81
|
+
| `json-path-equals` | `{ "type": "json-path-equals", "path": "...", "json_path": "$.foo.bar", "equals": "v" }` | JSON file parses; dotted path value === expected. |
|
|
82
|
+
| `regex-match` | `{ "type": "regex-match", "pattern": "...", "flags": "i" }` | Captured stdout matches the regex. |
|
|
83
|
+
| `llm-rubric` | `{ "type": "llm-rubric", "rubric": "...", "min_score": 4 }` | LLM grader scores ≥ min on a 1–5 rubric. |
|
|
84
|
+
|
|
85
|
+
## Run modes
|
|
86
|
+
|
|
87
|
+
The runner (`plugin/skills/eval/run-evals.ps1`) supports three dispatch modes:
|
|
88
|
+
|
|
89
|
+
1. **Direct invocation** (default for `script` graders). Runs the skill's executable artifact (`run.ps1`, `*.mjs`, or a small probe stub) with the given input and fixture. Pure deterministic.
|
|
90
|
+
2. **Sub-agent dispatch** (optional, gated by `-Live`). Forwards the case to a sub-agent. Used only for `llm-rubric` cases. Skipped in canary mode.
|
|
91
|
+
3. **Recorded fixture replay** (for `pull-*` skills). Reads a recorded `--cached` output of a real pull and asserts against that, so no live M365 calls are needed.
|
|
92
|
+
|
|
93
|
+
For each case the runner records: `pass`, `duration_ms`, `tokens_in`, `tokens_out`, `stdout`, `stderr`, per-assertion `pass`/`reason`. The aggregate is a JSON file under `Evidence/_evals/` plus a one-line `benchmark.json` summary.
|
|
94
|
+
|
|
95
|
+
## Canary set
|
|
96
|
+
|
|
97
|
+
Marked with `"canary": true`. Kept tiny so PRs stay fast.
|
|
98
|
+
|
|
99
|
+
Default canary set (v5.0.3):
|
|
100
|
+
|
|
101
|
+
- `ask-project`
|
|
102
|
+
- `bootstrap-project`
|
|
103
|
+
- `refresh-project`
|
|
104
|
+
- `link-entities`
|
|
105
|
+
- `build-state`
|
|
106
|
+
- `self-check`
|
|
107
|
+
|
|
108
|
+
## Baseline + regression detection
|
|
109
|
+
|
|
110
|
+
- `evals/baseline.json` is **committed**.
|
|
111
|
+
- Each per-skill record carries the last green `pass_rate`, `mean_duration_ms`, and `mean_tokens_total`.
|
|
112
|
+
- `src/eval-aggregator.mjs` flags **regressions**:
|
|
113
|
+
- `pass_rate` drop ≥ 10 percentage points
|
|
114
|
+
- `mean_duration_ms` increase ≥ 50 %
|
|
115
|
+
- `mean_tokens_total` increase ≥ 50 %
|
|
116
|
+
- Maintainers refresh the baseline with `npm run eval:baseline` after deliberate behavior changes.
|
|
117
|
+
|
|
118
|
+
## Privacy + safety
|
|
119
|
+
|
|
120
|
+
- **No real customer data** in any fixture. Use `fixture-acme`-style synthetic names.
|
|
121
|
+
- `Evidence/_evals/` is in `.gitignore`.
|
|
122
|
+
- `pull-*` evals NEVER hit live M365 endpoints in canary mode. Use recorded `--cached` payloads or `--dry-run`.
|
|
123
|
+
- Tenant IDs / GUIDs in fixtures must be obviously fake (e.g. `00000000-...`).
|
|
124
|
+
|
|
125
|
+
## References
|
|
126
|
+
|
|
127
|
+
- [agentskills.io — evaluating skills](https://agentskills.io/skill-creation/evaluating-skills) (source of truth)
|
|
128
|
+
- `plugin/skills/eval/SKILL.md` (the runner skill)
|
|
129
|
+
- `plugin/skills/eval/evals.schema.json` (JSON schema; self-check D33.evals-schema)
|
|
130
|
+
- `plugin/instructions/agentskills-compliance.instructions.md` (sibling doctrine — size + section caps)
|
|
@@ -26,7 +26,7 @@ import {
|
|
|
26
26
|
aliasRoot, projectSharedFile, userFile, USER_FILES,
|
|
27
27
|
} from './lib/layout.mjs';
|
|
28
28
|
import { writeAtomic, pathExists } from './lib/evidence.mjs';
|
|
29
|
-
import { writeRefreshReport, appendRunLog } from './lib/runlog.mjs';
|
|
29
|
+
import { writeRefreshReport, writeBootstrapStatus, appendRunLog } from './lib/runlog.mjs';
|
|
30
30
|
|
|
31
31
|
function parseArgs(argv) {
|
|
32
32
|
const args = { force: false, dryRun: false, lookbackDays: null, interactive: false };
|
|
@@ -329,11 +329,11 @@ async function main() {
|
|
|
329
329
|
}
|
|
330
330
|
|
|
331
331
|
const startedAt = new Date(); // capture when scaffold started (before report)
|
|
332
|
-
// v6.
|
|
333
|
-
//
|
|
334
|
-
//
|
|
335
|
-
// dateFloor + interactive outcomes. Diagnostics-only; never blocks.
|
|
332
|
+
// v6.2.0: write bootstrap report per run-reports.instructions.md (proper
|
|
333
|
+
// sectioned format) AND write bootstrap-status.md per
|
|
334
|
+
// bootstrap-status-format.instructions.md. Diagnostics-only; never blocks.
|
|
336
335
|
let reportPath = null;
|
|
336
|
+
let statusPath = null;
|
|
337
337
|
if (!args.dryRun) {
|
|
338
338
|
try {
|
|
339
339
|
const created = log.created.map(p => path.relative(root, p) || '.');
|
|
@@ -342,30 +342,37 @@ async function main() {
|
|
|
342
342
|
(dateFloorReport && dateFloorReport.fields?.length ? ` dateFloor=${dateFloorReport.dateFloor}` : '') +
|
|
343
343
|
(interactiveReport && interactiveReport.fields?.length ? ` interactive=${interactiveReport.fields.length}` : '');
|
|
344
344
|
const details = {
|
|
345
|
-
mode: 'bootstrap',
|
|
346
345
|
contributor: args.alias,
|
|
347
346
|
started: startedAt.toISOString(),
|
|
348
347
|
ended: new Date().toISOString(),
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
},
|
|
360
|
-
created,
|
|
361
|
-
existed,
|
|
362
|
-
date_floor: dateFloorReport || null,
|
|
363
|
-
interactive: interactiveReport || null,
|
|
348
|
+
window: 'scaffold-only (no HTTP, no pulls)',
|
|
349
|
+
profile: 'standard',
|
|
350
|
+
what_was_done_table: [
|
|
351
|
+
{ source: 'scaffold', action: 'create-or-skip', items: `${created.length} created / ${existed.length} existed`, outcome: 'completed', notes: 'project + Evidence trees + per-user dirs' },
|
|
352
|
+
...(dateFloorReport ? [{ source: 'm365-auth', action: 'stamp dateFloor', items: (dateFloorReport.fields || []).length, outcome: dateFloorReport.updated ? 'completed' : 'skipped', notes: dateFloorReport.reason || dateFloorReport.dateFloor || '' }] : []),
|
|
353
|
+
...(interactiveReport ? [{ source: 'interactive', action: 'prompt user', items: (interactiveReport.fields || []).length, outcome: 'completed', notes: 'fields stamped into m365-auth.json' }] : []),
|
|
354
|
+
],
|
|
355
|
+
resolutions: [],
|
|
356
|
+
cleanups: [],
|
|
357
|
+
learnings: [],
|
|
364
358
|
skips_and_gaps: [
|
|
365
359
|
'Bootstrap is scaffold-only — no source pulls performed.',
|
|
366
360
|
'Run `kushi discover <project>` next to populate boundaries.yml/integrations.yml.',
|
|
367
|
-
'Then `kushi refresh <project>` to capture per-source
|
|
361
|
+
'Then `kushi refresh <project>` to capture per-source weekly evidence.',
|
|
362
|
+
],
|
|
363
|
+
created,
|
|
364
|
+
existed,
|
|
365
|
+
next_steps: [
|
|
366
|
+
'Run `kushi discover <project>` to map boundaries (email folders, Teams chats, OneNote sections, SharePoint sites).',
|
|
367
|
+
'Run `kushi refresh <project>` to pull the current week of evidence from M365 + CRM + ADO.',
|
|
368
|
+
'Review `Evidence/<alias>/bootstrap-status.md` for the current durable state snapshot.',
|
|
368
369
|
],
|
|
370
|
+
date_floor: dateFloorReport || null,
|
|
371
|
+
interactive: interactiveReport || null,
|
|
372
|
+
counts: {
|
|
373
|
+
created: created.length,
|
|
374
|
+
existed: existed.length,
|
|
375
|
+
},
|
|
369
376
|
};
|
|
370
377
|
const r = await writeRefreshReport(args.project, args.alias, {
|
|
371
378
|
type: 'bootstrap',
|
|
@@ -373,6 +380,30 @@ async function main() {
|
|
|
373
380
|
details,
|
|
374
381
|
});
|
|
375
382
|
reportPath = r?.path || null;
|
|
383
|
+
|
|
384
|
+
// Bootstrap-status.md: durable per-user fast-orientation artifact.
|
|
385
|
+
try {
|
|
386
|
+
const s = await writeBootstrapStatus(args.project, args.alias, {
|
|
387
|
+
project: path.basename(root),
|
|
388
|
+
customer_hint: '',
|
|
389
|
+
mode: 'bootstrap',
|
|
390
|
+
lookback_days: args.lookbackDays || null,
|
|
391
|
+
preflight: [
|
|
392
|
+
{ check: 'integrations.yml present', status: existed.includes('integrations.yml') ? 'present' : 'created', notes: '' },
|
|
393
|
+
{ check: 'project-info.md present', status: existed.includes('project-info.md') ? 'present' : 'created', notes: '' },
|
|
394
|
+
{ check: 'Evidence/_shared scaffold', status: 'present', notes: SHARED_DIRS.join(', ') },
|
|
395
|
+
{ check: `Evidence/${args.alias}/ scaffold`, status: 'present', notes: USER_DIRS.join(', ') },
|
|
396
|
+
],
|
|
397
|
+
artifacts: [],
|
|
398
|
+
outcome: [
|
|
399
|
+
'Project scaffold complete. No pulls performed.',
|
|
400
|
+
'Run `kushi discover` next to map boundaries from M365 signals.',
|
|
401
|
+
],
|
|
402
|
+
access_limitations: [],
|
|
403
|
+
});
|
|
404
|
+
statusPath = s?.path || null;
|
|
405
|
+
} catch { /* status is diagnostics-only */ }
|
|
406
|
+
|
|
376
407
|
try {
|
|
377
408
|
await appendRunLog(args.project, {
|
|
378
409
|
mode: 'bootstrap',
|
|
@@ -380,6 +411,7 @@ async function main() {
|
|
|
380
411
|
status: 'ok',
|
|
381
412
|
summary: summaryLine,
|
|
382
413
|
report: reportPath ? path.relative(root, reportPath) : null,
|
|
414
|
+
status_md: statusPath ? path.relative(root, statusPath) : null,
|
|
383
415
|
});
|
|
384
416
|
} catch { /* run-log is diagnostics-only */ }
|
|
385
417
|
} catch { /* bootstrap-report is diagnostics-only, never block */ }
|
|
@@ -393,6 +425,7 @@ async function main() {
|
|
|
393
425
|
existed: log.existed.map(p => path.relative(root, p) || '.'),
|
|
394
426
|
dry_run: args.dryRun,
|
|
395
427
|
...(reportPath ? { report: path.relative(root, reportPath) } : {}),
|
|
428
|
+
...(statusPath ? { status_md: path.relative(root, statusPath) } : {}),
|
|
396
429
|
...(dateFloorReport ? { date_floor: dateFloorReport } : {}),
|
|
397
430
|
...(interactiveReport ? { interactive: interactiveReport } : {}),
|
|
398
431
|
});
|
|
@@ -46,16 +46,163 @@ export async function writeRefreshReport(project, alias, { type, summary, detail
|
|
|
46
46
|
}
|
|
47
47
|
|
|
48
48
|
function renderRefreshReport({ type, ts, summary, details }) {
|
|
49
|
+
const d = details || {};
|
|
50
|
+
const localTs = ts.toLocaleString('sv-SE').slice(0, 16); // "YYYY-MM-DD HH:mm"
|
|
49
51
|
const lines = [
|
|
50
|
-
`# ${type
|
|
52
|
+
`# ${type ? type.replace(/^./, c => c.toUpperCase()) : 'Refresh'} Report — ${localTs}`,
|
|
51
53
|
'',
|
|
52
|
-
|
|
54
|
+
`- **Mode**: ${type || 'refresh'}`,
|
|
55
|
+
`- **Contributor**: ${d.contributor ?? '_unknown_'}`,
|
|
56
|
+
`- **Window**: ${d.window ?? d.week ?? '_n/a_'}`,
|
|
57
|
+
`- **Profile**: ${d.profile ?? 'standard'}`,
|
|
58
|
+
`- **Started**: ${d.started ?? ts.toISOString()}`,
|
|
59
|
+
`- **Ended**: ${d.ended ?? ts.toISOString()}`,
|
|
60
|
+
`- **Duration**: ${d.duration ?? _computeDuration(d.started, d.ended)}`,
|
|
53
61
|
'',
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
62
|
+
];
|
|
63
|
+
if (summary) lines.push(summary, '');
|
|
64
|
+
|
|
65
|
+
// ## What was done
|
|
66
|
+
lines.push('## What was done', '');
|
|
67
|
+
if (Array.isArray(d.what_was_done_table) && d.what_was_done_table.length) {
|
|
68
|
+
lines.push('| Source | Action | Items pulled | Outcome | Notes |',
|
|
69
|
+
'|---|---|---|---|---|');
|
|
70
|
+
for (const r of d.what_was_done_table) {
|
|
71
|
+
lines.push(`| ${r.source ?? ''} | ${r.action ?? ''} | ${r.items ?? ''} | ${r.outcome ?? ''} | ${r.notes ?? ''} |`);
|
|
72
|
+
}
|
|
73
|
+
lines.push('');
|
|
74
|
+
} else if (d.what_was_done) {
|
|
75
|
+
lines.push('```yaml', YAML.stringify(d.what_was_done).trimEnd(), '```', '');
|
|
76
|
+
} else {
|
|
77
|
+
lines.push('_None this run._', '');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// ## Resolutions this run
|
|
81
|
+
lines.push('## Resolutions this run', '');
|
|
82
|
+
if (Array.isArray(d.resolutions) && d.resolutions.length) {
|
|
83
|
+
for (const r of d.resolutions) lines.push(`- ${typeof r === 'string' ? r : YAML.stringify(r).trim()}`);
|
|
84
|
+
lines.push('');
|
|
85
|
+
} else { lines.push('_None this run._', ''); }
|
|
86
|
+
|
|
87
|
+
// ## Cleanups this run
|
|
88
|
+
lines.push('## Cleanups this run', '');
|
|
89
|
+
if (Array.isArray(d.cleanups) && d.cleanups.length) {
|
|
90
|
+
for (const r of d.cleanups) lines.push(`- ${typeof r === 'string' ? r : YAML.stringify(r).trim()}`);
|
|
91
|
+
lines.push('');
|
|
92
|
+
} else { lines.push('_None this run._', ''); }
|
|
93
|
+
|
|
94
|
+
// ## Learnings appended
|
|
95
|
+
lines.push('## Learnings appended', '');
|
|
96
|
+
if (Array.isArray(d.learnings) && d.learnings.length) {
|
|
97
|
+
for (const r of d.learnings) lines.push(`- ${typeof r === 'string' ? r : YAML.stringify(r).trim()}`);
|
|
98
|
+
lines.push('');
|
|
99
|
+
} else { lines.push('_None this run._', ''); }
|
|
100
|
+
|
|
101
|
+
// ## Skips & gaps
|
|
102
|
+
lines.push('## Skips & gaps', '');
|
|
103
|
+
if (Array.isArray(d.skips_and_gaps) && d.skips_and_gaps.length) {
|
|
104
|
+
for (const r of d.skips_and_gaps) lines.push(`- ${typeof r === 'string' ? r : YAML.stringify(r).trim()}`);
|
|
105
|
+
lines.push('');
|
|
106
|
+
} else { lines.push('_None this run._', ''); }
|
|
107
|
+
|
|
108
|
+
// ## Files written
|
|
109
|
+
lines.push('## Files written', '');
|
|
110
|
+
const created = Array.isArray(d.created) ? d.created : (Array.isArray(d.files_written) ? d.files_written : []);
|
|
111
|
+
if (created.length) {
|
|
112
|
+
lines.push('| Path | Type |', '|---|---|');
|
|
113
|
+
for (const f of created) {
|
|
114
|
+
const p = typeof f === 'string' ? f : (f.path ?? '');
|
|
115
|
+
const t = typeof f === 'object' ? (f.type ?? '') : '';
|
|
116
|
+
lines.push(`| \`${p}\` | ${t} |`);
|
|
117
|
+
}
|
|
118
|
+
lines.push('');
|
|
119
|
+
} else { lines.push('_None this run._', ''); }
|
|
120
|
+
|
|
121
|
+
// ## Next steps
|
|
122
|
+
lines.push('## Next steps for this contributor', '');
|
|
123
|
+
if (Array.isArray(d.next_steps) && d.next_steps.length) {
|
|
124
|
+
for (const r of d.next_steps) lines.push(`- ${typeof r === 'string' ? r : YAML.stringify(r).trim()}`);
|
|
125
|
+
lines.push('');
|
|
126
|
+
} else { lines.push('_None this run._', ''); }
|
|
127
|
+
|
|
128
|
+
// ## Run summary (counts)
|
|
129
|
+
if (d.counts && typeof d.counts === 'object') {
|
|
130
|
+
lines.push('## Run summary', '',
|
|
131
|
+
'| Metric | Count |', '|---|---|');
|
|
132
|
+
for (const [k, v] of Object.entries(d.counts)) lines.push(`| ${k} | ${v} |`);
|
|
133
|
+
lines.push('');
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// ## Raw details (for machine readers / debugging)
|
|
137
|
+
lines.push('## Raw details', '', '```yaml', YAML.stringify(d).trimEnd(), '```', '');
|
|
138
|
+
return lines.join('\n');
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
function _computeDuration(startIso, endIso) {
|
|
142
|
+
if (!startIso || !endIso) return '_n/a_';
|
|
143
|
+
try {
|
|
144
|
+
const ms = new Date(endIso) - new Date(startIso);
|
|
145
|
+
if (!Number.isFinite(ms) || ms < 0) return '_n/a_';
|
|
146
|
+
const s = Math.round(ms / 1000);
|
|
147
|
+
const h = Math.floor(s / 3600), m = Math.floor((s % 3600) / 60), sec = s % 60;
|
|
148
|
+
return `${h}h ${m}m ${sec}s`;
|
|
149
|
+
} catch { return '_n/a_'; }
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Write Evidence/<alias>/bootstrap-status.md per bootstrap-status-format.instructions.md.
|
|
154
|
+
* Short, scannable summary of durable project state — NOT a run narrative.
|
|
155
|
+
*/
|
|
156
|
+
export async function writeBootstrapStatus(project, alias, status) {
|
|
157
|
+
const p = path.join(aliasRoot(project, alias), USER_FILES.bootstrapStatus);
|
|
158
|
+
const md = renderBootstrapStatus(status);
|
|
159
|
+
await writeAtomic(p, md, { skipIfUnchanged: false });
|
|
160
|
+
return { path: p };
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function renderBootstrapStatus(s) {
|
|
164
|
+
const ts = new Date();
|
|
165
|
+
const localTs = ts.toLocaleString('sv-SE').slice(0, 16);
|
|
166
|
+
const lines = [
|
|
167
|
+
'# Bootstrap Status',
|
|
168
|
+
'',
|
|
169
|
+
`- Bootstrap Date: ${localTs}`,
|
|
170
|
+
`- Project: ${s.project ?? ''}`,
|
|
171
|
+
`- Customer Hint: ${s.customer_hint ?? ''}`,
|
|
172
|
+
`- Mode: ${s.mode ?? 'bootstrap'}`,
|
|
173
|
+
`- Lookback Window: ${s.lookback_days ?? '_n/a_'}`,
|
|
174
|
+
'',
|
|
175
|
+
'## Preflight Checks',
|
|
58
176
|
'',
|
|
177
|
+
'| Check | Status | Notes |',
|
|
178
|
+
'|---|---|---|',
|
|
59
179
|
];
|
|
180
|
+
for (const c of (s.preflight || [])) {
|
|
181
|
+
lines.push(`| ${c.check ?? ''} | ${c.status ?? ''} | ${c.notes ?? ''} |`);
|
|
182
|
+
}
|
|
183
|
+
if ((s.preflight || []).length === 0) lines.push('| _none recorded_ | | |');
|
|
184
|
+
lines.push('',
|
|
185
|
+
'## Context Artifact Status',
|
|
186
|
+
'',
|
|
187
|
+
'| Artifact | Status | Notes |',
|
|
188
|
+
'|---|---|---|');
|
|
189
|
+
for (const a of (s.artifacts || [])) {
|
|
190
|
+
lines.push(`| ${a.artifact ?? ''} | ${a.status ?? ''} | ${a.notes ?? ''} |`);
|
|
191
|
+
}
|
|
192
|
+
if ((s.artifacts || []).length === 0) lines.push('| _none recorded_ | | |');
|
|
193
|
+
lines.push('',
|
|
194
|
+
'## Current Bootstrap Outcome',
|
|
195
|
+
'');
|
|
196
|
+
for (const r of (s.outcome || ['_None this run._'])) lines.push(`- ${r}`);
|
|
197
|
+
lines.push('',
|
|
198
|
+
'## Access Limitations',
|
|
199
|
+
'',
|
|
200
|
+
'| System | Status | Reason | Workaround |',
|
|
201
|
+
'|---|---|---|---|');
|
|
202
|
+
for (const a of (s.access_limitations || [])) {
|
|
203
|
+
lines.push(`| ${a.system ?? ''} | ${a.status ?? ''} | ${a.reason ?? ''} | ${a.workaround ?? ''} |`);
|
|
204
|
+
}
|
|
205
|
+
if ((s.access_limitations || []).length === 0) lines.push('| _none recorded_ | | | |');
|
|
206
|
+
lines.push('');
|
|
60
207
|
return lines.join('\n');
|
|
61
208
|
}
|