@event4u/agent-config 2.11.0 → 2.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-src/skills/canvas-design/SKILL.md +132 -0
- package/.agent-src/skills/canvas-design/evals/triggers.json +16 -0
- package/.agent-src/skills/doc-coauthoring/SKILL.md +129 -0
- package/.agent-src/skills/doc-coauthoring/evals/triggers.json +16 -0
- package/.agent-src/skills/skill-writing/SKILL.md +101 -16
- package/.agent-src/skills/sql-writing/SKILL.md +1 -1
- package/.claude-plugin/marketplace.json +3 -1
- package/CHANGELOG.md +31 -0
- package/README.md +2 -2
- package/config/agent-settings.template.yml +9 -0
- package/docs/architecture.md +1 -1
- package/docs/contracts/adr-level-6-productization.md +2 -2
- package/package.json +1 -1
- package/scripts/ai_council/clients.py +17 -4
- package/scripts/ai_council/orchestrator.py +6 -2
- package/scripts/check_references.py +25 -0
- package/scripts/council_cli.py +36 -5
- package/scripts/run_skill_evals.py +185 -0
- package/scripts/schemas/skill.schema.json +4 -0
- package/scripts/skill_linter.py +71 -1
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: canvas-design
|
|
3
|
+
description: "Use when creating static visual art — posters, marketing visuals, brand assets, PDF/PNG design pieces — even if the user just says 'design a poster' or 'mach uns ein Visual'."
|
|
4
|
+
source: package
|
|
5
|
+
domain: product
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# canvas-design
|
|
9
|
+
|
|
10
|
+
## When to use
|
|
11
|
+
|
|
12
|
+
Use when:
|
|
13
|
+
|
|
14
|
+
* User asks for a poster, marketing visual, brand asset, social-media graphic, cover art
|
|
15
|
+
* Output is a static `.pdf` or `.png` design piece (not a UI mockup, not a wireframe)
|
|
16
|
+
* The deliverable is the visual artifact itself
|
|
17
|
+
|
|
18
|
+
Do NOT use when:
|
|
19
|
+
|
|
20
|
+
* Designing a UI component or app screen → `fe-design`, `ui-component-architect`, `react-shadcn-ui`, `blade-ui`, `flux`
|
|
21
|
+
* Tailwind / shadcn / Flux component styling → `tailwind-engineer`
|
|
22
|
+
* Brand voice / tone definition → `voice-and-tone-design`
|
|
23
|
+
* Release announcement copy → `release-comms`
|
|
24
|
+
|
|
25
|
+
## Goal
|
|
26
|
+
|
|
27
|
+
Produce one finished visual artifact (`.pdf` or `.png`) backed by an original design philosophy. Both files ship together.
|
|
28
|
+
|
|
29
|
+
The work emphasizes: visual expression over text · original direction (no artist mimicry) · composition that looks deliberated, not generated.
|
|
30
|
+
|
|
31
|
+
## Preconditions
|
|
32
|
+
|
|
33
|
+
* Brief from user (theme, intent, occasion, target medium, size constraint)
|
|
34
|
+
* Output directory: `agents/design-assets/{slug}/` — create if missing
|
|
35
|
+
* Image-generation tooling available (Python with Pillow / matplotlib / cairo, SVG → PNG conversion, or whatever the environment ships)
|
|
36
|
+
|
|
37
|
+
## Procedure
|
|
38
|
+
|
|
39
|
+
### 1. Brief intake
|
|
40
|
+
|
|
41
|
+
One numbered-options block surfaces: theme / occasion · target medium + dimensions (web 1200×630? print A3? square 1080×1080?) · color & mood direction · hard constraints (logo required? color to avoid?) · single page or series.
|
|
42
|
+
|
|
43
|
+
If the brief says "in the style of [living artist]", flag the copyright risk and propose an original direction.
|
|
44
|
+
|
|
45
|
+
### 2. Design philosophy
|
|
46
|
+
|
|
47
|
+
Author `agents/design-assets/{slug}/philosophy.md` — 4–6 paragraphs naming:
|
|
48
|
+
|
|
49
|
+
* **Movement name** — 1–2 words ("Chromatic Silence", "Brutalist Joy", "Analog Meditation")
|
|
50
|
+
* **Visual language** — how the philosophy manifests through space, form, color, scale, composition, rhythm
|
|
51
|
+
* **Text role** — sparse, accent only; never paragraphs
|
|
52
|
+
* **Craftsmanship anchor** — visible deliberation, not template polish
|
|
53
|
+
|
|
54
|
+
Stay aesthetically specific but leave interpretive room for the canvas execution.
|
|
55
|
+
|
|
56
|
+
### 3. Subtle conceptual thread
|
|
57
|
+
|
|
58
|
+
Identify a single niche reference embedded in the work — not announced, woven into form / color / composition. A jazz musician quoting another song: those who know catch it, others enjoy the music.
|
|
59
|
+
|
|
60
|
+
Document it in `philosophy.md` under `## Subtle reference`.
|
|
61
|
+
|
|
62
|
+
### 4. Canvas execution
|
|
63
|
+
|
|
64
|
+
Produce `agents/design-assets/{slug}/{slug}.{pdf|png}`:
|
|
65
|
+
|
|
66
|
+
1. Pick the execution tool (Pillow, matplotlib, SVG, or framework-native)
|
|
67
|
+
2. Limited palette — 2–5 colors, intentional and cohesive
|
|
68
|
+
3. Geometric or organic forms per philosophy
|
|
69
|
+
4. Text — sparse, design-forward, integrated as visual element; never overlapping, never falling off canvas
|
|
70
|
+
5. Margins — every element contained, breathing room
|
|
71
|
+
6. Repeating patterns, layered elements, systematic markers as the philosophy permits
|
|
72
|
+
|
|
73
|
+
### 5. Refinement pass
|
|
74
|
+
|
|
75
|
+
After the first render, **do not add more graphics**. Refine what exists:
|
|
76
|
+
|
|
77
|
+
* Tighten composition cohesion
|
|
78
|
+
* Adjust spacing, alignment, color balance
|
|
79
|
+
* Replace fonts if they fight the philosophy
|
|
80
|
+
* Remove any element that doesn't earn its place
|
|
81
|
+
|
|
82
|
+
Render the refined version. Overwrite the artifact.
|
|
83
|
+
|
|
84
|
+
### 6. Multi-page (optional)
|
|
85
|
+
|
|
86
|
+
If the user requests a series, treat each page as a story beat — distinct but philosophically continuous. Bundle as a multi-page PDF or numbered PNGs (`{slug}-01.png`, `{slug}-02.png`, …).
|
|
87
|
+
|
|
88
|
+
### 7. Validation
|
|
89
|
+
|
|
90
|
+
* `philosophy.md` exists with movement name + 4–6 paragraphs + subtle-reference section
|
|
91
|
+
* Artifact file exists at the expected path
|
|
92
|
+
* Open and verify: nothing falls off canvas, no overlapping text, palette ≤ 5 distinct colors, every element has margin
|
|
93
|
+
* Original work — no traceable artist-style copy
|
|
94
|
+
|
|
95
|
+
## Output format
|
|
96
|
+
|
|
97
|
+
1. `agents/design-assets/{slug}/philosophy.md`
|
|
98
|
+
2. `agents/design-assets/{slug}/{slug}.{pdf|png}` (or numbered series for multi-page)
|
|
99
|
+
3. One concluding line stating both file paths
|
|
100
|
+
|
|
101
|
+
## Gotcha
|
|
102
|
+
|
|
103
|
+
* **No artist mimicry** — copying a living artist's signature style is copyright risk and breaks the original-work mandate. Propose an original direction.
|
|
104
|
+
* **Text discipline** — most pieces fail because text creeps in as paragraphs. Words are visual accents, not explanation.
|
|
105
|
+
* **One canvas** — single page unless multi-page is explicitly requested.
|
|
106
|
+
* **Font availability** — the environment may not ship your target font. Pick a fallback before render time, or download into the working dir first.
|
|
107
|
+
* **Output location** — always `agents/design-assets/{slug}/`. Never write binary artifacts to the repo root or to source-of-truth dirs.
|
|
108
|
+
* **Refinement loop is real** — first render is the draft, not the deliverable.
|
|
109
|
+
|
|
110
|
+
## Frugality Standards
|
|
111
|
+
|
|
112
|
+
Apply the [Frugality Charter](../../contexts/contracts/frugality-charter.md).
|
|
113
|
+
|
|
114
|
+
* Per the default-terse rule, `philosophy.md` opens with the movement name — no "In this document I will describe …" frame.
|
|
115
|
+
* Per the cheap-question check, numbered-options blocks only at brief intake (where consequences differ).
|
|
116
|
+
* Per the post-action summary suppression, ship the files; skip an "## Artist statement" wrapper.
|
|
117
|
+
|
|
118
|
+
**Pre-save self-check:**
|
|
119
|
+
|
|
120
|
+
1. Does `philosophy.md` carry filler superlatives ("absolute pinnacle", "transcendent")?
|
|
121
|
+
2. Does the canvas include explanatory text instead of visual-accent text?
|
|
122
|
+
3. Are more than 5 distinct colors present without justification in the philosophy?
|
|
123
|
+
4. Is the subtle reference announced explicitly in the visual, breaking the "those who know" principle?
|
|
124
|
+
|
|
125
|
+
## Do NOT
|
|
126
|
+
|
|
127
|
+
* Copy a living artist's signature visual style
|
|
128
|
+
* Generate cartoony / amateur / template-store output
|
|
129
|
+
* Add paragraphs of text — visuals communicate, words accent
|
|
130
|
+
* Skip the philosophy file — the artifact without it is just an image
|
|
131
|
+
* Skip the refinement pass
|
|
132
|
+
* Write binary artifacts to the repo root or to source-of-truth dirs
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "canvas-design",
|
|
3
|
+
"description": "5 should-trigger + 5 should-not-trigger queries. Should-trigger asks for static visual art (poster / cover / marketing visual). Should-not-trigger near-misses share design vocabulary but route elsewhere (fe-design, tailwind-engineer, voice-and-tone-design, release-comms).",
|
|
4
|
+
"queries": [
|
|
5
|
+
{"q": "Need a launch poster for next week's release announcement — A3 print, minimal style", "trigger": true},
|
|
6
|
+
{"q": "Design us a social-media visual for the v3.0 release, 1080x1080 for IG", "trigger": true},
|
|
7
|
+
{"q": "Mach mir bitte ein Cover-Bild für den Talk nächste Woche, 1200x630 fürs Web", "trigger": true},
|
|
8
|
+
{"q": "We want a single-page PDF brand visual for the conference booth handout", "trigger": true},
|
|
9
|
+
{"q": "Build a minimalist poster for the team-offsite invitation, square format", "trigger": true},
|
|
10
|
+
{"q": "Design a new component library for our app — buttons, inputs, cards", "trigger": false},
|
|
11
|
+
{"q": "What color palette should we standardize on in tailwind.config.js?", "trigger": false},
|
|
12
|
+
{"q": "Refactor the brand voice doc — make it less corporate and more direct", "trigger": false},
|
|
13
|
+
{"q": "Draft the release announcement blog post for the v3.0 launch", "trigger": false},
|
|
14
|
+
{"q": "Help me wireframe the new onboarding flow — three screens, mobile-first", "trigger": false}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: doc-coauthoring
|
|
3
|
+
description: "Use when co-authoring a PRD, design doc, RFC, decision doc, or technical spec — 3-stage flow (context → section-by-section → reader-test) — even if the user just says 'help me write this spec'."
|
|
4
|
+
source: package
|
|
5
|
+
domain: process
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# doc-coauthoring
|
|
9
|
+
|
|
10
|
+
## When to use
|
|
11
|
+
|
|
12
|
+
Use this skill when:
|
|
13
|
+
|
|
14
|
+
* User starts a substantial writing task — PRD, RFC, design doc, decision doc, technical spec, proposal
|
|
15
|
+
* User says "help me write this up", "draft a proposal", "we need a doc for X"
|
|
16
|
+
* The output is a structured prose document the user will own and ship
|
|
17
|
+
|
|
18
|
+
Do NOT use when:
|
|
19
|
+
|
|
20
|
+
* Authoring agent docs / module docs / AGENTS.md → `agent-docs-writing`
|
|
21
|
+
* Writing a README → `readme-writing` / `readme-writing-package`
|
|
22
|
+
* Writing an ADR (process is fixed, no co-authoring loop) → `adr-create`
|
|
23
|
+
* Code documentation, inline comments, docstrings
|
|
24
|
+
|
|
25
|
+
## Goal
|
|
26
|
+
|
|
27
|
+
Move from a fuzzy ask to a complete document the user owns, by:
|
|
28
|
+
|
|
29
|
+
1. Closing the context gap before drafting
|
|
30
|
+
2. Building each section through brainstorm → curate → draft → refine
|
|
31
|
+
3. Testing the draft with a fresh-context reader before declaring done
|
|
32
|
+
|
|
33
|
+
## Preconditions
|
|
34
|
+
|
|
35
|
+
* User explicitly wants a document (not a quick answer)
|
|
36
|
+
* `save-file` and `str-replace-editor` available
|
|
37
|
+
* Target path and filename agreed up front
|
|
38
|
+
|
|
39
|
+
## Procedure
|
|
40
|
+
|
|
41
|
+
### 0. Inspect existing material
|
|
42
|
+
|
|
43
|
+
Before any drafting, **inspect** the landscape: search `agents/` and
|
|
44
|
+
the repo for related prior docs (`grep -ril "{topic}" agents/ docs/`),
|
|
45
|
+
check the user's named ticket / thread for context, and confirm no
|
|
46
|
+
in-flight document already covers the ask. If a near-duplicate exists,
|
|
47
|
+
surface it and ask whether to extend or supersede.
|
|
48
|
+
|
|
49
|
+
### 1. Context gathering
|
|
50
|
+
|
|
51
|
+
Close the gap between what the user knows and what you know.
|
|
52
|
+
|
|
53
|
+
1. **Meta-questions** — one numbered-options block (per `user-interaction`): doc type? primary audience? desired impact? template/format constraint? existing related docs / threads / tickets?
|
|
54
|
+
2. **Info dump** — invite stream-of-consciousness context: plain text, paths to existing docs, ticket links, thread paste.
|
|
55
|
+
3. **Clarifying questions** — 5–10 numbered questions to fill remaining gaps. User answers shorthand (`1: yes`, `2: see #channel`, `3: backwards-compat reason`).
|
|
56
|
+
4. **Exit gate** — ask "ready to draft, or more context?" — wait for confirmation. Do not start scaffolding the file until the user confirms.
|
|
57
|
+
|
|
58
|
+
### 2. Refinement & structure
|
|
59
|
+
|
|
60
|
+
Build the document section by section.
|
|
61
|
+
|
|
62
|
+
1. **Agree on structure** — propose 3–5 sections based on doc type and context. Ask user to confirm or adjust.
|
|
63
|
+
2. **Scaffold the file** — use `save-file` to create the doc with placeholder text per section (`[To be written]`). One commit-equivalent action; review with the user before populating.
|
|
64
|
+
3. **Pick the starting section** — suggest the one with the most unknowns (usually the core decision / proposal). Never start with the summary.
|
|
65
|
+
4. **Per-section loop** — repeat for each section:
|
|
66
|
+
- **Clarifying questions** — 5–10 numbered questions about what this section covers
|
|
67
|
+
- **Brainstorm** — 5–20 numbered options of what could go in. Offer "more options?" at the end.
|
|
68
|
+
- **Curation** — user picks: `keep 1,4,7,9` / `remove 3 (dupes 1)` / `combine 11+12`. Parse freeform feedback if the user gives `"looks good but ..."`.
|
|
69
|
+
- **Gap check** — "anything missing for this section?"
|
|
70
|
+
- **Draft** — `str-replace-editor` to replace the placeholder. Never reprint the whole doc.
|
|
71
|
+
- **Iterate** — user feedback in, surgical edits out. After 3 iterations with no substantial change, ask "anything to remove without losing value?"
|
|
72
|
+
- **Section exit gate** — "section done — move to next?"
|
|
73
|
+
5. **Whole-doc review at 80% complete** — re-read the full file. Surface contradictions, redundancy, generic filler. Apply final edits.
|
|
74
|
+
|
|
75
|
+
### 3. Reader test
|
|
76
|
+
|
|
77
|
+
Verify the doc works for someone without your context.
|
|
78
|
+
|
|
79
|
+
1. **Predict reader questions** — generate 5–10 questions a real reader would ask after reading.
|
|
80
|
+
2. **Run the test** — pick one:
|
|
81
|
+
- **`ai-council` available** → invoke with the doc + predicted questions; treat each council member as a fresh reader.
|
|
82
|
+
- **Otherwise** → instruct the user to open a fresh Claude / ChatGPT, paste the doc, ask the questions one by one. Capture answers.
|
|
83
|
+
3. **Additional fresh-reader checks** (always): "what is ambiguous?" · "what context does this doc assume readers have?" · "internal contradictions?"
|
|
84
|
+
4. **Report** — surface where the fresh reader got it wrong, where assumptions break.
|
|
85
|
+
5. **Loop back to Stage 2** for problematic sections until the fresh reader answers cleanly and surfaces no new gaps.
|
|
86
|
+
|
|
87
|
+
### 4. Handover
|
|
88
|
+
|
|
89
|
+
1. Final read-through by the user (they own the doc).
|
|
90
|
+
2. Verify facts, links, technical details.
|
|
91
|
+
3. Confirm intended impact achieved.
|
|
92
|
+
4. Surface the final file path. Done.
|
|
93
|
+
|
|
94
|
+
## Output format
|
|
95
|
+
|
|
96
|
+
1. Target document file at the agreed path (e.g. `agents/proposals/{slug}.md`)
|
|
97
|
+
2. One concluding line stating "Doc complete at {path} — ready for owner review"
|
|
98
|
+
|
|
99
|
+
## Gotcha
|
|
100
|
+
|
|
101
|
+
* **One question per turn** (Iron Law from `ask-when-uncertain`) — never bundle clarifying + brainstorm + curate prompts in one message.
|
|
102
|
+
* **Never reprint the full doc** during iteration — always use `str-replace-editor`. Reprinting wastes tokens and creates merge drift.
|
|
103
|
+
* **Reader test is not optional** — without it, you ship the version that makes sense to you, not to readers. Skip only on explicit user override.
|
|
104
|
+
* **Sub-agent absence** — `ai-council` may not be configured. Have the manual fresh-Claude fallback ready (Stage 3 step 2).
|
|
105
|
+
* **Image alt-text** — if the doc embeds images, add alt-text inline; without it, fresh-reader tools can't see them.
|
|
106
|
+
* **Language discipline** — keep the doc body in English (per `language-and-tone`). For verbatim German user phrases or interview quotes, use `DE: … · EN: …` anchor blocks.
|
|
107
|
+
|
|
108
|
+
## Frugality Standards
|
|
109
|
+
|
|
110
|
+
Apply the [Frugality Charter](../../contexts/contracts/frugality-charter.md).
|
|
111
|
+
|
|
112
|
+
* Per the default-terse rule, each section opens with content, not "In this section …".
|
|
113
|
+
* Per the cheap-question check, numbered-options blocks only when consequences differ — skip "yes / no, continue?" type prompts.
|
|
114
|
+
* Per the post-action summary suppression, the final output is the doc — no wrapping "Summary of what we did" block.
|
|
115
|
+
|
|
116
|
+
**Pre-save self-check:**
|
|
117
|
+
|
|
118
|
+
1. Does any section open with a narrative preamble instead of content?
|
|
119
|
+
2. Are clarifying questions bundled when one-at-a-time would surface user priorities better?
|
|
120
|
+
3. Is the reader-test stage skipped or merged into a "we're done" claim?
|
|
121
|
+
4. Is non-English prose present outside `DE: / EN:` anchor blocks?
|
|
122
|
+
|
|
123
|
+
## Do NOT
|
|
124
|
+
|
|
125
|
+
* Skip Stage 1 — straight-to-drafting produces docs that miss audience and impact
|
|
126
|
+
* Bundle 5+ questions into one numbered block — breaks one-question-per-turn
|
|
127
|
+
* Reprint the whole doc on every iteration
|
|
128
|
+
* Declare "done" without the Stage 3 reader test
|
|
129
|
+
* Generate doc content from scratch when the user has existing context — gap-closing is the whole point
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"skill": "doc-coauthoring",
|
|
3
|
+
"description": "5 should-trigger + 5 should-not-trigger queries. Should-trigger phrasings reflect real co-authoring asks for PRDs / specs / decision docs. Should-not-trigger near-misses share doc-writing vocabulary but route elsewhere (agent-docs-writing, readme-writing, adr-create, code docs, translations).",
|
|
4
|
+
"queries": [
|
|
5
|
+
{"q": "Help me draft a PRD for the new analytics feature — I have a lot of context but no structure yet", "trigger": true},
|
|
6
|
+
{"q": "We need a design doc for the OAuth migration before next week's review. Can you walk me through writing it?", "trigger": true},
|
|
7
|
+
{"q": "I have to write up a decision doc about dropping Redis. Where do we start?", "trigger": true},
|
|
8
|
+
{"q": "Need an RFC for the data-export rate-limit change before tomorrow's architecture sync", "trigger": true},
|
|
9
|
+
{"q": "Lass uns einen Spec für das neue Webhook-Verfahren zusammenstellen — du fragst, ich antworte", "trigger": true},
|
|
10
|
+
{"q": "Add a section to AGENTS.md about the new env vars we just introduced", "trigger": false},
|
|
11
|
+
{"q": "Update the README with the new install instructions for the docker variant", "trigger": false},
|
|
12
|
+
{"q": "Write an ADR for the queue-broker switch from Redis to SQS", "trigger": false},
|
|
13
|
+
{"q": "Add docstrings to all public methods on the OrderService class", "trigger": false},
|
|
14
|
+
{"q": "Translate the lang/de strings to French for the new locale rollout", "trigger": false}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -3,6 +3,7 @@ name: skill-writing
|
|
|
3
3
|
description: "Use when deciding 'should this be a skill or a rule?', creating/improving/reviewing agent skills, SKILL.md frontmatter, or procedure sections — even without saying 'skill-writing'."
|
|
4
4
|
source: project
|
|
5
5
|
domain: process
|
|
6
|
+
meta_skill: true
|
|
6
7
|
---
|
|
7
8
|
|
|
8
9
|
# skill-writing
|
|
@@ -62,22 +63,25 @@ Ask: **"Does the model need this to do its job correctly?"**
|
|
|
62
63
|
|
|
63
64
|
### Skills and commands share the `.claude/skills/` namespace
|
|
64
65
|
|
|
65
|
-
Skills
|
|
66
|
-
|
|
67
|
-
`.claude/skills/` (`scripts/compress.py` →
|
|
68
|
-
`generate_claude_commands`). Claude treats
|
|
69
|
-
skills.
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
*
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
66
|
+
Skills in `.agent-src.uncompressed/skills/{name}/SKILL.md` AND commands in
|
|
67
|
+
`.agent-src.uncompressed/commands/{name}.md` both project into
|
|
68
|
+
`.claude/skills/` (see `scripts/compress.py` →
|
|
69
|
+
`generate_claude_skills` + `generate_claude_commands`). Claude treats
|
|
70
|
+
the whole directory as native skills.
|
|
71
|
+
|
|
72
|
+
Implications for skill authors:
|
|
73
|
+
|
|
74
|
+
* If a same-name command already exists, the skill takes priority and
|
|
75
|
+
the command is skipped (`generate_claude_commands` honors this).
|
|
76
|
+
Don't reuse a command's slug for a skill unless the command should
|
|
77
|
+
retire.
|
|
78
|
+
* Both artifacts compete on `description` for routing. A weak skill
|
|
79
|
+
description is shadowed by a stronger same-domain command — and vice
|
|
80
|
+
versa. Make trigger phrasing precise (§ 1b below).
|
|
81
|
+
* When the workflow has both a "user types `/foo`" path AND a "model
|
|
82
|
+
picks this up from intent" path, author the skill first and let the
|
|
83
|
+
command delegate (`skills:` frontmatter). Two artifacts with the same
|
|
84
|
+
trigger surface fight each other in the router.
|
|
81
85
|
|
|
82
86
|
### When "Nothing" is the right answer
|
|
83
87
|
|
|
@@ -263,6 +267,87 @@ Example:
|
|
|
263
267
|
* K7: Created with analysis (not blind, expected behavior defined)
|
|
264
268
|
* Size: Within limits (see size-and-scope guideline)
|
|
265
269
|
|
|
270
|
+
### 7. Run + iterate evals (quantitative loop)
|
|
271
|
+
|
|
272
|
+
Triggers (`evals/triggers.json`) check **routing**. A separate
|
|
273
|
+
`evals/evals.json` checks **behavior** — does the skill make the agent
|
|
274
|
+
produce a better answer than baseline? Add this layer for any skill
|
|
275
|
+
where the procedure has measurable output (commands, artifacts,
|
|
276
|
+
structured text). Skip for evergreen heuristics with no falsifiable
|
|
277
|
+
output (e.g. `direct-answers`, `language-and-tone`) unless the user
|
|
278
|
+
asks for it.
|
|
279
|
+
|
|
280
|
+
**Workspace layout** (all under `.gitignore`):
|
|
281
|
+
|
|
282
|
+
```
|
|
283
|
+
.agent-src.uncompressed/skills/{name}/evals/
|
|
284
|
+
triggers.json # tracked — routing eval (§ 1c)
|
|
285
|
+
evals.json # tracked — behavior eval definitions
|
|
286
|
+
runs/ # gitignored — per-iteration outputs
|
|
287
|
+
{timestamp}-baseline/ # sub-agent run without the skill
|
|
288
|
+
{timestamp}-with-skill/ # sub-agent run with the skill
|
|
289
|
+
{timestamp}-benchmark.json
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
**`evals.json` shape** — 3–10 scenarios, each with prompt + grading
|
|
293
|
+
rubric:
|
|
294
|
+
|
|
295
|
+
```json
|
|
296
|
+
{
|
|
297
|
+
"skill": "{name}",
|
|
298
|
+
"scenarios": [
|
|
299
|
+
{
|
|
300
|
+
"id": "happy-path",
|
|
301
|
+
"prompt": "<full user-shaped task that exercises the skill>",
|
|
302
|
+
"assertions": [
|
|
303
|
+
{"kind": "contains", "value": "<expected substring in output>"},
|
|
304
|
+
{"kind": "file_exists", "path": "<artifact path the skill should create>"},
|
|
305
|
+
{"kind": "rubric", "criterion": "<one-line judgement, e.g. 'output includes a numbered procedure'>"}
|
|
306
|
+
]
|
|
307
|
+
}
|
|
308
|
+
]
|
|
309
|
+
}
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
`contains` / `file_exists` grade deterministically. `rubric` items grade
|
|
313
|
+
via a fresh sub-agent reading the output against the criterion — keep
|
|
314
|
+
each criterion to one falsifiable sentence.
|
|
315
|
+
|
|
316
|
+
**Loop** (orchestrated by `scripts/run_skill_evals.py`):
|
|
317
|
+
|
|
318
|
+
1. **Scaffold** — `python3 scripts/run_skill_evals.py scaffold {skill}`
|
|
319
|
+
creates `runs/{timestamp}-{baseline,with-skill}/` and seeds each
|
|
320
|
+
scenario's `meta.json`.
|
|
321
|
+
2. **Baseline run** — spawn one sub-agent per scenario **without** the
|
|
322
|
+
skill loaded. Capture stdout + any artifacts into
|
|
323
|
+
`runs/{timestamp}-baseline/{scenario-id}/`.
|
|
324
|
+
3. **With-skill run** — same scenarios, same sub-agent harness, **with**
|
|
325
|
+
the skill loaded. Capture into `runs/{timestamp}-with-skill/{scenario-id}/`.
|
|
326
|
+
4. **Grade** — for each scenario, write a `grade.json` file with
|
|
327
|
+
per-assertion pass/fail. Deterministic assertions auto-grade;
|
|
328
|
+
rubric assertions need a grader sub-agent.
|
|
329
|
+
5. **Aggregate** — `python3 scripts/run_skill_evals.py aggregate {skill}
|
|
330
|
+
--run {timestamp}` produces `runs/{timestamp}-benchmark.json` with
|
|
331
|
+
pass-rate, timing, token deltas baseline-vs-with-skill.
|
|
332
|
+
6. **Report** — `python3 scripts/run_skill_evals.py report {skill}
|
|
333
|
+
--run {timestamp}` prints the diff. Iterate on the skill body
|
|
334
|
+
until `with-skill` outperforms `baseline` on every scenario.
|
|
335
|
+
|
|
336
|
+
The script ships with sub-agent spawning **stubbed** — the orchestration
|
|
337
|
+
layer is per-environment (Claude Code, Augment, council). Implement
|
|
338
|
+
the spawn function once for your environment, the rest of the loop
|
|
339
|
+
(aggregate / report / scaffold) works out of the box.
|
|
340
|
+
|
|
341
|
+
**Exit criterion** — every scenario passes with-skill, at least one
|
|
342
|
+
fails baseline (proves the skill earns its slot). Commit the
|
|
343
|
+
`evals.json` alongside the skill; never commit `runs/`.
|
|
344
|
+
|
|
345
|
+
Neighbors:
|
|
346
|
+
* `description-assist` — iterate on the trigger phrasing
|
|
347
|
+
* `skill-reviewer` — structural 7-Killers audit
|
|
348
|
+
* `lint-skills` — static checks (frontmatter, sections, size)
|
|
349
|
+
* `skill-improvement-pipeline` — production-learning capture
|
|
350
|
+
|
|
266
351
|
## Output format
|
|
267
352
|
|
|
268
353
|
1. Complete SKILL.md file
|
|
@@ -17,7 +17,7 @@ Do NOT use when:
|
|
|
17
17
|
|
|
18
18
|
## Procedure: Write raw SQL
|
|
19
19
|
|
|
20
|
-
1. **
|
|
20
|
+
1. **Inspect call site & choose approach** — identify every dynamic value flowing into the query, then pick: query builder when possible. Raw SQL only when query builder can't express the query.
|
|
21
21
|
2. **Parameterize** — Every variable must use `?` binding or named `:param`. Never interpolate PHP variables into SQL strings.
|
|
22
22
|
3. **Use MariaDB syntax** — Not PostgreSQL or MSSQL. Check `php/sql.md` for MariaDB-specific patterns.
|
|
23
23
|
4. **Verify** — Run EXPLAIN on complex queries. Check that no PHP interpolation (`"$var"`, `'{$var}'`) appears in SQL.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
},
|
|
7
7
|
"metadata": {
|
|
8
8
|
"description": "Shared agent configuration \u2014 skills for AI coding tools (Claude Code, Augment, Cursor, Cline, Windsurf, Gemini CLI).",
|
|
9
|
-
"version": "2.
|
|
9
|
+
"version": "2.12.0",
|
|
10
10
|
"keywords": [
|
|
11
11
|
"agent-config",
|
|
12
12
|
"skills",
|
|
@@ -71,6 +71,7 @@
|
|
|
71
71
|
"./.claude/skills/bug-fix",
|
|
72
72
|
"./.claude/skills/bug-investigate",
|
|
73
73
|
"./.claude/skills/build-buy-partner",
|
|
74
|
+
"./.claude/skills/canvas-design",
|
|
74
75
|
"./.claude/skills/challenge-me",
|
|
75
76
|
"./.claude/skills/challenge-me-vision",
|
|
76
77
|
"./.claude/skills/challenge-me-with-docs",
|
|
@@ -126,6 +127,7 @@
|
|
|
126
127
|
"./.claude/skills/devcontainer",
|
|
127
128
|
"./.claude/skills/developer-like-execution",
|
|
128
129
|
"./.claude/skills/discovery-interview",
|
|
130
|
+
"./.claude/skills/doc-coauthoring",
|
|
129
131
|
"./.claude/skills/docker",
|
|
130
132
|
"./.claude/skills/dto-creator",
|
|
131
133
|
"./.claude/skills/e2e-heal",
|
package/CHANGELOG.md
CHANGED
|
@@ -429,6 +429,37 @@ our recommendation order, not its support status.
|
|
|
429
429
|
> that forces a new era split (`# Era: 2.8.x`, etc.) — see
|
|
430
430
|
> [`docs/contracts/CHANGELOG-conventions.md § Era splits`](docs/contracts/CHANGELOG-conventions.md).
|
|
431
431
|
|
|
432
|
+
## [2.12.0](https://github.com/event4u-app/agent-config/compare/2.11.0...2.12.0) (2026-05-14)
|
|
433
|
+
|
|
434
|
+
### Features
|
|
435
|
+
|
|
436
|
+
* **linter:** evals.json schema validator + meta_skill exemption ([9568510](https://github.com/event4u-app/agent-config/commit/95685109540c7f2dc2643ec24ba9d996467e0645))
|
|
437
|
+
* **skill-writing:** § 7 quantitative eval loop + run_skill_evals.py ([9eda402](https://github.com/event4u-app/agent-config/commit/9eda402dc43b8e14682787fb1cbbc9872eb16fcc))
|
|
438
|
+
* **skills:** add doc-coauthoring from Anthropic ([161b904](https://github.com/event4u-app/agent-config/commit/161b9044743753f2e54bcae45c36a29daaa8058d))
|
|
439
|
+
* **skills:** add canvas-design from Anthropic ([95c247c](https://github.com/event4u-app/agent-config/commit/95c247c08d3c6710c53bfcd7ba7a00f270e0d8d4))
|
|
440
|
+
* **check-refs:** add file/line opt-out markers ([f381bcb](https://github.com/event4u-app/agent-config/commit/f381bcb5a08818e042af35836dd2c4d8965aa98e))
|
|
441
|
+
* make ai-council max_output_tokens configurable ([5976b46](https://github.com/event4u-app/agent-config/commit/5976b4623b94277f6ba49b0e82bb36ab7d5adb50))
|
|
442
|
+
|
|
443
|
+
### Bug Fixes
|
|
444
|
+
|
|
445
|
+
* **marketplace:** register canvas-design + doc-coauthoring ([9fbfe6a](https://github.com/event4u-app/agent-config/commit/9fbfe6af83589bf45b27b72c1b818be9772ae60c))
|
|
446
|
+
|
|
447
|
+
### Documentation
|
|
448
|
+
|
|
449
|
+
* **audit:** mark forward-refs in north-star bundle as opt-out ([a1d7c21](https://github.com/event4u-app/agent-config/commit/a1d7c21df3d05c27bacf81344893c4e43ae72a06))
|
|
450
|
+
* **roadmap:** expand step-99 with Total Dominance mandate ([c46cffd](https://github.com/event4u-app/agent-config/commit/c46cffd54214a61230be27ddaae3367053be39a5))
|
|
451
|
+
* **roadmap:** add step-99 north-star restructure (meta · out-of-band) ([8dd18f9](https://github.com/event4u-app/agent-config/commit/8dd18f963742d14dd9d006237ddd93881b198a60))
|
|
452
|
+
* **audit:** correct step-3 filename reference ([ee6bd7f](https://github.com/event4u-app/agent-config/commit/ee6bd7ffc6c6cd363b6207b6ff32aa72f2bc317e))
|
|
453
|
+
* **audit:** add 2026-05-14 north-star audit + council synthesis ([589c2fb](https://github.com/event4u-app/agent-config/commit/589c2fbd3e35b57529ab0f934665d71d611012d4))
|
|
454
|
+
* add roadmaps for council, persona, ghostwriter, user-types axis ([471fae3](https://github.com/event4u-app/agent-config/commit/471fae3a46182d930fea21adb4037a41ec99dcb3))
|
|
455
|
+
* add v2 feedback follow-up roadmap ([23d17cb](https://github.com/event4u-app/agent-config/commit/23d17cb24b33e794f7c1e31e76055cc5c8f1ab6c))
|
|
456
|
+
|
|
457
|
+
### Chores
|
|
458
|
+
|
|
459
|
+
* prefix roadmaps with step-N execution sequence ([de87232](https://github.com/event4u-app/agent-config/commit/de87232213404ad104e07c5ca831d64f4a607f8e))
|
|
460
|
+
|
|
461
|
+
Tests: 3718 (+0 since 2.11.0)
|
|
462
|
+
|
|
432
463
|
## [2.11.0](https://github.com/event4u-app/agent-config/compare/2.10.0...2.11.0) (2026-05-14)
|
|
433
464
|
|
|
434
465
|
### Features
|
package/README.md
CHANGED
|
@@ -7,7 +7,7 @@ Give your AI agents an audit-disciplined orchestration contract — testing, Git
|
|
|
7
7
|
> Your agent picks up the project's stack, runs tests, prepares PRs, fixes CI — and follows your team's coding standards while doing it. Stack-aware skill sets ship for PHP (Laravel · Symfony · Zend/Laminas), JavaScript (Next.js · React · Node), and cross-stack concerns (API · testing · security · observability).
|
|
8
8
|
|
|
9
9
|
<p align="center">
|
|
10
|
-
<strong>
|
|
10
|
+
<strong>210 Skills</strong> · <strong>61 Rules</strong> · <strong>106 Commands</strong> · <strong>72 Guidelines</strong> · <strong>8 AI Tools</strong>
|
|
11
11
|
</p>
|
|
12
12
|
|
|
13
13
|
---
|
|
@@ -556,7 +556,7 @@ slash-commands) 📌 = informational marker only (no auto-discovery
|
|
|
556
556
|
or manual wiring required)
|
|
557
557
|
|
|
558
558
|
> **What this means in practice:** Claude Code gets the full project-scoped
|
|
559
|
-
> package (rules +
|
|
559
|
+
> package (rules + 210 skills + 106 native commands); Augment Code gets the
|
|
560
560
|
> same content but only from a single global install at `~/.augment/`.
|
|
561
561
|
> Cursor, Cline, Windsurf, Gemini CLI, GitHub Copilot, Roo Code, Codex CLI,
|
|
562
562
|
> and Continue.dev only get the **rules** natively; skills and commands are
|
|
@@ -298,6 +298,15 @@ ai_council:
|
|
|
298
298
|
# opts in. Set to `min_rounds` to disable the deep tier.
|
|
299
299
|
deep_min_rounds: 3
|
|
300
300
|
|
|
301
|
+
# Per-member output-token budget passed to every API call. The CLI
|
|
302
|
+
# `--max-tokens` flag overrides this on a single invocation; the
|
|
303
|
+
# cost estimator uses the same value as its worst-case ceiling.
|
|
304
|
+
# `0` means "unlimited" — internally widened to the safe provider
|
|
305
|
+
# ceiling (16384) because Anthropic rejects max_tokens=0. Raise
|
|
306
|
+
# explicitly past 16384 only when a model genuinely supports more
|
|
307
|
+
# and you want longer answers.
|
|
308
|
+
max_output_tokens: 2048
|
|
309
|
+
|
|
301
310
|
# Hard cost ceiling per /council invocation. The orchestrator pauses
|
|
302
311
|
# before any member whose projected spend would breach a cap and asks
|
|
303
312
|
# the user to continue. `max_total_usd: 0` disables the USD ceiling
|
package/docs/architecture.md
CHANGED
|
@@ -141,7 +141,7 @@ note, package-internal path-swap, description budget, and the
|
|
|
141
141
|
|
|
142
142
|
| Layer | Count | Purpose |
|
|
143
143
|
|---|---|---|
|
|
144
|
-
| **Skills** |
|
|
144
|
+
| **Skills** | 210 | On-demand expertise — stack analysis (Laravel · Symfony · Zend / Laminas · Next.js · React · Node), testing, Docker, API design, security, observability, … |
|
|
145
145
|
| **Rules** | 61 | Always-active constraints — coding standards, scope control, verification, language-and-tone, agent-authority |
|
|
146
146
|
| **Commands** | 106 | Slash-command workflows — `/commit`, `/create-pr`, `/fix ci`, `/optimize skills`, `/feature plan`, `/work`, `/implement-ticket`, `/compress`, … |
|
|
147
147
|
| **Guidelines** | 72 | Reference material cited by skills — PHP patterns, Eloquent, Playwright, agent-infra, … |
|
|
@@ -73,7 +73,7 @@ stability: stable
|
|
|
73
73
|
§ Beta-review markers; `scripts/check_beta_review_markers.py` wired
|
|
74
74
|
into `task ci`; 39 beta contracts back-filled (P5.4).
|
|
75
75
|
- Test-redundancy audit produced
|
|
76
|
-
[`
|
|
76
|
+
[`step-5-test-cleanup.md`](../../agents/roadmaps/step-5-test-cleanup.md)
|
|
77
77
|
— audit-only, no deletions (P5.5).
|
|
78
78
|
|
|
79
79
|
### Release-trunk discipline (Phase 1)
|
|
@@ -119,7 +119,7 @@ keep-beta-until dates beyond the window.
|
|
|
119
119
|
- **Showcase capture** → future `road-to-showcase-capture.md` when a
|
|
120
120
|
hosted-LLM runner is on the table.
|
|
121
121
|
- **Test-suite deletion** →
|
|
122
|
-
[`
|
|
122
|
+
[`step-5-test-cleanup.md`](../../agents/roadmaps/step-5-test-cleanup.md)
|
|
123
123
|
(audit-only sibling spawned by P5.5; non-destructive by default).
|
|
124
124
|
- **Persona Block B** (Architect / Risk-Officer extension) —
|
|
125
125
|
anti-recommended per the sibling closure decision; not deferred,
|
package/package.json
CHANGED
|
@@ -53,6 +53,19 @@ def _resolve_key_path(filename: str) -> Path:
|
|
|
53
53
|
DEFAULT_ANTHROPIC_MODEL = "claude-sonnet-4-5"
|
|
54
54
|
DEFAULT_OPENAI_MODEL = "gpt-4o"
|
|
55
55
|
|
|
56
|
+
#: Per-call output budget when no caller-supplied value reaches `ask()`.
|
|
57
|
+
#: The CLI resolves the live default from `ai_council.max_output_tokens`
|
|
58
|
+
#: in `.agent-settings.yml`; this constant is only the abstract-base /
|
|
59
|
+
#: direct-API fallback when nothing else is wired up.
|
|
60
|
+
DEFAULT_MAX_TOKENS = 2048
|
|
61
|
+
|
|
62
|
+
#: Expansion target when the user sets `max_output_tokens: 0` ("unlimited")
|
|
63
|
+
#: in settings. Anthropic requires `max_tokens` to be a positive integer,
|
|
64
|
+
#: so 0 is widened to this safe ceiling before the SDK call. Big enough
|
|
65
|
+
#: for current frontier models (Sonnet/GPT-4o headroom ≥ 16k); raise
|
|
66
|
+
#: explicitly in settings if a larger budget is genuinely needed.
|
|
67
|
+
UNLIMITED_TOKENS_FALLBACK = 16384
|
|
68
|
+
|
|
56
69
|
# OpenAI reasoning models (o1, o3, o4 families) reject `max_tokens` and the
|
|
57
70
|
# `system` role; they require `max_completion_tokens` and accept only `user`
|
|
58
71
|
# (and `developer`) messages.
|
|
@@ -128,7 +141,7 @@ class ExternalAIClient(ABC):
|
|
|
128
141
|
self,
|
|
129
142
|
system_prompt: str,
|
|
130
143
|
user_prompt: str,
|
|
131
|
-
max_tokens: int =
|
|
144
|
+
max_tokens: int = DEFAULT_MAX_TOKENS,
|
|
132
145
|
) -> CouncilResponse:
|
|
133
146
|
"""Send one independent query. Must never raise on network/API
|
|
134
147
|
failure — return a `CouncilResponse` with `error` set instead.
|
|
@@ -162,7 +175,7 @@ class AnthropicClient(ExternalAIClient):
|
|
|
162
175
|
) from exc
|
|
163
176
|
self._client = anthropic.Anthropic(api_key=api_key)
|
|
164
177
|
|
|
165
|
-
def ask(self, system_prompt: str, user_prompt: str, max_tokens: int =
|
|
178
|
+
def ask(self, system_prompt: str, user_prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS) -> CouncilResponse:
|
|
166
179
|
t0 = time.monotonic()
|
|
167
180
|
try:
|
|
168
181
|
response = self._client.messages.create(
|
|
@@ -218,7 +231,7 @@ class OpenAIClient(ExternalAIClient):
|
|
|
218
231
|
) from exc
|
|
219
232
|
self._client = openai.OpenAI(api_key=api_key)
|
|
220
233
|
|
|
221
|
-
def ask(self, system_prompt: str, user_prompt: str, max_tokens: int =
|
|
234
|
+
def ask(self, system_prompt: str, user_prompt: str, max_tokens: int = DEFAULT_MAX_TOKENS) -> CouncilResponse:
|
|
222
235
|
t0 = time.monotonic()
|
|
223
236
|
kwargs: dict[str, object] = {"model": self.model}
|
|
224
237
|
if _is_reasoning_model(self.model):
|
|
@@ -316,7 +329,7 @@ class ManualClient(ExternalAIClient):
|
|
|
316
329
|
self,
|
|
317
330
|
system_prompt: str,
|
|
318
331
|
user_prompt: str,
|
|
319
|
-
max_tokens: int =
|
|
332
|
+
max_tokens: int = DEFAULT_MAX_TOKENS, # noqa: ARG002 — accepted for ABC parity
|
|
320
333
|
) -> CouncilResponse:
|
|
321
334
|
t0 = time.monotonic()
|
|
322
335
|
rounds: list[str] = []
|
|
@@ -27,7 +27,11 @@ from scripts.ai_council.budget_guard import (
|
|
|
27
27
|
today_spend_usd as _today_spend_usd,
|
|
28
28
|
would_exceed as _would_exceed_daily,
|
|
29
29
|
)
|
|
30
|
-
from scripts.ai_council.clients import
|
|
30
|
+
from scripts.ai_council.clients import (
|
|
31
|
+
DEFAULT_MAX_TOKENS,
|
|
32
|
+
CouncilResponse,
|
|
33
|
+
ExternalAIClient,
|
|
34
|
+
)
|
|
31
35
|
from scripts.ai_council.pricing import (
|
|
32
36
|
CostEstimate,
|
|
33
37
|
PriceTable,
|
|
@@ -51,7 +55,7 @@ class CostBudget:
|
|
|
51
55
|
class CouncilQuestion:
|
|
52
56
|
mode: str # one of: prompt, roadmap, diff, files
|
|
53
57
|
user_prompt: str # bundled artefact text
|
|
54
|
-
max_tokens: int =
|
|
58
|
+
max_tokens: int = DEFAULT_MAX_TOKENS
|
|
55
59
|
|
|
56
60
|
|
|
57
61
|
@dataclass
|
|
@@ -39,6 +39,17 @@ SKIP_DIRS = [
|
|
|
39
39
|
"agents/council-questions", # design Q&A trail — forward-refs to planned artifacts
|
|
40
40
|
"agents/analysis", # plate-comparison working docs — forward-refs to planned artifacts
|
|
41
41
|
]
|
|
42
|
+
|
|
43
|
+
# Per-file opt-out marker. When present in the first 10 lines of a .md
|
|
44
|
+
# file, the entire file is skipped. Use for working docs that
|
|
45
|
+
# intentionally reference planned-but-not-yet-existing artifacts
|
|
46
|
+
# (audit bundles, design Q&A, in-flight plans).
|
|
47
|
+
FILE_SKIP_MARKER = "<!-- check-refs: skip -->"
|
|
48
|
+
|
|
49
|
+
# Per-line opt-out marker. When present anywhere on a line, that line's
|
|
50
|
+
# refs are skipped. Use for isolated forward-refs inside otherwise
|
|
51
|
+
# fully-checked documents.
|
|
52
|
+
LINE_IGNORE_MARKER = "<!-- ref-ignore -->"
|
|
42
53
|
ROOT = Path(".")
|
|
43
54
|
|
|
44
55
|
# YAML memory files (engineering-memory layer) live under `agents/memory/`.
|
|
@@ -219,6 +230,14 @@ def check_file(filepath: Path, artifacts: dict[str, set[str]], root: Path) -> Li
|
|
|
219
230
|
except Exception:
|
|
220
231
|
return broken
|
|
221
232
|
|
|
233
|
+
# File-level opt-out: working docs that intentionally reference
|
|
234
|
+
# planned-but-not-yet-existing artifacts mark themselves with
|
|
235
|
+
# `<!-- check-refs: skip -->` in the first 10 lines. Marker pairs
|
|
236
|
+
# with the per-line `<!-- ref-ignore -->` below; either suffices.
|
|
237
|
+
header_lines = text.splitlines()[:10]
|
|
238
|
+
if any(FILE_SKIP_MARKER in line for line in header_lines):
|
|
239
|
+
return broken
|
|
240
|
+
|
|
222
241
|
# Validate `personas:` frontmatter entries against known persona ids.
|
|
223
242
|
for line_no, pid in _extract_personas_frontmatter(text):
|
|
224
243
|
if pid not in artifacts["personas"]:
|
|
@@ -241,6 +260,12 @@ def check_file(filepath: Path, artifacts: dict[str, set[str]], root: Path) -> Li
|
|
|
241
260
|
if in_code_block:
|
|
242
261
|
continue
|
|
243
262
|
|
|
263
|
+
# Per-line opt-out: isolated forward-refs in otherwise checked
|
|
264
|
+
# documents (e.g. one ref to a planned skill, surrounded by
|
|
265
|
+
# valid refs). Skip the whole line's path / skill / rule checks.
|
|
266
|
+
if LINE_IGNORE_MARKER in line:
|
|
267
|
+
continue
|
|
268
|
+
|
|
244
269
|
# Unchecked TODO checkboxes document future work — their refs are
|
|
245
270
|
# forward-looking and will not resolve yet. Track multi-line bullets:
|
|
246
271
|
# any `- [ ]` opens a TODO context; a new top-level bullet, heading,
|
package/scripts/council_cli.py
CHANGED
|
@@ -31,6 +31,7 @@ from scripts.ai_council.bundler import ( # noqa: E402
|
|
|
31
31
|
BundleTooLarge, bundle_prompt, bundle_roadmap,
|
|
32
32
|
)
|
|
33
33
|
from scripts.ai_council.clients import ( # noqa: E402
|
|
34
|
+
DEFAULT_MAX_TOKENS, UNLIMITED_TOKENS_FALLBACK,
|
|
34
35
|
AnthropicClient, CouncilResponse, ExternalAIClient, ManualClient,
|
|
35
36
|
OpenAIClient, load_anthropic_key, load_openai_key,
|
|
36
37
|
)
|
|
@@ -236,6 +237,32 @@ def _resolve_rounds(args: argparse.Namespace, ai_cfg: dict[str, Any]) -> int:
|
|
|
236
237
|
return min_rounds
|
|
237
238
|
|
|
238
239
|
|
|
240
|
+
def _resolve_max_tokens(args: argparse.Namespace, ai_cfg: dict[str, Any]) -> int:
|
|
241
|
+
"""Resolve the per-call output budget passed to each member.
|
|
242
|
+
|
|
243
|
+
Resolution chain (highest priority first):
|
|
244
|
+
1. ``--max-tokens N`` — explicit invocation override.
|
|
245
|
+
2. ``ai_council.max_output_tokens`` — settings value (project file
|
|
246
|
+
is authoritative; this key is not user-global-mergeable).
|
|
247
|
+
3. ``DEFAULT_MAX_TOKENS`` — package fallback (2048).
|
|
248
|
+
|
|
249
|
+
A value of ``0`` at any layer means "unlimited"; it is widened to
|
|
250
|
+
``UNLIMITED_TOKENS_FALLBACK`` before reaching the SDK because
|
|
251
|
+
Anthropic rejects ``max_tokens=0``. Estimation uses the same expanded
|
|
252
|
+
value so the cost preview reflects the worst-case ceiling.
|
|
253
|
+
"""
|
|
254
|
+
cli = getattr(args, "max_tokens", None)
|
|
255
|
+
if cli is not None:
|
|
256
|
+
value = int(cli)
|
|
257
|
+
elif "max_output_tokens" in ai_cfg:
|
|
258
|
+
value = int(ai_cfg.get("max_output_tokens") or 0)
|
|
259
|
+
else:
|
|
260
|
+
value = DEFAULT_MAX_TOKENS
|
|
261
|
+
if value <= 0:
|
|
262
|
+
return UNLIMITED_TOKENS_FALLBACK
|
|
263
|
+
return value
|
|
264
|
+
|
|
265
|
+
|
|
239
266
|
def cmd_estimate(
|
|
240
267
|
args: argparse.Namespace,
|
|
241
268
|
*,
|
|
@@ -255,9 +282,10 @@ def cmd_estimate(
|
|
|
255
282
|
)
|
|
256
283
|
if table is None:
|
|
257
284
|
table = load_prices()
|
|
285
|
+
ai_cfg = (settings.get("ai_council") or {}) if isinstance(settings, dict) else {}
|
|
258
286
|
question, _ = build_question(
|
|
259
287
|
input_path=Path(args.question), input_mode=args.input_mode,
|
|
260
|
-
max_tokens=args
|
|
288
|
+
max_tokens=_resolve_max_tokens(args, ai_cfg),
|
|
261
289
|
)
|
|
262
290
|
project = detect_project_context(REPO_ROOT)
|
|
263
291
|
billable = [m for m in members if getattr(m, "billable", True)]
|
|
@@ -316,9 +344,10 @@ def cmd_run(
|
|
|
316
344
|
)
|
|
317
345
|
if table is None:
|
|
318
346
|
table = load_prices()
|
|
347
|
+
ai_cfg = (settings.get("ai_council") or {}) if isinstance(settings, dict) else {}
|
|
319
348
|
question, artefact = build_question(
|
|
320
349
|
input_path=Path(args.question), input_mode=args.input_mode,
|
|
321
|
-
max_tokens=args
|
|
350
|
+
max_tokens=_resolve_max_tokens(args, ai_cfg),
|
|
322
351
|
)
|
|
323
352
|
project = detect_project_context(REPO_ROOT)
|
|
324
353
|
billable = [m for m in members if getattr(m, "billable", True)]
|
|
@@ -337,7 +366,6 @@ def cmd_run(
|
|
|
337
366
|
)
|
|
338
367
|
return 0
|
|
339
368
|
|
|
340
|
-
ai_cfg = settings.get("ai_council") or {}
|
|
341
369
|
cost_cfg = ai_cfg.get("cost_budget") or {}
|
|
342
370
|
budget = CostBudget(
|
|
343
371
|
max_input_tokens=int(cost_cfg.get("max_input_tokens", 50_000)),
|
|
@@ -451,8 +479,11 @@ def _add_common_input_args(p: argparse.ArgumentParser) -> None:
|
|
|
451
479
|
p.add_argument("--input-mode", choices=["prompt", "roadmap"],
|
|
452
480
|
default="prompt",
|
|
453
481
|
help="How to bundle the file (default: prompt).")
|
|
454
|
-
p.add_argument("--max-tokens", type=int, default=
|
|
455
|
-
help="Per-member output budget
|
|
482
|
+
p.add_argument("--max-tokens", type=int, default=None,
|
|
483
|
+
help="Per-member output budget. Default reads "
|
|
484
|
+
"ai_council.max_output_tokens from .agent-settings.yml "
|
|
485
|
+
"(2048 if unset). 0 = unlimited (widened to the safe "
|
|
486
|
+
"provider ceiling before the SDK call).")
|
|
456
487
|
p.add_argument("--mode-override", choices=["api", "manual"], default=None,
|
|
457
488
|
help="Override every member's transport mode.")
|
|
458
489
|
p.add_argument("--model", action="append", default=None, dest="model",
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Quantitative skill-eval orchestrator (skill-writing § 7).
|
|
3
|
+
|
|
4
|
+
Scaffolds, aggregates, and reports sub-agent eval runs for a skill.
|
|
5
|
+
|
|
6
|
+
Sub-agent SPAWNING is per-environment (Claude Code, Augment Code,
|
|
7
|
+
council) and is left as a stub `_spawn_subagent(...)` that authors
|
|
8
|
+
implement once for their environment. The rest of the loop —
|
|
9
|
+
scaffold / aggregate / report — works out of the box and reads /
|
|
10
|
+
writes JSON files in `runs/`.
|
|
11
|
+
|
|
12
|
+
Layout per skill:
|
|
13
|
+
|
|
14
|
+
.agent-src.uncompressed/skills/{name}/evals/
|
|
15
|
+
evals.json
|
|
16
|
+
runs/ # gitignored
|
|
17
|
+
{timestamp}-baseline/{scenario_id}/output.txt
|
|
18
|
+
{timestamp}-baseline/{scenario_id}/grade.json
|
|
19
|
+
{timestamp}-with-skill/{scenario_id}/output.txt
|
|
20
|
+
{timestamp}-with-skill/{scenario_id}/grade.json
|
|
21
|
+
{timestamp}-benchmark.json
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
import argparse
|
|
27
|
+
import json
|
|
28
|
+
import sys
|
|
29
|
+
from datetime import datetime, timezone
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
from typing import Any
|
|
32
|
+
|
|
33
|
+
REPO_ROOT = Path(__file__).resolve().parent.parent
|
|
34
|
+
SKILLS_ROOT = REPO_ROOT / ".agent-src.uncompressed" / "skills"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _skill_dir(skill: str) -> Path:
|
|
38
|
+
p = SKILLS_ROOT / skill
|
|
39
|
+
if not p.is_dir():
|
|
40
|
+
sys.exit(f"error: skill {skill!r} not found at {p}")
|
|
41
|
+
return p
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _evals_dir(skill: str) -> Path:
|
|
45
|
+
return _skill_dir(skill) / "evals"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _load_evals(skill: str) -> dict[str, Any]:
|
|
49
|
+
f = _evals_dir(skill) / "evals.json"
|
|
50
|
+
if not f.exists():
|
|
51
|
+
sys.exit(f"error: {f} not found — create it before scaffolding")
|
|
52
|
+
return json.loads(f.read_text(encoding="utf-8"))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _timestamp() -> str:
|
|
56
|
+
return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _spawn_subagent(prompt: str, *, load_skill: str | None) -> dict[str, Any]:
|
|
60
|
+
"""STUB — implement per environment.
|
|
61
|
+
|
|
62
|
+
Must return {"output": str, "elapsed_s": float, "tokens_in": int,
|
|
63
|
+
"tokens_out": int}. When load_skill is None, run baseline; when
|
|
64
|
+
set, load that skill into the sub-agent's context.
|
|
65
|
+
"""
|
|
66
|
+
raise NotImplementedError(
|
|
67
|
+
"implement _spawn_subagent for this environment (Claude Code, "
|
|
68
|
+
"Augment, council, ...) — see docstring contract"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _grade_assertions(output: str, run_dir: Path, assertions: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
73
|
+
results: list[dict[str, Any]] = []
|
|
74
|
+
for a in assertions:
|
|
75
|
+
kind = a.get("kind")
|
|
76
|
+
if kind == "contains":
|
|
77
|
+
ok = a["value"] in output
|
|
78
|
+
results.append({"kind": kind, "value": a["value"], "pass": ok})
|
|
79
|
+
elif kind == "file_exists":
|
|
80
|
+
ok = (run_dir / a["path"]).exists() or Path(a["path"]).exists()
|
|
81
|
+
results.append({"kind": kind, "path": a["path"], "pass": ok})
|
|
82
|
+
elif kind == "rubric":
|
|
83
|
+
results.append({"kind": kind, "criterion": a["criterion"], "pass": None,
|
|
84
|
+
"note": "rubric grading requires sub-agent — fill in manually or via grader"})
|
|
85
|
+
else:
|
|
86
|
+
results.append({"kind": kind, "pass": False, "note": f"unknown assertion kind {kind!r}"})
|
|
87
|
+
return results
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def cmd_scaffold(skill: str) -> int:
|
|
91
|
+
spec = _load_evals(skill)
|
|
92
|
+
scenarios = spec.get("scenarios", [])
|
|
93
|
+
if not scenarios:
|
|
94
|
+
sys.exit("error: evals.json has no scenarios")
|
|
95
|
+
ts = _timestamp()
|
|
96
|
+
runs = _evals_dir(skill) / "runs"
|
|
97
|
+
for arm in ("baseline", "with-skill"):
|
|
98
|
+
for sc in scenarios:
|
|
99
|
+
d = runs / f"{ts}-{arm}" / sc["id"]
|
|
100
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
(d / "meta.json").write_text(json.dumps({
|
|
102
|
+
"skill": skill, "arm": arm, "scenario_id": sc["id"],
|
|
103
|
+
"prompt": sc["prompt"], "assertions": sc.get("assertions", []),
|
|
104
|
+
"timestamp": ts,
|
|
105
|
+
}, indent=2) + "\n", encoding="utf-8")
|
|
106
|
+
print(f"scaffolded {len(scenarios)} scenarios × 2 arms at runs/{ts}-{{baseline,with-skill}}/")
|
|
107
|
+
print(f"timestamp: {ts}")
|
|
108
|
+
return 0
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def cmd_aggregate(skill: str, run: str) -> int:
|
|
112
|
+
runs = _evals_dir(skill) / "runs"
|
|
113
|
+
spec = _load_evals(skill)
|
|
114
|
+
bench: dict[str, Any] = {"skill": skill, "run": run, "generated_at": _timestamp(), "scenarios": []}
|
|
115
|
+
totals = {"baseline_pass": 0, "with_skill_pass": 0, "scenarios": 0}
|
|
116
|
+
for sc in spec.get("scenarios", []):
|
|
117
|
+
row: dict[str, Any] = {"id": sc["id"], "arms": {}}
|
|
118
|
+
for arm in ("baseline", "with-skill"):
|
|
119
|
+
run_dir = runs / f"{run}-{arm}" / sc["id"]
|
|
120
|
+
grade_f = run_dir / "grade.json"
|
|
121
|
+
if not grade_f.exists():
|
|
122
|
+
row["arms"][arm] = {"status": "missing", "pass_count": 0, "total": 0}
|
|
123
|
+
continue
|
|
124
|
+
g = json.loads(grade_f.read_text(encoding="utf-8"))
|
|
125
|
+
results = g.get("results", [])
|
|
126
|
+
passed = sum(1 for r in results if r.get("pass") is True)
|
|
127
|
+
row["arms"][arm] = {"status": "graded", "pass_count": passed, "total": len(results),
|
|
128
|
+
"elapsed_s": g.get("elapsed_s"), "tokens_in": g.get("tokens_in"),
|
|
129
|
+
"tokens_out": g.get("tokens_out")}
|
|
130
|
+
if arm == "baseline" and passed == len(results) and results:
|
|
131
|
+
totals["baseline_pass"] += 1
|
|
132
|
+
if arm == "with-skill" and passed == len(results) and results:
|
|
133
|
+
totals["with_skill_pass"] += 1
|
|
134
|
+
bench["scenarios"].append(row)
|
|
135
|
+
totals["scenarios"] += 1
|
|
136
|
+
bench["totals"] = totals
|
|
137
|
+
out = runs / f"{run}-benchmark.json"
|
|
138
|
+
out.write_text(json.dumps(bench, indent=2) + "\n", encoding="utf-8")
|
|
139
|
+
print(f"wrote {out.relative_to(REPO_ROOT)}")
|
|
140
|
+
print(f" baseline pass: {totals['baseline_pass']}/{totals['scenarios']}")
|
|
141
|
+
print(f" with-skill pass: {totals['with_skill_pass']}/{totals['scenarios']}")
|
|
142
|
+
return 0
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def cmd_report(skill: str, run: str) -> int:
|
|
146
|
+
bench_f = _evals_dir(skill) / "runs" / f"{run}-benchmark.json"
|
|
147
|
+
if not bench_f.exists():
|
|
148
|
+
sys.exit(f"error: {bench_f} not found — run aggregate first")
|
|
149
|
+
bench = json.loads(bench_f.read_text(encoding="utf-8"))
|
|
150
|
+
print(f"# Skill eval report — {skill} @ {run}\n")
|
|
151
|
+
print("| Scenario | Baseline | With skill | Δ tokens_out | Δ elapsed_s |")
|
|
152
|
+
print("|---|---|---|---|---|")
|
|
153
|
+
for sc in bench["scenarios"]:
|
|
154
|
+
b = sc["arms"].get("baseline", {})
|
|
155
|
+
w = sc["arms"].get("with-skill", {})
|
|
156
|
+
bp = f"{b.get('pass_count', 0)}/{b.get('total', 0)}"
|
|
157
|
+
wp = f"{w.get('pass_count', 0)}/{w.get('total', 0)}"
|
|
158
|
+
dt = (w.get("tokens_out") or 0) - (b.get("tokens_out") or 0)
|
|
159
|
+
de = (w.get("elapsed_s") or 0) - (b.get("elapsed_s") or 0)
|
|
160
|
+
print(f"| {sc['id']} | {bp} | {wp} | {dt:+d} | {de:+.2f} |")
|
|
161
|
+
t = bench["totals"]
|
|
162
|
+
print(f"\n**Totals:** baseline {t['baseline_pass']}/{t['scenarios']} · with-skill {t['with_skill_pass']}/{t['scenarios']}")
|
|
163
|
+
return 0
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def main() -> int:
|
|
167
|
+
p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
|
|
168
|
+
sub = p.add_subparsers(dest="cmd", required=True)
|
|
169
|
+
for name in ("scaffold", "aggregate", "report"):
|
|
170
|
+
sp = sub.add_parser(name)
|
|
171
|
+
sp.add_argument("skill")
|
|
172
|
+
if name != "scaffold":
|
|
173
|
+
sp.add_argument("--run", required=True, help="run timestamp (from scaffold output)")
|
|
174
|
+
args = p.parse_args()
|
|
175
|
+
if args.cmd == "scaffold":
|
|
176
|
+
return cmd_scaffold(args.skill)
|
|
177
|
+
if args.cmd == "aggregate":
|
|
178
|
+
return cmd_aggregate(args.skill, args.run)
|
|
179
|
+
if args.cmd == "report":
|
|
180
|
+
return cmd_report(args.skill, args.run)
|
|
181
|
+
return 1
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
sys.exit(main())
|
|
@@ -47,6 +47,10 @@
|
|
|
47
47
|
"enum": ["senior"],
|
|
48
48
|
"description": "Optional tier marker. `senior` opts the skill into the Senior-Tier Required Structure check (Context-First lead, Related Skills, Proactive Triggers, Output Artifacts) per .agent-src.uncompressed/rules/skill-quality.md."
|
|
49
49
|
},
|
|
50
|
+
"meta_skill": {
|
|
51
|
+
"type": "boolean",
|
|
52
|
+
"description": "Opt-out of the linter's `skill_too_large` warn for skills whose purpose IS breadth (skill-writing, agent-docs-writing, skill-reviewer). Meta-skills inherently bundle multiple procedures and inline examples. Use sparingly — every meta_skill: true is a load-on-context trade-off."
|
|
53
|
+
},
|
|
50
54
|
"external_source": {
|
|
51
55
|
"type": "string",
|
|
52
56
|
"format": "uri",
|
package/scripts/skill_linter.py
CHANGED
|
@@ -775,8 +775,14 @@ def lint_skill(path: Path, text: str) -> LintResult:
|
|
|
775
775
|
# is *both* large AND prose-dominant OR ships ≥ 2 independently invocable
|
|
776
776
|
# procedures. Reference catalogues (quality-tools 411 L / density 0.83)
|
|
777
777
|
# pass; multi-procedure skills are flagged for split.
|
|
778
|
+
#
|
|
779
|
+
# Frontmatter opt-out: `meta_skill: true` exempts a skill from the size
|
|
780
|
+
# warn when the skill's purpose *is* breadth (skill-writing, agent-docs-
|
|
781
|
+
# writing, skill-reviewer, etc.). Meta-skills inherently bundle multiple
|
|
782
|
+
# procedures and inline examples.
|
|
778
783
|
total_lines = len(text.splitlines())
|
|
779
|
-
|
|
784
|
+
is_meta_skill = bool(fm) and re.search(r"^meta_skill:\s*true\s*$", fm, re.MULTILINE)
|
|
785
|
+
if total_lines > 400 and not is_meta_skill:
|
|
780
786
|
density = _density_score(text)
|
|
781
787
|
procedures = _count_procedure_sections(text)
|
|
782
788
|
if density < 0.6 or procedures >= 2:
|
|
@@ -832,6 +838,12 @@ def lint_skill(path: Path, text: str) -> LintResult:
|
|
|
832
838
|
f"{meaningful_steps} steps) — may lack its own executable workflow"))
|
|
833
839
|
suggestions.append("Expand the skill so it remains executable without opening a guideline")
|
|
834
840
|
|
|
841
|
+
# --- evals.json schema validator ---
|
|
842
|
+
# When a skill ships sibling `evals/evals.json` (quantitative behavior
|
|
843
|
+
# eval per skill-writing § 7), validate its shape. Triggers.json is a
|
|
844
|
+
# separate concern handled elsewhere. All issues here are WARN.
|
|
845
|
+
issues.extend(validate_evals_json(path))
|
|
846
|
+
|
|
835
847
|
return LintResult(
|
|
836
848
|
file=str(path),
|
|
837
849
|
artifact_type="skill",
|
|
@@ -841,6 +853,64 @@ def lint_skill(path: Path, text: str) -> LintResult:
|
|
|
841
853
|
)
|
|
842
854
|
|
|
843
855
|
|
|
856
|
+
def validate_evals_json(skill_path: Path) -> list[Issue]:
|
|
857
|
+
"""Validate `{skill_dir}/evals/evals.json` against the schema declared
|
|
858
|
+
in `skill-writing` § 7. Returns WARN-level issues only; never blocks.
|
|
859
|
+
Skipped entirely when the file is absent."""
|
|
860
|
+
evals_path = skill_path.parent / "evals" / "evals.json"
|
|
861
|
+
if not evals_path.is_file():
|
|
862
|
+
return []
|
|
863
|
+
issues: list[Issue] = []
|
|
864
|
+
try:
|
|
865
|
+
data = json.loads(evals_path.read_text(encoding="utf-8"))
|
|
866
|
+
except (OSError, json.JSONDecodeError) as exc:
|
|
867
|
+
return [Issue("warning", "evals_json_unreadable",
|
|
868
|
+
f"evals/evals.json could not be parsed: {exc}")]
|
|
869
|
+
if not isinstance(data, dict):
|
|
870
|
+
return [Issue("warning", "evals_json_shape",
|
|
871
|
+
"evals/evals.json root must be an object")]
|
|
872
|
+
if "skill" not in data or not isinstance(data["skill"], str):
|
|
873
|
+
issues.append(Issue("warning", "evals_json_missing_skill",
|
|
874
|
+
"evals/evals.json must declare top-level 'skill' (string)"))
|
|
875
|
+
scenarios = data.get("scenarios")
|
|
876
|
+
if not isinstance(scenarios, list) or len(scenarios) < 1:
|
|
877
|
+
issues.append(Issue("warning", "evals_json_no_scenarios",
|
|
878
|
+
"evals/evals.json must declare 'scenarios' (non-empty array)"))
|
|
879
|
+
return issues
|
|
880
|
+
valid_kinds = {"contains", "file_exists", "rubric"}
|
|
881
|
+
for idx, scenario in enumerate(scenarios):
|
|
882
|
+
loc = f"scenarios[{idx}]"
|
|
883
|
+
if not isinstance(scenario, dict):
|
|
884
|
+
issues.append(Issue("warning", "evals_json_scenario_shape",
|
|
885
|
+
f"{loc} must be an object"))
|
|
886
|
+
continue
|
|
887
|
+
for key in ("id", "prompt"):
|
|
888
|
+
if key not in scenario or not isinstance(scenario[key], str) or not scenario[key].strip():
|
|
889
|
+
issues.append(Issue("warning", "evals_json_scenario_missing_field",
|
|
890
|
+
f"{loc} missing required string field '{key}'"))
|
|
891
|
+
assertions = scenario.get("assertions")
|
|
892
|
+
if not isinstance(assertions, list) or len(assertions) < 1:
|
|
893
|
+
issues.append(Issue("warning", "evals_json_scenario_no_assertions",
|
|
894
|
+
f"{loc}.assertions must be a non-empty array"))
|
|
895
|
+
continue
|
|
896
|
+
for a_idx, assertion in enumerate(assertions):
|
|
897
|
+
a_loc = f"{loc}.assertions[{a_idx}]"
|
|
898
|
+
if not isinstance(assertion, dict):
|
|
899
|
+
issues.append(Issue("warning", "evals_json_assertion_shape",
|
|
900
|
+
f"{a_loc} must be an object"))
|
|
901
|
+
continue
|
|
902
|
+
kind = assertion.get("kind")
|
|
903
|
+
if kind not in valid_kinds:
|
|
904
|
+
issues.append(Issue("warning", "evals_json_assertion_kind",
|
|
905
|
+
f"{a_loc}.kind must be one of {sorted(valid_kinds)}, got {kind!r}"))
|
|
906
|
+
continue
|
|
907
|
+
required_field = {"contains": "value", "file_exists": "path", "rubric": "criterion"}[kind]
|
|
908
|
+
if required_field not in assertion or not isinstance(assertion[required_field], str):
|
|
909
|
+
issues.append(Issue("warning", "evals_json_assertion_missing_field",
|
|
910
|
+
f"{a_loc} (kind={kind}) missing required string field '{required_field}'"))
|
|
911
|
+
return issues
|
|
912
|
+
|
|
913
|
+
|
|
844
914
|
def extract_frontmatter(text: str) -> Optional[str]:
|
|
845
915
|
match = FRONTMATTER_PATTERN.search(text)
|
|
846
916
|
return match.group(1) if match else None
|