npm - @intentsolutionsio/skill-creator - Versions diffs - 5.0.0 → 5.0.3 - Mend

@intentsolutionsio/skill-creator 5.0.0 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@intentsolutionsio/skill-creator",
-  "version": "5.0.0",
+  "version": "5.0.3",
   "description": "Create and validate production-grade agent skills with 100-point marketplace grading",
   "keywords": [
     "skill-creation",

package/scripts/validate-skill.py CHANGED Viewed

@@ -32,19 +32,41 @@ except ImportError:
 # === CONSTANTS ===
 VALID_TOOLS = {
-    "Read", "Write", "Edit", "Bash", "Glob", "Grep",
-    "WebFetch", "WebSearch", "Task", "NotebookEdit",
-    "AskUserQuestion", "Skill",
+    "Read",
+    "Write",
+    "Edit",
+    "Bash",
+    "Glob",
+    "Grep",
+    "WebFetch",
+    "WebSearch",
+    "Task",
+    "NotebookEdit",
+    "AskUserQuestion",
+    "Skill",
 }
 KNOWN_FRONTMATTER_FIELDS = {
     # AgentSkills.io spec
-    "name", "description", "license", "compatibility", "metadata", "allowed-tools",
+    "name",
+    "description",
+    "license",
+    "compatibility",
+    "metadata",
+    "allowed-tools",
     # Top-level identity fields (marketplace standard)
-    "version", "author", "compatible-with", "tags",
+    "version",
+    "author",
+    "compatible-with",
+    "tags",
     # Claude Code extensions
-    "argument-hint", "disable-model-invocation", "user-invocable", "model",
-    "context", "agent", "hooks",
+    "argument-hint",
+    "disable-model-invocation",
+    "user-invocable",
+    "model",
+    "context",
+    "agent",
+    "hooks",
 }
 DEPRECATED_FIELDS = {
@@ -53,7 +75,13 @@ DEPRECATED_FIELDS = {
 }
 VALID_PLATFORMS = {
-    "claude-code", "codex", "openclaw", "aider", "continue", "cursor", "windsurf",
+    "claude-code",
+    "codex",
+    "openclaw",
+    "aider",
+    "continue",
+    "cursor",
+    "windsurf",
 }
 RE_FRONTMATTER = re.compile(r"^---\s*\n(.*?)\n---\s*\n(.*)$", re.DOTALL)
@@ -69,7 +97,10 @@ RE_XML_TAG = re.compile(r"[<>]")
 RE_TIME_SENSITIVE = [
     re.compile(r"\b(20\d{2}[-/]\d{2}[-/]\d{2})\b"),  # dates like 2025-01-01
     re.compile(r"\b(v\d+\.\d+\.\d+)\b", re.IGNORECASE),  # version numbers like v1.2.3
-    re.compile(r"\b(as of|since|after|before) (January|February|March|April|May|June|July|August|September|October|November|December)\b", re.IGNORECASE),
+    re.compile(
+        r"\b(as of|since|after|before) (January|February|March|April|May|June|July|August|September|October|November|December)\b",
+        re.IGNORECASE,
+    ),
 ]
 ABSOLUTE_PATH_PATTERNS = [
@@ -285,9 +316,7 @@ def validate_optional_fields(fm: dict) -> Tuple[List[str], List[str]]:
     if isinstance(metadata, dict):
         for field in ("author", "version", "license", "tags"):
             if field in metadata and field not in fm:
-                warnings.append(
-                    f"'{field}' found in metadata block - move to top-level for marketplace scoring"
-                )
+                warnings.append(f"'{field}' found in metadata block - move to top-level for marketplace scoring")
     # Unknown fields
     all_known = KNOWN_FRONTMATTER_FIELDS | DEPRECATED_FIELDS.keys()
@@ -369,10 +398,7 @@ def validate_body(body: str, path: Path, enterprise: bool) -> Tuple[List[str], L
             )
             if instr_match:
                 instr = instr_match.group(1)
-                has_steps = (
-                    re.search(r"(?m)^\s*\d+\.\s+", instr)
-                    or re.search(r"(?mi)^\s*#{2,6}\s*step\s*\d+", instr)
-                )
+                has_steps = re.search(r"(?m)^\s*\d+\.\s+", instr) or re.search(r"(?mi)^\s*#{2,6}\s*step\s*\d+", instr)
                 if not has_steps:
                     warnings.append("Instructions should have numbered steps or ### Step N headings")
@@ -386,8 +412,7 @@ def validate_body(body: str, path: Path, enterprise: bool) -> Tuple[List[str], L
         refs_dir = skill_dir / "references"
         if line_count > 300 and not refs_dir.exists():
             warnings.append(
-                f"SKILL.md is {line_count} lines with no references/ directory - "
-                "consider splitting heavy content"
+                f"SKILL.md is {line_count} lines with no references/ directory - consider splitting heavy content"
             )
     if "{baseDir}/../" in body:
@@ -1003,7 +1028,7 @@ def print_result(result: Dict[str, Any], path: Path) -> None:
             print(f"  - {i}")
         print()
-    print(f"Stats:")
+    print("Stats:")
     print(f"  Words: {stats['word_count']}")
     print(f"  Lines: {stats['line_count']}")
     print(f"  Tokens (est.): {stats['token_estimate']}")
@@ -1056,9 +1081,7 @@ def print_grade(grade_result: Dict[str, Any], path: Path) -> None:
 def main():
-    parser = argparse.ArgumentParser(
-        description="Validate SKILL.md files (Standard tier by default)"
-    )
+    parser = argparse.ArgumentParser(description="Validate SKILL.md files (Standard tier by default)")
     parser.add_argument("path", help="Path to SKILL.md file")
     parser.add_argument(
         "--standard",

package/skills/agent-creator/SKILL.md CHANGED Viewed

@@ -1,20 +1,33 @@
 ---
 name: agent-creator
-description: |
-  Create production-grade agent .md files aligned with the Anthropic 2026 spec (16-field schema).
-  Also validates existing agents against the marketplace compliance rules. Use when building custom
-  subagents, reviewing agent quality, or creating parallel agent architectures for orchestrator skills.
-  Trigger with "/agent-creator", "create an agent", "build a subagent", or "validate my agent".
-  Make sure to use this skill whenever creating agents/*.md files for plugins or standalone use.
-allowed-tools: "Read,Write,Edit,Glob,Grep,Bash(python:*),AskUserQuestion"
+description: 'Create production-grade agent .md files aligned with the Anthropic 2026
+  spec (16-field schema).
+  Also validates existing agents against the marketplace compliance rules. Use when
+  building custom
+  subagents, reviewing agent quality, or creating parallel agent architectures for
+  orchestrator skills.
+  Trigger with "/agent-creator", "create an agent", "build a subagent", or "validate
+  my agent".
+  Make sure to use this skill whenever creating agents/*.md files for plugins or standalone
+  use.
+  '
+allowed-tools: Read,Write,Edit,Glob,Grep,Bash(python:*),AskUserQuestion
 version: 1.0.0
 author: Jeremy Longshore <jeremy@intentsolutions.io>
 license: MIT
-compatible-with: claude-code, codex, openclaw
-tags: [agent-creation, validation, meta-tooling, subagents]
+tags:
+- agent-creation
+- validation
+- meta-tooling
+- subagents
 model: inherit
+compatibility: Designed for Claude Code, also compatible with Codex and OpenClaw
 ---
 # Agent Creator
 Creates spec-compliant agent .md files following the Anthropic 2026 16-field schema. Supports
@@ -45,6 +58,7 @@ that drives the subagent — it does NOT receive the full Claude Code system pro
 ### Mode Detection
 Determine user intent from their prompt:
 - **Create mode**: "create an agent", "build a subagent", "new agent" -> Step 1
 - **Validate mode**: "validate agent", "check agent", "grade agent" -> Validation Workflow
@@ -53,21 +67,25 @@ Determine user intent from their prompt:
 Ask the user with AskUserQuestion:
 **Agent Identity:**
 - Name (kebab-case, 1-64 chars, e.g., `risk-assessor`, `clause-analyzer`)
 - Specialty description (20-200 chars — shown in agent selection UI)
 **Execution Context:**
 - Plugin agent (`plugins/*/agents/`) or standalone (`~/.claude/agents/`)?
 - Will it be spawned by an orchestrator skill via `Task` tool?
 - Does it need to preload specific skills? (`skills: [skill-name]`)
 **Behavioral Controls:**
 - Model override? (`sonnet` for speed, `opus` for quality, `inherit` for default)
 - Reasoning effort? (`low` for simple, `medium` default, `high` for complex analysis)
 - Max iterations? (`maxTurns` — how many tool-use loops before stopping)
 - Tools to deny? (`disallowedTools` — denylist approach, opposite of skills)
 **Plugin Restrictions (if plugin agent):**
 - `hooks` — NOT supported in plugin agents (use plugin-level hooks)
 - `mcpServers` — NOT supported in plugin agents
 - `permissionMode` — standalone only, NOT plugin agents
@@ -78,6 +96,7 @@ Before writing, determine:
 **Agent Role Clarity:**
 The agent body must make three things unambiguous:
 1. **What it IS responsible for** — its specific domain/methodology
 2. **What it is NOT responsible for** — boundaries with other agents
 3. **How it communicates results** — output format and structure
@@ -99,6 +118,7 @@ All production agents should follow this body structure:
 | `## Examples` | Concrete interaction examples | For complex agents |
 **Output Structure Decision:**
 - If the agent feeds into an orchestrator: use **JSON output** (machine-parseable)
 - If the agent is user-facing: use **markdown output** (human-readable)
 - If the agent produces both: JSON primary with markdown summary
@@ -113,12 +133,14 @@ Generate the agent .md using the template from
 See [Anthropic Agent Spec](references/anthropic-agent-spec.md) for the full official reference.
 Required fields:
 ```yaml
 name: {agent-name}         # Lowercase letters and hyphens, unique identifier
 description: "{specialty}"  # When Claude should delegate to this subagent
 ```
 Optional fields (include only what's needed):
 ```yaml
 tools: "Read, Glob, Grep"  # Allowlist — inherits all tools if omitted
 disallowedTools: "Write"   # Denylist — removed from inherited/specified list
@@ -137,12 +159,14 @@ mcpServers: {}             # Standalone only, NOT plugin agents
 ```
 **Tool access:**
 - `tools` = allowlist (like skills' `allowed-tools`)
 - `disallowedTools` = denylist (remove specific tools)
 - If both set: disallowed applied first, then tools resolved
 - If neither set: inherits all tools from parent conversation
 **Invalid fields (ERROR — never use these):**
 - `capabilities` — looks valid but flagged by validator
 - `expertise_level` — invented, not in Anthropic spec
 - `activation_priority` — invented, not in Anthropic spec
@@ -191,6 +215,7 @@ Run validation against the Anthropic 16-field schema:
 | Body under 300 lines | Offload to references if longer (prevents context bloat) |
 **Automated validation:**
 ```bash
 python3 ${CLAUDE_SKILL_DIR}/../skill-creator/scripts/validate-skill.py --agents-only {plugin-dir}/
 ```
@@ -209,6 +234,7 @@ Test the agent by spawning it via the `Task` tool or the `Agent` tool:
 ### Step 6: Report
 Provide a summary:
 - Agent name and file path
 - Frontmatter field count (of 14 possible)
 - Body line count
@@ -299,7 +325,7 @@ actionable).
 ## Resources
 - [Anthropic Agent Spec](references/anthropic-agent-spec.md) — Official 16-field schema from code.claude.com/docs/en/sub-agents
-- [Agent template](${CLAUDE_SKILL_DIR}/../skill-creator/templates/agent-template.md) — Skeleton with placeholders
-- [Frontmatter spec](${CLAUDE_SKILL_DIR}/../skill-creator/references/frontmatter-spec.md) — Field reference (internal)
-- [Source of truth](${CLAUDE_SKILL_DIR}/../skill-creator/references/source-of-truth.md) — Canonical spec
-- [Validation rules](${CLAUDE_SKILL_DIR}/../skill-creator/references/validation-rules.md) — Agent validation section
+- Agent template — Skeleton with placeholders
+- Frontmatter spec — Field reference (internal)
+- Source of truth — Canonical spec
+- Validation rules — Agent validation section

package/skills/agent-creator/references/anthropic-agent-spec.md CHANGED Viewed

@@ -39,6 +39,7 @@ Total: 16 official fields.
 ## Plugin Agent Restrictions
 Plugin agents (`plugins/*/agents/*.md`) do NOT support:
 - `hooks` — ignored when loading from plugin
 - `mcpServers` — ignored when loading from plugin
 - `permissionMode` — ignored when loading from plugin

package/skills/skill-creator/SKILL.md CHANGED Viewed

@@ -1,20 +1,31 @@
 ---
 name: skill-creator
-description: |
-  Create production-grade agent skills aligned with the 2026 AgentSkills.io spec and Anthropic
-  best practices (2026). Also validates existing skills against the Intent Solutions 100-point rubric.
+description: 'Create production-grade agent skills aligned with the 2026 AgentSkills.io
+  spec and Anthropic
+  best practices (2026). Also validates existing skills against the Intent Solutions
+  100-point rubric.
   Use when building, testing, validating, or optimizing Claude Code skills.
-  Trigger with "/skill-creator", "create a skill", "validate my skill", or "check skill quality".
-  Make sure to use this skill whenever creating a new skill, slash command, or agent capability.
-allowed-tools: "Read,Write,Edit,Glob,Grep,Bash(mkdir:*),Bash(chmod:*),Bash(python:*),Bash(claude:*),Task,AskUserQuestion"
+  Trigger with "/skill-creator", "create a skill", "validate my skill", or "check
+  skill quality".
+  Make sure to use this skill whenever creating a new skill, slash command, or agent
+  capability.
+  '
+allowed-tools: Read,Write,Edit,Glob,Grep,Bash(mkdir:*),Bash(chmod:*),Bash(python:*),Bash(claude:*),Task,AskUserQuestion
 version: 5.1.0
 author: Jeremy Longshore <jeremy@intentsolutions.io>
 license: MIT
-compatible-with: claude-code, codex, openclaw
-tags: [skill-creation, validation, meta-tooling]
+tags:
+- skill-creation
+- validation
+- meta-tooling
 model: inherit
+compatibility: Designed for Claude Code, also compatible with Codex and OpenClaw
 ---
 # Skill Creator
 Creates complete, spec-compliant skill packages following AgentSkills.io and Anthropic standards.
@@ -39,12 +50,14 @@ scratch with full validation, or grade/audit existing skills with actionable fix
 ### Mode Detection
 Determine user intent from their prompt:
 - **Create mode**: "create a skill", "build a skill", "new skill" -> proceed to Step 1
 - **Validate mode**: "validate", "check", "grade", "score", "audit" -> jump to Validation Workflow
 ### Communicating with the User
 Pay attention to context cues to understand the user's technical level. Skill creator is used by people across a wide range of familiarity — from first-time coders to senior engineers. In the default case:
 - "evaluation" and "benchmark" are borderline but OK
 - For "JSON" and "assertion", check for cues the user knows these terms before using them without explanation
 - Briefly explain terms if in doubt
@@ -56,21 +69,25 @@ If the current conversation already contains a workflow the user wants to captur
 Ask the user with AskUserQuestion:
 **Skill Identity:**
 - Name (kebab-case, gerund preferred: `processing-pdfs`, `analyzing-data`)
 - Purpose (1-2 sentences: what it does + when to use it)
 **Execution Model:**
 - User-invocable via `/name`? Or background knowledge only?
 - Accepts arguments? (`$ARGUMENTS` substitution)
 - Needs isolated context? (`context: fork` for subagent execution)
 - Explicit-only invocation? (`disable-model-invocation: true` — prevents auto-activation, requires `/name`)
 **Required Tools:**
 - Read, Write, Edit, Glob, Grep, WebFetch, WebSearch, Task, AskUserQuestion, Skill
 - Bash must be scoped: `Bash(git:*)`, `Bash(npm:*)`, etc.
 - MCP tools: `ServerName:tool_name`
 **Complexity:**
 - Simple (SKILL.md only)
 - With scripts (automation code in `scripts/`)
 - With references (documentation in `references/`)
@@ -78,6 +95,7 @@ Ask the user with AskUserQuestion:
 - Full package (all directories)
 **Location:**
 - Global: `~/.claude/skills/<skill-name>/`
 - Project: `.claude/skills/<skill-name>/`
@@ -86,6 +104,7 @@ Ask the user with AskUserQuestion:
 Before writing, determine:
 **Degrees of Freedom:**
 | Level | When to Use |
 |-------|-------------|
 | High | Creative/open-ended tasks (analysis, writing) |
@@ -95,6 +114,7 @@ Before writing, determine:
 Think of it as **narrow bridge vs open field**: a deployment skill is a narrow bridge (one safe path, guard rails everywhere), while a writing skill is an open field (Claude roams freely within broad boundaries). Match constraint level to the task.
 **Workflow Pattern** (see `${CLAUDE_SKILL_DIR}/references/workflows.md`):
 - Sequential: fixed steps in order
 - Conditional: branch based on input
 - Wizard: interactive multi-step gathering
@@ -104,6 +124,7 @@ Think of it as **narrow bridge vs open field**: a deployment skill is a narrow b
 - Search-Analyze-Report: explore and summarize
 **Output Pattern** (see `${CLAUDE_SKILL_DIR}/references/output-patterns.md`):
 - Strict template (exact format)
 - Flexible template (structure with creative content)
 - Examples-driven (input/output pairs)
@@ -128,6 +149,7 @@ mkdir -p {location}/{skill-name}/evals        # for eval-driven development
 For detailed guidance on writing SKILL.md (frontmatter rules, description scoring, body guidelines, string substitutions, DCI syntax), creating supporting files, validation, testing, iteration, description optimization, and final reporting, see [Creation Guide](references/creation-guide.md).
 Key rules:
 - `version`, `author`, `license`, `tags`, `compatible-with` are TOP-LEVEL fields (not nested under `metadata:`)
 - Scope Bash: `Bash(git:*)` not bare `Bash`
 - Keep under 500 lines; offload to `references/` if longer
@@ -244,15 +266,18 @@ Output:
 ## Resources
 **References:** `${CLAUDE_SKILL_DIR}/references/`
 - `creation-guide.md` — Detailed Steps 4-10 and Validation Workflow (V1-V5)
 - `source-of-truth.md` — Canonical spec ([AgentSkills.io](https://agentskills.io/specification), [Anthropic docs](https://code.claude.com/docs/en/skills), [Lee Han Chung deep dive](https://leehanchung.github.io/blogs/2025/10/26/claude-skills-deep-dive/)) | `frontmatter-spec.md` — Field reference | `validation-rules.md` — 100-point rubric
 - `workflows.md` — Workflow patterns | `output-patterns.md` — Output formats | `schemas.md` — JSON schemas (evals, grading, benchmarks)
 - `anthropic-comparison.md` — Gap analysis | `advanced-eval-workflow.md` — Eval, iteration, optimization, platform notes
 **Agents** (read when spawning subagents): `${CLAUDE_SKILL_DIR}/agents/`
 - `grader.md` — Assertion evaluation | `comparator.md` — Blind A/B comparison | `analyzer.md` — Benchmark analysis
 **Scripts:** `${CLAUDE_SKILL_DIR}/scripts/`
 - `validate-skill.py` — 100-point rubric grading | `quick_validate.py` — Lightweight validation
 - `aggregate_benchmark.py` — Benchmark stats | `run_eval.py` — Trigger accuracy testing
 - `run_loop.py` — Description optimization loop | `improve_description.py` — LLM-powered rewriting

package/skills/skill-creator/agents/analyzer.md CHANGED Viewed

@@ -54,6 +54,7 @@ You receive these parameters in your prompt:
 ### Step 4: Analyze Instruction Following
 For each transcript, evaluate:
 - Did the agent follow the skill's explicit instructions?
 - Did the agent use the skill's provided tools/scripts?
 - Were there missed opportunities to leverage skill content?
@@ -64,6 +65,7 @@ Score instruction following 1-10 and note specific issues.
 ### Step 5: Identify Winner Strengths
 Determine what made the winner better:
 - Clearer instructions that led to better behavior?
 - Better scripts/tools that produced better output?
 - More comprehensive examples that guided edge cases?
@@ -74,6 +76,7 @@ Be specific. Quote from skills/transcripts where relevant.
 ### Step 6: Identify Loser Weaknesses
 Determine what held the loser back:
 - Ambiguous instructions that led to suboptimal choices?
 - Missing tools/scripts that forced workarounds?
 - Gaps in edge case coverage?
@@ -82,6 +85,7 @@ Determine what held the loser back:
 ### Step 7: Generate Improvement Suggestions
 Based on the analysis, produce actionable suggestions for improving the loser skill:
 - Specific instruction changes to make
 - Tools/scripts to add or modify
 - Examples to include
@@ -216,6 +220,7 @@ You receive these parameters in your prompt:
 ### Step 2: Analyze Per-Assertion Patterns
 For each expectation across all runs:
 - Does it **always pass** in both configurations? (may not differentiate skill value)
 - Does it **always fail** in both configurations? (may be broken or beyond capability)
 - Does it **always pass with skill but fail without**? (skill clearly adds value here)
@@ -225,6 +230,7 @@ For each expectation across all runs:
 ### Step 3: Analyze Cross-Eval Patterns
 Look for patterns across evals:
 - Are certain eval types consistently harder/easier?
 - Do some evals show high variance while others are stable?
 - Are there surprising results that contradict expectations?
@@ -232,6 +238,7 @@ Look for patterns across evals:
 ### Step 4: Analyze Metrics Patterns
 Look at time_seconds, tokens, tool_calls:
 - Does the skill significantly increase execution time?
 - Is there high variance in resource usage?
 - Are there outlier runs that skew the aggregates?
@@ -239,11 +246,13 @@ Look at time_seconds, tokens, tool_calls:
 ### Step 5: Generate Notes
 Write freeform observations as a list of strings. Each note should:
 - State a specific observation
 - Be grounded in the data (not speculation)
 - Help the user understand something the aggregate metrics don't show
 Examples:
 - "Assertion 'Output is a PDF file' passes 100% in both configurations - may not differentiate skill value"
 - "Eval 3 shows high variance (50% ± 40%) - run 2 had an unusual failure that may be flaky"
 - "Without-skill runs consistently fail on table extraction expectations (0% pass rate)"
@@ -267,12 +276,14 @@ Save notes to `{output_path}` as a JSON array of strings:
 ## Guidelines
 **DO:**
 - Report what you observe in the data
 - Be specific about which evals, expectations, or runs you're referring to
 - Note patterns that aggregate metrics would hide
 - Provide context that helps interpret the numbers
 **DO NOT:**
 - Suggest improvements to the skill (that's for the improvement step, not benchmarking)
 - Make subjective quality judgments ("the output was good/bad")
 - Speculate about causes without evidence

package/skills/skill-creator/agents/comparator.md CHANGED Viewed

@@ -44,6 +44,7 @@ You receive these parameters in your prompt:
 Based on the task, generate a rubric with two dimensions:
 **Content Rubric** (what the output contains):
 | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
 |-----------|----------|----------------|---------------|
 | Correctness | Major errors | Minor errors | Fully correct |
@@ -51,6 +52,7 @@ Based on the task, generate a rubric with two dimensions:
 | Accuracy | Significant inaccuracies | Minor inaccuracies | Accurate throughout |
 **Structure Rubric** (how the output is organized):
 | Criterion | 1 (Poor) | 3 (Acceptable) | 5 (Excellent) |
 |-----------|----------|----------------|---------------|
 | Organization | Disorganized | Reasonably organized | Clear, logical structure |
@@ -58,6 +60,7 @@ Based on the task, generate a rubric with two dimensions:
 | Usability | Difficult to use | Usable with effort | Easy to use |
 Adapt criteria to the specific task. For example:
 - PDF form → "Field alignment", "Text readability", "Data placement"
 - Document → "Section structure", "Heading hierarchy", "Paragraph flow"
 - Data output → "Schema correctness", "Data types", "Completeness"

package/skills/skill-creator/agents/grader.md CHANGED Viewed

@@ -66,6 +66,7 @@ This catches issues that predefined expectations might miss.
 ### Step 5: Read User Notes
 If `{outputs_dir}/user_notes.md` exists:
 1. Read it and note any uncertainties or issues flagged by the executor
 2. Include relevant concerns in the grading output
 3. These may reveal problems even when expectations pass
@@ -77,6 +78,7 @@ After grading, consider whether the evals themselves could be improved. Only sur
 Good suggestions test meaningful outcomes — assertions that are hard to satisfy without actually doing the work correctly. Think about what makes an assertion *discriminating*: it passes when the skill genuinely succeeds and fails when it doesn't.
 Suggestions worth raising:
 - An assertion that passed but would also pass for a clearly wrong output (e.g., checking filename existence but not file content)
 - An important outcome you observed — good or bad — that no assertion covers at all
 - An assertion that can't actually be verified from the available outputs
@@ -90,11 +92,13 @@ Save results to `{outputs_dir}/../grading.json` (sibling to outputs_dir).
 ## Grading Criteria
 **PASS when**:
 - The transcript or outputs clearly demonstrate the expectation is true
 - Specific evidence can be cited
 - The evidence reflects genuine substance, not just surface compliance (e.g., a file exists AND contains correct content, not just the right filename)
 **FAIL when**:
 - No evidence found for the expectation
 - Evidence contradicts the expectation
 - The expectation cannot be verified from available information