npm - @intentsolutionsio/skill-creator - Versions diffs - 5.0.0 → 5.0.3 - Mend

@intentsolutionsio/skill-creator 5.0.0 → 5.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/skills/skill-creator/eval-viewer/generate_review.py CHANGED Viewed

@@ -32,9 +32,32 @@ METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
 # Extensions we render as inline text
 TEXT_EXTENSIONS = {
-    ".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
-    ".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
-    ".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
+    ".txt",
+    ".md",
+    ".json",
+    ".csv",
+    ".py",
+    ".js",
+    ".ts",
+    ".tsx",
+    ".jsx",
+    ".yaml",
+    ".yml",
+    ".xml",
+    ".html",
+    ".css",
+    ".sh",
+    ".rb",
+    ".go",
+    ".rs",
+    ".java",
+    ".c",
+    ".cpp",
+    ".h",
+    ".hpp",
+    ".sql",
+    ".r",
+    ".toml",
 }
 # Extensions we render as inline images
@@ -224,9 +247,7 @@ def load_previous_iteration(workspace: Path) -> dict[str, dict]:
         try:
             data = json.loads(feedback_path.read_text())
             feedback_map = {
-                r["run_id"]: r["feedback"]
-                for r in data.get("reviews", [])
-                if r.get("feedback", "").strip()
+                r["run_id"]: r["feedback"] for r in data.get("reviews", []) if r.get("feedback", "").strip()
             }
         except (json.JSONDecodeError, OSError, KeyError):
             pass
@@ -285,12 +306,15 @@ def generate_html(
 # HTTP server (stdlib only, zero dependencies)
 # ---------------------------------------------------------------------------
 def _kill_port(port: int) -> None:
     """Kill any process listening on the given port."""
     try:
         result = subprocess.run(
             ["lsof", "-ti", f":{port}"],
-            capture_output=True, text=True, timeout=5,
+            capture_output=True,
+            text=True,
+            timeout=5,
         )
         for pid_str in result.stdout.strip().split("\n"):
             if pid_str.strip():
@@ -305,6 +329,7 @@ def _kill_port(port: int) -> None:
     except FileNotFoundError:
         print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
 class ReviewHandler(BaseHTTPRequestHandler):
     """Serves the review HTML and handles feedback saves.
@@ -390,15 +415,22 @@ def main() -> None:
     parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
     parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
     parser.add_argument(
-        "--previous-workspace", type=Path, default=None,
+        "--previous-workspace",
+        type=Path,
+        default=None,
         help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
     )
     parser.add_argument(
-        "--benchmark", type=Path, default=None,
+        "--benchmark",
+        type=Path,
+        default=None,
         help="Path to benchmark.json to show in the Benchmark tab",
     )
     parser.add_argument(
-        "--static", "-s", type=Path, default=None,
+        "--static",
+        "-s",
+        type=Path,
+        default=None,
         help="Write standalone HTML to this path instead of starting a server",
     )
     args = parser.parse_args()
@@ -447,8 +479,8 @@ def main() -> None:
         port = server.server_address[1]
     url = f"http://localhost:{port}"
-    print(f"\n  Eval Viewer")
-    print(f"  ─────────────────────────────────")
+    print("\n  Eval Viewer")
+    print("  ─────────────────────────────────")
     print(f"  URL:       {url}")
     print(f"  Workspace: {workspace}")
     print(f"  Feedback:  {feedback_path}")
@@ -456,7 +488,7 @@ def main() -> None:
         print(f"  Previous:  {args.previous_workspace} ({len(previous)} runs)")
     if benchmark_path:
         print(f"  Benchmark: {benchmark_path}")
-    print(f"\n  Press Ctrl+C to stop.\n")
+    print("\n  Press Ctrl+C to stop.\n")
     webbrowser.open(url)

package/skills/skill-creator/references/advanced-eval-workflow.md CHANGED Viewed

@@ -24,6 +24,7 @@ For each test case, spawn two subagents in the same turn — one with the skill,
 Launch everything at once so runs finish around the same time.
 **With-skill run:**
 ```
 Execute this task:
 - Skill path: <path-to-skill>
@@ -34,11 +35,13 @@ Execute this task:
 ```
 **Baseline run** (same prompt, no skill):
 - **Creating a new skill**: no skill at all. Save to `without_skill/outputs/`.
 - **Improving an existing skill**: snapshot the old version first (`cp -r`), point baseline
   at the snapshot. Save to `old_skill/outputs/`.
 Write an `eval_metadata.json` for each test case:
 ```json
 {
   "eval_id": 0,
@@ -63,6 +66,7 @@ Update `eval_metadata.json` files and `evals/evals.json` with the assertions. Se
 When each subagent task completes, the notification contains `total_tokens` and
 `duration_ms`. Save immediately to `timing.json` — this is the only opportunity to
 capture this data:
 ```json
 {
   "total_tokens": 84852,
@@ -83,9 +87,11 @@ Once all runs are done:
    eyeballing it.
 2. **Aggregate into benchmark**:
    ```bash
    python -m scripts.aggregate_benchmark <workspace>/iteration-N --skill-name <name>
    ```
    This produces `benchmark.json` and `benchmark.md` with pass_rate, time, and tokens for
    each configuration, with mean +/- stddev and the delta. If generating benchmark.json
    manually, see `${CLAUDE_SKILL_DIR}/references/schemas.md` for the exact schema the
@@ -96,6 +102,7 @@ Once all runs are done:
    what to look for — non-discriminating assertions, high-variance evals, time/token tradeoffs.
 4. **Launch the viewer**:
    ```bash
    nohup python ${CLAUDE_SKILL_DIR}/eval-viewer/generate_review.py \
      <workspace>/iteration-N \
@@ -104,6 +111,7 @@ Once all runs are done:
      > /dev/null 2>&1 &
    VIEWER_PID=$!
    ```
    For iteration 2+, also pass `--previous-workspace <workspace>/iteration-<N-1>`.
    **Headless/Cowork:** Use `--static <output_path>` to write standalone HTML instead of
@@ -115,6 +123,7 @@ Once all runs are done:
 ### What the user sees in the viewer
 The "Outputs" tab shows one test case at a time:
 - **Prompt**: the task that was given
 - **Output**: the files the skill produced, rendered inline where possible
 - **Previous Output** (iteration 2+): collapsed section showing last iteration's output
@@ -131,6 +140,7 @@ all feedback to `feedback.json`.
 ### Step E5: Read the feedback
 When the user is done, read `feedback.json`:
 ```json
 {
   "reviews": [
@@ -140,6 +150,7 @@ When the user is done, read `feedback.json`:
   "status": "complete"
 }
 ```
 Empty feedback means the user thought it was fine. Focus improvements on test cases with
 specific complaints. Kill the viewer server when done: `kill $VIEWER_PID 2>/dev/null`.
@@ -172,6 +183,7 @@ After running test cases and collecting feedback, improve the skill based on wha
 ### The iteration loop
 After improving the skill:
 1. Apply improvements to the skill
 2. Rerun all test cases into a new `iteration-<N+1>/` directory, including baselines
 3. Launch the reviewer with `--previous-workspace` pointing at the previous iteration
@@ -179,6 +191,7 @@ After improving the skill:
 5. Read new feedback, improve again, repeat
 Keep going until:
 - The user says they're happy
 - The feedback is all empty (everything looks good)
 - You're not making meaningful progress
@@ -194,6 +207,7 @@ accuracy.
 ### Step D1: Generate trigger eval queries
 Create 20 eval queries — a mix of should-trigger and should-not-trigger. Save as JSON:
 ```json
 [
   {"query": "the user prompt", "should_trigger": true},
@@ -219,6 +233,7 @@ Good: `"ok so my boss just sent me this xlsx file (its in my downloads, called s
 ### Step D2: Review with user
 Present the eval set using the HTML template:
 1. Read `${CLAUDE_SKILL_DIR}/assets/eval_review.html`
 2. Replace placeholders: `__EVAL_DATA_PLACEHOLDER__` (JSON array, no quotes — it's a JS
    variable assignment), `__SKILL_NAME_PLACEHOLDER__`, `__SKILL_DESCRIPTION_PLACEHOLDER__`
@@ -298,6 +313,7 @@ direct the user to the resulting `.skill` file path so they can install it.
 ### Claude.ai
 The core workflow (draft -> test -> review -> improve) is the same, but without subagents:
 - **Test cases**: Run them yourself one at a time. Skip baseline runs.
 - **Review**: Present results directly in conversation. Save output files and tell the user
   where they are.

package/skills/skill-creator/references/anthropic-comparison.md CHANGED Viewed

@@ -3,6 +3,7 @@
 Sources: [AgentSkills.io spec](https://agentskills.io/specification) · [Anthropic docs](https://code.claude.com/docs/en/skills) · [anthropics/skills repo](https://github.com/anthropics/skills)
 Comparison of our skill-creator implementation against:
 - AgentSkills.io specification (canonical open standard)
 - Anthropic best practices (platform.claude.com)
 - anthropics/skills official skill-creator
@@ -73,6 +74,7 @@ Comparison of our skill-creator implementation against:
 ### For Existing Skills
 Existing skills that pass our old validator will mostly pass the new one because:
 - Enterprise tier (default) still checks `metadata.author`, `metadata.version`, scoped tools
 - The body section checks are warnings, not errors
 - "Use when" / "Trigger with" are now recommended patterns, not hard requirements
@@ -85,6 +87,7 @@ Existing skills that pass our old validator will mostly pass the new one because
 ### New Capabilities
 Skills can now use:
 - `$ARGUMENTS` for dynamic input
 - `context: fork` for subagent execution
 - `hooks` for lifecycle automation

package/skills/skill-creator/references/creation-guide.md CHANGED Viewed

@@ -10,6 +10,7 @@ Generate the SKILL.md using the template from `${CLAUDE_SKILL_DIR}/templates/ski
 **Frontmatter rules** (see `${CLAUDE_SKILL_DIR}/references/frontmatter-spec.md`):
 Required fields:
 ```yaml
 name: {skill-name}          # Must match directory name
 description: |               # Third person, what + when + keywords
@@ -18,10 +19,12 @@ description: |               # Third person, what + when + keywords
 ```
 **Frontmatter constraints (Anthropic spec):**
 - `name`: No XML tags (`<`, `>` characters prohibited). No reserved words (`anthropic`, `claude`) in isolation.
 - `description`: No XML tags. Description is injected into Claude's system prompt — third person prevents discovery issues where Claude speaks as the skill author.
 Identity fields (top-level — marketplace validator scores these here):
 ```yaml
 version: 1.0.0
 author: {name} <{email}>
@@ -32,12 +35,14 @@ license: MIT
 Do NOT nest them under `metadata:`. The marketplace 100-point validator checks them at top-level.
 Recommended fields:
 ```yaml
 allowed-tools: "{scoped tools}"
 model: inherit
 ```
 Optional Claude Code extensions:
 ```yaml
 argument-hint: "[arg]"              # If accepts $ARGUMENTS
 context: fork                       # If needs isolated execution
@@ -71,6 +76,7 @@ Pattern (enterprise): "Use when [scenario]" (+3 pts) + "Trigger with [phrases]"
 **Body content guidelines — section recommendations:**
 Anthropic's spec places no format restrictions on body content. The sections below are enterprise-tier quality recommendations scored by the Intent Solutions marketplace rubric. At standard tier, these are not required but are still good practice:
 ```
 ## Overview       (>50 chars content: +4 pts enterprise)
 ## Prerequisites  (+2 pts enterprise)
@@ -83,10 +89,11 @@ Anthropic's spec places no format restrictions on body content. The sections bel
 ```
 Additional guidelines:
 - Keep under 500 lines (offload to `references/` if longer)
 - Concise — Claude is smart, don't over-explain
 - Concrete examples over abstract descriptions
-- Reference supporting files with relative markdown links: `[details](reference.md)` or `[API](references/api.md)` — Claude reads these on demand
+- Reference supporting files with relative markdown links: `details` or `API` — Claude reads these on demand
 - Use `${CLAUDE_SKILL_DIR}/` in DCI/bash contexts only: exclamation + backtick-wrapped command, e.g. `cat ${CLAUDE_SKILL_DIR}/references/config.md`
 - Sections >20 lines (Output, Error Handling, Examples) → offload to `references/` with relative links
 - If skill has 3+ distinct user operations → split into individual `commands/*.md` files
@@ -100,6 +107,7 @@ Additional guidelines:
 - **No surprise behavior**: Skills must not contain malware, exploit code, or content that could compromise security. A skill's behavior should not surprise the user if described honestly
 **String substitutions available:**
 - `$ARGUMENTS` / `$0`, `$1` - user-provided arguments (pair with `argument-hint` frontmatter)
 - `${CLAUDE_SESSION_ID}` - current session ID
 - `` !`command` `` syntax — dynamic context injection (Anthropic spec feature):
@@ -111,6 +119,7 @@ Additional guidelines:
 ## Step 5: Create Supporting Files
 **Scripts** (`scripts/`):
 - Scripts should solve problems, not punt to Claude
 - Explicit error handling
 - No voodoo constants (document all magic values)
@@ -118,16 +127,19 @@ Additional guidelines:
 - Make executable: `chmod +x scripts/*.py`
 **References** (`references/`):
 - Heavy documentation that doesn't need to load at activation
 - Use clear section headers for navigability
 - For reference files >100 lines, include a TOC at the top so Claude can see full scope even with partial reads
 - One-level-deep references only (no `references/sub/dir/`)
 **Templates** (`templates/`):
 - Boilerplate files used for generation
 - Use clear placeholder syntax (`{{PLACEHOLDER}}`)
 **Assets** (`assets/`):
 - Static resources (images, configs, data files)
 ## Step 6: Validate
@@ -142,6 +154,7 @@ python3 ${CLAUDE_SKILL_DIR}/scripts/validate-skill.py --grade {skill-dir}/SKILL.
 Standard tier is the default (no required fields, broad compatibility). Use `--enterprise` for full 100-point marketplace grading.
 **Validation checks:**
 - Frontmatter: required fields, types, constraints
 - Description: third person, what + when, keywords, length
 - Body: under 500 lines, no absolute paths, has instructions + examples
@@ -151,6 +164,7 @@ Standard tier is the default (no required fields, broad compatibility). Use `--e
 - Progressive disclosure: appropriate use of references/
 **If validation fails:** fix issues and re-run. Common fixes:
 - Scope Bash tools: `Bash(git:*)` not `Bash`
 - Remove absolute paths, use `${CLAUDE_SKILL_DIR}/`
 - Split long SKILL.md into references
@@ -172,6 +186,7 @@ Create `evals/evals.json` with minimum 3 scenarios: happy path, edge case, negat
 Run parallel evaluation: Claude A with skill installed vs Claude B without. Compare outputs against assertions — the skill should produce meaningfully better results for its target use cases.
 **Additional testing practices:**
 - **Team feedback**: If applicable, share the skill with teammates and observe usage patterns
 - **Observe Claude navigation**: Watch how Claude reads and navigates the skill — look for unexpected exploration paths, missed references, or overreliance on certain sections
@@ -197,6 +212,7 @@ Tips: front-load distinctive keywords, include specific file types/tools/domains
 ## Step 10: Report
 Show the user:
 ```
 SKILL CREATED
 ====================================
@@ -235,6 +251,7 @@ When the user wants to validate, grade, or audit an existing skill:
 ### Step V1: Locate the Skill
 Ask for the SKILL.md path or detect from context. Common locations:
 - `~/.claude/skills/{name}/SKILL.md` (global)
 - `.claude/skills/{name}/SKILL.md` (project)
@@ -268,6 +285,7 @@ Present the grade report with specific fix recommendations. Prioritize fixes by
 ### Step V5: Auto-Fix (if requested)
 If the user says "fix it" or "auto-fix", apply the suggested improvements:
 1. Add missing sections (Overview, Prerequisites, Output)
 2. Add "Use when" / "Trigger with" to description
 3. Move author/version from metadata to top-level
@@ -301,5 +319,6 @@ For A/B testing between skill versions, read `${CLAUDE_SKILL_DIR}/agents/compara
 ## Platform-Specific Notes
 See `${CLAUDE_SKILL_DIR}/references/advanced-eval-workflow.md` (section "Platform-Specific Notes").
 - **Claude.ai**: No subagents — run tests yourself, skip benchmarking/description optimization.
 - **Cowork**: Full subagent workflow. Use `--static` for eval viewer. Generate viewer BEFORE self-evaluation.

package/skills/skill-creator/references/errors-template.md CHANGED Viewed

@@ -22,6 +22,7 @@ Standard troubleshooting reference for all marketplace skills. Every references/
 ```
 Categories should match the skill's domain. Examples:
 - For GCP skills: Authentication Errors, API Errors, Deployment Errors, Quota Errors
 - For database skills: Connection Errors, Query Errors, Permission Errors, Data Errors
 - For CI/CD skills: Build Errors, Deploy Errors, Configuration Errors, Permission Errors

package/skills/skill-creator/references/examples-template.md CHANGED Viewed

@@ -34,6 +34,7 @@ Standard usage examples reference for all marketplace skills. Every references/e
 ```
 Rules:
 - Minimum 3 examples, maximum 6
 - Each example has: scenario title, user request, what's produced, code/config
 - Code must be real and runnable — no pseudocode, no "add your logic here"

package/skills/skill-creator/references/frontmatter-spec.md CHANGED Viewed

@@ -94,6 +94,7 @@ allowed-tools: "Read,Write,Bash"  # Warning in Standard, Error in Enterprise
 ```
 **Bash Scoping Patterns**:
 ```yaml
 Bash(git:*)       # All git commands
 Bash(npm:*)       # All npm commands

package/skills/skill-creator/references/implementation-template.md CHANGED Viewed

@@ -37,6 +37,7 @@ Standard implementation guide for all marketplace skills. Every references/imple
 ```
 Rules:
 - Write from the perspective of how this specific skill works, not generic patterns
 - Include real commands, real API calls, real config values
 - Keep under 200 lines — this is a reference, not a textbook

package/skills/skill-creator/references/output-patterns.md CHANGED Viewed

@@ -29,6 +29,7 @@ Generate output using this exact template:
 ```
 Replace placeholders with gathered values. Do not add extra fields.
 ```
 ### Flexible Template (Medium Degrees of Freedom)
@@ -69,20 +70,24 @@ Provide input/output pairs that demonstrate expected behavior.
 **Input**: `/skill-name auth.py`
 **Output**:
 ```
 auth.py: 3 issues found
   Line 15: SQL injection risk in query builder
   Line 42: Hardcoded credential detected
   Line 89: Missing input validation
 ```
 ### Complex case
 **Input**: `/skill-name --deep src/`
 **Output**:
 ```
 Deep scan: 12 files, 7 issues
   CRITICAL (2): sql-injection, hardcoded-secret
   WARNING (3): missing-validation, weak-hash, cors-wildcard
   INFO (2): deprecated-api, unused-import
 ```
 ```
@@ -135,6 +140,7 @@ Example HTML structure:
 </body>
 </html>
 ```
 ```
 ### When to Use
@@ -177,6 +183,7 @@ Results are written as JSON to `{output_path}/results.json`:
 ```
 Additionally, a human-readable summary is printed to the conversation.
 ```
 ---

package/skills/skill-creator/references/schemas.md CHANGED Viewed

@@ -27,6 +27,7 @@ Defines the evals for a skill. Located at `evals/evals.json` within the skill di
 ```
 **Fields:**
 - `skill_name`: Name matching the skill's frontmatter
 - `evals[].id`: Unique integer identifier
 - `evals[].prompt`: The task to execute
@@ -72,6 +73,7 @@ Tracks version progression in Improve mode. Located at workspace root.
 ```
 **Fields:**
 - `started_at`: ISO timestamp of when improvement started
 - `skill_name`: Name of the skill being improved
 - `current_best`: Version identifier of the best performer
@@ -150,6 +152,7 @@ Output from the grader agent. Located at `<run-dir>/grading.json`.
 ```
 **Fields:**
 - `expectations[]`: Graded expectations with evidence
 - `summary`: Aggregate pass/fail counts
 - `execution_metrics`: Tool usage and output size (from executor's metrics.json)
@@ -184,6 +187,7 @@ Output from the executor agent. Located at `<run-dir>/outputs/metrics.json`.
 ```
 **Fields:**
 - `tool_calls`: Count per tool type
 - `total_tool_calls`: Sum of all tool calls
 - `total_steps`: Number of major execution steps
@@ -286,6 +290,7 @@ Output from Benchmark mode. Located at `benchmarks/<timestamp>/benchmark.json`.
 ```
 **Fields:**
 - `metadata`: Information about the benchmark run
   - `skill_name`: Name of the skill
   - `timestamp`: When the benchmark was run