npm - @event4u/agent-config - Versions diffs - 2.10.0 → 2.12.0 - Mend

@event4u/agent-config 2.10.0 → 2.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (94) hide show

package/.agent-src/commands/agents.md +1 -0
package/.agent-src/commands/challenge-me.md +1 -0
package/.agent-src/commands/chat-history.md +1 -0
package/.agent-src/commands/context.md +1 -0
package/.agent-src/commands/council.md +1 -0
package/.agent-src/commands/feature.md +1 -0
package/.agent-src/commands/fix.md +1 -0
package/.agent-src/commands/grill-me.md +1 -0
package/.agent-src/commands/judge.md +1 -0
package/.agent-src/commands/memory.md +1 -0
package/.agent-src/commands/module.md +1 -0
package/.agent-src/commands/onboard.md +32 -4
package/.agent-src/commands/optimize.md +1 -0
package/.agent-src/commands/override.md +1 -0
package/.agent-src/commands/roadmap.md +1 -0
package/.agent-src/commands/tests.md +1 -0
package/.agent-src/skills/canvas-design/SKILL.md +132 -0
package/.agent-src/skills/canvas-design/evals/triggers.json +16 -0
package/.agent-src/skills/doc-coauthoring/SKILL.md +129 -0
package/.agent-src/skills/doc-coauthoring/evals/triggers.json +16 -0
package/.agent-src/skills/nextjs-patterns/SKILL.md +203 -0
package/.agent-src/skills/skill-writing/SKILL.md +101 -16
package/.agent-src/skills/sql-writing/SKILL.md +1 -1
package/.agent-src/skills/symfony-workflow/SKILL.md +173 -0
package/.agent-src/templates/scripts/work_engine/hook_bootstrap.py +4 -0
package/.agent-src/templates/scripts/work_engine/hooks/builtin/__init__.py +3 -0
package/.agent-src/templates/scripts/work_engine/hooks/builtin/decision_gate.py +162 -0
package/.agent-src/templates/scripts/work_engine/hooks/settings.py +24 -6
package/.agent-src/templates/scripts/work_engine/scoring/decision_engine.py +351 -0
package/.claude-plugin/marketplace.json +5 -1
package/CHANGELOG.md +68 -0
package/README.md +37 -8
package/config/agent-settings.template.yml +66 -0
package/docs/architecture.md +1 -1
package/docs/contracts/STABILITY.md +16 -0
package/docs/contracts/adr-chat-history-split.md +1 -0
package/docs/contracts/adr-forecast-construction-shape.md +1 -0
package/docs/contracts/adr-gtm-context-spine.md +1 -0
package/docs/contracts/adr-level-6-productization.md +147 -0
package/docs/contracts/adr-settings-sync-engine.md +1 -0
package/docs/contracts/adr-wing4-context-spine.md +1 -0
package/docs/contracts/agent-memory-contract.md +1 -0
package/docs/contracts/agents-md-tech-stack.md +1 -0
package/docs/contracts/audit-log-v1.md +1 -0
package/docs/contracts/command-clusters.md +1 -0
package/docs/contracts/command-surface-tiers.md +1 -0
package/docs/contracts/context-paths.md +1 -0
package/docs/contracts/cost-profile-defaults.md +105 -0
package/docs/contracts/cross-wing-handoff.md +1 -0
package/docs/contracts/decision-engine-gates.md +115 -0
package/docs/contracts/decision-trace-v1.md +1 -0
package/docs/contracts/file-ownership-matrix.md +1 -0
package/docs/contracts/hook-architecture-v1.md +1 -0
package/docs/contracts/implement-ticket-flow.md +1 -0
package/docs/contracts/installed-tools-lockfile.md +1 -0
package/docs/contracts/kernel-membership.md +1 -0
package/docs/contracts/linear-ai-rules-inclusion.md +1 -0
package/docs/contracts/linear-ai-three-layers.md +1 -0
package/docs/contracts/linter-structural-model.md +1 -0
package/docs/contracts/load-context-budget-model.md +1 -0
package/docs/contracts/load-context-schema.md +1 -0
package/docs/contracts/memory-visibility-v1.md +1 -0
package/docs/contracts/one-off-script-lifecycle.md +1 -0
package/docs/contracts/orchestration-dsl-v1.md +1 -0
package/docs/contracts/package-self-orientation.md +1 -0
package/docs/contracts/persona-schema.md +1 -0
package/docs/contracts/release-trunk-sync.md +104 -0
package/docs/contracts/roadmap-complexity-standard.md +1 -0
package/docs/contracts/rule-classification.md +1 -0
package/docs/contracts/rule-interactions.md +26 -0
package/docs/contracts/rule-priority-hierarchy.md +1 -0
package/docs/contracts/rule-router.md +1 -0
package/docs/contracts/settings-sync-yaml-subset.md +1 -0
package/docs/contracts/skill-domains.md +1 -0
package/docs/contracts/tier-3-contrib-plugin.md +1 -0
package/docs/contracts/ui-stack-extension.md +1 -0
package/docs/contracts/ui-track-flow.md +1 -0
package/docs/customization.md +1 -1
package/docs/getting-started.md +3 -1
package/docs/installation.md +8 -6
package/package.json +1 -1
package/scripts/ai_council/clients.py +17 -4
package/scripts/ai_council/orchestrator.py +6 -2
package/scripts/check_beta_review_markers.py +127 -0
package/scripts/check_references.py +25 -0
package/scripts/check_release_trunk_sync.py +152 -0
package/scripts/council_cli.py +36 -5
package/scripts/install.py +3 -3
package/scripts/run_skill_evals.py +185 -0
package/scripts/schemas/command.schema.json +5 -0
package/scripts/schemas/skill.schema.json +4 -0
package/scripts/skill_linter.py +82 -3
package/scripts/smoke_quickstart.py +134 -0
package/scripts/validate_decision_engine.py +124 -0

package/scripts/check_beta_review_markers.py ADDED Viewed

@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""
+Beta-review-marker checker for `docs/contracts/`.
+Every contract whose frontmatter declares `stability: beta` MUST carry
+exactly one of the following frontmatter markers (per
+`docs/contracts/STABILITY.md` § Beta-review markers, ratified in
+`road-to-productization.md` § P5.4):
+  - `promote-to: stable`
+  - `keep-beta-until: YYYY-MM-DD`     (max 90 days from the last review)
+  - `superseded-by: <contract-id>`
+Exit codes: 0 = clean, 1 = violations found, 3 = internal error.
+Usage:
+    python3 scripts/check_beta_review_markers.py
+    python3 scripts/check_beta_review_markers.py --json
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sys
+from dataclasses import asdict, dataclass
+from datetime import date, timedelta
+from pathlib import Path
+ROOT = Path(__file__).resolve().parent.parent
+CONTRACTS_DIR = Path("docs/contracts")
+FRONTMATTER_RE = re.compile(r"^---\s*\n(.*?)\n---\s*\n", re.DOTALL)
+STABILITY_RE = re.compile(r"^stability:\s*(\w+)\s*$", re.MULTILINE)
+PROMOTE_RE = re.compile(r"^promote-to:\s*stable\s*$", re.MULTILINE)
+KEEP_RE = re.compile(r"^keep-beta-until:\s*(\d{4}-\d{2}-\d{2})\s*$", re.MULTILINE)
+SUPERSEDED_RE = re.compile(r"^superseded-by:\s*\S+\s*$", re.MULTILINE)
+MAX_REVIEW_WINDOW_DAYS = 90
+@dataclass
+class Violation:
+    file: str
+    reason: str
+    severity: str  # "error" | "warning"
+def read_frontmatter(path: Path) -> str | None:
+    if not path.exists():
+        return None
+    txt = path.read_text(encoding="utf-8")
+    m = FRONTMATTER_RE.match(txt)
+    return m.group(1) if m else None
+def check_one(path: Path, today: date) -> list[Violation]:
+    fm = read_frontmatter(path)
+    if fm is None:
+        return []
+    sm = STABILITY_RE.search(fm)
+    if not sm or sm.group(1) != "beta":
+        return []
+    markers = [
+        ("promote-to", bool(PROMOTE_RE.search(fm))),
+        ("keep-beta-until", bool(KEEP_RE.search(fm))),
+        ("superseded-by", bool(SUPERSEDED_RE.search(fm))),
+    ]
+    set_markers = [name for name, present in markers if present]
+    rel = str(path.relative_to(ROOT))
+    if not set_markers:
+        return [Violation(
+            file=rel,
+            reason="stability=beta but no review marker; add one of "
+                   "`promote-to: stable` | `keep-beta-until: <date>` | "
+                   "`superseded-by: <id>` (see STABILITY.md § Beta-review markers)",
+            severity="error",
+        )]
+    if len(set_markers) > 1:
+        return [Violation(
+            file=rel,
+            reason=f"multiple beta-review markers set ({', '.join(set_markers)}); "
+                   "exactly one is allowed",
+            severity="error",
+        )]
+    km = KEEP_RE.search(fm)
+    if km:
+        review_date = date.fromisoformat(km.group(1))
+        max_date = today + timedelta(days=MAX_REVIEW_WINDOW_DAYS)
+        if review_date > max_date:
+            return [Violation(
+                file=rel,
+                reason=f"keep-beta-until={review_date} exceeds the "
+                       f"{MAX_REVIEW_WINDOW_DAYS}-day window (max: {max_date})",
+                severity="error",
+            )]
+    return []
+def main() -> int:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--json", action="store_true", help="machine-readable output")
+    args = ap.parse_args()
+    today = date.today()
+    violations: list[Violation] = []
+    for p in sorted((ROOT / CONTRACTS_DIR).glob("*.md")):
+        violations.extend(check_one(p, today))
+    if args.json:
+        print(json.dumps({"violations": [asdict(v) for v in violations]}, indent=2))
+    else:
+        if not violations:
+            print("✅  All beta contracts carry a valid review marker.")
+        else:
+            for v in violations:
+                icon = "❌" if v.severity == "error" else "⚠️ "
+                print(f"{icon}  {v.file}: {v.reason}")
+            print(f"\n{len(violations)} violation(s).")
+    return 1 if any(v.severity == "error" for v in violations) else 0
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except Exception as exc:  # pragma: no cover
+        print(f"internal error: {exc}", file=sys.stderr)
+        sys.exit(3)

package/scripts/check_references.py CHANGED Viewed

@@ -39,6 +39,17 @@ SKIP_DIRS = [
     "agents/council-questions",  # design Q&A trail — forward-refs to planned artifacts
     "agents/analysis",           # plate-comparison working docs — forward-refs to planned artifacts
 ]
+# Per-file opt-out marker. When present in the first 10 lines of a .md
+# file, the entire file is skipped. Use for working docs that
+# intentionally reference planned-but-not-yet-existing artifacts
+# (audit bundles, design Q&A, in-flight plans).
+FILE_SKIP_MARKER = "<!-- check-refs: skip -->"
+# Per-line opt-out marker. When present anywhere on a line, that line's
+# refs are skipped. Use for isolated forward-refs inside otherwise
+# fully-checked documents.
+LINE_IGNORE_MARKER = "<!-- ref-ignore -->"
 ROOT = Path(".")
 # YAML memory files (engineering-memory layer) live under `agents/memory/`.
@@ -219,6 +230,14 @@ def check_file(filepath: Path, artifacts: dict[str, set[str]], root: Path) -> Li
     except Exception:
         return broken
+    # File-level opt-out: working docs that intentionally reference
+    # planned-but-not-yet-existing artifacts mark themselves with
+    # `<!-- check-refs: skip -->` in the first 10 lines. Marker pairs
+    # with the per-line `<!-- ref-ignore -->` below; either suffices.
+    header_lines = text.splitlines()[:10]
+    if any(FILE_SKIP_MARKER in line for line in header_lines):
+        return broken
     # Validate `personas:` frontmatter entries against known persona ids.
     for line_no, pid in _extract_personas_frontmatter(text):
         if pid not in artifacts["personas"]:
@@ -241,6 +260,12 @@ def check_file(filepath: Path, artifacts: dict[str, set[str]], root: Path) -> Li
         if in_code_block:
             continue
+        # Per-line opt-out: isolated forward-refs in otherwise checked
+        # documents (e.g. one ref to a planned skill, surrounded by
+        # valid refs). Skip the whole line's path / skill / rule checks.
+        if LINE_IGNORE_MARKER in line:
+            continue
         # Unchecked TODO checkboxes document future work — their refs are
         # forward-looking and will not resolve yet. Track multi-line bullets:
         # any `- [ ]` opens a TODO context; a new top-level bullet, heading,

package/scripts/check_release_trunk_sync.py ADDED Viewed

@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""
+Release-trunk-sync CI gate (road-to-productization P1.3).
+Fails if `main` is more than one tagged release behind the current
+release-prep branch's target version. No-ops on every other branch
+class. Owner contract: `docs/contracts/release-trunk-sync.md`.
+Exit codes: 0 = pass / no-op, 1 = main is too far behind, 3 = internal
+error (git unavailable, malformed tag, etc.).
+"""
+from __future__ import annotations
+import os
+import re
+import subprocess
+import sys
+from pathlib import Path
+RELEASE_BRANCH_RE = re.compile(r"^release/(\d+)\.(\d+)\.(\d+)$")
+SEMVER_TAG_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)$")
+BOOTSTRAP_FILE = Path("docs/contracts/release-trunk-sync.bootstrap")
+def _git(*args: str) -> str:
+    proc = subprocess.run(
+        ["git", *args], capture_output=True, text=True, check=False
+    )
+    if proc.returncode != 0:
+        return ""
+    return proc.stdout.strip()
+def _current_branch() -> str:
+    return _git("rev-parse", "--abbrev-ref", "HEAD")
+def _parse_semver(text: str) -> tuple[int, int, int] | None:
+    m = SEMVER_TAG_RE.match(text)
+    if not m:
+        return None
+    return int(m.group(1)), int(m.group(2)), int(m.group(3))
+def _all_tags() -> list[tuple[int, int, int]]:
+    raw = _git("tag", "--list")
+    tags = []
+    for line in raw.splitlines():
+        parsed = _parse_semver(line.strip())
+        if parsed is not None:
+            tags.append(parsed)
+    tags.sort()
+    return tags
+def _main_tag() -> tuple[int, int, int] | None:
+    """Highest semver tag whose commit is reachable from main."""
+    # Try local main, fall back to origin/main.
+    for ref in ("refs/heads/main", "refs/remotes/origin/main"):
+        head = _git("rev-parse", "--verify", ref)
+        if head:
+            break
+    else:
+        return None
+    # `git tag --merged <main>` lists tags reachable from main.
+    raw = _git("tag", "--merged", head)
+    reachable: list[tuple[int, int, int]] = []
+    for line in raw.splitlines():
+        parsed = _parse_semver(line.strip())
+        if parsed is not None:
+            reachable.append(parsed)
+    if not reachable:
+        return None
+    return max(reachable)
+def _prior_release(
+    target: tuple[int, int, int], tags: list[tuple[int, int, int]]
+) -> tuple[int, int, int] | None:
+    earlier = [t for t in tags if t < target]
+    return max(earlier) if earlier else None
+def _bootstrap_ok(target: tuple[int, int, int]) -> bool:
+    if not BOOTSTRAP_FILE.exists():
+        return False
+    target_s = "{0}.{1}.{2}".format(*target)
+    for line in BOOTSTRAP_FILE.read_text().splitlines():
+        line = line.strip()
+        if not line or line.startswith("#"):
+            continue
+        if line == target_s:
+            return True
+    return False
+def main() -> int:
+    branch = _current_branch()
+    if branch == "HEAD" or not branch:
+        print("::warning::detached HEAD — release-trunk-sync gate skipped")
+        return 0
+    # CI override: GitHub Actions sometimes runs on the merge ref.
+    ci_ref = os.environ.get("GITHUB_HEAD_REF") or os.environ.get(
+        "GITHUB_REF_NAME"
+    )
+    if ci_ref:
+        branch = ci_ref
+    m = RELEASE_BRANCH_RE.match(branch)
+    if not m:
+        return 0  # non-release branch class — gate is a no-op
+    target = (int(m.group(1)), int(m.group(2)), int(m.group(3)))
+    tags = _all_tags()
+    if not tags:
+        print(
+            "::warning::no semver tags found — release-trunk-sync gate skipped"
+        )
+        return 0
+    main_tag = _main_tag()
+    if main_tag is None:
+        print(
+            "::warning::no semver tag reachable from main — gate skipped"
+        )
+        return 0
+    if main_tag >= target:
+        return 0  # main already at or ahead of release target
+    prior = _prior_release(target, tags)
+    if prior is not None and main_tag >= prior:
+        return 0  # within the one-release tolerance
+    if _bootstrap_ok(target):
+        target_s = "{0}.{1}.{2}".format(*target)
+        print(
+            f"::warning::release-trunk-sync gate suppressed for {target_s} "
+            "via bootstrap file"
+        )
+        return 0
+    main_s = "{0}.{1}.{2}".format(*main_tag)
+    target_s = "{0}.{1}.{2}".format(*target)
+    print(
+        f"::error::main is at {main_s}; release-prep branch targets "
+        f"{target_s}. Main must be no more than one tagged release behind. "
+        "See docs/contracts/release-trunk-sync.md."
+    )
+    return 1
+if __name__ == "__main__":
+    try:
+        sys.exit(main())
+    except Exception as exc:  # noqa: BLE001
+        print(f"::error::release-trunk-sync gate internal error: {exc}")
+        sys.exit(3)

package/scripts/council_cli.py CHANGED Viewed

@@ -31,6 +31,7 @@ from scripts.ai_council.bundler import (  # noqa: E402
     BundleTooLarge, bundle_prompt, bundle_roadmap,
 )
 from scripts.ai_council.clients import (  # noqa: E402
+    DEFAULT_MAX_TOKENS, UNLIMITED_TOKENS_FALLBACK,
     AnthropicClient, CouncilResponse, ExternalAIClient, ManualClient,
     OpenAIClient, load_anthropic_key, load_openai_key,
 )
@@ -236,6 +237,32 @@ def _resolve_rounds(args: argparse.Namespace, ai_cfg: dict[str, Any]) -> int:
     return min_rounds
+def _resolve_max_tokens(args: argparse.Namespace, ai_cfg: dict[str, Any]) -> int:
+    """Resolve the per-call output budget passed to each member.
+    Resolution chain (highest priority first):
+      1. ``--max-tokens N`` — explicit invocation override.
+      2. ``ai_council.max_output_tokens`` — settings value (project file
+         is authoritative; this key is not user-global-mergeable).
+      3. ``DEFAULT_MAX_TOKENS`` — package fallback (2048).
+    A value of ``0`` at any layer means "unlimited"; it is widened to
+    ``UNLIMITED_TOKENS_FALLBACK`` before reaching the SDK because
+    Anthropic rejects ``max_tokens=0``. Estimation uses the same expanded
+    value so the cost preview reflects the worst-case ceiling.
+    """
+    cli = getattr(args, "max_tokens", None)
+    if cli is not None:
+        value = int(cli)
+    elif "max_output_tokens" in ai_cfg:
+        value = int(ai_cfg.get("max_output_tokens") or 0)
+    else:
+        value = DEFAULT_MAX_TOKENS
+    if value <= 0:
+        return UNLIMITED_TOKENS_FALLBACK
+    return value
 def cmd_estimate(
     args: argparse.Namespace,
     *,
@@ -255,9 +282,10 @@ def cmd_estimate(
         )
     if table is None:
         table = load_prices()
+    ai_cfg = (settings.get("ai_council") or {}) if isinstance(settings, dict) else {}
     question, _ = build_question(
         input_path=Path(args.question), input_mode=args.input_mode,
-        max_tokens=args.max_tokens,
+        max_tokens=_resolve_max_tokens(args, ai_cfg),
     )
     project = detect_project_context(REPO_ROOT)
     billable = [m for m in members if getattr(m, "billable", True)]
@@ -316,9 +344,10 @@ def cmd_run(
         )
     if table is None:
         table = load_prices()
+    ai_cfg = (settings.get("ai_council") or {}) if isinstance(settings, dict) else {}
     question, artefact = build_question(
         input_path=Path(args.question), input_mode=args.input_mode,
-        max_tokens=args.max_tokens,
+        max_tokens=_resolve_max_tokens(args, ai_cfg),
     )
     project = detect_project_context(REPO_ROOT)
     billable = [m for m in members if getattr(m, "billable", True)]
@@ -337,7 +366,6 @@ def cmd_run(
         )
         return 0
-    ai_cfg = settings.get("ai_council") or {}
     cost_cfg = ai_cfg.get("cost_budget") or {}
     budget = CostBudget(
         max_input_tokens=int(cost_cfg.get("max_input_tokens", 50_000)),
@@ -451,8 +479,11 @@ def _add_common_input_args(p: argparse.ArgumentParser) -> None:
     p.add_argument("--input-mode", choices=["prompt", "roadmap"],
                    default="prompt",
                    help="How to bundle the file (default: prompt).")
-    p.add_argument("--max-tokens", type=int, default=1024,
-                   help="Per-member output budget (default: 1024).")
+    p.add_argument("--max-tokens", type=int, default=None,
+                   help="Per-member output budget. Default reads "
+                        "ai_council.max_output_tokens from .agent-settings.yml "
+                        "(2048 if unset). 0 = unlimited (widened to the safe "
+                        "provider ceiling before the SDK call).")
     p.add_argument("--mode-override", choices=["api", "manual"], default=None,
                    help="Override every member's transport mode.")
     p.add_argument("--model", action="append", default=None, dest="model",

package/scripts/install.py CHANGED Viewed

@@ -12,8 +12,8 @@ format in `.agent-settings.yml`, leaves a one-shot backup as
 exactly once; subsequent runs are idempotent.
 Usage:
-  python3 scripts/install.py                     # defaults: cost_profile=minimal
-  python3 scripts/install.py --profile=balanced  # set cost_profile=balanced
+  python3 scripts/install.py                     # defaults: cost_profile=balanced
+  python3 scripts/install.py --profile=minimal   # set cost_profile=minimal (kernel only)
   python3 scripts/install.py --force             # overwrite existing files
   python3 scripts/install.py --skip-bridges      # only create .agent-settings.yml
   python3 scripts/install.py --project <dir>     # override project root
@@ -42,7 +42,7 @@ try:
 except ImportError:  # pragma: no cover — alt sys.path layout
     from _lib.json_pointers import build_merge_entries  # type: ignore[no-redef]  # noqa: PLC0415
-DEFAULT_PROFILE = "minimal"
+DEFAULT_PROFILE = "balanced"
 SUPPORTED_PROFILES = ("minimal", "balanced", "full")
 COST_PROFILE_PLACEHOLDER = "__COST_PROFILE__"

package/scripts/run_skill_evals.py ADDED Viewed

@@ -0,0 +1,185 @@
+#!/usr/bin/env python3
+"""Quantitative skill-eval orchestrator (skill-writing § 7).
+Scaffolds, aggregates, and reports sub-agent eval runs for a skill.
+Sub-agent SPAWNING is per-environment (Claude Code, Augment Code,
+council) and is left as a stub `_spawn_subagent(...)` that authors
+implement once for their environment. The rest of the loop —
+scaffold / aggregate / report — works out of the box and reads /
+writes JSON files in `runs/`.
+Layout per skill:
+    .agent-src.uncompressed/skills/{name}/evals/
+        evals.json
+        runs/                              # gitignored
+            {timestamp}-baseline/{scenario_id}/output.txt
+            {timestamp}-baseline/{scenario_id}/grade.json
+            {timestamp}-with-skill/{scenario_id}/output.txt
+            {timestamp}-with-skill/{scenario_id}/grade.json
+            {timestamp}-benchmark.json
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+REPO_ROOT = Path(__file__).resolve().parent.parent
+SKILLS_ROOT = REPO_ROOT / ".agent-src.uncompressed" / "skills"
+def _skill_dir(skill: str) -> Path:
+    p = SKILLS_ROOT / skill
+    if not p.is_dir():
+        sys.exit(f"error: skill {skill!r} not found at {p}")
+    return p
+def _evals_dir(skill: str) -> Path:
+    return _skill_dir(skill) / "evals"
+def _load_evals(skill: str) -> dict[str, Any]:
+    f = _evals_dir(skill) / "evals.json"
+    if not f.exists():
+        sys.exit(f"error: {f} not found — create it before scaffolding")
+    return json.loads(f.read_text(encoding="utf-8"))
+def _timestamp() -> str:
+    return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+def _spawn_subagent(prompt: str, *, load_skill: str | None) -> dict[str, Any]:
+    """STUB — implement per environment.
+    Must return {"output": str, "elapsed_s": float, "tokens_in": int,
+    "tokens_out": int}. When load_skill is None, run baseline; when
+    set, load that skill into the sub-agent's context.
+    """
+    raise NotImplementedError(
+        "implement _spawn_subagent for this environment (Claude Code, "
+        "Augment, council, ...) — see docstring contract"
+    )
+def _grade_assertions(output: str, run_dir: Path, assertions: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    results: list[dict[str, Any]] = []
+    for a in assertions:
+        kind = a.get("kind")
+        if kind == "contains":
+            ok = a["value"] in output
+            results.append({"kind": kind, "value": a["value"], "pass": ok})
+        elif kind == "file_exists":
+            ok = (run_dir / a["path"]).exists() or Path(a["path"]).exists()
+            results.append({"kind": kind, "path": a["path"], "pass": ok})
+        elif kind == "rubric":
+            results.append({"kind": kind, "criterion": a["criterion"], "pass": None,
+                            "note": "rubric grading requires sub-agent — fill in manually or via grader"})
+        else:
+            results.append({"kind": kind, "pass": False, "note": f"unknown assertion kind {kind!r}"})
+    return results
+def cmd_scaffold(skill: str) -> int:
+    spec = _load_evals(skill)
+    scenarios = spec.get("scenarios", [])
+    if not scenarios:
+        sys.exit("error: evals.json has no scenarios")
+    ts = _timestamp()
+    runs = _evals_dir(skill) / "runs"
+    for arm in ("baseline", "with-skill"):
+        for sc in scenarios:
+            d = runs / f"{ts}-{arm}" / sc["id"]
+            d.mkdir(parents=True, exist_ok=True)
+            (d / "meta.json").write_text(json.dumps({
+                "skill": skill, "arm": arm, "scenario_id": sc["id"],
+                "prompt": sc["prompt"], "assertions": sc.get("assertions", []),
+                "timestamp": ts,
+            }, indent=2) + "\n", encoding="utf-8")
+    print(f"scaffolded {len(scenarios)} scenarios × 2 arms at runs/{ts}-{{baseline,with-skill}}/")
+    print(f"timestamp: {ts}")
+    return 0
+def cmd_aggregate(skill: str, run: str) -> int:
+    runs = _evals_dir(skill) / "runs"
+    spec = _load_evals(skill)
+    bench: dict[str, Any] = {"skill": skill, "run": run, "generated_at": _timestamp(), "scenarios": []}
+    totals = {"baseline_pass": 0, "with_skill_pass": 0, "scenarios": 0}
+    for sc in spec.get("scenarios", []):
+        row: dict[str, Any] = {"id": sc["id"], "arms": {}}
+        for arm in ("baseline", "with-skill"):
+            run_dir = runs / f"{run}-{arm}" / sc["id"]
+            grade_f = run_dir / "grade.json"
+            if not grade_f.exists():
+                row["arms"][arm] = {"status": "missing", "pass_count": 0, "total": 0}
+                continue
+            g = json.loads(grade_f.read_text(encoding="utf-8"))
+            results = g.get("results", [])
+            passed = sum(1 for r in results if r.get("pass") is True)
+            row["arms"][arm] = {"status": "graded", "pass_count": passed, "total": len(results),
+                                 "elapsed_s": g.get("elapsed_s"), "tokens_in": g.get("tokens_in"),
+                                 "tokens_out": g.get("tokens_out")}
+            if arm == "baseline" and passed == len(results) and results:
+                totals["baseline_pass"] += 1
+            if arm == "with-skill" and passed == len(results) and results:
+                totals["with_skill_pass"] += 1
+        bench["scenarios"].append(row)
+        totals["scenarios"] += 1
+    bench["totals"] = totals
+    out = runs / f"{run}-benchmark.json"
+    out.write_text(json.dumps(bench, indent=2) + "\n", encoding="utf-8")
+    print(f"wrote {out.relative_to(REPO_ROOT)}")
+    print(f"  baseline pass: {totals['baseline_pass']}/{totals['scenarios']}")
+    print(f"  with-skill pass: {totals['with_skill_pass']}/{totals['scenarios']}")
+    return 0
+def cmd_report(skill: str, run: str) -> int:
+    bench_f = _evals_dir(skill) / "runs" / f"{run}-benchmark.json"
+    if not bench_f.exists():
+        sys.exit(f"error: {bench_f} not found — run aggregate first")
+    bench = json.loads(bench_f.read_text(encoding="utf-8"))
+    print(f"# Skill eval report — {skill} @ {run}\n")
+    print("| Scenario | Baseline | With skill | Δ tokens_out | Δ elapsed_s |")
+    print("|---|---|---|---|---|")
+    for sc in bench["scenarios"]:
+        b = sc["arms"].get("baseline", {})
+        w = sc["arms"].get("with-skill", {})
+        bp = f"{b.get('pass_count', 0)}/{b.get('total', 0)}"
+        wp = f"{w.get('pass_count', 0)}/{w.get('total', 0)}"
+        dt = (w.get("tokens_out") or 0) - (b.get("tokens_out") or 0)
+        de = (w.get("elapsed_s") or 0) - (b.get("elapsed_s") or 0)
+        print(f"| {sc['id']} | {bp} | {wp} | {dt:+d} | {de:+.2f} |")
+    t = bench["totals"]
+    print(f"\n**Totals:** baseline {t['baseline_pass']}/{t['scenarios']} · with-skill {t['with_skill_pass']}/{t['scenarios']}")
+    return 0
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    sub = p.add_subparsers(dest="cmd", required=True)
+    for name in ("scaffold", "aggregate", "report"):
+        sp = sub.add_parser(name)
+        sp.add_argument("skill")
+        if name != "scaffold":
+            sp.add_argument("--run", required=True, help="run timestamp (from scaffold output)")
+    args = p.parse_args()
+    if args.cmd == "scaffold":
+        return cmd_scaffold(args.skill)
+    if args.cmd == "aggregate":
+        return cmd_aggregate(args.skill, args.run)
+    if args.cmd == "report":
+        return cmd_report(args.skill, args.run)
+    return 1
+if __name__ == "__main__":
+    sys.exit(main())

package/scripts/schemas/command.schema.json CHANGED Viewed

@@ -39,6 +39,11 @@
       "pattern": "^[a-z][a-z0-9-]*$",
       "description": "Locked verb cluster this command belongs to. See docs/contracts/command-clusters.md."
     },
+    "type": {
+      "type": "string",
+      "enum": ["orchestrator"],
+      "description": "Optional type tag. `orchestrator` marks a command that aggregates other commands / skills (cluster routers, top-level entry points) and exempts it from the `command_missing_skill_references` linter check. Omit the key for ordinary commands. See road-to-productization.md P5.3."
+    },
     "sub": {
       "type": "string",
       "pattern": "^[a-z][a-z0-9-]*$",

package/scripts/schemas/skill.schema.json CHANGED Viewed

@@ -47,6 +47,10 @@
       "enum": ["senior"],
       "description": "Optional tier marker. `senior` opts the skill into the Senior-Tier Required Structure check (Context-First lead, Related Skills, Proactive Triggers, Output Artifacts) per .agent-src.uncompressed/rules/skill-quality.md."
     },
+    "meta_skill": {
+      "type": "boolean",
+      "description": "Opt-out of the linter's `skill_too_large` warn for skills whose purpose IS breadth (skill-writing, agent-docs-writing, skill-reviewer). Meta-skills inherently bundle multiple procedures and inline examples. Use sparingly — every meta_skill: true is a load-on-context trade-off."
+    },
     "external_source": {
       "type": "string",
       "format": "uri",