npm - agim-cli - Versions diffs - 1.2.144 → 1.2.148 - Mend

agim-cli 1.2.144 → 1.2.148

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (611) hide show

package/dist/core/skills/builtin/skill-comply/scripts/report.py ADDED Viewed

@@ -0,0 +1,170 @@
+"""Generate Markdown compliance reports."""
+from __future__ import annotations
+from datetime import datetime, timezone
+from pathlib import Path
+from scripts.grader import ComplianceResult
+from scripts.parser import ComplianceSpec, ObservationEvent
+from scripts.scenario_generator import Scenario
+def generate_report(
+    skill_path: Path,
+    spec: ComplianceSpec,
+    results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
+    scenarios: list[Scenario] | None = None,
+) -> str:
+    """Generate a Markdown compliance report.
+    Args:
+        skill_path: Path to the skill file that was tested.
+        spec: The compliance spec used for grading.
+        results: List of (scenario_level_name, ComplianceResult, observations) tuples.
+        scenarios: Original scenario definitions with prompts.
+    """
+    now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+    overall = _overall_compliance(results)
+    threshold = spec.threshold_promote_to_hook
+    lines: list[str] = []
+    lines.append(f"# skill-comply Report: {skill_path.name}")
+    lines.append(f"Generated: {now}")
+    lines.append("")
+    # Summary
+    lines.append("## Summary")
+    lines.append("")
+    lines.append(f"| Metric | Value |")
+    lines.append(f"|--------|-------|")
+    lines.append(f"| Skill | `{skill_path}` |")
+    lines.append(f"| Spec | {spec.id} |")
+    lines.append(f"| Scenarios | {len(results)} |")
+    lines.append(f"| Overall Compliance | {overall:.0%} |")
+    lines.append(f"| Threshold | {threshold:.0%} |")
+    promote_steps = _steps_to_promote(spec, results, threshold)
+    if promote_steps:
+        step_names = ", ".join(promote_steps)
+        lines.append(f"| Recommendation | **Promote {step_names} to hooks** |")
+    else:
+        lines.append(f"| Recommendation | All steps above threshold — no hook promotion needed |")
+    lines.append("")
+    # Expected Behavioral Sequence
+    lines.append("## Expected Behavioral Sequence")
+    lines.append("")
+    lines.append("| # | Step | Required | Description |")
+    lines.append("|---|------|----------|-------------|")
+    for i, step in enumerate(spec.steps, 1):
+        req = "Yes" if step.required else "No"
+        lines.append(f"| {i} | {step.id} | {req} | {step.detector.description} |")
+    lines.append("")
+    # Scenario Results
+    lines.append("## Scenario Results")
+    lines.append("")
+    lines.append("| Scenario | Compliance | Failed Steps |")
+    lines.append("|----------|-----------|----------------|")
+    for level_name, result, _obs in results:
+        failed = [s.step_id for s in result.steps if not s.detected
+                  and any(sp.id == s.step_id and sp.required for sp in spec.steps)]
+        failed_str = ", ".join(failed) if failed else "—"
+        lines.append(f"| {level_name} | {result.compliance_rate:.0%} | {failed_str} |")
+    lines.append("")
+    # Scenario Prompts
+    if scenarios:
+        lines.append("## Scenario Prompts")
+        lines.append("")
+        for s in scenarios:
+            lines.append(f"### {s.level_name} (Level {s.level})")
+            lines.append("")
+            for prompt_line in s.prompt.splitlines():
+                lines.append(f"> {prompt_line}")
+            lines.append("")
+    # Hook Promotion Recommendations (optional/advanced)
+    if promote_steps:
+        lines.append("## Advanced: Hook Promotion Recommendations (optional)")
+        lines.append("")
+        for step_id in promote_steps:
+            rate = _step_compliance_rate(step_id, results)
+            step = next(s for s in spec.steps if s.id == step_id)
+            lines.append(
+                f"- **{step_id}** (compliance {rate:.0%}): {step.description}"
+            )
+        lines.append("")
+    # Per-scenario details with timeline
+    lines.append("## Detail")
+    lines.append("")
+    for level_name, result, observations in results:
+        lines.append(f"### {level_name} (Compliance: {result.compliance_rate:.0%})")
+        lines.append("")
+        lines.append("| Step | Required | Detected | Reason |")
+        lines.append("|------|----------|----------|--------|")
+        for sr in result.steps:
+            req = "Yes" if any(
+                sp.id == sr.step_id and sp.required for sp in spec.steps
+            ) else "No"
+            det = "YES" if sr.detected else "NO"
+            reason = sr.failure_reason or "—"
+            lines.append(f"| {sr.step_id} | {req} | {det} | {reason} |")
+        lines.append("")
+        # Timeline: show what the agent actually did
+        if observations:
+            # Build reverse index: event_index → step_id
+            index_to_step: dict[int, str] = {}
+            for step_id, indices in result.classification.items():
+                for idx in indices:
+                    index_to_step[idx] = step_id
+            lines.append(f"**Tool Call Timeline ({len(observations)} calls)**")
+            lines.append("")
+            lines.append("| # | Tool | Input | Output | Classified As |")
+            lines.append("|---|------|-------|--------|------|")
+            for i, obs in enumerate(observations):
+                step_label = index_to_step.get(i, "—")
+                input_summary = obs.input[:100].replace("|", "\\|").replace("\n", " ")
+                output_summary = obs.output[:50].replace("|", "\\|").replace("\n", " ")
+                lines.append(
+                    f"| {i} | {obs.tool} | {input_summary} | {output_summary} | {step_label} |"
+                )
+            lines.append("")
+    return "\n".join(lines)
+def _overall_compliance(results: list[tuple[str, ComplianceResult, list[ObservationEvent]]]) -> float:
+    if not results:
+        return 0.0
+    return sum(r.compliance_rate for _, r, _obs in results) / len(results)
+def _step_compliance_rate(
+    step_id: str,
+    results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
+) -> float:
+    detected = sum(
+        1 for _, r, _obs in results
+        for s in r.steps if s.step_id == step_id and s.detected
+    )
+    return detected / len(results) if results else 0.0
+def _steps_to_promote(
+    spec: ComplianceSpec,
+    results: list[tuple[str, ComplianceResult, list[ObservationEvent]]],
+    threshold: float,
+) -> list[str]:
+    promote = []
+    for step in spec.steps:
+        if not step.required:
+            continue
+        rate = _step_compliance_rate(step.id, results)
+        if rate < threshold:
+            promote.append(step.id)
+    return promote

package/dist/core/skills/builtin/skill-comply/scripts/run.py ADDED Viewed

@@ -0,0 +1,127 @@
+"""CLI entry point for skill-comply."""
+from __future__ import annotations
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+import yaml
+from scripts.grader import grade
+from scripts.report import generate_report
+from scripts.runner import run_scenario
+from scripts.scenario_generator import generate_scenarios
+from scripts.spec_generator import generate_spec
+logger = logging.getLogger(__name__)
+def main() -> None:
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+    parser = argparse.ArgumentParser(
+        description="skill-comply: Measure skill compliance rates",
+    )
+    parser.add_argument(
+        "skill",
+        type=Path,
+        help="Path to skill/rule file to test",
+    )
+    parser.add_argument(
+        "--model",
+        default="sonnet",
+        help="Model for scenario execution (default: sonnet)",
+    )
+    parser.add_argument(
+        "--gen-model",
+        default="haiku",
+        help="Model for spec/scenario generation (default: haiku)",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Generate spec and scenarios without executing",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=None,
+        help="Output report path (default: results/<skill-name>.md)",
+    )
+    args = parser.parse_args()
+    if not args.skill.is_file():
+        logger.error("Error: Skill file not found: %s", args.skill)
+        sys.exit(1)
+    results_dir = Path(__file__).parent.parent / "results"
+    results_dir.mkdir(exist_ok=True)
+    # Step 1: Generate compliance spec
+    logger.info("[1/4] Generating compliance spec from %s...", args.skill.name)
+    spec = generate_spec(args.skill, model=args.gen_model)
+    logger.info("       %d steps extracted", len(spec.steps))
+    # Step 2: Generate scenarios
+    spec_yaml = yaml.dump({
+        "steps": [
+            {"id": s.id, "description": s.description, "required": s.required}
+            for s in spec.steps
+        ]
+    })
+    logger.info("[2/4] Generating scenarios (3 prompt strictness levels)...")
+    scenarios = generate_scenarios(args.skill, spec_yaml, model=args.gen_model)
+    logger.info("       %d scenarios generated", len(scenarios))
+    for s in scenarios:
+        logger.info("       - %s: %s", s.level_name, s.description[:60])
+    if args.dry_run:
+        logger.info("\n[dry-run] Spec and scenarios generated. Skipping execution.")
+        logger.info("\nSpec: %s (%d steps)", spec.id, len(spec.steps))
+        for step in spec.steps:
+            marker = "*" if step.required else " "
+            logger.info("  [%s] %s: %s", marker, step.id, step.description)
+        return
+    # Step 3: Execute scenarios
+    logger.info("[3/4] Executing scenarios (model=%s)...", args.model)
+    graded_results: list[tuple[str, Any, list[Any]]] = []
+    for scenario in scenarios:
+        logger.info("       Running %s...", scenario.level_name)
+        run = run_scenario(scenario, model=args.model)
+        result = grade(spec, list(run.observations))
+        graded_results.append((scenario.level_name, result, list(run.observations)))
+        logger.info("       %s: %.0f%%", scenario.level_name, result.compliance_rate * 100)
+    # Step 4: Generate report
+    skill_name = args.skill.parent.name if args.skill.stem == "SKILL" else args.skill.stem
+    output_path = args.output or results_dir / f"{skill_name}.md"
+    logger.info("[4/4] Generating report...")
+    report = generate_report(args.skill, spec, graded_results, scenarios=scenarios)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(report)
+    logger.info("       Report saved to %s", output_path)
+    # Summary
+    if not graded_results:
+        logger.warning("No scenarios were executed.")
+        return
+    overall = sum(r.compliance_rate for _, r, _obs in graded_results) / len(graded_results)
+    logger.info("\n%s", "=" * 50)
+    logger.info("Overall Compliance: %.0f%%", overall * 100)
+    if overall < spec.threshold_promote_to_hook:
+        logger.info(
+            "Recommendation: Some steps have low compliance. "
+            "Consider promoting them to hooks. See the report for details."
+        )
+if __name__ == "__main__":
+    main()

package/dist/core/skills/builtin/skill-comply/scripts/runner.py ADDED Viewed

@@ -0,0 +1,186 @@
+"""Run scenarios via claude -p and parse tool calls from stream-json output."""
+from __future__ import annotations
+import json
+import re
+import shlex
+import shutil
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from scripts.parser import ObservationEvent
+from scripts.scenario_generator import Scenario
+SANDBOX_BASE = Path("/tmp/skill-comply-sandbox")
+ALLOWED_MODELS = frozenset({"haiku", "sonnet", "opus"})
+# Shell builtins cannot be invoked via subprocess.run; cwd is already
+# controlled by the cwd= keyword. Scenarios that include these in
+# setup_commands (a common shell-style convention) must be tolerated.
+SHELL_BUILTINS = frozenset({"cd", "pushd", "popd"})
+@dataclass(frozen=True)
+class ScenarioRun:
+    scenario: Scenario
+    observations: tuple[ObservationEvent, ...]
+    sandbox_dir: Path
+def run_scenario(
+    scenario: Scenario,
+    model: str = "sonnet",
+    max_turns: int = 30,
+    timeout: int = 300,
+) -> ScenarioRun:
+    """Execute a scenario and extract tool calls from stream-json output."""
+    if model not in ALLOWED_MODELS:
+        raise ValueError(f"Unknown model: {model!r}. Allowed: {ALLOWED_MODELS}")
+    sandbox_dir = _safe_sandbox_dir(scenario.id)
+    _setup_sandbox(sandbox_dir, scenario)
+    result = subprocess.run(
+        [
+            "claude", "-p", scenario.prompt,
+            "--model", model,
+            "--max-turns", str(max_turns),
+            "--add-dir", str(sandbox_dir),
+            "--allowedTools", "Read,Write,Edit,Bash,Glob,Grep",
+            "--output-format", "stream-json",
+            "--verbose",
+        ],
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        cwd=sandbox_dir,
+    )
+    # claude -p returns rc=1 when --max-turns is reached, but the stream-json
+    # output is still complete and parseable. Treat this graceful termination
+    # as non-fatal so scenarios that hit the turn cap still produce usable
+    # observations.
+    nonfatal_max_turns = (
+        result.returncode == 1
+        and '"terminal_reason":"max_turns"' in result.stdout
+    )
+    if result.returncode != 0 and not nonfatal_max_turns:
+        # Include both stderr and stdout tails. claude -p often surfaces the
+        # actual failure context (model error JSON, partial stream-json) on
+        # stdout, while stderr carries generic transport / auth messages.
+        # Showing both dramatically reduces "rc=N: <empty>" debugging dead-ends.
+        raise RuntimeError(
+            f"claude -p failed (rc={result.returncode}): "
+            f"stderr={result.stderr[:500]!r} stdout_tail={result.stdout[-500:]!r}"
+        )
+    observations = _parse_stream_json(result.stdout)
+    return ScenarioRun(
+        scenario=scenario,
+        observations=tuple(observations),
+        sandbox_dir=sandbox_dir,
+    )
+def _safe_sandbox_dir(scenario_id: str) -> Path:
+    """Sanitize scenario ID and ensure path stays within sandbox base."""
+    safe_id = re.sub(r"[^a-zA-Z0-9\-_]", "_", scenario_id)
+    path = SANDBOX_BASE / safe_id
+    # Validate path stays within sandbox base (raises ValueError on traversal)
+    path.resolve().relative_to(SANDBOX_BASE.resolve())
+    return path
+def _setup_sandbox(sandbox_dir: Path, scenario: Scenario) -> None:
+    """Create sandbox directory and run setup commands."""
+    if sandbox_dir.exists():
+        shutil.rmtree(sandbox_dir)
+    sandbox_dir.mkdir(parents=True)
+    subprocess.run(["git", "init"], cwd=sandbox_dir, capture_output=True)
+    for cmd in scenario.setup_commands:
+        parts = shlex.split(cmd)
+        if not parts or parts[0] in SHELL_BUILTINS:
+            # Shell builtins (cd/pushd/popd) cannot run as subprocess; skip.
+            continue
+        try:
+            subprocess.run(parts, cwd=sandbox_dir, capture_output=True)
+        except FileNotFoundError:
+            # Setup tool not installed in this environment; skip rather than
+            # crash the whole scenario. The compliance run continues.
+            continue
+def _parse_stream_json(stdout: str) -> list[ObservationEvent]:
+    """Parse claude -p stream-json output into ObservationEvents.
+    Stream-json format:
+    - type=assistant with content[].type=tool_use → tool call (name, input)
+    - type=user with content[].type=tool_result → tool result (output)
+    """
+    events: list[ObservationEvent] = []
+    pending: dict[str, dict] = {}
+    event_counter = 0
+    for line in stdout.strip().splitlines():
+        try:
+            msg = json.loads(line)
+        except json.JSONDecodeError:
+            continue
+        msg_type = msg.get("type")
+        if msg_type == "assistant":
+            content = msg.get("message", {}).get("content", [])
+            for block in content:
+                if block.get("type") == "tool_use":
+                    tool_use_id = block.get("id", "")
+                    tool_input = block.get("input", {})
+                    input_str = (
+                        json.dumps(tool_input)[:5000]
+                        if isinstance(tool_input, dict)
+                        else str(tool_input)[:5000]
+                    )
+                    pending[tool_use_id] = {
+                        "tool": block.get("name", "unknown"),
+                        "input": input_str,
+                        "order": event_counter,
+                    }
+                    event_counter += 1
+        elif msg_type == "user":
+            content = msg.get("message", {}).get("content", [])
+            if isinstance(content, list):
+                for block in content:
+                    tool_use_id = block.get("tool_use_id", "")
+                    if tool_use_id in pending:
+                        info = pending.pop(tool_use_id)
+                        output_content = block.get("content", "")
+                        if isinstance(output_content, list):
+                            output_str = json.dumps(output_content)[:5000]
+                        else:
+                            output_str = str(output_content)[:5000]
+                        events.append(ObservationEvent(
+                            timestamp=f"T{info['order']:04d}",
+                            event="tool_complete",
+                            tool=info["tool"],
+                            session=msg.get("session_id", "unknown"),
+                            input=info["input"],
+                            output=output_str,
+                        ))
+    for _tool_use_id, info in pending.items():
+        events.append(ObservationEvent(
+            timestamp=f"T{info['order']:04d}",
+            event="tool_complete",
+            tool=info["tool"],
+            session="unknown",
+            input=info["input"],
+            output="",
+        ))
+    return sorted(events, key=lambda e: e.timestamp)

package/dist/core/skills/builtin/skill-comply/scripts/scenario_generator.py ADDED Viewed

@@ -0,0 +1,70 @@
+"""Generate pressure scenarios from skill + spec using LLM."""
+from __future__ import annotations
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+import yaml
+from scripts.utils import extract_yaml
+PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
+@dataclass(frozen=True)
+class Scenario:
+    id: str
+    level: int
+    level_name: str
+    description: str
+    prompt: str
+    setup_commands: tuple[str, ...]
+def generate_scenarios(
+    skill_path: Path,
+    spec_yaml: str,
+    model: str = "haiku",
+) -> list[Scenario]:
+    """Generate 3 scenarios with decreasing prompt strictness.
+    Calls claude -p with the scenario_generator prompt, parses YAML output.
+    """
+    skill_content = skill_path.read_text()
+    prompt_template = (PROMPTS_DIR / "scenario_generator.md").read_text()
+    prompt = (
+        prompt_template
+        .replace("{skill_content}", skill_content)
+        .replace("{spec_yaml}", spec_yaml)
+    )
+    result = subprocess.run(
+        ["claude", "-p", prompt, "--model", model, "--output-format", "text"],
+        capture_output=True,
+        text=True,
+        timeout=120,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(f"claude -p failed: {result.stderr}")
+    if not result.stdout.strip():
+        raise RuntimeError("claude -p returned empty output")
+    raw_yaml = extract_yaml(result.stdout)
+    parsed = yaml.safe_load(raw_yaml)
+    scenarios: list[Scenario] = []
+    for s in parsed["scenarios"]:
+        scenarios.append(Scenario(
+            id=s["id"],
+            level=s["level"],
+            level_name=s["level_name"],
+            description=s["description"],
+            prompt=s["prompt"].strip(),
+            setup_commands=tuple(s.get("setup_commands", [])),
+        ))
+    return sorted(scenarios, key=lambda s: s.level)

package/dist/core/skills/builtin/skill-comply/scripts/spec_generator.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Generate compliance specs from skill files using LLM."""
+from __future__ import annotations
+import subprocess
+import tempfile
+from pathlib import Path
+import yaml
+from scripts.parser import ComplianceSpec, parse_spec
+from scripts.utils import extract_yaml
+PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
+def generate_spec(
+    skill_path: Path,
+    model: str = "haiku",
+    max_retries: int = 2,
+) -> ComplianceSpec:
+    """Generate a compliance spec from a skill/rule file.
+    Calls claude -p with the spec_generator prompt, parses YAML output.
+    Retries on YAML parse errors with error feedback.
+    """
+    skill_content = skill_path.read_text()
+    prompt_template = (PROMPTS_DIR / "spec_generator.md").read_text()
+    base_prompt = prompt_template.replace("{skill_content}", skill_content)
+    last_error: Exception | None = None
+    for attempt in range(max_retries + 1):
+        prompt = base_prompt
+        if attempt > 0 and last_error is not None:
+            prompt += (
+                f"\n\nPREVIOUS ATTEMPT FAILED with YAML parse error:\n"
+                f"{last_error}\n\n"
+                f"Please fix the YAML. Remember to quote all string values "
+                f"that contain colons, e.g.: description: \"Use type: description format\""
+            )
+        result = subprocess.run(
+            ["claude", "-p", prompt, "--model", model, "--output-format", "text"],
+            capture_output=True,
+            text=True,
+            timeout=120,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(f"claude -p failed: {result.stderr}")
+        raw_yaml = extract_yaml(result.stdout)
+        tmp_path = None
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".yaml", delete=False,
+        ) as f:
+            f.write(raw_yaml)
+            tmp_path = Path(f.name)
+        try:
+            return parse_spec(tmp_path)
+        except (yaml.YAMLError, KeyError, TypeError) as e:
+            last_error = e
+            if attempt == max_retries:
+                raise
+        finally:
+            if tmp_path is not None:
+                tmp_path.unlink(missing_ok=True)
+    raise RuntimeError("unreachable")

package/dist/core/skills/builtin/skill-comply/scripts/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""Shared utilities for skill-comply scripts."""
+from __future__ import annotations
+def extract_yaml(text: str) -> str:
+    """Extract YAML from LLM output, stripping markdown fences if present."""
+    lines = text.strip().splitlines()
+    if lines and lines[0].startswith("```"):
+        lines = lines[1:]
+    if lines and lines[-1].startswith("```"):
+        lines = lines[:-1]
+    return "\n".join(lines)