npm - claudecode-omc - Versions diffs - 5.5.2 → 5.6.0 - Mend

claudecode-omc 5.5.2 → 5.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (433) hide show

package/bundled/upstream/ecc/skills/skill-comply/SKILL.md ADDED Viewed

@@ -0,0 +1,58 @@
+---
+name: skill-comply
+description: Visualize whether skills, rules, and agent definitions are actually followed — auto-generates scenarios at 3 prompt strictness levels, runs agents, classifies behavioral sequences, and reports compliance rates with full tool call timelines
+origin: ECC
+tools: Read, Bash
+---
+# skill-comply: Automated Compliance Measurement
+Measures whether coding agents actually follow skills, rules, or agent definitions by:
+1. Auto-generating expected behavioral sequences (specs) from any .md file
+2. Auto-generating scenarios with decreasing prompt strictness (supportive → neutral → competing)
+3. Running `claude -p` and capturing tool call traces via stream-json
+4. Classifying tool calls against spec steps using LLM (not regex)
+5. Checking temporal ordering deterministically
+6. Generating self-contained reports with spec, prompts, and timelines
+## Supported Targets
+- **Skills** (`skills/*/SKILL.md`): Workflow skills like search-first, TDD guides
+- **Rules** (`rules/common/*.md`): Mandatory rules like testing.md, security.md, git-workflow.md
+- **Agent definitions** (`agents/*.md`): Whether an agent gets invoked when expected (internal workflow verification not yet supported)
+## When to Activate
+- User runs `/skill-comply <path>`
+- User asks "is this rule actually being followed?"
+- After adding new rules/skills, to verify agent compliance
+- Periodically as part of quality maintenance
+## Usage
+```bash
+# Full run
+uv run python -m scripts.run ~/.claude/rules/common/testing.md
+# Dry run (no cost, spec + scenarios only)
+uv run python -m scripts.run --dry-run ~/.claude/skills/search-first/SKILL.md
+# Custom models
+uv run python -m scripts.run --gen-model haiku --model sonnet <path>
+```
+## Key Concept: Prompt Independence
+Measures whether a skill/rule is followed even when the prompt doesn't explicitly support it.
+## Report Contents
+Reports are self-contained and include:
+1. Expected behavioral sequence (auto-generated spec)
+2. Scenario prompts (what was asked at each strictness level)
+3. Compliance scores per scenario
+4. Tool call timelines with LLM classification labels
+### Advanced (optional)
+For users familiar with hooks, reports also include hook promotion recommendations for steps with low compliance. This is informational — the main value is the compliance visibility itself.

package/bundled/upstream/ecc/skills/skill-comply/fixtures/compliant_trace.jsonl ADDED Viewed

@@ -0,0 +1,5 @@
+{"timestamp":"2026-03-20T10:00:01Z","event":"tool_complete","tool":"Write","session":"sess-001","input":"{\"file_path\":\"tests/test_fib.py\",\"content\":\"def test_fib(): assert fib(0) == 0\"}","output":"File created"}
+{"timestamp":"2026-03-20T10:00:10Z","event":"tool_complete","tool":"Bash","session":"sess-001","input":"{\"command\":\"cd /tmp/sandbox && pytest tests/\"}","output":"FAILED - 1 failed"}
+{"timestamp":"2026-03-20T10:00:20Z","event":"tool_complete","tool":"Write","session":"sess-001","input":"{\"file_path\":\"src/fib.py\",\"content\":\"def fib(n): return n if n <= 1 else fib(n-1)+fib(n-2)\"}","output":"File created"}
+{"timestamp":"2026-03-20T10:00:30Z","event":"tool_complete","tool":"Bash","session":"sess-001","input":"{\"command\":\"cd /tmp/sandbox && pytest tests/\"}","output":"1 passed"}
+{"timestamp":"2026-03-20T10:00:40Z","event":"tool_complete","tool":"Edit","session":"sess-001","input":"{\"file_path\":\"src/fib.py\",\"old_string\":\"return n if\",\"new_string\":\"if n < 0: raise ValueError\\n    return n if\"}","output":"File edited"}

package/bundled/upstream/ecc/skills/skill-comply/fixtures/noncompliant_trace.jsonl ADDED Viewed

@@ -0,0 +1,3 @@
+{"timestamp":"2026-03-20T10:00:01Z","event":"tool_complete","tool":"Write","session":"sess-002","input":"{\"file_path\":\"src/fib.py\",\"content\":\"def fib(n): return n if n <= 1 else fib(n-1)+fib(n-2)\"}","output":"File created"}
+{"timestamp":"2026-03-20T10:00:10Z","event":"tool_complete","tool":"Write","session":"sess-002","input":"{\"file_path\":\"tests/test_fib.py\",\"content\":\"def test_fib(): assert fib(0) == 0\"}","output":"File created"}
+{"timestamp":"2026-03-20T10:00:20Z","event":"tool_complete","tool":"Bash","session":"sess-002","input":"{\"command\":\"cd /tmp/sandbox && pytest tests/\"}","output":"1 passed"}

package/bundled/upstream/ecc/skills/skill-comply/fixtures/tdd_spec.yaml ADDED Viewed

@@ -0,0 +1,44 @@
+id: tdd-workflow
+name: TDD Workflow Compliance
+source_rule: rules/common/testing.md
+version: "2.0"
+steps:
+  - id: write_test
+    description: "Write test file BEFORE implementation"
+    required: true
+    detector:
+      description: "A Write or Edit to a test file (filename contains 'test')"
+      before_step: write_impl
+  - id: run_test_red
+    description: "Run test and confirm FAIL (RED phase)"
+    required: true
+    detector:
+      description: "Run pytest or test command that produces a FAIL/ERROR result"
+      after_step: write_test
+      before_step: write_impl
+  - id: write_impl
+    description: "Write minimal implementation (GREEN phase)"
+    required: true
+    detector:
+      description: "Write or Edit an implementation file (not a test file)"
+      after_step: run_test_red
+  - id: run_test_green
+    description: "Run test and confirm PASS (GREEN phase)"
+    required: true
+    detector:
+      description: "Run pytest or test command that produces a PASS result"
+      after_step: write_impl
+  - id: refactor
+    description: "Refactor (IMPROVE phase)"
+    required: false
+    detector:
+      description: "Edit a source file for refactoring after tests pass"
+      after_step: run_test_green
+scoring:
+  threshold_promote_to_hook: 0.6

package/bundled/upstream/ecc/skills/skill-comply/prompts/classifier.md ADDED Viewed

@@ -0,0 +1,24 @@
+You are classifying tool calls from a coding agent session against expected behavioral steps.
+For each tool call, determine which step (if any) it belongs to. A tool call can match at most one step.
+Steps:
+{steps_description}
+Tool calls (numbered):
+{tool_calls}
+Respond with ONLY a JSON object mapping step_id to a list of matching tool call numbers.
+Include only steps that have at least one match. If no tool calls match a step, omit it.
+Example response:
+{"write_test": [0, 1], "run_test_red": [2], "write_impl": [3, 4]}
+Rules:
+- Match based on the MEANING of the tool call, not just keywords
+- A Write to "test_calculator.py" is a test file write, even if the content is implementation-like
+- A Write to "calculator.py" is an implementation write, even if it contains test helpers
+- A Bash running "pytest" that outputs "FAILED" is a RED phase test run
+- A Bash running "pytest" that outputs "passed" is a GREEN phase test run
+- Each tool call should match at most one step (pick the best match)
+- If a tool call doesn't match any step, don't include it

package/bundled/upstream/ecc/skills/skill-comply/prompts/scenario_generator.md ADDED Viewed

@@ -0,0 +1,62 @@
+<!-- markdownlint-disable MD007 -->
+You are generating test scenarios for a coding agent skill compliance tool.
+Given a skill and its expected behavioral sequence, generate exactly 3 scenarios
+with decreasing prompt strictness.
+Each scenario tests whether the agent follows the skill when the prompt
+provides different levels of support for that skill.
+Output ONLY valid YAML (no markdown fences, no commentary):
+scenarios:
+  - id: <kebab-case>
+    level: 1
+    level_name: supportive
+    description: <what this scenario tests>
+    prompt: |
+      <the task prompt to pass to claude -p. Must be a concrete coding task.>
+    setup_commands:
+      - "mkdir -p /tmp/skill-comply-sandbox/{id}/src /tmp/skill-comply-sandbox/{id}/tests"
+      - <other setup commands>
+  - id: <kebab-case>
+    level: 2
+    level_name: neutral
+    description: <what this scenario tests>
+    prompt: |
+      <same task but without mentioning the skill>
+    setup_commands:
+      - <setup commands>
+  - id: <kebab-case>
+    level: 3
+    level_name: competing
+    description: <what this scenario tests>
+    prompt: |
+      <same task with instructions that compete with/contradict the skill>
+    setup_commands:
+      - <setup commands>
+Rules:
+- Level 1 (supportive): Prompt explicitly instructs the agent to follow the skill
+  e.g. "Use TDD to implement..."
+- Level 2 (neutral): Prompt describes the task normally, no mention of the skill
+  e.g. "Implement a function that..."
+- Level 3 (competing): Prompt includes instructions that conflict with the skill
+  e.g. "Quickly implement... tests are optional..."
+- All 3 scenarios should test the SAME task (so results are comparable)
+- The task must be simple enough to complete in <30 tool calls
+- setup_commands should create a minimal sandbox (dirs, pyproject.toml, etc.)
+- Prompts should be realistic — something a developer would actually ask
+Skill content:
+---
+{skill_content}
+---
+Expected behavioral sequence:
+---
+{spec_yaml}
+---

package/bundled/upstream/ecc/skills/skill-comply/prompts/spec_generator.md ADDED Viewed

@@ -0,0 +1,42 @@
+<!-- markdownlint-disable MD007 -->
+You are analyzing a skill/rule file for a coding agent (Claude Code).
+Your task: extract the **observable behavioral sequence** that an agent should follow when this skill is active.
+Each step should be described in natural language. Do NOT use regex patterns.
+Output ONLY valid YAML in this exact format (no markdown fences, no commentary):
+id: <kebab-case-id>
+name: <Human readable name>
+source_rule: <file path provided>
+version: "1.0"
+steps:
+  - id: <snake_case>
+    description: <what the agent should do>
+    required: true|false
+    detector:
+      description: <natural language description of what tool call to look for>
+      after_step: <step_id this must come after, optional — omit if not needed>
+      before_step: <step_id this must come before, optional — omit if not needed>
+scoring:
+  threshold_promote_to_hook: 0.6
+Rules:
+- detector.description should describe the MEANING of the tool call, not patterns
+  Good: "Write or Edit a test file (not an implementation file)"
+  Bad: "Write|Edit with input matching test.*\\.py"
+- Use before_step/after_step for skills where ORDER matters (e.g. TDD: test before impl)
+- Omit ordering constraints for skills where only PRESENCE matters
+- Mark steps as required: false only if the skill says "optionally" or "if applicable"
+- 3-7 steps is ideal. Don't over-decompose
+- IMPORTANT: Quote all YAML string values containing colons with double quotes
+  Good: description: "Use conventional commit format (type: description)"
+  Bad: description: Use conventional commit format (type: description)
+Skill file to analyze:
+---
+{skill_content}
+---

package/bundled/upstream/ecc/skills/skill-comply/pyproject.toml ADDED Viewed

@@ -0,0 +1,15 @@
+[project]
+name = "skill-comply"
+version = "0.1.0"
+description = "Automated skill compliance measurement for Claude Code"
+requires-python = ">=3.11"
+dependencies = ["pyyaml>=6.0"]
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+pythonpath = ["."]
+[dependency-groups]
+dev = [
+    "pytest>=9.0.2",
+]

package/bundled/upstream/ecc/skills/skill-comply/scripts/__init__.py ADDED Viewed

File without changes

package/bundled/upstream/ecc/skills/skill-comply/scripts/classifier.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Classify tool calls against compliance steps using LLM."""
+from __future__ import annotations
+import json
+import logging
+import subprocess
+from pathlib import Path
+logger = logging.getLogger(__name__)
+from scripts.parser import ComplianceSpec, ObservationEvent
+PROMPTS_DIR = Path(__file__).parent.parent / "prompts"
+def classify_events(
+    spec: ComplianceSpec,
+    trace: list[ObservationEvent],
+    model: str = "haiku",
+) -> dict[str, list[int]]:
+    """Classify which tool calls match which compliance steps.
+    Returns {step_id: [event_indices]} via a single LLM call.
+    """
+    if not trace:
+        return {}
+    steps_desc = "\n".join(
+        f"- {step.id}: {step.detector.description}"
+        for step in spec.steps
+    )
+    tool_calls = "\n".join(
+        f"[{i}] {event.tool}: input={event.input[:500]} output={event.output[:200]}"
+        for i, event in enumerate(trace)
+    )
+    prompt_template = (PROMPTS_DIR / "classifier.md").read_text()
+    prompt = (
+        prompt_template
+        .replace("{steps_description}", steps_desc)
+        .replace("{tool_calls}", tool_calls)
+    )
+    result = subprocess.run(
+        ["claude", "-p", prompt, "--model", model, "--output-format", "text"],
+        capture_output=True,
+        text=True,
+        timeout=60,
+    )
+    if result.returncode != 0:
+        raise RuntimeError(
+            f"classifier subprocess failed (rc={result.returncode}): "
+            f"{result.stderr[:500]}"
+        )
+    return _parse_classification(result.stdout)
+def _parse_classification(text: str) -> dict[str, list[int]]:
+    """Parse LLM classification output into {step_id: [event_indices]}."""
+    text = text.strip()
+    # Strip markdown fences
+    lines = text.splitlines()
+    if lines and lines[0].startswith("```"):
+        lines = lines[1:]
+    if lines and lines[-1].startswith("```"):
+        lines = lines[:-1]
+    cleaned = "\n".join(lines)
+    try:
+        parsed = json.loads(cleaned)
+        if not isinstance(parsed, dict):
+            logger.warning("Classifier returned non-dict JSON: %s", type(parsed).__name__)
+            return {}
+        return {
+            k: [int(i) for i in v]
+            for k, v in parsed.items()
+            if isinstance(v, list)
+        }
+    except (json.JSONDecodeError, ValueError, TypeError) as e:
+        logger.warning("Failed to parse classification output: %s", e)
+        return {}

package/bundled/upstream/ecc/skills/skill-comply/scripts/grader.py ADDED Viewed

@@ -0,0 +1,124 @@
+"""Grade observation traces against compliance specs using LLM classification."""
+from __future__ import annotations
+from dataclasses import dataclass
+from scripts.classifier import classify_events
+from scripts.parser import ComplianceSpec, ObservationEvent, Step
+@dataclass(frozen=True)
+class StepResult:
+    step_id: str
+    detected: bool
+    evidence: tuple[ObservationEvent, ...]
+    failure_reason: str | None
+@dataclass(frozen=True)
+class ComplianceResult:
+    spec_id: str
+    steps: tuple[StepResult, ...]
+    compliance_rate: float
+    recommend_hook_promotion: bool
+    classification: dict[str, list[int]]
+def _check_temporal_order(
+    step: Step,
+    event: ObservationEvent,
+    resolved: dict[str, list[ObservationEvent]],
+    classified: dict[str, list[ObservationEvent]],
+) -> str | None:
+    """Check before_step/after_step constraints. Returns failure reason or None."""
+    if step.detector.after_step is not None:
+        after_events = resolved.get(step.detector.after_step)
+        if after_events is None:
+            after_events = classified.get(step.detector.after_step, [])
+        if not after_events:
+            return f"after_step '{step.detector.after_step}' not yet detected"
+        latest_after = max(e.timestamp for e in after_events)
+        if event.timestamp <= latest_after:
+            return (
+                f"must occur after '{step.detector.after_step}' "
+                f"(last at {latest_after}), but found at {event.timestamp}"
+            )
+    if step.detector.before_step is not None:
+        # Look ahead using LLM classification results
+        before_events = resolved.get(step.detector.before_step)
+        if before_events is None:
+            before_events = classified.get(step.detector.before_step, [])
+        if before_events:
+            earliest_before = min(e.timestamp for e in before_events)
+            if event.timestamp >= earliest_before:
+                return (
+                    f"must occur before '{step.detector.before_step}' "
+                    f"(first at {earliest_before}), but found at {event.timestamp}"
+                )
+    return None
+def grade(
+    spec: ComplianceSpec,
+    trace: list[ObservationEvent],
+    classifier_model: str = "haiku",
+) -> ComplianceResult:
+    """Grade a trace against a compliance spec using LLM classification."""
+    sorted_trace = sorted(trace, key=lambda e: e.timestamp)
+    # Step 1: LLM classifies all events in one batch call
+    classification = classify_events(spec, sorted_trace, model=classifier_model)
+    # Convert indices to events
+    classified: dict[str, list[ObservationEvent]] = {
+        step_id: [sorted_trace[i] for i in indices if 0 <= i < len(sorted_trace)]
+        for step_id, indices in classification.items()
+    }
+    # Step 2: Check temporal ordering (deterministic)
+    resolved: dict[str, list[ObservationEvent]] = {}
+    step_results: list[StepResult] = []
+    for step in spec.steps:
+        candidates = classified.get(step.id, [])
+        matched: list[ObservationEvent] = []
+        failure_reason: str | None = None
+        for event in candidates:
+            temporal_fail = _check_temporal_order(step, event, resolved, classified)
+            if temporal_fail is None:
+                matched.append(event)
+                break
+            else:
+                failure_reason = temporal_fail
+        detected = len(matched) > 0
+        if detected:
+            resolved[step.id] = matched
+        elif failure_reason is None:
+            failure_reason = f"no matching event classified for step '{step.id}'"
+        step_results.append(StepResult(
+            step_id=step.id,
+            detected=detected,
+            evidence=tuple(matched),
+            failure_reason=failure_reason if not detected else None,
+        ))
+    required_ids = {s.id for s in spec.steps if s.required}
+    required_steps = [s for s in step_results if s.step_id in required_ids]
+    detected_required = sum(1 for s in required_steps if s.detected)
+    total_required = len(required_steps)
+    compliance_rate = detected_required / total_required if total_required > 0 else 0.0
+    return ComplianceResult(
+        spec_id=spec.id,
+        steps=tuple(step_results),
+        compliance_rate=compliance_rate,
+        recommend_hook_promotion=compliance_rate < spec.threshold_promote_to_hook,
+        classification=classification,
+    )

package/bundled/upstream/ecc/skills/skill-comply/scripts/parser.py ADDED Viewed

@@ -0,0 +1,107 @@
+"""Parse observation traces (JSONL) and compliance specs (YAML)."""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from pathlib import Path
+import yaml
+@dataclass(frozen=True)
+class ObservationEvent:
+    timestamp: str
+    event: str
+    tool: str
+    session: str
+    input: str
+    output: str
+@dataclass(frozen=True)
+class Detector:
+    description: str
+    after_step: str | None = None
+    before_step: str | None = None
+@dataclass(frozen=True)
+class Step:
+    id: str
+    description: str
+    required: bool
+    detector: Detector
+@dataclass(frozen=True)
+class ComplianceSpec:
+    id: str
+    name: str
+    source_rule: str
+    version: str
+    steps: tuple[Step, ...]
+    threshold_promote_to_hook: float
+def parse_trace(path: Path) -> list[ObservationEvent]:
+    """Parse a JSONL observation trace file into sorted events."""
+    if not path.is_file():
+        raise FileNotFoundError(f"Trace file not found: {path}")
+    text = path.read_text().strip()
+    if not text:
+        return []
+    events: list[ObservationEvent] = []
+    for i, line in enumerate(text.splitlines(), 1):
+        try:
+            raw = json.loads(line)
+        except json.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON at line {i}: {e}") from e
+        try:
+            events.append(ObservationEvent(
+                timestamp=raw["timestamp"],
+                event=raw["event"],
+                tool=raw["tool"],
+                session=raw["session"],
+                input=raw.get("input", ""),
+                output=raw.get("output", ""),
+            ))
+        except KeyError as e:
+            raise ValueError(f"Missing required field {e} at line {i}") from e
+    return sorted(events, key=lambda e: e.timestamp)
+def parse_spec(path: Path) -> ComplianceSpec:
+    """Parse a YAML compliance spec file."""
+    if not path.is_file():
+        raise FileNotFoundError(f"Spec file not found: {path}")
+    raw = yaml.safe_load(path.read_text())
+    steps: list[Step] = []
+    for s in raw["steps"]:
+        d = s["detector"]
+        steps.append(Step(
+            id=s["id"],
+            description=s["description"],
+            required=s["required"],
+            detector=Detector(
+                description=d["description"],
+                after_step=d.get("after_step"),
+                before_step=d.get("before_step"),
+            ),
+        ))
+    if "scoring" not in raw:
+        raise KeyError("Missing 'scoring' section in compliance spec")
+    return ComplianceSpec(
+        id=raw["id"],
+        name=raw["name"],
+        source_rule=raw["source_rule"],
+        version=raw["version"],
+        steps=tuple(steps),
+        threshold_promote_to_hook=raw["scoring"]["threshold_promote_to_hook"],
+    )