npm - @cleocode/skills - Versions diffs - 2.0.0 - Mend

@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

package/skills/ct-grade/eval-viewer/viewer.html ADDED Viewed

@@ -0,0 +1,219 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>ct-grade Eval Review</title>
+  <style>
+    :root {
+      --bg: #0f1117; --surface: #1a1d27; --surface2: #21263a; --border: #2a2f45;
+      --text: #e8eaf0; --muted: #6b7280; --accent: #6366f1; --accent-dim: #3730a3;
+      --green: #22c55e; --green-bg: rgba(34,197,94,.12);
+      --red: #ef4444; --red-bg: rgba(239,68,68,.12);
+      --yellow: #eab308; --yellow-bg: rgba(234,179,8,.12);
+      --radius: 8px;
+    }
+    * { box-sizing: border-box; margin: 0; padding: 0; }
+    body { font-family: system-ui, sans-serif; background: var(--bg); color: var(--text); height: 100vh; display: flex; flex-direction: column; font-size: 14px; }
+    .header { background: var(--surface); border-bottom: 1px solid var(--border); padding: 12px 20px; display: flex; align-items: center; justify-content: space-between; flex-shrink: 0; }
+    .header h1 { font-size: 1rem; font-weight: 600; }
+    .badge { background: var(--accent-dim); color: var(--accent); font-size: 11px; padding: 2px 8px; border-radius: 20px; font-weight: 600; margin-left: 8px; }
+    .layout { flex: 1; display: flex; overflow: hidden; }
+    .sidebar { width: 260px; background: var(--surface); border-right: 1px solid var(--border); overflow-y: auto; flex-shrink: 0; padding: 8px; }
+    .main { flex: 1; overflow-y: auto; padding: 20px; }
+    .run-item { padding: 10px 12px; border-radius: var(--radius); cursor: pointer; margin-bottom: 4px; border: 1px solid transparent; }
+    .run-item:hover { background: var(--surface2); border-color: var(--border); }
+    .run-item.active { background: var(--accent-dim); border-color: var(--accent); }
+    .run-id { font-size: 11px; color: var(--muted); font-family: monospace; margin-bottom: 4px; }
+    .run-prompt { font-size: 12px; line-height: 1.4; }
+    .run-pass { font-size: 11px; margin-top: 5px; font-weight: 600; }
+    .pass-good { color: var(--green); } .pass-mid { color: var(--yellow); } .pass-bad { color: var(--red); }
+    .card { background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius); margin-bottom: 16px; overflow: hidden; }
+    .card-header { padding: 12px 16px; border-bottom: 1px solid var(--border); font-size: 13px; font-weight: 600; }
+    .card-body { padding: 16px; }
+    .prompt-box { background: var(--surface2); border-left: 3px solid var(--accent); border-radius: 4px; padding: 12px; font-size: 13px; line-height: 1.6; }
+    .exp-row { display: flex; gap: 10px; align-items: flex-start; padding: 8px 0; border-bottom: 1px solid var(--border); }
+    .exp-row:last-child { border-bottom: none; }
+    .exp-icon { font-size: 14px; flex-shrink: 0; margin-top: 2px; }
+    .exp-text { font-size: 13px; line-height: 1.5; }
+    .exp-evidence { font-size: 11px; color: var(--muted); margin-top: 3px; font-style: italic; }
+    .pill-row { display: flex; gap: 8px; flex-wrap: wrap; margin-bottom: 12px; }
+    .pill { padding: 3px 10px; border-radius: 20px; font-size: 12px; font-weight: 600; }
+    .pill-green { background: var(--green-bg); color: var(--green); } .pill-red { background: var(--red-bg); color: var(--red); } .pill-yellow { background: var(--yellow-bg); color: var(--yellow); }
+    .file-tabs { display: flex; gap: 4px; padding: 8px 16px; background: var(--surface2); border-bottom: 1px solid var(--border); flex-wrap: wrap; }
+    .ftab { padding: 3px 8px; border-radius: 4px; cursor: pointer; font-size: 11px; font-family: monospace; border: 1px solid transparent; color: var(--muted); background: none; }
+    .ftab:hover { border-color: var(--border); color: var(--text); }
+    .ftab.active { background: var(--accent-dim); border-color: var(--accent); color: var(--text); }
+    .file-body { padding: 16px; max-height: 400px; overflow-y: auto; }
+    .file-pre { font-family: monospace; font-size: 12px; line-height: 1.6; white-space: pre-wrap; word-break: break-all; }
+    .feedback-ta { width: 100%; min-height: 80px; background: var(--surface2); border: 1px solid var(--border); border-radius: var(--radius); color: var(--text); padding: 10px; font-size: 13px; font-family: inherit; resize: vertical; }
+    .feedback-ta:focus { outline: none; border-color: var(--accent); }
+    .btn { padding: 6px 14px; border-radius: var(--radius); border: none; cursor: pointer; font-size: 13px; font-weight: 600; background: var(--accent); color: white; }
+    .btn:hover { background: #4f46e5; }
+    .save-msg { font-size: 12px; color: var(--muted); margin-left: 10px; }
+    .empty-msg { padding: 60px 20px; text-align: center; color: var(--muted); }
+  </style>
+</head>
+<body>
+<div class="header">
+  <h1>ct-grade <span class="badge">Eval Review</span></h1>
+  <span id="hdr-stats" style="font-size:12px;color:var(--muted)"></span>
+</div>
+<div class="layout">
+  <nav class="sidebar" id="sidebar"></nav>
+  <main class="main" id="main"><div class="empty-msg"><p>Select an eval run from the sidebar.</p></div></main>
+</div>
+<script>
+/*__EMBEDDED_DATA__*/
+const {runs, benchmark} = EMBEDDED_DATA;
+let feedback = {reviews: []};
+let curRun = null, curFileIdx = {};
+function el(tag, props, ...children) {
+  const e = document.createElement(tag);
+  if (props) for (const [k, v] of Object.entries(props)) {
+    if (k === 'cls') e.className = v;
+    else if (k === 'text') e.textContent = v;
+    else if (k === 'onclick') e.onclick = v;
+    else e.setAttribute(k, v);
+  }
+  for (const c of children) { if (c) e.appendChild(typeof c === 'string' ? document.createTextNode(c) : c); }
+  return e;
+}
+function passClass(r) { return r >= 0.8 ? 'pass-good' : r >= 0.5 ? 'pass-mid' : 'pass-bad'; }
+function pillClass(r) { return r >= 0.8 ? 'pill-green' : r >= 0.5 ? 'pill-yellow' : 'pill-red'; }
+fetch('/api/feedback').then(r => r.json()).then(d => { feedback = d; }).catch(() => {});
+function buildSidebar() {
+  const sb = document.getElementById('sidebar');
+  sb.textContent = '';
+  if (!runs || !runs.length) { sb.appendChild(el('div', {style: 'padding:16px;color:var(--muted);font-size:12px', text: 'No eval runs found.'})); return; }
+  let tot = 0, pass = 0;
+  runs.forEach((run, i) => {
+    const item = el('div', {cls: 'run-item', onclick: () => selectRun(i)});
+    item.dataset.idx = i;
+    item.appendChild(el('div', {cls: 'run-id', text: `eval-${run.eval_id || i+1}`}));
+    item.appendChild(el('div', {cls: 'run-prompt', text: run.prompt || '(no prompt)'}));
+    const g = run.grading;
+    if (g && g.summary) {
+      pass += g.summary.passed; tot += g.summary.total;
+      const rate = g.summary.pass_rate;
+      const rp = el('div', {cls: 'run-pass ' + passClass(rate), text: `${g.summary.passed}/${g.summary.total} passed (${Math.round(rate*100)}%)`});
+      item.appendChild(rp);
+    }
+    sb.appendChild(item);
+  });
+  if (tot > 0) {
+    const hdr = document.getElementById('hdr-stats');
+    hdr.textContent = `${runs.length} evals · ${pass}/${tot} expectations · ${Math.round(pass/tot*100)}% pass`;
+  }
+}
+function selectRun(idx) {
+  curRun = idx;
+  document.querySelectorAll('.run-item').forEach(e => e.classList.remove('active'));
+  const item = document.querySelector(`.run-item[data-idx="${idx}"]`);
+  if (item) item.classList.add('active');
+  renderMain(runs[idx]);
+}
+function renderMain(run) {
+  const main = document.getElementById('main');
+  main.textContent = '';
+  // Prompt card
+  const promptCard = el('div', {cls: 'card'});
+  promptCard.appendChild(el('div', {cls: 'card-header', text: 'Prompt'}));
+  const pb = el('div', {cls: 'card-body'});
+  pb.appendChild(el('div', {cls: 'prompt-box', text: run.prompt || '(no prompt)'}));
+  promptCard.appendChild(pb);
+  main.appendChild(promptCard);
+  // Grading card
+  const g = run.grading;
+  if (g && g.summary) {
+    const gradeCard = el('div', {cls: 'card'});
+    gradeCard.appendChild(el('div', {cls: 'card-header', text: 'Grade Result'}));
+    const gb = el('div', {cls: 'card-body'});
+    const rate = g.summary.pass_rate;
+    const pr = el('div', {cls: 'pill-row'});
+    pr.appendChild(el('span', {cls: 'pill ' + pillClass(rate), text: `${g.summary.passed}/${g.summary.total} passed`}));
+    pr.appendChild(el('span', {cls: 'pill ' + pillClass(rate), text: `${Math.round(rate*100)}%`}));
+    gb.appendChild(pr);
+    if (g.expectations) {
+      for (const exp of g.expectations) {
+        const row = el('div', {cls: 'exp-row'});
+        row.appendChild(el('span', {cls: 'exp-icon', text: exp.passed ? '✓' : '✗', style: exp.passed ? 'color:var(--green)' : 'color:var(--red)'}));
+        const textWrap = el('div');
+        textWrap.appendChild(el('div', {cls: 'exp-text', text: exp.text || ''}));
+        if (exp.evidence) textWrap.appendChild(el('div', {cls: 'exp-evidence', text: exp.evidence}));
+        row.appendChild(textWrap);
+        gb.appendChild(row);
+      }
+    }
+    gradeCard.appendChild(gb);
+    main.appendChild(gradeCard);
+  }
+  // Files card
+  const runKey = run.id;
+  if (!curFileIdx[runKey]) curFileIdx[runKey] = 0;
+  if (run.outputs && run.outputs.length > 0) {
+    const filesCard = el('div', {cls: 'card'});
+    filesCard.appendChild(el('div', {cls: 'card-header', text: 'Output Files'}));
+    const ftabs = el('div', {cls: 'file-tabs'});
+    run.outputs.forEach((f, i) => {
+      const tab = el('button', {cls: 'ftab' + (i === curFileIdx[runKey] ? ' active' : ''), text: f.name, onclick: () => { curFileIdx[runKey] = i; renderMain(run); }});
+      ftabs.appendChild(tab);
+    });
+    filesCard.appendChild(ftabs);
+    const body = el('div', {cls: 'file-body'});
+    const active = run.outputs[curFileIdx[runKey]];
+    if (active && active.type === 'text') {
+      body.appendChild(el('pre', {cls: 'file-pre', text: active.content || ''}));
+    } else if (active) {
+      body.appendChild(el('div', {style: 'color:var(--muted);font-size:12px', text: `[Binary file: ${active.name}]`}));
+    }
+    filesCard.appendChild(body);
+    main.appendChild(filesCard);
+  }
+  // Feedback card
+  const fbCard = el('div', {cls: 'card'});
+  fbCard.appendChild(el('div', {cls: 'card-header', text: 'Feedback'}));
+  const fbb = el('div', {cls: 'card-body'});
+  const saved = (feedback.reviews || []).find(r => r.run_id === run.id)?.feedback || '';
+  const ta = el('textarea', {cls: 'feedback-ta', placeholder: 'Notes on this eval run...'});
+  ta.value = saved;
+  ta.id = `fb-${run.id}`;
+  fbb.appendChild(ta);
+  const btnRow = el('div', {style: 'display:flex;align-items:center;margin-top:8px'});
+  const saveMsg = el('span', {cls: 'save-msg'});
+  const btn = el('button', {cls: 'btn', text: 'Save', onclick: () => saveFeedback(run.id, saveMsg)});
+  btnRow.appendChild(btn);
+  btnRow.appendChild(saveMsg);
+  fbb.appendChild(btnRow);
+  fbCard.appendChild(fbb);
+  main.appendChild(fbCard);
+}
+function saveFeedback(runId, msgEl) {
+  const ta = document.getElementById(`fb-${runId}`);
+  if (!ta) return;
+  const text = ta.value.trim();
+  const reviews = feedback.reviews || [];
+  const idx = reviews.findIndex(r => r.run_id === runId);
+  if (idx >= 0) reviews[idx].feedback = text; else reviews.push({run_id: runId, feedback: text});
+  feedback.reviews = reviews;
+  fetch('/api/feedback', {method: 'POST', headers: {'Content-Type': 'application/json'}, body: JSON.stringify(feedback)})
+    .then(r => r.json()).then(() => { if (msgEl) { msgEl.textContent = 'Saved'; setTimeout(() => { msgEl.textContent = ''; }, 2000); } })
+    .catch(() => { if (msgEl) msgEl.textContent = 'Save failed'; });
+}
+buildSidebar();
+if (runs && runs.length > 0) selectRun(0);
+</script>
+</body>
+</html>

package/skills/ct-grade/evals/evals.json ADDED Viewed

@@ -0,0 +1,94 @@
+{
+  "skill_name": "ct-grade",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "Grade session S-abc123 and tell me the total score, grade letter, and any flags.",
+      "expected_output": "Calls `query check grade { sessionId }` (preferred canonical op, or legacy `admin.grade`) and reports totalScore/100, grade letter (A-F), and any flags from the result.",
+      "expectations": [
+        "Uses `query check grade` MCP operation with the session ID",
+        "Reports the totalScore out of 100",
+        "Reports the grade letter (A, B, C, D, or F)",
+        "Lists any flags from the grade result",
+        "Shows per-dimension scores"
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "Run grade scenario S4 (Full Lifecycle) using the MCP interface and grade it.",
+      "expected_output": "Runs a graded session executing the S4 scenario operations via MCP gateway, then calls `check.grade` and reports the result.",
+      "expectations": [
+        "Starts a graded session with `mutate session start { grade: true }`",
+        "Executes session.list as first operation (S1)",
+        "Calls `query admin help` for progressive disclosure (S5)",
+        "Uses `tasks.find` not `tasks.list` for discovery (S2)",
+        "Calls `mutate session end` at conclusion (S1)",
+        "Grades the session via `query check grade { sessionId }` after ending",
+        "Reports score close to 100/100 for correct execution"
+      ]
+    },
+    {
+      "id": 3,
+      "prompt": "A/B compare MCP vs CLI for the tasks domain using 3 runs. Set up the run and tell me the execution plan.",
+      "expected_output": "Runs setup_run.py with mode=ab, domains=tasks, interface=both, runs=3 and outputs the step-by-step execution plan.",
+      "expectations": [
+        "Calls or references `scripts/setup_run.py --mode ab --domains tasks --interface both --runs 3`",
+        "Creates output directory structure",
+        "Outputs a step-by-step plan with arm-A (MCP) and arm-B (CLI) spawn instructions",
+        "Notes the requirement to capture total_tokens from task notifications",
+        "Includes blind-comparator step after both arms complete"
+      ]
+    },
+    {
+      "id": 4,
+      "prompt": "Run a blind A/B test for scenario S1 and S4. Compare MCP vs CLI. Use 2 runs per configuration.",
+      "expected_output": "Sets up and orchestrates blind A/B comparison for both scenarios, spawning parallel agents, capturing tokens, running comparator, and producing a report.",
+      "expectations": [
+        "Sets up run directory for scenarios s1 and s4",
+        "Spawns arm-A (MCP) and arm-B (CLI) agents in the same turn for each scenario",
+        "Captures total_tokens from each task notification into timing.json",
+        "Spawns blind-comparator agent after both arms complete for each run",
+        "Runs token_tracker.py to aggregate stats",
+        "Generates report.md with comparative results",
+        "Identifies that CLI arm will score 0 on S5 Progressive Disclosure"
+      ]
+    },
+    {
+      "id": 5,
+      "prompt": "List all past grade results and summarize the average score.",
+      "expected_output": "Calls `query check grade.list` and summarizes the results.",
+      "expectations": [
+        "Uses `query check grade.list` MCP operation",
+        "Reports number of grade results found",
+        "Computes or estimates the average total score",
+        "Identifies any sessions with failing grades (F)"
+      ]
+    },
+    {
+      "id": 6,
+      "prompt": "Run all 5 grade scenarios (S1-S5) via MCP and produce a comparative analysis.",
+      "expected_output": "Runs all 5 scenarios as graded sessions, grades each one, and produces an analysis comparing performance across scenarios.",
+      "expectations": [
+        "Starts 5 separate graded sessions (one per scenario)",
+        "Executes each scenario's operation sequence via MCP",
+        "Grades each session via `check.grade`",
+        "Compares scores across all 5 scenarios",
+        "Identifies which scenarios scored highest and lowest",
+        "Suggests improvements for low-scoring dimensions"
+      ]
+    },
+    {
+      "id": 7,
+      "prompt": "What is the expected token cost difference between MCP and CLI for grade scenario S4?",
+      "expected_output": "Explains the token tracking methodology, describes that MCP operations produce richer responses (more tokens) but higher scores, and estimates the delta based on typical operation counts.",
+      "expectations": [
+        "Explains that total_tokens is captured from task notifications",
+        "Notes that MCP responses include _meta envelope with gateway, requestId, etc.",
+        "Notes that CLI responses are typically more compact",
+        "Estimates MCP is likely to use more tokens per operation",
+        "Mentions score-per-token as the key efficiency metric",
+        "References timing.json as where token data is stored"
+      ]
+    }
+  ]
+}

package/skills/ct-grade/references/ab-test-methodology.md ADDED Viewed

@@ -0,0 +1,150 @@
+# A/B Test Methodology
+## Overview
+The ct-grade A/B framework compares two configurations (arms) of CLEO agent behavior using the 5-dimension behavioral rubric as the scoring criterion. The framework is blind: the comparator agent does not know which arm is which configuration.
+---
+## Core Concepts
+### Arms
+An "arm" is a specific test configuration. In CLEO A/B tests, the two most common arms are:
+| Arm | Typical Config | Example |
+|-----|---------------|---------|
+| A | MCP gateway | Uses `query`/`mutate` for all operations |
+| B | CLI fallback | Uses `cleo-dev` CLI for equivalent operations |
+Arms can also differ by:
+- Session scope (`global` vs `epic:T500`)
+- Tier escalation (with/without `admin.help`)
+- Agent persona (orchestrator vs task-executor)
+- Prompt configuration (with/without ct-cleo skill)
+### Slots
+A "slot" is one test unit — either a grade scenario (s1-s5) or a domain (tasks, session, etc.) depending on mode. Each slot produces one comparison.
+### Runs
+The number of times each arm executes each slot. Multiple runs increase statistical confidence. Minimum recommended: 3 runs.
+---
+## Blind Protocol
+1. **Orchestrator spawns both arms in the same turn** — this is critical for parallel execution and prevents the orchestrator's context from being polluted by one arm's output before spawning the other.
+2. **Arm outputs are labeled A and B only** — the arm label (A/B) is used, not the configuration label (mcp/cli). The comparator never sees "MCP" or "CLI" in the output it receives.
+3. **Comparator reads only grade.json and operations.jsonl** — not timing.json (which contains the `interface` field). This enforces blindness.
+4. **Analysis-reporter de-blinds** — after all comparisons are done, the reporter reveals which arm was which configuration and synthesizes patterns.
+---
+## Token Tracking Protocol
+Token data comes from Claude Code task notifications. You MUST capture it immediately — it is ephemeral.
+### Capture Point
+When an Agent task completes, the notification includes:
+- `total_tokens`: Total tokens consumed by the subagent task
+- `duration_ms`: Wall clock time for the task
+### Storage
+Immediately on task completion, update the arm's `timing.json`:
+```python
+# Pseudocode — actual capture is in main context
+timing = load_json(arm_dir + "/timing.json")
+timing["total_tokens"] = task.total_tokens    # from notification
+timing["duration_ms"] = task.duration_ms      # from notification
+timing["executor_end"] = now_iso()
+timing["executor_duration_seconds"] = task.duration_ms / 1000
+save_json(arm_dir + "/timing.json", timing)
+```
+### Why This Matters
+Token cost is the primary economic metric for comparing interfaces:
+- MCP operations may use more tokens (richer responses, metadata)
+- CLI operations may use fewer tokens but score lower on S5
+- Score-per-token tells you which interface is more efficient for protocol work
+### Missing Token Data
+If you forgot to capture tokens, you cannot recover them. Mark `total_tokens: null` in timing.json. The token_tracker.py script will warn about missing data. Run statistics will exclude null values.
+---
+## Statistical Interpretation
+### Minimum Confidence
+- **1 run**: No statistical confidence, anecdotal
+- **3 runs**: Low confidence, sufficient for directional signal
+- **5+ runs**: Moderate confidence, suitable for decisions
+- **10+ runs**: High confidence, publication-grade
+### Score Interpretation
+| Score Delta | Interpretation |
+|-------------|----------------|
+| 0-5 pts | Noise level — likely equivalent |
+| 5-15 pts | Meaningful difference — investigate flags |
+| 15-25 pts | Significant — one interface clearly better |
+| 25+ pts | Extreme — likely S5 differential (MCP vs CLI) |
+### Expected MCP vs CLI Delta
+Based on the rubric implementation:
+- S5 Progressive Disclosure: always +20 for MCP (if admin.help called), +10 MCP no help, 0 CLI
+- S1-S4: approximately equal if agent follows same protocol steps
+- Total expected delta: **+10 to +20 points** in favor of MCP for equivalent protocols
+If delta exceeds 20 points, investigate whether the CLI agent is also skipping other protocol steps (session.list, descriptions, etc.) due to lack of guidance.
+---
+## Git Tree Comparison
+The "git tree" metaphor: each A/B run produces a branch in the results tree. Multiple runs of the same configuration are like commits on the same branch.
+```
+ab_results/
+  run-001/           ← first full A/B run
+    s4/
+      run-01/arm-A/  ← first run, MCP arm
+      run-01/arm-B/  ← first run, CLI arm
+      run-01/comparison.json
+      run-02/arm-A/
+      ...
+    token-summary.json
+    report.md
+  run-002/           ← second full A/B run (compare against run-001)
+    ...
+```
+To compare **across A/B runs** (e.g., after making a protocol change):
+1. Generate report for run-001
+2. Make protocol change
+3. Run run-002
+4. Compare report.md files
+---
+## Anti-patterns in A/B Testing
+| Anti-pattern | Problem | Fix |
+|---|---|---|
+| Sequential arms | Arm B's spawn might be influenced by A's output | Spawn both arms in same message |
+| Comparator sees config | Breaks blindness | Pass only grade.json + operations.jsonl |
+| Single run | No variance data | Minimum 3 runs |
+| Same session scope | Arms share audit data | Each arm starts a new `session.start { grade: true }` |
+| Forgetting to capture tokens | Cannot reconstruct | Write timing.json IMMEDIATELY on task completion |
+| Comparing different scenarios | Apples vs oranges | Fix scenario parameter, vary interface only |

package/skills/ct-grade/references/domains.md ADDED Viewed

@@ -0,0 +1,137 @@
+# CLEO Domain Operation Reference for A/B Testing
+**Source**: `docs/specs/CLEO-OPERATION-CONSTITUTION.md`
+**Purpose**: Lists the key operations to test in MCP vs CLI A/B comparisons.
+---
+## MCP vs CLI Equivalents
+For each domain, these are the canonical operations to test in A/B mode.
+MCP gateway = audit metadata.gateway is `'query'` or `'mutate'` (set by MCP adapter).
+CLI = operations routed through CLI do NOT set metadata.gateway.
+### tasks (32 operations)
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Discovery | `query tasks find { "status": "active" }` | `cleo-dev find --status active` |
+| Show detail | `query tasks show { "taskId": "T123" }` | `cleo-dev show T123` |
+| List children | `query tasks list { "parent": "T100" }` | `cleo-dev list --parent T100` |
+| Create | `mutate tasks add { "title": "...", "description": "..." }` | `cleo-dev add --title "..." --description "..."` |
+| Update | `mutate tasks update { "taskId": "T123", "status": "active" }` | `cleo-dev update T123 --status active` |
+| Complete | `mutate tasks complete { "taskId": "T123" }` | `cleo-dev complete T123` |
+| Exists check | `query tasks exists { "taskId": "T123" }` | `cleo-dev exists T123` |
+**Key S2 insight**: `tasks.find` (MCP) vs `cleo-dev find` (CLI). Both count toward find:list ratio in the audit log. MCP find at gateway='query', CLI find also logged but without gateway metadata.
+### session (19 operations)
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Check existing | `query session list` | `cleo-dev session list` |
+| Start | `mutate session start { "grade": true, "scope": "global" }` | `cleo-dev session start --grade --scope global` |
+| End | `mutate session end` | `cleo-dev session end` |
+| Status | `query session status` | `cleo-dev session status` |
+| Record decision | `mutate session record.decision { "decision": "...", "rationale": "..." }` | `cleo-dev session record-decision ...` |
+**Critical**: `session.list` (MCP) is what the rubric checks for S1. If CLI does `cleo-dev session list`, it still appears as `domain='session', operation='list'` in the audit log. S1 counts it.
+### memory (18 operations) — Tier 1
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Search | `query memory find { "query": "authentication" }` | `cleo-dev memory find "authentication"` |
+| Store observation | `mutate memory observe { "text": "..." }` | `cleo-dev memory observe "..."` |
+| Timeline | `query memory timeline { "anchor": "<id>" }` | N/A (MCP-preferred) |
+### admin (44 operations)
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Dashboard | `query admin dash` | `cleo-dev dash` |
+| Help (S5 key) | `query admin help` | `cleo-dev help` |
+| Grade session | `query admin grade { "sessionId": "<id>" }` | `cleo-dev grade <id>` |
+| Health check | `query admin health` | `cleo-dev health` |
+**Critical for S5**: Only `query admin help` (MCP) satisfies the `helpCalls` filter in S5. CLI `cleo-dev help` does NOT set `metadata.gateway='query'` or match `domain='admin', operation='help'` — it depends on how the CLI routes internally.
+### pipeline (42 operations) — LOOM system
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Stage status | `query pipeline stage.status` | `cleo-dev pipeline status` |
+| Stage validate | `query pipeline stage.validate` | `cleo-dev pipeline validate` |
+| Manifest list | `query pipeline manifest.list` | `cleo-dev pipeline manifest list` |
+### check (19 operations)
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Test status | `query check test.status` | `cleo-dev check test-status` |
+| Protocol check | `query check protocol` | `cleo-dev check protocol` |
+| Compliance | `query check compliance.summary` | `cleo-dev check compliance` |
+### orchestrate (19 operations)
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Status | `query orchestrate status` | `cleo-dev orchestrate status` |
+| Waves | `query orchestrate waves` | `cleo-dev orchestrate waves` |
+### tools (32 operations)
+| Test Op | MCP | CLI |
+|---------|-----|-----|
+| Skill list (S5 key) | `query tools skill.list` | `cleo-dev tools skill list` |
+| Skill show (S5 key) | `query tools skill.show { "skillId": "ct-cleo" }` | `cleo-dev tools skill show ct-cleo` |
+**S5 note**: `tools.skill.list` and `tools.skill.show` via MCP count toward S5 helpCalls filter.
+---
+## A/B Domain Test Configurations
+### Quick A/B: Tasks Domain
+**Goal**: Compare MCP vs CLI for core task operations.
+**Operations to execute (both interfaces)**:
+1. `session list` — S1
+2. `tasks find { "status": "active" }` — S2
+3. `tasks show { "taskId": "<valid-id>" }` — S2
+4. `session end` — S1
+**Expected score difference**: MCP ~30/100 vs CLI ~20/100 (S5 is 0 for CLI)
+### Standard A/B: Full Protocol (S4)
+**Goal**: Full lifecycle scenario through both interfaces.
+**Operations**: Follow S4 scenario (10 ops including admin.help).
+**Expected**: MCP 100/100, CLI ~80/100
+### Targeted A/B: S5 Isolation
+**Goal**: Specifically measure the S5 (progressive disclosure) gap.
+**Operations** — same except arm A calls `admin.help`, arm B does not:
+Arm A (MCP + help):
+```
+query session list → query admin help → query tasks find → mutate session end
+```
+Arm B (CLI — no help call):
+```
+cleo-dev session list → cleo-dev find → cleo-dev session end
+```
+**Expected**: Arm A S5 = 20/20, Arm B S5 = 0/20
+---
+## Tier Notes
+- **Tier 0 ops**: Available to all agents without admin.help (tasks, session, check, pipeline, orchestrate, tools, admin, sticky)
+- **Tier 1 ops**: Require `admin.help --tier 1` first (memory, manifest, advanced session)
+- **Tier 2 ops**: Require `admin.help --tier 2` (nexus, admin advanced, cross-project)
+In A/B tests, tier 1+ operations should only appear if the scenario explicitly escalates via admin.help. Otherwise the agent should not have discovered them.