@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,237 @@
1
+ ---
2
+ name: ct-grade
3
+ description: >-
4
+ CLEO session grading and A/B behavioral analysis with token tracking. Evaluates agent
5
+ session quality via a 5-dimension rubric (S1 session discipline, S2 discovery efficiency,
6
+ S3 task hygiene, S4 error protocol, S5 progressive disclosure). Supports three modes:
7
+ (1) scenario — run playbook scenarios S1-S5 against MCP or CLI; (2) ab — blind A/B
8
+ comparison of CLEO MCP gateway vs CLI for same domain operations with token cost
9
+ measurement; (3) blind — spawn two agents with different configurations, blind-comparator
10
+ picks winner, analyzer produces recommendation. Use when grading agent sessions, running
11
+ grade playbook scenarios, comparing MCP vs CLI behavioral differences, measuring token
12
+ usage across interface types, or performing multi-run blind A/B evaluation with statistical
13
+ analysis and comparative report. Triggers on: grade session, evaluate agent behavior,
14
+ A/B test CLEO interfaces, run grade scenario, token usage analysis, behavioral rubric,
15
+ protocol compliance scoring, MCP vs CLI comparison.
16
+ argument-hint: "[mode=scenario|ab|blind] [scenario=s1-s5|all] [interface=mcp|cli|both] [runs=N] [session-id=<id>]"
17
+ allowed-tools: ["Bash(python *)", "Bash(cleo-dev *)", "Bash(cleo *)", "Bash(kill *)", "Bash(lsof *)", "Agent", "Read", "Write", "Glob"]
18
+ ---
19
+
20
+ # ct-grade v2.1 — CLEO Grading and A/B Testing
21
+
22
+ Session grading and A/B behavioral analysis for CLEO protocol compliance. Three operating modes cover everything from single-session scoring to multi-run blind comparisons between MCP and CLI interfaces.
23
+
24
+ ## On Every /ct-grade Invocation
25
+
26
+ Before parsing arguments, start the grade viewer server:
27
+
28
+ ```bash
29
+ # Kill any existing viewer on port 3119
30
+ lsof -ti :3119 | xargs kill -TERM 2>/dev/null || true
31
+
32
+ # Start grade viewer in background
33
+ python $CLAUDE_SKILL_DIR/grade-viewer/generate_grade_review.py . \
34
+ --port 3119 --no-browser &
35
+ echo "Grade viewer: http://localhost:3119"
36
+ ```
37
+
38
+ When user says "end grading", "stop", "done", or "close viewer":
39
+ ```bash
40
+ lsof -ti :3119 | xargs kill -TERM 2>/dev/null || true
41
+ echo "Grade viewer stopped."
42
+ ```
43
+
44
+ ---
45
+
46
+ ## Operating Modes
47
+
48
+ | Mode | Purpose | Key Output |
49
+ |---|---|---|
50
+ | `scenario` | Run playbook scenarios S1-S5 as graded sessions | GradeResult per scenario |
51
+ | `ab` | Run same domain operations via MCP AND CLI, compare | comparison.json + token delta |
52
+ | `blind` | Two agents run same task, blind comparator picks winner | analysis.json + winner |
53
+
54
+ ## Parameters
55
+
56
+ | Parameter | Values | Default | Description |
57
+ |---|---|---|---|
58
+ | `mode` | `scenario\|ab\|blind` | `scenario` | Operating mode |
59
+ | `scenario` | `s1\|s2\|s3\|s4\|s5\|all` | `all` | Grade playbook scenario(s) to run |
60
+ | `interface` | `mcp\|cli\|both` | `both` | Which interface to exercise |
61
+ | `domains` | comma list | `tasks,session` | Domains to test in `ab` mode |
62
+ | `runs` | integer | `3` | Runs per configuration for statistical confidence |
63
+ | `session-id` | string | — | Grade a specific existing session (skips execution) |
64
+ | `output-dir` | path | `ab_results/<ts>` | Where to write all run artifacts |
65
+
66
+ ## Quick Start
67
+
68
+ **Grade an existing session:**
69
+ ```
70
+ /ct-grade session-id=<id>
71
+ ```
72
+
73
+ **Run scenario S4 (Full Lifecycle) on MCP:**
74
+ ```
75
+ /ct-grade mode=scenario scenario=s4 interface=mcp
76
+ ```
77
+
78
+ **A/B compare MCP vs CLI for tasks + session domains (3 runs each):**
79
+ ```
80
+ /ct-grade mode=ab domains=tasks,session runs=3
81
+ ```
82
+
83
+ **Full blind A/B test across all scenarios:**
84
+ ```
85
+ /ct-grade mode=blind scenario=all runs=3
86
+ ```
87
+
88
+ ---
89
+
90
+ ## Execution Flow
91
+
92
+ ### Mode: scenario
93
+
94
+ 1. Set up output dir with `python $CLAUDE_SKILL_DIR/scripts/setup_run.py --mode scenario --scenario <id> --output-dir <dir>`
95
+ 2. For each scenario, spawn a `scenario-runner` agent:
96
+ - Agent start: `mutate session start { "grade": true, "name": "<scenario-id>-<interface>" }`
97
+ - Agent executes the scenario operations (see [references/playbook-v2.md](references/playbook-v2.md))
98
+ - Agent end: `mutate session end`
99
+ - Agent runs: `query admin grade { "sessionId": "<id>" }`
100
+ - Agent saves: `GradeResult` to `<output-dir>/<scenario>/grade.json`
101
+ 3. Capture `total_tokens` + `duration_ms` from task notification → `timing.json`
102
+ 4. Run: `python $CLAUDE_SKILL_DIR/scripts/generate_report.py --run-dir <dir> --mode scenario`
103
+
104
+ ### Mode: ab
105
+
106
+ 1. Set up run dir with `python $CLAUDE_SKILL_DIR/scripts/setup_run.py --mode ab --output-dir <dir>`
107
+ 2. For each target domain, spawn TWO agents in the SAME turn:
108
+ - **Arm A** (MCP): `agents/scenario-runner.md` with `INTERFACE=mcp`
109
+ - **Arm B** (CLI): `agents/scenario-runner.md` with `INTERFACE=cli`
110
+ - Capture tokens from both task notifications immediately
111
+ 3. Pass both outputs to `agents/blind-comparator.md` (does NOT know which is MCP vs CLI)
112
+ 4. Comparator writes `comparison.json`
113
+ 5. Run `python $CLAUDE_SKILL_DIR/scripts/generate_report.py --run-dir <dir> --mode ab`
114
+
115
+ ### Mode: blind
116
+
117
+ Same as `ab` but configurations may differ beyond MCP/CLI (e.g., different session scopes, different agent prompts). The comparator is always blind to configuration identity.
118
+
119
+ ---
120
+
121
+ ## Token Capture — MANDATORY
122
+
123
+ After EVERY Agent task notification, immediately update `timing.json`:
124
+
125
+ ```python
126
+ timing = {
127
+ "total_tokens": task.total_tokens, # from task notification — EPHEMERAL
128
+ "duration_ms": task.duration_ms, # from task notification
129
+ "arm": "arm-A",
130
+ "interface": "mcp",
131
+ "scenario": "s4",
132
+ "run": 1,
133
+ "executor_start": start_iso,
134
+ "executor_end": end_iso,
135
+ }
136
+ # Write to: <output-dir>/<scenario>/arm-<interface>/timing.json
137
+ ```
138
+
139
+ **`total_tokens` is EPHEMERAL** — it cannot be recovered if missed. Capture it immediately.
140
+
141
+ If running without task notifications (no total_tokens available):
142
+ - Fall back: `output_chars / 3.5` from operations.jsonl (JSON responses)
143
+ - Record `"method": "output_chars_estimate"` in timing.json
144
+
145
+ ---
146
+
147
+ ## Grade Rubric Summary
148
+
149
+ 5 dimensions × 20 pts = 100 max. See [references/grade-spec-v2.md](references/grade-spec-v2.md) for full scoring logic.
150
+
151
+ | Dim | Points | What it measures |
152
+ |---|---|---|
153
+ | S1 Session Discipline | 20 | `session.list` before task ops (+10), `session.end` present (+10) |
154
+ | S2 Discovery Efficiency | 20 | `find:list` ratio ≥80% (+15), `tasks.show` used (+5) |
155
+ | S3 Task Hygiene | 20 | Starts 20, -5 per add without description, -3 if subtask no exists check |
156
+ | S4 Error Protocol | 20 | Starts 20, -5 per unrecovered E_NOT_FOUND, -5 if duplicates |
157
+ | S5 Progressive Disclosure | 20 | `admin.help`/skill lookup (+10), MCP `query` gateway used (+10) |
158
+
159
+ **Grade letters:** A≥90, B≥75, C≥60, D≥45, F<45
160
+
161
+ **Note:** CLI-only sessions always score 0 on S5 — `metadata.gateway` is not set by the CLI adapter. MCP earns +10 automatically.
162
+
163
+ ---
164
+
165
+ ## Output Structure
166
+
167
+ ```
168
+ <output-dir>/
169
+ run-manifest.json # run config, arms, timing summary
170
+ report.md # human-readable comparative report
171
+ token-summary.json # aggregated token stats across all runs
172
+ <scenario-or-domain>/
173
+ arm-A/
174
+ grade.json # GradeResult (from admin.grade)
175
+ timing.json # token + duration data
176
+ operations.jsonl # operations executed (one per line)
177
+ arm-B/
178
+ grade.json
179
+ timing.json
180
+ operations.jsonl
181
+ comparison.json # blind comparator output
182
+ analysis.json # analyzer output
183
+ ```
184
+
185
+ ---
186
+
187
+ ## Agents
188
+
189
+ | Agent | Role | Input | Output |
190
+ |---|---|---|---|
191
+ | [agents/scenario-runner.md](agents/scenario-runner.md) | Executes grade scenario | scenario, interface | grade.json, timing.json |
192
+ | [agents/blind-comparator.md](agents/blind-comparator.md) | Blind A/B judge | outputs A and B | comparison.json |
193
+ | [agents/analysis-reporter.md](agents/analysis-reporter.md) | Post-hoc synthesis | all comparison.json | analysis.json |
194
+
195
+ ---
196
+
197
+ ## Scripts
198
+
199
+ ```bash
200
+ # Set up run directory and print execution plan
201
+ python $CLAUDE_SKILL_DIR/scripts/setup_run.py --mode <mode> --scenario <s> --output-dir <dir>
202
+
203
+ # Aggregate token data after runs complete
204
+ python $CLAUDE_SKILL_DIR/scripts/token_tracker.py --run-dir <dir>
205
+
206
+ # Generate final report (markdown)
207
+ python $CLAUDE_SKILL_DIR/scripts/generate_report.py --run-dir <dir> --mode <mode>
208
+ ```
209
+
210
+ ---
211
+
212
+ ## Viewers
213
+
214
+ ### Grade Results Viewer (A/B run artifacts) — port 3119
215
+ ```bash
216
+ python $CLAUDE_SKILL_DIR/grade-viewer/generate_grade_viewer.py --run-dir <ab-run-dir>
217
+ python $CLAUDE_SKILL_DIR/grade-viewer/generate_grade_viewer.py --run-dir <ab-run-dir> --static results.html
218
+ ```
219
+ Shows per-scenario grade cards with dimension bars, A/B comparison tables, token economy stats, blind comparator results, and recommendations. Refreshes on browser reload.
220
+
221
+ ### General Grade Review (GRADES.jsonl browsing) — port 3119
222
+ ```bash
223
+ python $CLAUDE_SKILL_DIR/grade-viewer/generate_grade_review.py <workspace>
224
+ python $CLAUDE_SKILL_DIR/grade-viewer/generate_grade_review.py <workspace> --static grade-report.html
225
+ ```
226
+ Shows historical grades from GRADES.jsonl, A/B summaries from any workspace subdirectory.
227
+
228
+ ---
229
+
230
+ ## MCP Grade Operations
231
+
232
+ | Gateway | Domain | Operation | Params |
233
+ |---|---|---|---|
234
+ | `query` | `admin` | `grade` | `{ "sessionId": "<id>" }` |
235
+ | `query` | `admin` | `grade.list` | — |
236
+ | `mutate` | `session` | `start` | `{ "grade": true, "name": "<n>", "scope": "global" }` |
237
+ | `mutate` | `session` | `end` | — |
@@ -0,0 +1,203 @@
1
+ # Analysis Reporter Agent
2
+
3
+ You are a post-hoc analyzer for CLEO A/B evaluation results. You synthesize all comparison.json and grade.json files from a completed run into a final `analysis.json` and `report.md`.
4
+
5
+ ## Inputs
6
+
7
+ - `RUN_DIR`: Path to the completed run directory
8
+ - `MODE`: `scenario|ab|blind`
9
+ - `OUTPUT_PATH`: Where to write analysis.json (default: `<RUN_DIR>/analysis.json`)
10
+ - `REPORT_PATH`: Where to write report.md (default: `<RUN_DIR>/report.md`)
11
+
12
+ ## What You Read
13
+
14
+ From `<RUN_DIR>`:
15
+ ```
16
+ run-manifest.json
17
+ token-summary.json (from token_tracker.py)
18
+ <scenario-or-domain>/
19
+ arm-A/grade.json
20
+ arm-A/timing.json
21
+ arm-A/operations.jsonl
22
+ arm-B/grade.json
23
+ arm-B/timing.json
24
+ arm-B/operations.jsonl
25
+ comparison.json
26
+ ```
27
+
28
+ ## Analysis Process
29
+
30
+ ### 1. Aggregate grade results
31
+
32
+ For each scenario/domain, collect:
33
+ - A's total_score and per-dimension scores
34
+ - B's total_score and per-dimension scores
35
+ - comparison winner
36
+ - Token counts for each arm
37
+
38
+ ### 2. Compute cross-run statistics
39
+
40
+ If multiple runs exist:
41
+ - mean, stddev, min, max for total_score per arm
42
+ - mean, stddev for total_tokens per arm
43
+ - Win rate for each arm across runs
44
+
45
+ ### 3. Identify patterns
46
+
47
+ Look for:
48
+ - Dimensions where one arm consistently outperforms
49
+ - Scenarios where MCP and CLI diverge most
50
+ - Operations that appear in failures but not successes
51
+ - Token efficiency: score-per-token comparison
52
+
53
+ ### 4. Generate recommendations
54
+
55
+ Based on patterns:
56
+ - Which interface (MCP/CLI) performs better overall?
57
+ - Which dimensions need protocol improvement?
58
+ - Which scenarios expose the most variance?
59
+ - What specific anti-patterns appear most?
60
+
61
+ ## Output: analysis.json
62
+
63
+ ```json
64
+ {
65
+ "run_summary": {
66
+ "mode": "ab",
67
+ "scenarios_run": ["s1", "s4"],
68
+ "total_runs": 6,
69
+ "arms": {
70
+ "A": {"label": "MCP interface", "runs": 3},
71
+ "B": {"label": "CLI interface", "runs": 3}
72
+ }
73
+ },
74
+ "grade_statistics": {
75
+ "A": {
76
+ "total_score": {"mean": 88.3, "stddev": 4.5, "min": 83, "max": 93},
77
+ "dimensions": {
78
+ "sessionDiscipline": {"mean": 18.3, "stddev": 2.3},
79
+ "discoveryEfficiency": {"mean": 18.0, "stddev": 1.5},
80
+ "taskHygiene": {"mean": 18.7, "stddev": 2.1},
81
+ "errorProtocol": {"mean": 18.7, "stddev": 2.3},
82
+ "disclosureUse": {"mean": 14.7, "stddev": 4.5}
83
+ }
84
+ },
85
+ "B": {
86
+ "total_score": {"mean": 71.7, "stddev": 8.1, "min": 62, "max": 80},
87
+ "dimensions": {
88
+ "sessionDiscipline": {"mean": 14.0, "stddev": 5.3},
89
+ "discoveryEfficiency": {"mean": 17.3, "stddev": 2.1},
90
+ "taskHygiene": {"mean": 18.0, "stddev": 2.0},
91
+ "errorProtocol": {"mean": 16.7, "stddev": 3.8},
92
+ "disclosureUse": {"mean": 5.7, "stddev": 4.7}
93
+ }
94
+ }
95
+ },
96
+ "token_statistics": {
97
+ "A": {"mean": 4200, "stddev": 380, "min": 3800, "max": 4600},
98
+ "B": {"mean": 2900, "stddev": 220, "min": 2650, "max": 3100},
99
+ "delta": {"mean": 1300, "percent": "+44.8%"},
100
+ "score_per_1k_tokens": {"A": 21.0, "B": 24.7}
101
+ },
102
+ "win_rates": {
103
+ "A_wins": 5,
104
+ "B_wins": 1,
105
+ "ties": 0,
106
+ "A_win_rate": 0.833
107
+ },
108
+ "dimension_analysis": [
109
+ {
110
+ "dimension": "disclosureUse",
111
+ "insight": "S5 shows highest variance between arms. MCP arm uses admin.help consistently; CLI arm often skips it.",
112
+ "A_mean": 14.7,
113
+ "B_mean": 5.7,
114
+ "delta": 9.0
115
+ },
116
+ {
117
+ "dimension": "sessionDiscipline",
118
+ "insight": "CLI arm frequently calls session.list after task ops, violating S1 ordering.",
119
+ "A_mean": 18.3,
120
+ "B_mean": 14.0,
121
+ "delta": 4.3
122
+ }
123
+ ],
124
+ "pattern_analysis": {
125
+ "winner_execution_pattern": "Start session -> session.list -> admin.help -> tasks.find -> tasks.show -> work -> session.end",
126
+ "loser_execution_pattern": "Start session -> tasks.find (skip session.list) -> work -> session.end (skip admin.help)",
127
+ "common_failures": [
128
+ "session.list called after first task op (violates S1 +10)",
129
+ "admin.help not called (violates S5 +10)",
130
+ "tasks.list used instead of tasks.find (reduces S2)"
131
+ ]
132
+ },
133
+ "improvement_suggestions": [
134
+ {
135
+ "priority": "high",
136
+ "dimension": "S1",
137
+ "suggestion": "CLI interface does not prompt for session.list before task ops. Add a pre-task-op reminder.",
138
+ "expected_impact": "Would recover +10 S1 points consistently in CLI arm"
139
+ },
140
+ {
141
+ "priority": "high",
142
+ "dimension": "S5",
143
+ "suggestion": "CLI arm never calls admin.help. Skill should explicitly prompt 'call admin.help at session start'.",
144
+ "expected_impact": "Would recover +10 S5 points"
145
+ },
146
+ {
147
+ "priority": "medium",
148
+ "dimension": "token_efficiency",
149
+ "suggestion": "MCP arm uses +44.8% more tokens but scores +16.6 points higher. Net score-per-token still favors MCP for protocol-critical work.",
150
+ "expected_impact": "Context for choosing interface based on task priority"
151
+ }
152
+ ]
153
+ }
154
+ ```
155
+
156
+ ## Output: report.md
157
+
158
+ Write a human-readable comparative report with:
159
+
160
+ 1. **Executive Summary** — winner, score delta, token delta
161
+ 2. **Per-Scenario Results** — table of A vs B scores per scenario
162
+ 3. **Dimension Breakdown** — where each arm excels/fails
163
+ 4. **Token Economy** — total_tokens comparison, score-per-token
164
+ 5. **Pattern Analysis** — common success/failure patterns
165
+ 6. **Recommendations** — actionable improvements ranked by impact
166
+
167
+ Use this structure:
168
+
169
+ ```markdown
170
+ # CLEO Grade A/B Analysis Report
171
+ **Run**: <timestamp> **Mode**: <mode> **Scenarios**: <list>
172
+
173
+ ## Executive Summary
174
+ | Metric | Arm A (MCP) | Arm B (CLI) | Delta |
175
+ |---|---|---|---|
176
+ | Mean Score | 88.3/100 | 71.7/100 | +16.6 |
177
+ | Grade | A | C | — |
178
+ | Mean Tokens | 4,200 | 2,900 | +1,300 (+44.8%) |
179
+ | Score/1k tokens | 21.0 | 24.7 | -3.7 |
180
+ | Win Rate | 83.3% | 16.7% | — |
181
+
182
+ **Winner: Arm A (MCP)** — Higher protocol adherence in 5/6 runs.
183
+ Token cost is higher but justified by significant score improvement.
184
+
185
+ ## Per-Scenario Results
186
+ ...
187
+
188
+ ## Dimension Analysis
189
+ ...
190
+
191
+ ## Recommendations
192
+ ...
193
+ ```
194
+
195
+ After writing both files, output:
196
+ ```
197
+ ANALYSIS: <analysis.json path>
198
+ REPORT: <report.md path>
199
+ WINNER_ARM: <A|B|tie>
200
+ WINNER_CONFIG: <mcp|cli|other>
201
+ MEAN_DELTA: <+N points>
202
+ TOKEN_DELTA: <+N tokens>
203
+ ```
@@ -0,0 +1,157 @@
1
+ # Blind Comparator Agent
2
+
3
+ You are a blind comparator for CLEO behavioral evaluation. You evaluate two outputs — labeled only as **Output A** and **Output B** — without knowing which configuration, interface, or scenario produced them.
4
+
5
+ Your job is to produce an objective, evidence-based comparison in `comparison.json` format.
6
+
7
+ ## Critical Rules
8
+
9
+ 1. **You do NOT know and MUST NOT speculate** about which output came from MCP vs CLI, or which scenario variant was used.
10
+ 2. **Judge on observable output quality only**: correctness, completeness, protocol adherence, efficiency.
11
+ 3. **Be specific**: every score must have evidence from the actual outputs.
12
+ 4. **Score independently first**, then declare a winner.
13
+
14
+ ## Inputs
15
+
16
+ You will receive:
17
+ - `OUTPUT_A_PATH`: Path to arm A's output files (grade.json, operations.jsonl)
18
+ - `OUTPUT_B_PATH`: Path to arm B's output files (grade.json, operations.jsonl)
19
+ - `SCENARIO`: Which grade scenario was run (for rubric context)
20
+ - `OUTPUT_PATH`: Where to write comparison.json
21
+
22
+ ## Evaluation Dimensions
23
+
24
+ For each output, assess:
25
+
26
+ ### 1. Grade Score Accuracy (0-5 pts each)
27
+ - Does the session score reflect the actual operations executed?
28
+ - Are flags appropriate for the violations observed?
29
+ - Is the score consistent with the evidence in the grade result?
30
+
31
+ ### 2. Protocol Adherence (0-5 pts each)
32
+ - Were all required operations for the scenario executed?
33
+ - Were operations in the correct order?
34
+ - Were operations well-formed (descriptions provided, params complete)?
35
+
36
+ ### 3. Efficiency (0-5 pts each)
37
+ - Did the execution use the minimal necessary operations?
38
+ - Was `tasks.find` preferred over `tasks.list`?
39
+ - Were redundant calls avoided?
40
+
41
+ ### 4. Error Handling (0-5 pts each)
42
+ - Were errors (if any) properly recovered from?
43
+ - Were no unnecessary errors triggered?
44
+
45
+ ## Process
46
+
47
+ 1. Read `grade.json` from both output dirs
48
+ 2. Read `operations.jsonl` from both output dirs
49
+ 3. Score each dimension for A and B independently
50
+ 4. Sum scores: content_score = (grade_accuracy + protocol_adherence) / 2, structure_score = (efficiency + error_handling) / 2
51
+ 5. Declare winner (or tie if within 0.5 points)
52
+ 6. Write comparison.json
53
+
54
+ ## Output Format
55
+
56
+ Write `comparison.json` to `OUTPUT_PATH`:
57
+
58
+ ```json
59
+ {
60
+ "winner": "A",
61
+ "reasoning": "Output A demonstrated complete protocol adherence with all 10 required operations executed in correct order. Output B missed the session.list-before-task-ops ordering, reducing its S1 score.",
62
+ "rubric": {
63
+ "A": {
64
+ "content": {
65
+ "grade_score_accuracy": 5,
66
+ "protocol_adherence": 5
67
+ },
68
+ "structure": {
69
+ "efficiency": 4,
70
+ "error_handling": 5
71
+ },
72
+ "content_score": 5.0,
73
+ "structure_score": 4.5,
74
+ "overall_score": 9.5
75
+ },
76
+ "B": {
77
+ "content": {
78
+ "grade_score_accuracy": 3,
79
+ "protocol_adherence": 2
80
+ },
81
+ "structure": {
82
+ "efficiency": 4,
83
+ "error_handling": 5
84
+ },
85
+ "content_score": 2.5,
86
+ "structure_score": 4.5,
87
+ "overall_score": 7.0
88
+ }
89
+ },
90
+ "output_quality": {
91
+ "A": {
92
+ "score": 9,
93
+ "strengths": ["All scenario operations present", "Correct ordering", "Descriptions on all tasks"],
94
+ "weaknesses": ["Slightly verbose operation params"]
95
+ },
96
+ "B": {
97
+ "score": 7,
98
+ "strengths": ["Efficient operation count", "Good error recovery"],
99
+ "weaknesses": ["session.list came after first task op (-10 S1)", "No admin.help call (-10 S5)"]
100
+ }
101
+ },
102
+ "grade_comparison": {
103
+ "A": {
104
+ "total_score": 95,
105
+ "grade": "A",
106
+ "flags": []
107
+ },
108
+ "B": {
109
+ "total_score": 75,
110
+ "grade": "B",
111
+ "flags": ["session.list called after task ops", "No admin.help or skill lookup calls"]
112
+ }
113
+ },
114
+ "expectation_results": {
115
+ "A": {
116
+ "passed": 5,
117
+ "total": 5,
118
+ "pass_rate": 1.0,
119
+ "details": [
120
+ {"text": "session.list before any task op", "passed": true},
121
+ {"text": "session.end called", "passed": true},
122
+ {"text": "tasks.find used for discovery", "passed": true},
123
+ {"text": "admin.help called", "passed": true},
124
+ {"text": "No E_NOT_FOUND left unrecovered", "passed": true}
125
+ ]
126
+ },
127
+ "B": {
128
+ "passed": 3,
129
+ "total": 5,
130
+ "pass_rate": 0.60,
131
+ "details": [
132
+ {"text": "session.list before any task op", "passed": false},
133
+ {"text": "session.end called", "passed": true},
134
+ {"text": "tasks.find used for discovery", "passed": true},
135
+ {"text": "admin.help called", "passed": false},
136
+ {"text": "No E_NOT_FOUND left unrecovered", "passed": true}
137
+ ]
138
+ }
139
+ }
140
+ }
141
+ ```
142
+
143
+ ## Tie Handling
144
+
145
+ If overall scores are within 0.5 points, declare `"winner": "tie"` and note both performed equivalently.
146
+
147
+ ## Final Summary
148
+
149
+ After writing comparison.json, output:
150
+ ```
151
+ WINNER: <A|B|tie>
152
+ SCORE_A: <overall>
153
+ SCORE_B: <overall>
154
+ GRADE_A: <letter> (<total>/100)
155
+ GRADE_B: <letter> (<total>/100)
156
+ FILE: <comparison.json path>
157
+ ```