@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,233 @@
1
+ # Blind A/B Testing Protocol
2
+
3
+ Methodology for blind comparison of MCP vs CLI interface usage in CLEO.
4
+
5
+ ---
6
+
7
+ ## Agent-Based Execution (Canonical)
8
+
9
+ The canonical A/B approach uses Claude Code Agents to run scenarios end-to-end via the live MCP/CLI interfaces. This avoids subprocess initialization issues and captures real token data from task notifications.
10
+
11
+ ### Execution Flow
12
+
13
+ 1. Run `python scripts/setup_run.py` to create run structure and print the execution plan
14
+ 2. Follow the plan: spawn scenario-runner agents in parallel (arm-A MCP, arm-B CLI)
15
+ 3. Immediately capture `total_tokens` from each task notification → `timing.json`
16
+ 4. Spawn blind-comparator agent after both arms complete
17
+ 5. Run `python scripts/token_tracker.py --run-dir <dir>` to aggregate tokens
18
+ 6. Run `python scripts/generate_report.py --run-dir <dir>` for final report
19
+
20
+ ### Token Data from Task Notifications
21
+
22
+ ```python
23
+ # After EACH agent task completes, fill timing.json immediately:
24
+ timing = {
25
+ "total_tokens": task.total_tokens, # EPHEMERAL — capture now or lose it
26
+ "duration_ms": task.duration_ms,
27
+ "arm": "arm-A",
28
+ "interface": "mcp",
29
+ "scenario": "s4",
30
+ "run": 1,
31
+ }
32
+ ```
33
+
34
+ Token data priority:
35
+ 1. `total_tokens` from Claude Code Agent task notification (canonical)
36
+ 2. OTel `claude_code.token.usage` (when `CLAUDE_CODE_ENABLE_TELEMETRY=1`)
37
+ 3. `output_chars / 3.5` (JSON response estimate)
38
+ 4. `entryCount × 150` (coarse proxy from GRADES.jsonl)
39
+
40
+ ---
41
+
42
+ ## Subprocess-Based Execution (Fallback)
43
+
44
+ For automated testing without agent delegation, use `run_ab_test.py`. This invokes CLEO via subprocess and requires a migrated `tasks.db`.
45
+
46
+ ---
47
+
48
+ ## What We're Testing
49
+
50
+ | Side | Interface | Mechanism |
51
+ |------|-----------|-----------|
52
+ | **A** (MCP) | JSON-RPC via stdio to CLEO MCP server | `node dist/mcp/index.js` with JSON-RPC messages |
53
+ | **B** (CLI) | Shell commands via subprocess | `cleo-dev <domain> <operation> [params]` |
54
+
55
+ Both sides call the same underlying `src/dispatch/` layer. The A/B test isolates:
56
+ - **Output format differences** — MCP returns structured JSON envelopes; CLI may add ANSI/formatting
57
+ - **Response size** — character counts as token proxy
58
+ - **Latency** — wall-clock time per operation
59
+ - **Data equivalence** — do they return the same logical data?
60
+
61
+ Blind assignment means the comparator does not know which result came from MCP vs CLI when producing the quality verdict.
62
+
63
+ ---
64
+
65
+ ## Test Structure
66
+
67
+ ```
68
+ ab-results/
69
+ <timestamp>/
70
+ meta.json -- test parameters, domain, operations, runs
71
+ run-001/
72
+ side-a/
73
+ request.json -- what was sent
74
+ response.json -- raw response
75
+ metrics.json -- output_chars, duration_ms, success
76
+ side-b/
77
+ request.json
78
+ response.json
79
+ metrics.json
80
+ comparison.json -- blind comparator output (winner: A|B|TIE)
81
+ run-002/
82
+ ...
83
+ summary.json -- aggregated stats across all runs
84
+ report.md -- human-readable comparative analysis
85
+ ```
86
+
87
+ ---
88
+
89
+ ## Blind Assignment
90
+
91
+ The `run_ab_test.py` script randomly shuffles which side gets labeled "A" vs "B" for each run. The comparator agent sees only:
92
+ - Output labeled "A" (could be MCP or CLI)
93
+ - Output labeled "B" (could be MCP or CLI)
94
+ - The original request prompt
95
+
96
+ The `meta.json` records the true identity (`a_is_mcp: true|false`) per run. `generate_report.py` de-blinds after all comparisons are done.
97
+
98
+ ---
99
+
100
+ ## Metrics Captured Per Run
101
+
102
+ | Metric | How captured |
103
+ |--------|-------------|
104
+ | `output_chars` | `len(response_json_str)` |
105
+ | `estimated_tokens` | `output_chars / 4` (approximation) |
106
+ | `duration_ms` | wall clock from subprocess start to end |
107
+ | `success` | `response.success === true` (MCP) or exit code 0 (CLI) |
108
+ | `data_equivalent` | compare key fields between A and B response |
109
+
110
+ ---
111
+
112
+ ## Data Equivalence Check
113
+
114
+ For each operation, define "equivalent" as the key response fields matching:
115
+
116
+ ```python
117
+ EQUIVALENCE_FIELDS = {
118
+ "tasks.find": ["data.tasks[].id", "data.total"],
119
+ "tasks.show": ["data.id", "data.status", "data.title"],
120
+ "tasks.list": ["data.tasks[].id"],
121
+ "session.list": ["data.sessions[].id"],
122
+ "session.status": ["data.currentSession.id", "data.hasActiveSession"],
123
+ "admin.dash": ["data.stats.total", "data.stats.active"],
124
+ "admin.health": ["data.healthy"],
125
+ "admin.stats": ["data.totalTasks"],
126
+ }
127
+ ```
128
+
129
+ Equivalence is checked before the blind comparison to flag data divergence independently of quality judgment.
130
+
131
+ ---
132
+
133
+ ## Statistical Analysis
134
+
135
+ After N runs, `generate_report.py` computes:
136
+
137
+ ```json
138
+ {
139
+ "wins": { "mcp": 0, "cli": 0, "tie": 0 },
140
+ "win_rate": { "mcp": 0.0, "cli": 0.0 },
141
+ "token_delta": {
142
+ "mean_mcp_chars": 0,
143
+ "mean_cli_chars": 0,
144
+ "delta_chars": 0,
145
+ "delta_pct": "+0%"
146
+ },
147
+ "latency_delta": {
148
+ "mean_mcp_ms": 0,
149
+ "mean_cli_ms": 0,
150
+ "delta_ms": 0
151
+ },
152
+ "data_equivalence_rate": 1.0,
153
+ "per_operation": { ... }
154
+ }
155
+ ```
156
+
157
+ **Recommended minimum runs:** 3 per operation for trend detection, 10+ for statistical confidence.
158
+
159
+ ---
160
+
161
+ ## Comparator Rubric
162
+
163
+ The blind comparator evaluates each side on:
164
+
165
+ | Criterion | Description |
166
+ |-----------|-------------|
167
+ | **Completeness** | Does the response contain all expected fields? |
168
+ | **Structure** | Is the response well-formed JSON? Clean envelope? |
169
+ | **Usability** | Can an agent consume this without post-processing? |
170
+ | **Verbosity** | Lower is better — same data, fewer chars = more efficient |
171
+
172
+ Rubric scores are 1–5 per criterion. Winner is the side with higher weighted total.
173
+
174
+ ---
175
+
176
+ ## MCP Server Invocation Details
177
+
178
+ The `run_ab_test.py` script calls the CLEO MCP server via stdio JSON-RPC:
179
+
180
+ ```python
181
+ # Protocol sequence
182
+ # 1. Send initialize
183
+ # 2. Send tools/call (query or mutate)
184
+ # 3. Read response lines until tool result found
185
+ # 4. Terminate process
186
+
187
+ MCP_INIT = {
188
+ "jsonrpc": "2.0", "id": 0, "method": "initialize",
189
+ "params": {
190
+ "protocolVersion": "2024-11-05",
191
+ "capabilities": {},
192
+ "clientInfo": {"name": "ct-grade-ab-test", "version": "2.1.0"}
193
+ }
194
+ }
195
+
196
+ MCP_CALL = {
197
+ "jsonrpc": "2.0", "id": 1, "method": "tools/call",
198
+ "params": {
199
+ "name": "query", # or "mutate"
200
+ "arguments": {
201
+ "domain": "<domain>",
202
+ "operation": "<operation>",
203
+ "params": {}
204
+ }
205
+ }
206
+ }
207
+ ```
208
+
209
+ **CLI equivalent:**
210
+ ```bash
211
+ cleo-dev <domain> <operation> [args] --json
212
+ ```
213
+
214
+ ---
215
+
216
+ ## Interpreting Results
217
+
218
+ | Outcome | Meaning | Action |
219
+ |---------|---------|--------|
220
+ | MCP wins consistently | MCP output is cleaner/more complete | Recommend MCP-first in agent protocols |
221
+ | CLI wins consistently | CLI output is more complete or parseable | Investigate MCP envelope overhead |
222
+ | Tie | Both equivalent | Focus on latency and token cost |
223
+ | MCP tokens > CLI tokens | MCP envelope adds overhead | Quantify and document in CLEO-GRADE-SPEC |
224
+ | Data divergence detected | MCP and CLI returning different data | File bug — should be dispatch-level consistent |
225
+
226
+ ---
227
+
228
+ ## Parity Scenarios
229
+
230
+ The P1-P3 parity scenarios (see playbook-v2.md) run a curated set of operations specifically chosen to stress:
231
+ - **P1**: tasks domain — high-frequency agent operations
232
+ - **P2**: session domain — lifecycle operations agents use at start/end
233
+ - **P3**: admin domain — help, dash, health (first calls in any session)
@@ -0,0 +1,156 @@
1
+ # CLEO Domains SSoT
2
+
3
+ 10 canonical domains for A/B test construction and grade analysis.
4
+ Source: `docs/specs/CLEO-OPERATION-CONSTITUTION.md` + `src/dispatch/registry.ts`.
5
+
6
+ ---
7
+
8
+ ## Domain Summary
9
+
10
+ | Domain | Gateway | Tier-0 ops | Key purpose |
11
+ |--------|---------|-----------|-------------|
12
+ | `tasks` | query+mutate | show, list, find, exists, tree, add, update, complete, cancel, delete | Task CRUD, hierarchy, deps |
13
+ | `session` | query+mutate | status, list, show, history, decision.log, start, end, resume, gc | Session lifecycle |
14
+ | `memory` | query+mutate | (tier 1+) show, find, timeline, fetch, observe | Cognitive memory (brain.db) |
15
+ | `check` | query+mutate | schema, protocol, task, manifest, test.run | Validation and compliance |
16
+ | `pipeline` | query+mutate | stage.validate, stage.status, manifest.*, release.* | RCASD-IVTR+C lifecycle, releases |
17
+ | `orchestrate` | query+mutate | status, next, ready, waves, spawn, spawn.execute | Multi-agent coordination |
18
+ | `tools` | query+mutate | skill.list, skill.show, skill.find, provider.list, issue.add.bug | Skills, providers |
19
+ | `admin` | query+mutate | version, health, dash, help, stats, grade, grade.list | Config, diagnostics |
20
+ | `nexus` | query+mutate | (tier 2) status, list, show, register, sync | Cross-project coordination |
21
+ | `sticky` | query+mutate | list, show, add, convert, archive, purge | Quick capture notes |
22
+
23
+ ---
24
+
25
+ ## Tier-0 Operations (A/B test defaults)
26
+
27
+ These are available without progressive disclosure. Use as the default test set.
28
+
29
+ ### tasks (17 query + 15 mutate)
30
+
31
+ **Query (tier 0):**
32
+ - `show` — single task details
33
+ - `list` — tasks with filters (HEAVY — test against `find`)
34
+ - `find` — search tasks (LIGHTWEIGHT — preferred)
35
+ - `exists` — check task ID exists
36
+ - `tree` — hierarchy tree
37
+ - `blockers` — blocking deps
38
+ - `depends` — dependency graph
39
+ - `analyze` — task metrics
40
+ - `next` — suggest next task
41
+ - `plan` — composite planning view
42
+ - `relates` — related tasks
43
+ - `current` — currently active task
44
+
45
+ **Mutate (tier 0):**
46
+ - `add` — create task
47
+ - `update` — modify task
48
+ - `complete` — mark done
49
+ - `cancel` — cancel task
50
+ - `delete` — permanent remove
51
+ - `archive` — soft delete
52
+ - `restore` — restore from terminal
53
+ - `start` — begin working
54
+ - `stop` — stop working
55
+
56
+ ### session (11 query + 8 mutate)
57
+
58
+ **Query (tier 0):**
59
+ - `status` — current session status
60
+ - `list` — list sessions
61
+ - `show` — session details
62
+ - `history` — session history
63
+ - `decision.log` — decision log
64
+ - `context.drift` — detect drift
65
+ - `handoff.show` — handoff data
66
+ - `briefing.show` — session-start context
67
+ - `find` — lightweight session discovery
68
+
69
+ **Mutate (tier 0):**
70
+ - `start` — begin new session
71
+ - `end` — end current session
72
+ - `resume` — resume suspended
73
+ - `suspend` — suspend without ending
74
+ - `gc` — garbage-collect stale
75
+ - `record.decision` — record decision
76
+ - `record.assumption` — record assumption
77
+
78
+ ### admin (tier 0 subset)
79
+
80
+ **Query:**
81
+ - `version` — CLEO version
82
+ - `health` — system health
83
+ - `config.show` — configuration
84
+ - `stats` — project statistics
85
+ - `context` — project context
86
+ - `runtime` — runtime info
87
+ - `dash` — dashboard overview
88
+ - `log` — audit log
89
+ - `help` — progressive disclosure entry
90
+ - `doctor` — health check diagnostics
91
+
92
+ **Mutate:**
93
+ - `init` — initialize CLEO
94
+ - `config.set` — set config
95
+ - `backup` — create backup
96
+ - `sync` — synchronize data stores
97
+ - `cleanup` — clean stale data
98
+ - `fix` — auto-fix doctor checks
99
+ - `detect` — refresh project-context.json
100
+
101
+ ### tools (tier 0 subset)
102
+
103
+ **Query:**
104
+ - `skill.list` — list installed skills
105
+ - `skill.show` — skill details
106
+ - `skill.find` — search skills
107
+ - `skill.dispatch` — dispatch execution
108
+ - `skill.verify` — verify skill
109
+ - `provider.list` — list providers
110
+ - `provider.detect` — detect providers
111
+
112
+ **Mutate:**
113
+ - `skill.install` — install skill
114
+ - `skill.enable` / `skill.disable` — toggle
115
+ - `skill.configure` — configure params
116
+ - `skill.refresh` — refresh catalog
117
+ - `provider.inject` — inject provider config
118
+
119
+ ---
120
+
121
+ ## For A/B Testing
122
+
123
+ ### Recommended test operation sets
124
+
125
+ **Fast smoke test (5 ops):**
126
+ ```
127
+ tasks.find, tasks.show, session.status, admin.dash, admin.health
128
+ ```
129
+
130
+ **Standard parity test (15 ops):**
131
+ ```
132
+ tasks.find, tasks.show, tasks.list, tasks.tree, tasks.plan,
133
+ session.status, session.list, session.briefing.show,
134
+ admin.dash, admin.health, admin.help, admin.stats,
135
+ tools.skill.list, tools.provider.list, admin.doctor
136
+ ```
137
+
138
+ **Full tier-0 sweep (all tier-0 query ops across all domains):**
139
+ Use `--tier 0 --gateway query` flag in run_ab_test.py
140
+
141
+ ---
142
+
143
+ ## Known Token Cost Ranking
144
+
145
+ Ordered by typical output size (most expensive first):
146
+
147
+ 1. `tasks.list` (no filter) — AVOID in agents, use `tasks.find`
148
+ 2. `admin.help --tier 2` — large operation catalog
149
+ 3. `memory.find` — FTS5 results
150
+ 4. `tasks.plan` — composite view
151
+ 5. `admin.dash` — multi-domain overview
152
+ 6. `admin.doctor` — comprehensive health
153
+ 7. `tasks.tree` — hierarchy visualization
154
+ 8. `session.history` — session log
155
+ 9. `tasks.find` (10 results) — standard discovery
156
+ 10. `admin.stats` — aggregate counts
@@ -0,0 +1,167 @@
1
+ # CLEO Grade Specification v2
2
+
3
+ Updated for CLEO v2026.3+ with 10 canonical domains and 262 operations.
4
+ Source of truth: `src/core/sessions/session-grade.ts` + `docs/specs/CLEO-GRADE-SPEC.md`.
5
+
6
+ ---
7
+
8
+ ## Rubric: 5 Dimensions (100 pts max)
9
+
10
+ ### S1: Session Discipline (20 pts)
11
+
12
+ Measures whether the agent checks existing sessions before starting work and properly ends sessions.
13
+
14
+ | Points | Condition | Evidence string |
15
+ |--------|-----------|-----------------|
16
+ | +10 | `session.list` called before first `tasks.*` operation | `session.list called before first task op` |
17
+ | +10 | `session.end` called at least once | `session.end called` |
18
+
19
+ **Flags on violation:**
20
+ - `session.list never called (check existing sessions before starting)`
21
+ - `session.list called after task ops (should check sessions first)`
22
+ - `session.end never called (always end sessions when done)`
23
+
24
+ **Scoring:** Starts at 0. Range: 0–20.
25
+
26
+ ---
27
+
28
+ ### S2: Discovery Efficiency (20 pts)
29
+
30
+ Measures whether the agent uses `tasks.find` (lightweight, minimal fields) over `tasks.list` (heavy, full notes arrays).
31
+
32
+ | Points | Condition | Evidence string |
33
+ |--------|-----------|-----------------|
34
+ | +15 | `find / (find + list)` ratio >= 80% | `find:list ratio N% >= 80%` |
35
+ | partial | Proportional if ratio < 80%: `round(15 * ratio)` | — |
36
+ | +10 | Zero discovery calls (benefit of doubt) | `No discovery calls needed` |
37
+ | +5 | `tasks.show` used at least once | `tasks.show used Nx for detail` |
38
+
39
+ **Flags:** `tasks.list used Nx (prefer tasks.find for discovery)`
40
+
41
+ **Scoring:** Capped at 20. Range: 0–20.
42
+
43
+ ---
44
+
45
+ ### S3: Task Hygiene (20 pts)
46
+
47
+ Measures whether tasks are created with proper descriptions and subtask parent verification.
48
+
49
+ | Points | Condition | Evidence string |
50
+ |--------|-----------|-----------------|
51
+ | -5 each | `tasks.add` succeeded without a description | flag per violation |
52
+ | -3 | Subtasks created (with `parent` param) but no preceding `tasks.exists` | `Subtasks created without tasks.exists parent check` |
53
+ | (none) | All adds have descriptions | `All N tasks.add calls had descriptions` |
54
+ | (none) | Subtasks preceded by `tasks.exists` | `Parent existence verified before subtask creation` |
55
+
56
+ **Flags:**
57
+ - `tasks.add without description (taskId: <id>)`
58
+ - `Subtasks created without tasks.exists parent check`
59
+
60
+ **Scoring:** Starts at 20, deducts penalties. Floor: 0.
61
+
62
+ ---
63
+
64
+ ### S4: Error Protocol (20 pts)
65
+
66
+ Measures whether the agent recovers from `E_NOT_FOUND` (exit code 4) and avoids duplicate creates.
67
+
68
+ | Points | Condition | Evidence string |
69
+ |--------|-----------|-----------------|
70
+ | -5 each | `E_NOT_FOUND` not followed by `tasks.find` or `tasks.exists` within next 4 entries | flag per violation |
71
+ | -5 | Duplicate task creates (same title, case-insensitive) in session | `N potentially duplicate task create(s) detected` |
72
+ | (none) | Error followed by recovery | `E_NOT_FOUND followed by recovery lookup` |
73
+ | (none) | No violations | `No error protocol violations` |
74
+
75
+ **Recovery window:** Checks `entries[errIdx+1 : errIdx+5]` for `tasks.find` or `tasks.exists`.
76
+
77
+ **Duplicate detection:** Compares lowercased trimmed titles of all successful `tasks.add` calls.
78
+
79
+ **Scoring:** Starts at 20, deducts penalties. Floor: 0.
80
+
81
+ ---
82
+
83
+ ### S5: Progressive Disclosure Use (20 pts)
84
+
85
+ Measures whether the agent uses CLEO's progressive disclosure system and the MCP query gateway.
86
+
87
+ | Points | Condition | Evidence string |
88
+ |--------|-----------|-----------------|
89
+ | +10 | At least one help/skill call: `admin.help`, `tools.skill.show`, `tools.skill.list`, `tools.skill.find` | `Progressive disclosure used (Nx)` |
90
+ | +10 | At least one MCP query gateway call (`metadata.gateway === "query"`) | `query (MCP) used Nx` |
91
+
92
+ **Flags:**
93
+ - `No admin.help or skill lookup calls (load ct-cleo for guidance)`
94
+ - `No MCP query calls (prefer query over CLI for programmatic access)`
95
+
96
+ **Scoring:** Starts at 0. Range: 0–20.
97
+
98
+ ---
99
+
100
+ ## Grade Letter Mapping
101
+
102
+ | Grade | Threshold | Profile |
103
+ |-------|-----------|---------|
104
+ | A | >= 90% | All dimensions near max, zero or minimal flags |
105
+ | B | >= 75% | Minor violations in one or two dimensions |
106
+ | C | >= 60% | Several protocol gaps |
107
+ | D | >= 45% | Multiple anti-patterns |
108
+ | F | < 45% | Severe protocol violations across most dimensions |
109
+
110
+ ---
111
+
112
+ ## Token Metadata (v2.1 addition)
113
+
114
+ Grade results in v2.1 carry optional token metadata alongside the standard GradeResult — not a scored dimension, but captured for efficiency analysis:
115
+
116
+ ```json
117
+ {
118
+ "_tokenMeta": {
119
+ "estimationMethod": "otel|output_chars",
120
+ "totalEstimatedTokens": 4200,
121
+ "perDomain": {
122
+ "tasks": 1800,
123
+ "session": 600,
124
+ "admin": 400
125
+ },
126
+ "mcpQueryTokens": 2100,
127
+ "cliTokens": 1100,
128
+ "auditEntries": 47
129
+ }
130
+ }
131
+ ```
132
+
133
+ This field is appended by the run_scenario.py and run_ab_test.py scripts. It does NOT affect the 0–100 score.
134
+
135
+ ---
136
+
137
+ ## Edge Cases
138
+
139
+ | Scenario | Handling |
140
+ |----------|----------|
141
+ | No audit entries | All scores 0; flag `No audit entries found for session (use --grade flag when starting session)` |
142
+ | No task operations | S1 session.list check passes (list is always "before" task ops when there are none) |
143
+ | No discovery calls | S2 awards 10 baseline (benefit of doubt) |
144
+ | No adds | S3 starts at 20 with no deductions |
145
+ | No errors | S4 starts at 20 with no deductions |
146
+ | No grade file | `readGrades()` returns `[]` |
147
+
148
+ ---
149
+
150
+ ## Updated Domain Recognition (v2.1)
151
+
152
+ The rubric recognizes all 10 canonical domains in audit entries. Key domain-to-dimension mappings:
153
+
154
+ | Domain | Affects |
155
+ |--------|---------|
156
+ | `session` | S1 (list/end), S5 (gateway) |
157
+ | `tasks` | S1 (first task op timing), S2 (find/list/show), S3 (add/exists), S4 (error recovery) |
158
+ | `admin` | S5 (admin.help progressive disclosure) |
159
+ | `tools` | S5 (skill.show, skill.list, skill.find) |
160
+ | `memory` | S5 (gateway tracking only) |
161
+ | `pipeline` | S5 (gateway tracking only) |
162
+ | `check` | S5 (gateway tracking only) |
163
+ | `orchestrate` | S5 (gateway tracking only) |
164
+ | `nexus` | S5 (gateway tracking only) |
165
+ | `sticky` | S5 (gateway tracking only) |
166
+
167
+ All 10 domains contribute to `mcpQueryCalls` count in S5 — any MCP query gateway call regardless of domain earns the +10.