@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,179 @@
1
+ # Scenario Runner Agent
2
+
3
+ You are a CLEO grade scenario executor. Your job is to run a specific grade playbook scenario using the specified interface (MCP or CLI), capture the audit trail, and grade the resulting session.
4
+
5
+ ## Inputs
6
+
7
+ You will receive:
8
+ - `SCENARIO`: Which scenario to run (s1|s2|s3|s4|s5|s6|s7|s8|s9|s10)
9
+ - `INTERFACE`: Which interface to use (mcp|cli)
10
+ - `OUTPUT_DIR`: Where to write results
11
+ - `PROJECT_DIR`: Path to the CLEO project (for cleo-dev --cwd)
12
+ - `RUN_NUMBER`: Integer (1, 2, 3...) for repeated runs
13
+
14
+ ## Execution Protocol
15
+
16
+ ### Step 1: Record start time
17
+
18
+ Note the ISO timestamp before any operations.
19
+
20
+ ### Step 2: Start a graded session via MCP (always use MCP for session lifecycle)
21
+
22
+ ```
23
+ mutate session start { "grade": true, "name": "grade-<SCENARIO>-<INTERFACE>-run<RUN>", "scope": "global" }
24
+ ```
25
+
26
+ Save the returned `sessionId`.
27
+
28
+ If this fails (DB migration error, ENOENT, or non-zero exit):
29
+ - Write `grade.json: { "error": "DB_UNAVAILABLE", "totalScore": null }`
30
+ - Write `timing.json: { "error": "DB_UNAVAILABLE", "total_tokens": null, "duration_ms": null, "arm": "<INTERFACE>", "scenario": "<SCENARIO>", "run": <RUN_NUMBER>, "interface": "<INTERFACE>", "executor_start": "<ISO>", "executor_end": "<ISO>" }`
31
+ - Output: `SESSION_START_FAILED: DB_UNAVAILABLE`
32
+ - Stop. Do NOT abort silently.
33
+
34
+ ### Step 3: Execute scenario operations
35
+
36
+ Follow the exact operation sequence from the scenario playbook. Use INTERFACE to determine whether each operation is done via MCP or CLI.
37
+
38
+ **MCP operations** use the query/mutate gateway:
39
+ ```
40
+ query tasks find { "status": "active" }
41
+ ```
42
+
43
+ **CLI operations** use cleo-dev (prefer) or cleo, with PROJECT_DIR as cwd if provided:
44
+ ```bash
45
+ cleo-dev --cwd <PROJECT_DIR> find --status active
46
+ ```
47
+
48
+ Scenario sequences are in [../references/playbook-v2.md](../references/playbook-v2.md). Execute the operations in order. Do NOT skip operations — each one contributes to the grade.
49
+
50
+ ### Step 4: End the session
51
+
52
+ ```
53
+ mutate session end
54
+ ```
55
+
56
+ ### Step 5: Grade the session
57
+
58
+ ```
59
+ query admin grade { "sessionId": "<saved-id>" }
60
+ ```
61
+
62
+ Save the full GradeResult JSON.
63
+
64
+ ### Step 6: Capture operations log
65
+
66
+ Record every operation you executed as a JSONL file. Each line:
67
+ ```json
68
+ {"seq": 1, "gateway": "query", "domain": "tasks", "operation": "find", "params": {}, "success": true, "interface": "mcp", "timestamp": "..."}
69
+ ```
70
+
71
+ ### Step 7: Write output files
72
+
73
+ Write to `<OUTPUT_DIR>/<SCENARIO>/arm-<INTERFACE>/`:
74
+
75
+ **grade.json** — The GradeResult from admin.grade:
76
+ ```json
77
+ {
78
+ "sessionId": "...",
79
+ "totalScore": 85,
80
+ "maxScore": 100,
81
+ "dimensions": {...},
82
+ "flags": [...],
83
+ "entryCount": 12
84
+ }
85
+ ```
86
+
87
+ **operations.jsonl** — One JSON object per line, each operation executed.
88
+
89
+ **timing.json** — Fill in what you can; orchestrator fills `total_tokens` and `duration_ms`:
90
+ ```json
91
+ {
92
+ "arm": "<INTERFACE>",
93
+ "scenario": "<SCENARIO>",
94
+ "run": <RUN_NUMBER>,
95
+ "interface": "<INTERFACE>",
96
+ "session_id": "<session-id>",
97
+ "executor_start": "<ISO>",
98
+ "executor_end": "<ISO>",
99
+ "executor_duration_seconds": 0,
100
+ "token_usage_id": "<id from admin.token.record response>",
101
+ "total_tokens": null,
102
+ "duration_ms": null
103
+ }
104
+ ```
105
+
106
+ Note: `total_tokens` and `duration_ms` are filled by the orchestrator from the task completion notification — you cannot read them yourself.
107
+
108
+ ### Step 8: Record token exchange (mandatory for token_usage table)
109
+
110
+ After receiving the grade result, record the exchange to persist token measurements:
111
+
112
+ ```
113
+ mutate admin token.record {
114
+ "sessionId": "<session-id>",
115
+ "transport": "mcp",
116
+ "domain": "admin",
117
+ "operation": "grade",
118
+ "metadata": {
119
+ "scenario": "<SCENARIO>",
120
+ "interface": "<INTERFACE>",
121
+ "run": <RUN_NUMBER>
122
+ }
123
+ }
124
+ ```
125
+
126
+ Save the returned `id` as `token_usage_id` in timing.json.
127
+
128
+ ## Quick Reference — Scenarios
129
+
130
+ | Scenario | Name | Key Domains | Target Score |
131
+ |----------|------|-------------|--------------|
132
+ | s1 | Session Discipline | session, tasks | S1=20, S2=15+ |
133
+ | s2 | Task Hygiene | tasks, session | S3=20, S1=20 |
134
+ | s3 | Error Recovery | tasks, session | S4=20 |
135
+ | s4 | Full Lifecycle | tasks, session, admin | All dims 15+ |
136
+ | s5 | Multi-Domain Analysis | tasks, admin, pipeline | S5=15+ |
137
+ | s6 | Memory Observe & Recall | memory, session | S5=15+, S2=15+ |
138
+ | s7 | Decision Continuity | memory, session | S1=20, S5=15+ |
139
+ | s8 | Pattern & Learning | memory, session | S2=15+, S5=15+ |
140
+ | s9 | NEXUS Cross-Project | nexus, session, admin | S5=20, S1=20 |
141
+ | s10 | Full System Throughput | all 8 domains | S2=15+, S5=15+ |
142
+
143
+ ## Scenario Key Operations
144
+
145
+ | Scenario | Key Operations | S1 | S2 | S3 | S4 | S5 |
146
+ |---|---|---|---|---|---|---|
147
+ | s1 | session.list, tasks.find, tasks.show, session.end | ✓ | ✓ | — | — | partial |
148
+ | s2 | session.list, tasks.exists, tasks.add×2, session.end | ✓ | — | ✓ | — | — |
149
+ | s3 | session.list, tasks.show (E_NOT_FOUND), tasks.find (recover), tasks.add, session.end | ✓ | — | ✓ | ✓ | — |
150
+ | s4 | session.list, admin.help, tasks.find, tasks.show, tasks.update, tasks.complete, session.end | ✓ | ✓ | ✓ | ✓ | ✓ |
151
+ | s5 | session.list, admin.help, tasks.find (parent filter), tasks.show, session.context.drift, session.decision.log, session.record.decision, tasks.update, tasks.complete, session.end | ✓ | ✓ | ✓ | ✓ | ✓ |
152
+ | s6 | memory.observe, memory.find, memory.timeline, memory.fetch, session.end | ✓ | ✓ | — | — | ✓ |
153
+ | s7 | memory.decision.store, memory.decision.find, memory.find, memory.stats, session.end | ✓ | — | — | — | ✓ |
154
+ | s8 | memory.pattern.store, memory.learning.store, memory.pattern.find, memory.learning.find, session.end | — | ✓ | — | — | ✓ |
155
+ | s9 | nexus.status, nexus.list, nexus.show, admin.dash, session.end | ✓ | — | — | — | ✓ |
156
+ | s10 | session.list, admin.help, tasks.find, memory.find, nexus.status, pipeline.stage.status, check.health, tools.skill.list, memory.observe, session.end | ✓ | ✓ | — | — | ✓ |
157
+
158
+ ## Anti-patterns to Avoid
159
+
160
+ Do NOT do these during scenario execution — they will lower the grade intentionally only if you are running the anti-pattern variant:
161
+ - Calling `tasks.list` instead of `tasks.find` for discovery
162
+ - Skipping `session.list` at the start
163
+ - Creating tasks without descriptions
164
+ - Ignoring `E_NOT_FOUND` errors without recovery lookup
165
+ - Never calling `admin.help`
166
+
167
+ ## Output
168
+
169
+ When complete, summarize:
170
+ ```
171
+ SCENARIO: <id>
172
+ INTERFACE: <interface>
173
+ RUN: <n>
174
+ SESSION_ID: <id>
175
+ TOTAL_SCORE: <n>/100
176
+ GRADE: <letter>
177
+ FLAGS: <count>
178
+ FILES_WRITTEN: <list>
179
+ ```
@@ -0,0 +1,74 @@
1
+ [
2
+ {
3
+ "id": "eval-001",
4
+ "description": "Grade a session — verify grading pipeline returns a valid GradeResult",
5
+ "prompt": "Start a graded session, run query session list and admin dash, end session, then grade it",
6
+ "expectations": [
7
+ "Grade operation returns success: true",
8
+ "totalScore is a number 0-100",
9
+ "dimensions has 5 entries each with score and max",
10
+ "flags is an array"
11
+ ]
12
+ },
13
+ {
14
+ "id": "eval-002",
15
+ "description": "Session discipline — session.list before task ops scores S1=20",
16
+ "prompt": "Run scenario S1 and verify session discipline dimension is 20/20",
17
+ "expectations": [
18
+ "S1 Session Discipline score = 20",
19
+ "session.list was called before any task operation",
20
+ "session.end was called",
21
+ "No protocol flags"
22
+ ]
23
+ },
24
+ {
25
+ "id": "eval-003",
26
+ "description": "Task efficiency — tasks.find used (not tasks.list) scores S2>=15",
27
+ "prompt": "Run tasks.find query and verify efficiency score is 15 or higher",
28
+ "expectations": [
29
+ "S2 Task Efficiency score >= 15",
30
+ "tasks.find was used instead of tasks.list",
31
+ "No TASK_LIST_USED flag"
32
+ ]
33
+ },
34
+ {
35
+ "id": "eval-004",
36
+ "description": "Task hygiene — task add with description scores S3=20",
37
+ "prompt": "Add a task with both title and description, verify hygiene score is 20",
38
+ "expectations": [
39
+ "S3 Task Hygiene score = 20",
40
+ "Task was created with non-empty description",
41
+ "No MISSING_DESCRIPTION flag"
42
+ ]
43
+ },
44
+ {
45
+ "id": "eval-005",
46
+ "description": "Protocol adherence — following CLEO workflow scores S4>=15",
47
+ "prompt": "Follow the complete CLEO session workflow and verify protocol adherence",
48
+ "expectations": [
49
+ "S4 Protocol Adherence score >= 15",
50
+ "Session started before task work",
51
+ "Session ended after task work"
52
+ ]
53
+ },
54
+ {
55
+ "id": "eval-006",
56
+ "description": "MCP gateway — MCP-sourced ops score S5>=15",
57
+ "prompt": "Use MCP interface for all operations and verify gateway score is 15 or higher",
58
+ "expectations": [
59
+ "S5 MCP Gateway score >= 15",
60
+ "Operations sourced from MCP (not CLI)",
61
+ "audit_log shows gateway=query or gateway=mutate with source=mcp"
62
+ ]
63
+ },
64
+ {
65
+ "id": "eval-007",
66
+ "description": "Memory recall — observe then find retrieves the observation",
67
+ "prompt": "Run scenario S6: observe a fact then find it via memory.find",
68
+ "expectations": [
69
+ "memory.observe succeeds and returns an ID",
70
+ "memory.find with matching query returns the observation",
71
+ "Grade total score >= 60"
72
+ ]
73
+ }
74
+ ]
@@ -0,0 +1,174 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ build_op_stats.py — Aggregate operations.jsonl files from grade runs into per-operation statistics.
4
+
5
+ Reads all operations.jsonl files under --grade-runs-dir and computes per-operation stats
6
+ split by interface (mcp/cli). Output is a JSON object keyed by "domain.operation".
7
+
8
+ Usage:
9
+ python build_op_stats.py [options]
10
+
11
+ Options:
12
+ --grade-runs-dir PATH Directory containing grade run subdirectories
13
+ (default: .cleo/metrics/grade-runs relative to cwd)
14
+ --output PATH Output JSON file path
15
+ (default: .cleo/metrics/per_operation_stats.json)
16
+ --pretty Pretty-print JSON output (default: compact)
17
+ --verbose Print progress to stderr
18
+
19
+ Output format (per key "domain.operation"):
20
+ {
21
+ "mcp_calls": 42,
22
+ "cli_calls": 10,
23
+ "total_mcp_ms": 1234.5,
24
+ "total_cli_ms": 456.7,
25
+ "avg_mcp_ms": 29.4,
26
+ "avg_cli_ms": 45.7,
27
+ "runs_seen": 3
28
+ }
29
+
30
+ Also importable as a module:
31
+ from build_op_stats import compute_stats
32
+ stats = compute_stats(grade_runs_dir="/path/to/grade-runs")
33
+ """
34
+
35
+ import argparse
36
+ import json
37
+ import sys
38
+ from pathlib import Path
39
+
40
+
41
+ def compute_stats(grade_runs_dir, verbose=False):
42
+ """
43
+ Aggregate operations.jsonl files under grade_runs_dir.
44
+
45
+ Returns dict keyed by "domain.operation" with accumulated stats.
46
+ """
47
+ runs_dir = Path(grade_runs_dir)
48
+ stats = {}
49
+ files_processed = 0
50
+ lines_processed = 0
51
+
52
+ if not runs_dir.exists():
53
+ if verbose:
54
+ print(f"[build_op_stats] Grade runs dir not found: {runs_dir}", file=sys.stderr)
55
+ return stats
56
+
57
+ for ops_file in sorted(runs_dir.rglob('operations.jsonl')):
58
+ files_processed += 1
59
+ if verbose:
60
+ print(f"[build_op_stats] Processing: {ops_file}", file=sys.stderr)
61
+
62
+ for line in ops_file.read_text(errors='replace').splitlines():
63
+ line = line.strip()
64
+ if not line:
65
+ continue
66
+ try:
67
+ entry = json.loads(line)
68
+ except json.JSONDecodeError:
69
+ continue
70
+
71
+ domain = entry.get('domain', 'unknown')
72
+ operation = entry.get('operation', 'unknown')
73
+ key = f"{domain}.{operation}"
74
+ interface = entry.get('interface', 'mcp')
75
+ duration = float(entry.get('duration_ms', 0) or 0)
76
+
77
+ if key not in stats:
78
+ stats[key] = {
79
+ 'mcp_calls': 0,
80
+ 'cli_calls': 0,
81
+ 'total_mcp_ms': 0.0,
82
+ 'total_cli_ms': 0.0,
83
+ 'avg_mcp_ms': 0.0,
84
+ 'avg_cli_ms': 0.0,
85
+ 'runs_seen': set(),
86
+ }
87
+
88
+ # Track which run directory this came from
89
+ # ops_file is e.g. .../grade-runs/run-20260308/s1/run-01/arm-mcp/operations.jsonl
90
+ # run_id is the first path component relative to runs_dir (e.g. "run-20260308")
91
+ run_id = ops_file.relative_to(runs_dir).parts[0]
92
+ stats[key]['runs_seen'].add(run_id)
93
+
94
+ if interface == 'cli':
95
+ stats[key]['cli_calls'] += 1
96
+ stats[key]['total_cli_ms'] += duration
97
+ else:
98
+ stats[key]['mcp_calls'] += 1
99
+ stats[key]['total_mcp_ms'] += duration
100
+
101
+ lines_processed += 1
102
+
103
+ # Compute averages and convert sets to counts
104
+ for key, v in stats.items():
105
+ mc = v['mcp_calls']
106
+ cc = v['cli_calls']
107
+ v['avg_mcp_ms'] = round(v['total_mcp_ms'] / mc, 2) if mc > 0 else 0.0
108
+ v['avg_cli_ms'] = round(v['total_cli_ms'] / cc, 2) if cc > 0 else 0.0
109
+ v['total_mcp_ms'] = round(v['total_mcp_ms'], 2)
110
+ v['total_cli_ms'] = round(v['total_cli_ms'], 2)
111
+ v['runs_seen'] = len(v['runs_seen'])
112
+
113
+ if verbose:
114
+ print(f"[build_op_stats] Processed {files_processed} files, {lines_processed} lines → {len(stats)} unique operations", file=sys.stderr)
115
+
116
+ return stats
117
+
118
+
119
+ def find_cleo_dir(start='.'):
120
+ """Walk up from start to find directory containing .cleo/tasks.db."""
121
+ p = Path(start).resolve()
122
+ while p != p.parent:
123
+ if (p / '.cleo' / 'tasks.db').exists():
124
+ return p
125
+ p = p.parent
126
+ return Path(start).resolve()
127
+
128
+
129
+ def main():
130
+ parser = argparse.ArgumentParser(
131
+ description='Aggregate grade run operations.jsonl files into per-operation stats.'
132
+ )
133
+ parser.add_argument(
134
+ '--grade-runs-dir',
135
+ default=None,
136
+ help='Directory containing grade run subdirectories (default: .cleo/metrics/grade-runs)'
137
+ )
138
+ parser.add_argument(
139
+ '--output',
140
+ default=None,
141
+ help='Output JSON path (default: .cleo/metrics/per_operation_stats.json)'
142
+ )
143
+ parser.add_argument(
144
+ '--pretty',
145
+ action='store_true',
146
+ help='Pretty-print JSON output'
147
+ )
148
+ parser.add_argument(
149
+ '--verbose',
150
+ action='store_true',
151
+ help='Print progress to stderr'
152
+ )
153
+ args = parser.parse_args()
154
+
155
+ workspace = find_cleo_dir('.')
156
+
157
+ grade_runs_dir = args.grade_runs_dir or str(workspace / '.cleo' / 'metrics' / 'grade-runs')
158
+ output_path = args.output or str(workspace / '.cleo' / 'metrics' / 'per_operation_stats.json')
159
+
160
+ stats = compute_stats(grade_runs_dir, verbose=args.verbose)
161
+
162
+ indent = 2 if args.pretty else None
163
+ output_json = json.dumps(stats, indent=indent)
164
+
165
+ out = Path(output_path)
166
+ out.parent.mkdir(parents=True, exist_ok=True)
167
+ out.write_text(output_json)
168
+
169
+ print(f"Wrote {len(stats)} operation stats to {output_path}")
170
+ return 0
171
+
172
+
173
+ if __name__ == '__main__':
174
+ sys.exit(main())
@@ -0,0 +1,41 @@
1
+ {
2
+ "total_grades": 31,
3
+ "score_distribution": {
4
+ "F (0)": 7,
5
+ "D (45-59)": 10,
6
+ "C (60-74)": 5,
7
+ "B (75-89)": 8,
8
+ "A (90+)": 1
9
+ },
10
+ "score_stats": {
11
+ "mean": 64.6,
12
+ "min": 50,
13
+ "max": 95,
14
+ "grades_with_data": 24,
15
+ "zero_score_count": 7
16
+ },
17
+ "dimension_averages": {
18
+ "sessionDiscipline": 5.8,
19
+ "discoveryEfficiency": 9.0,
20
+ "taskHygiene": 15.4,
21
+ "errorProtocol": 15.3,
22
+ "disclosureUse": 4.5
23
+ },
24
+ "flag_frequency": {
25
+ "No admin.help calls": 21,
26
+ "session.list never called": 18,
27
+ "No MCP query calls": 13,
28
+ "session.end never called": 12,
29
+ "No audit entries": 7,
30
+ "tasks.list used (prefer find)": 5,
31
+ "Subtasks without exists check": 1,
32
+ "Duplicate task creates": 1
33
+ },
34
+ "avg_audit_entries": 9.5,
35
+ "token_estimate": {
36
+ "avg_per_session_chars": 0,
37
+ "avg_per_session_tokens": 1425.0,
38
+ "method": "entry_count * 150 proxy",
39
+ "note": "OTEL not enabled; enable with CLAUDE_CODE_ENABLE_TELEMETRY=1 for real counts"
40
+ }
41
+ }
@@ -0,0 +1,34 @@
1
+ # CLEO Grade v2.1 — Comparative Analysis Report
2
+
3
+ **Generated:** 2026-03-07 23:47 UTC
4
+ **Source:** `/tmp/ct-grade-eval`
5
+
6
+ ---
7
+
8
+ ## MCP vs CLI Blind A/B Results
9
+
10
+ **Overall winner: MCP**
11
+
12
+ | Metric | Value |
13
+ |--------|-------|
14
+ | Total runs | 3 |
15
+ | MCP wins | 3 (100.0%) |
16
+ | CLI wins | 0 (0.0%) |
17
+ | Ties | 0 |
18
+ | Avg token delta (MCP–CLI) | +416.0 tokens |
19
+ | Interpretation | MCP uses more tokens on average |
20
+
21
+ ### Per-Operation Results
22
+
23
+ | Operation | MCP wins | CLI wins | Ties | Token delta | MCP chars | CLI chars | MCP ms | CLI ms |
24
+ |-----------|----------|----------|------|-------------|-----------|-----------|--------|--------|
25
+ | `admin.version` **MCP** | 3 | 0 | 0 | +416t | 1664 | 0 | 930ms | 786ms |
26
+
27
+ ### Recommendations
28
+
29
+ - **MCP adds significant token overhead.** Consider whether MCP envelope verbosity can be reduced for high-frequency operations.
30
+ - **MCP output quality is consistently higher.** Reinforces MCP-first agent protocol recommendation.
31
+
32
+ ---
33
+
34
+ *Report generated by ct-grade v2.1 `generate_report.py`*