@cleocode/skills 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch-config.json +404 -0
- package/index.d.ts +178 -0
- package/index.js +405 -0
- package/package.json +14 -0
- package/profiles/core.json +7 -0
- package/profiles/full.json +10 -0
- package/profiles/minimal.json +7 -0
- package/profiles/recommended.json +7 -0
- package/provider-skills-map.json +97 -0
- package/skills/_shared/cleo-style-guide.md +84 -0
- package/skills/_shared/manifest-operations.md +810 -0
- package/skills/_shared/placeholders.json +433 -0
- package/skills/_shared/skill-chaining-patterns.md +237 -0
- package/skills/_shared/subagent-protocol-base.md +223 -0
- package/skills/_shared/task-system-integration.md +232 -0
- package/skills/_shared/testing-framework-config.md +110 -0
- package/skills/ct-cleo/SKILL.md +490 -0
- package/skills/ct-cleo/references/anti-patterns.md +19 -0
- package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
- package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
- package/skills/ct-cleo/references/session-protocol.md +162 -0
- package/skills/ct-codebase-mapper/SKILL.md +82 -0
- package/skills/ct-contribution/SKILL.md +521 -0
- package/skills/ct-contribution/templates/contribution-init.json +21 -0
- package/skills/ct-dev-workflow/SKILL.md +423 -0
- package/skills/ct-docs-lookup/SKILL.md +66 -0
- package/skills/ct-docs-review/SKILL.md +175 -0
- package/skills/ct-docs-write/SKILL.md +108 -0
- package/skills/ct-documentor/SKILL.md +231 -0
- package/skills/ct-epic-architect/SKILL.md +305 -0
- package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
- package/skills/ct-epic-architect/references/commands.md +201 -0
- package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
- package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
- package/skills/ct-epic-architect/references/output-format.md +92 -0
- package/skills/ct-epic-architect/references/patterns.md +284 -0
- package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
- package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
- package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
- package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
- package/skills/ct-grade/SKILL.md +230 -0
- package/skills/ct-grade/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade/agents/blind-comparator.md +157 -0
- package/skills/ct-grade/agents/scenario-runner.md +134 -0
- package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
- package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
- package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
- package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
- package/skills/ct-grade/eval-viewer/viewer.html +219 -0
- package/skills/ct-grade/evals/evals.json +94 -0
- package/skills/ct-grade/references/ab-test-methodology.md +150 -0
- package/skills/ct-grade/references/domains.md +137 -0
- package/skills/ct-grade/references/grade-spec.md +236 -0
- package/skills/ct-grade/references/scenario-playbook.md +234 -0
- package/skills/ct-grade/references/token-tracking.md +120 -0
- package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
- package/skills/ct-grade/scripts/generate_report.py +283 -0
- package/skills/ct-grade/scripts/run_ab_test.py +504 -0
- package/skills/ct-grade/scripts/run_all.py +287 -0
- package/skills/ct-grade/scripts/setup_run.py +183 -0
- package/skills/ct-grade/scripts/token_tracker.py +630 -0
- package/skills/ct-grade-v2-1/SKILL.md +237 -0
- package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
- package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
- package/skills/ct-grade-v2-1/evals/evals.json +74 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
- package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
- package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
- package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
- package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
- package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
- package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
- package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
- package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
- package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
- package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
- package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
- package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
- package/skills/ct-memory/SKILL.md +84 -0
- package/skills/ct-orchestrator/INSTALL.md +61 -0
- package/skills/ct-orchestrator/README.md +69 -0
- package/skills/ct-orchestrator/SKILL.md +380 -0
- package/skills/ct-orchestrator/manifest-entry.json +19 -0
- package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
- package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
- package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
- package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
- package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
- package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
- package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
- package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
- package/skills/ct-research-agent/SKILL.md +226 -0
- package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
- package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
- package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
- package/skills/ct-skill-creator/SKILL.md +356 -0
- package/skills/ct-skill-creator/agents/analyzer.md +276 -0
- package/skills/ct-skill-creator/agents/comparator.md +204 -0
- package/skills/ct-skill-creator/agents/grader.md +225 -0
- package/skills/ct-skill-creator/assets/eval_review.html +146 -0
- package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/ct-skill-creator/manifest-entry.json +17 -0
- package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
- package/skills/ct-skill-creator/references/frontmatter.md +83 -0
- package/skills/ct-skill-creator/references/invocation-control.md +165 -0
- package/skills/ct-skill-creator/references/output-patterns.md +86 -0
- package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
- package/skills/ct-skill-creator/references/schemas.md +430 -0
- package/skills/ct-skill-creator/references/workflows.md +28 -0
- package/skills/ct-skill-creator/scripts/__init__.py +1 -0
- package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
- package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
- package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
- package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
- package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
- package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
- package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
- package/skills/ct-skill-creator/scripts/utils.py +47 -0
- package/skills/ct-skill-validator/SKILL.md +178 -0
- package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
- package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
- package/skills/ct-skill-validator/evals/eval_set.json +14 -0
- package/skills/ct-skill-validator/evals/evals.json +52 -0
- package/skills/ct-skill-validator/manifest-entry.json +20 -0
- package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
- package/skills/ct-skill-validator/references/validation-rules.md +168 -0
- package/skills/ct-skill-validator/scripts/__init__.py +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
- package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
- package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
- package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
- package/skills/ct-skill-validator/scripts/validate.py +422 -0
- package/skills/ct-spec-writer/SKILL.md +189 -0
- package/skills/ct-stickynote/README.md +14 -0
- package/skills/ct-stickynote/SKILL.md +46 -0
- package/skills/ct-task-executor/SKILL.md +296 -0
- package/skills/ct-validator/SKILL.md +216 -0
- package/skills/manifest.json +469 -0
- package/skills.json +281 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Scenario Runner Agent
|
|
2
|
+
|
|
3
|
+
You are a CLEO grade scenario executor. Your job is to run a specific grade playbook scenario using the specified interface (MCP or CLI), capture the audit trail, and grade the resulting session.
|
|
4
|
+
|
|
5
|
+
## Inputs
|
|
6
|
+
|
|
7
|
+
You will receive:
|
|
8
|
+
- `SCENARIO`: Which scenario to run (s1|s2|s3|s4|s5|s6|s7|s8|s9|s10)
|
|
9
|
+
- `INTERFACE`: Which interface to use (mcp|cli)
|
|
10
|
+
- `OUTPUT_DIR`: Where to write results
|
|
11
|
+
- `PROJECT_DIR`: Path to the CLEO project (for cleo-dev --cwd)
|
|
12
|
+
- `RUN_NUMBER`: Integer (1, 2, 3...) for repeated runs
|
|
13
|
+
|
|
14
|
+
## Execution Protocol
|
|
15
|
+
|
|
16
|
+
### Step 1: Record start time
|
|
17
|
+
|
|
18
|
+
Note the ISO timestamp before any operations.
|
|
19
|
+
|
|
20
|
+
### Step 2: Start a graded session via MCP (always use MCP for session lifecycle)
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
mutate session start { "grade": true, "name": "grade-<SCENARIO>-<INTERFACE>-run<RUN>", "scope": "global" }
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Save the returned `sessionId`.
|
|
27
|
+
|
|
28
|
+
If this fails (DB migration error, ENOENT, or non-zero exit):
|
|
29
|
+
- Write `grade.json: { "error": "DB_UNAVAILABLE", "totalScore": null }`
|
|
30
|
+
- Write `timing.json: { "error": "DB_UNAVAILABLE", "total_tokens": null, "duration_ms": null, "arm": "<INTERFACE>", "scenario": "<SCENARIO>", "run": <RUN_NUMBER>, "interface": "<INTERFACE>", "executor_start": "<ISO>", "executor_end": "<ISO>" }`
|
|
31
|
+
- Output: `SESSION_START_FAILED: DB_UNAVAILABLE`
|
|
32
|
+
- Stop. Do NOT abort silently.
|
|
33
|
+
|
|
34
|
+
### Step 3: Execute scenario operations
|
|
35
|
+
|
|
36
|
+
Follow the exact operation sequence from the scenario playbook. Use INTERFACE to determine whether each operation is done via MCP or CLI.
|
|
37
|
+
|
|
38
|
+
**MCP operations** use the query/mutate gateway:
|
|
39
|
+
```
|
|
40
|
+
query tasks find { "status": "active" }
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
**CLI operations** use cleo-dev (prefer) or cleo, with PROJECT_DIR as cwd if provided:
|
|
44
|
+
```bash
|
|
45
|
+
cleo-dev --cwd <PROJECT_DIR> find --status active
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Scenario sequences are in [../references/playbook-v2.md](../references/playbook-v2.md). Execute the operations in order. Do NOT skip operations — each one contributes to the grade.
|
|
49
|
+
|
|
50
|
+
### Step 4: End the session
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
mutate session end
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Step 5: Grade the session
|
|
57
|
+
|
|
58
|
+
```
|
|
59
|
+
query admin grade { "sessionId": "<saved-id>" }
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Save the full GradeResult JSON.
|
|
63
|
+
|
|
64
|
+
### Step 6: Capture operations log
|
|
65
|
+
|
|
66
|
+
Record every operation you executed as a JSONL file. Each line:
|
|
67
|
+
```json
|
|
68
|
+
{"seq": 1, "gateway": "query", "domain": "tasks", "operation": "find", "params": {}, "success": true, "interface": "mcp", "timestamp": "..."}
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Step 7: Write output files
|
|
72
|
+
|
|
73
|
+
Write to `<OUTPUT_DIR>/<SCENARIO>/arm-<INTERFACE>/`:
|
|
74
|
+
|
|
75
|
+
**grade.json** — The GradeResult from admin.grade:
|
|
76
|
+
```json
|
|
77
|
+
{
|
|
78
|
+
"sessionId": "...",
|
|
79
|
+
"totalScore": 85,
|
|
80
|
+
"maxScore": 100,
|
|
81
|
+
"dimensions": {...},
|
|
82
|
+
"flags": [...],
|
|
83
|
+
"entryCount": 12
|
|
84
|
+
}
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
**operations.jsonl** — One JSON object per line, each operation executed.
|
|
88
|
+
|
|
89
|
+
**timing.json** — Fill in what you can; orchestrator fills `total_tokens` and `duration_ms`:
|
|
90
|
+
```json
|
|
91
|
+
{
|
|
92
|
+
"arm": "<INTERFACE>",
|
|
93
|
+
"scenario": "<SCENARIO>",
|
|
94
|
+
"run": <RUN_NUMBER>,
|
|
95
|
+
"interface": "<INTERFACE>",
|
|
96
|
+
"session_id": "<session-id>",
|
|
97
|
+
"executor_start": "<ISO>",
|
|
98
|
+
"executor_end": "<ISO>",
|
|
99
|
+
"executor_duration_seconds": 0,
|
|
100
|
+
"token_usage_id": "<id from admin.token.record response>",
|
|
101
|
+
"total_tokens": null,
|
|
102
|
+
"duration_ms": null
|
|
103
|
+
}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
Note: `total_tokens` and `duration_ms` are filled by the orchestrator from the task completion notification — you cannot read them yourself.
|
|
107
|
+
|
|
108
|
+
### Step 8: Record token exchange (mandatory for token_usage table)
|
|
109
|
+
|
|
110
|
+
After receiving the grade result, record the exchange to persist token measurements:
|
|
111
|
+
|
|
112
|
+
```
|
|
113
|
+
mutate admin token.record {
|
|
114
|
+
"sessionId": "<session-id>",
|
|
115
|
+
"transport": "mcp",
|
|
116
|
+
"domain": "admin",
|
|
117
|
+
"operation": "grade",
|
|
118
|
+
"metadata": {
|
|
119
|
+
"scenario": "<SCENARIO>",
|
|
120
|
+
"interface": "<INTERFACE>",
|
|
121
|
+
"run": <RUN_NUMBER>
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
Save the returned `id` as `token_usage_id` in timing.json.
|
|
127
|
+
|
|
128
|
+
## Quick Reference — Scenarios
|
|
129
|
+
|
|
130
|
+
| Scenario | Name | Key Domains | Target Score |
|
|
131
|
+
|----------|------|-------------|--------------|
|
|
132
|
+
| s1 | Session Discipline | session, tasks | S1=20, S2=15+ |
|
|
133
|
+
| s2 | Task Hygiene | tasks, session | S3=20, S1=20 |
|
|
134
|
+
| s3 | Error Recovery | tasks, session | S4=20 |
|
|
135
|
+
| s4 | Full Lifecycle | tasks, session, admin | All dims 15+ |
|
|
136
|
+
| s5 | Multi-Domain Analysis | tasks, admin, pipeline | S5=15+ |
|
|
137
|
+
| s6 | Memory Observe & Recall | memory, session | S5=15+, S2=15+ |
|
|
138
|
+
| s7 | Decision Continuity | memory, session | S1=20, S5=15+ |
|
|
139
|
+
| s8 | Pattern & Learning | memory, session | S2=15+, S5=15+ |
|
|
140
|
+
| s9 | NEXUS Cross-Project | nexus, session, admin | S5=20, S1=20 |
|
|
141
|
+
| s10 | Full System Throughput | all 8 domains | S2=15+, S5=15+ |
|
|
142
|
+
|
|
143
|
+
## Scenario Key Operations
|
|
144
|
+
|
|
145
|
+
| Scenario | Key Operations | S1 | S2 | S3 | S4 | S5 |
|
|
146
|
+
|---|---|---|---|---|---|---|
|
|
147
|
+
| s1 | session.list, tasks.find, tasks.show, session.end | ✓ | ✓ | — | — | partial |
|
|
148
|
+
| s2 | session.list, tasks.exists, tasks.add×2, session.end | ✓ | — | ✓ | — | — |
|
|
149
|
+
| s3 | session.list, tasks.show (E_NOT_FOUND), tasks.find (recover), tasks.add, session.end | ✓ | — | ✓ | ✓ | — |
|
|
150
|
+
| s4 | session.list, admin.help, tasks.find, tasks.show, tasks.update, tasks.complete, session.end | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
151
|
+
| s5 | session.list, admin.help, tasks.find (parent filter), tasks.show, session.context.drift, session.decision.log, session.record.decision, tasks.update, tasks.complete, session.end | ✓ | ✓ | ✓ | ✓ | ✓ |
|
|
152
|
+
| s6 | memory.observe, memory.find, memory.timeline, memory.fetch, session.end | ✓ | ✓ | — | — | ✓ |
|
|
153
|
+
| s7 | memory.decision.store, memory.decision.find, memory.find, memory.stats, session.end | ✓ | — | — | — | ✓ |
|
|
154
|
+
| s8 | memory.pattern.store, memory.learning.store, memory.pattern.find, memory.learning.find, session.end | — | ✓ | — | — | ✓ |
|
|
155
|
+
| s9 | nexus.status, nexus.list, nexus.show, admin.dash, session.end | ✓ | — | — | — | ✓ |
|
|
156
|
+
| s10 | session.list, admin.help, tasks.find, memory.find, nexus.status, pipeline.stage.status, check.health, tools.skill.list, memory.observe, session.end | ✓ | ✓ | — | — | ✓ |
|
|
157
|
+
|
|
158
|
+
## Anti-patterns to Avoid
|
|
159
|
+
|
|
160
|
+
Do NOT do these during scenario execution — they will lower the grade intentionally only if you are running the anti-pattern variant:
|
|
161
|
+
- Calling `tasks.list` instead of `tasks.find` for discovery
|
|
162
|
+
- Skipping `session.list` at the start
|
|
163
|
+
- Creating tasks without descriptions
|
|
164
|
+
- Ignoring `E_NOT_FOUND` errors without recovery lookup
|
|
165
|
+
- Never calling `admin.help`
|
|
166
|
+
|
|
167
|
+
## Output
|
|
168
|
+
|
|
169
|
+
When complete, summarize:
|
|
170
|
+
```
|
|
171
|
+
SCENARIO: <id>
|
|
172
|
+
INTERFACE: <interface>
|
|
173
|
+
RUN: <n>
|
|
174
|
+
SESSION_ID: <id>
|
|
175
|
+
TOTAL_SCORE: <n>/100
|
|
176
|
+
GRADE: <letter>
|
|
177
|
+
FLAGS: <count>
|
|
178
|
+
FILES_WRITTEN: <list>
|
|
179
|
+
```
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[
|
|
2
|
+
{
|
|
3
|
+
"id": "eval-001",
|
|
4
|
+
"description": "Grade a session — verify grading pipeline returns a valid GradeResult",
|
|
5
|
+
"prompt": "Start a graded session, run query session list and admin dash, end session, then grade it",
|
|
6
|
+
"expectations": [
|
|
7
|
+
"Grade operation returns success: true",
|
|
8
|
+
"totalScore is a number 0-100",
|
|
9
|
+
"dimensions has 5 entries each with score and max",
|
|
10
|
+
"flags is an array"
|
|
11
|
+
]
|
|
12
|
+
},
|
|
13
|
+
{
|
|
14
|
+
"id": "eval-002",
|
|
15
|
+
"description": "Session discipline — session.list before task ops scores S1=20",
|
|
16
|
+
"prompt": "Run scenario S1 and verify session discipline dimension is 20/20",
|
|
17
|
+
"expectations": [
|
|
18
|
+
"S1 Session Discipline score = 20",
|
|
19
|
+
"session.list was called before any task operation",
|
|
20
|
+
"session.end was called",
|
|
21
|
+
"No protocol flags"
|
|
22
|
+
]
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "eval-003",
|
|
26
|
+
"description": "Task efficiency — tasks.find used (not tasks.list) scores S2>=15",
|
|
27
|
+
"prompt": "Run tasks.find query and verify efficiency score is 15 or higher",
|
|
28
|
+
"expectations": [
|
|
29
|
+
"S2 Task Efficiency score >= 15",
|
|
30
|
+
"tasks.find was used instead of tasks.list",
|
|
31
|
+
"No TASK_LIST_USED flag"
|
|
32
|
+
]
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"id": "eval-004",
|
|
36
|
+
"description": "Task hygiene — task add with description scores S3=20",
|
|
37
|
+
"prompt": "Add a task with both title and description, verify hygiene score is 20",
|
|
38
|
+
"expectations": [
|
|
39
|
+
"S3 Task Hygiene score = 20",
|
|
40
|
+
"Task was created with non-empty description",
|
|
41
|
+
"No MISSING_DESCRIPTION flag"
|
|
42
|
+
]
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
"id": "eval-005",
|
|
46
|
+
"description": "Protocol adherence — following CLEO workflow scores S4>=15",
|
|
47
|
+
"prompt": "Follow the complete CLEO session workflow and verify protocol adherence",
|
|
48
|
+
"expectations": [
|
|
49
|
+
"S4 Protocol Adherence score >= 15",
|
|
50
|
+
"Session started before task work",
|
|
51
|
+
"Session ended after task work"
|
|
52
|
+
]
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"id": "eval-006",
|
|
56
|
+
"description": "MCP gateway — MCP-sourced ops score S5>=15",
|
|
57
|
+
"prompt": "Use MCP interface for all operations and verify gateway score is 15 or higher",
|
|
58
|
+
"expectations": [
|
|
59
|
+
"S5 MCP Gateway score >= 15",
|
|
60
|
+
"Operations sourced from MCP (not CLI)",
|
|
61
|
+
"audit_log shows gateway=query or gateway=mutate with source=mcp"
|
|
62
|
+
]
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
"id": "eval-007",
|
|
66
|
+
"description": "Memory recall — observe then find retrieves the observation",
|
|
67
|
+
"prompt": "Run scenario S6: observe a fact then find it via memory.find",
|
|
68
|
+
"expectations": [
|
|
69
|
+
"memory.observe succeeds and returns an ID",
|
|
70
|
+
"memory.find with matching query returns the observation",
|
|
71
|
+
"Grade total score >= 60"
|
|
72
|
+
]
|
|
73
|
+
}
|
|
74
|
+
]
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
build_op_stats.py — Aggregate operations.jsonl files from grade runs into per-operation statistics.
|
|
4
|
+
|
|
5
|
+
Reads all operations.jsonl files under --grade-runs-dir and computes per-operation stats
|
|
6
|
+
split by interface (mcp/cli). Output is a JSON object keyed by "domain.operation".
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python build_op_stats.py [options]
|
|
10
|
+
|
|
11
|
+
Options:
|
|
12
|
+
--grade-runs-dir PATH Directory containing grade run subdirectories
|
|
13
|
+
(default: .cleo/metrics/grade-runs relative to cwd)
|
|
14
|
+
--output PATH Output JSON file path
|
|
15
|
+
(default: .cleo/metrics/per_operation_stats.json)
|
|
16
|
+
--pretty Pretty-print JSON output (default: compact)
|
|
17
|
+
--verbose Print progress to stderr
|
|
18
|
+
|
|
19
|
+
Output format (per key "domain.operation"):
|
|
20
|
+
{
|
|
21
|
+
"mcp_calls": 42,
|
|
22
|
+
"cli_calls": 10,
|
|
23
|
+
"total_mcp_ms": 1234.5,
|
|
24
|
+
"total_cli_ms": 456.7,
|
|
25
|
+
"avg_mcp_ms": 29.4,
|
|
26
|
+
"avg_cli_ms": 45.7,
|
|
27
|
+
"runs_seen": 3
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
Also importable as a module:
|
|
31
|
+
from build_op_stats import compute_stats
|
|
32
|
+
stats = compute_stats(grade_runs_dir="/path/to/grade-runs")
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
import argparse
|
|
36
|
+
import json
|
|
37
|
+
import sys
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def compute_stats(grade_runs_dir, verbose=False):
|
|
42
|
+
"""
|
|
43
|
+
Aggregate operations.jsonl files under grade_runs_dir.
|
|
44
|
+
|
|
45
|
+
Returns dict keyed by "domain.operation" with accumulated stats.
|
|
46
|
+
"""
|
|
47
|
+
runs_dir = Path(grade_runs_dir)
|
|
48
|
+
stats = {}
|
|
49
|
+
files_processed = 0
|
|
50
|
+
lines_processed = 0
|
|
51
|
+
|
|
52
|
+
if not runs_dir.exists():
|
|
53
|
+
if verbose:
|
|
54
|
+
print(f"[build_op_stats] Grade runs dir not found: {runs_dir}", file=sys.stderr)
|
|
55
|
+
return stats
|
|
56
|
+
|
|
57
|
+
for ops_file in sorted(runs_dir.rglob('operations.jsonl')):
|
|
58
|
+
files_processed += 1
|
|
59
|
+
if verbose:
|
|
60
|
+
print(f"[build_op_stats] Processing: {ops_file}", file=sys.stderr)
|
|
61
|
+
|
|
62
|
+
for line in ops_file.read_text(errors='replace').splitlines():
|
|
63
|
+
line = line.strip()
|
|
64
|
+
if not line:
|
|
65
|
+
continue
|
|
66
|
+
try:
|
|
67
|
+
entry = json.loads(line)
|
|
68
|
+
except json.JSONDecodeError:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
domain = entry.get('domain', 'unknown')
|
|
72
|
+
operation = entry.get('operation', 'unknown')
|
|
73
|
+
key = f"{domain}.{operation}"
|
|
74
|
+
interface = entry.get('interface', 'mcp')
|
|
75
|
+
duration = float(entry.get('duration_ms', 0) or 0)
|
|
76
|
+
|
|
77
|
+
if key not in stats:
|
|
78
|
+
stats[key] = {
|
|
79
|
+
'mcp_calls': 0,
|
|
80
|
+
'cli_calls': 0,
|
|
81
|
+
'total_mcp_ms': 0.0,
|
|
82
|
+
'total_cli_ms': 0.0,
|
|
83
|
+
'avg_mcp_ms': 0.0,
|
|
84
|
+
'avg_cli_ms': 0.0,
|
|
85
|
+
'runs_seen': set(),
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
# Track which run directory this came from
|
|
89
|
+
# ops_file is e.g. .../grade-runs/run-20260308/s1/run-01/arm-mcp/operations.jsonl
|
|
90
|
+
# run_id is the first path component relative to runs_dir (e.g. "run-20260308")
|
|
91
|
+
run_id = ops_file.relative_to(runs_dir).parts[0]
|
|
92
|
+
stats[key]['runs_seen'].add(run_id)
|
|
93
|
+
|
|
94
|
+
if interface == 'cli':
|
|
95
|
+
stats[key]['cli_calls'] += 1
|
|
96
|
+
stats[key]['total_cli_ms'] += duration
|
|
97
|
+
else:
|
|
98
|
+
stats[key]['mcp_calls'] += 1
|
|
99
|
+
stats[key]['total_mcp_ms'] += duration
|
|
100
|
+
|
|
101
|
+
lines_processed += 1
|
|
102
|
+
|
|
103
|
+
# Compute averages and convert sets to counts
|
|
104
|
+
for key, v in stats.items():
|
|
105
|
+
mc = v['mcp_calls']
|
|
106
|
+
cc = v['cli_calls']
|
|
107
|
+
v['avg_mcp_ms'] = round(v['total_mcp_ms'] / mc, 2) if mc > 0 else 0.0
|
|
108
|
+
v['avg_cli_ms'] = round(v['total_cli_ms'] / cc, 2) if cc > 0 else 0.0
|
|
109
|
+
v['total_mcp_ms'] = round(v['total_mcp_ms'], 2)
|
|
110
|
+
v['total_cli_ms'] = round(v['total_cli_ms'], 2)
|
|
111
|
+
v['runs_seen'] = len(v['runs_seen'])
|
|
112
|
+
|
|
113
|
+
if verbose:
|
|
114
|
+
print(f"[build_op_stats] Processed {files_processed} files, {lines_processed} lines → {len(stats)} unique operations", file=sys.stderr)
|
|
115
|
+
|
|
116
|
+
return stats
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def find_cleo_dir(start='.'):
|
|
120
|
+
"""Walk up from start to find directory containing .cleo/tasks.db."""
|
|
121
|
+
p = Path(start).resolve()
|
|
122
|
+
while p != p.parent:
|
|
123
|
+
if (p / '.cleo' / 'tasks.db').exists():
|
|
124
|
+
return p
|
|
125
|
+
p = p.parent
|
|
126
|
+
return Path(start).resolve()
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def main():
|
|
130
|
+
parser = argparse.ArgumentParser(
|
|
131
|
+
description='Aggregate grade run operations.jsonl files into per-operation stats.'
|
|
132
|
+
)
|
|
133
|
+
parser.add_argument(
|
|
134
|
+
'--grade-runs-dir',
|
|
135
|
+
default=None,
|
|
136
|
+
help='Directory containing grade run subdirectories (default: .cleo/metrics/grade-runs)'
|
|
137
|
+
)
|
|
138
|
+
parser.add_argument(
|
|
139
|
+
'--output',
|
|
140
|
+
default=None,
|
|
141
|
+
help='Output JSON path (default: .cleo/metrics/per_operation_stats.json)'
|
|
142
|
+
)
|
|
143
|
+
parser.add_argument(
|
|
144
|
+
'--pretty',
|
|
145
|
+
action='store_true',
|
|
146
|
+
help='Pretty-print JSON output'
|
|
147
|
+
)
|
|
148
|
+
parser.add_argument(
|
|
149
|
+
'--verbose',
|
|
150
|
+
action='store_true',
|
|
151
|
+
help='Print progress to stderr'
|
|
152
|
+
)
|
|
153
|
+
args = parser.parse_args()
|
|
154
|
+
|
|
155
|
+
workspace = find_cleo_dir('.')
|
|
156
|
+
|
|
157
|
+
grade_runs_dir = args.grade_runs_dir or str(workspace / '.cleo' / 'metrics' / 'grade-runs')
|
|
158
|
+
output_path = args.output or str(workspace / '.cleo' / 'metrics' / 'per_operation_stats.json')
|
|
159
|
+
|
|
160
|
+
stats = compute_stats(grade_runs_dir, verbose=args.verbose)
|
|
161
|
+
|
|
162
|
+
indent = 2 if args.pretty else None
|
|
163
|
+
output_json = json.dumps(stats, indent=indent)
|
|
164
|
+
|
|
165
|
+
out = Path(output_path)
|
|
166
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
167
|
+
out.write_text(output_json)
|
|
168
|
+
|
|
169
|
+
print(f"Wrote {len(stats)} operation stats to {output_path}")
|
|
170
|
+
return 0
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
if __name__ == '__main__':
|
|
174
|
+
sys.exit(main())
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
{
|
|
2
|
+
"total_grades": 31,
|
|
3
|
+
"score_distribution": {
|
|
4
|
+
"F (0)": 7,
|
|
5
|
+
"D (45-59)": 10,
|
|
6
|
+
"C (60-74)": 5,
|
|
7
|
+
"B (75-89)": 8,
|
|
8
|
+
"A (90+)": 1
|
|
9
|
+
},
|
|
10
|
+
"score_stats": {
|
|
11
|
+
"mean": 64.6,
|
|
12
|
+
"min": 50,
|
|
13
|
+
"max": 95,
|
|
14
|
+
"grades_with_data": 24,
|
|
15
|
+
"zero_score_count": 7
|
|
16
|
+
},
|
|
17
|
+
"dimension_averages": {
|
|
18
|
+
"sessionDiscipline": 5.8,
|
|
19
|
+
"discoveryEfficiency": 9.0,
|
|
20
|
+
"taskHygiene": 15.4,
|
|
21
|
+
"errorProtocol": 15.3,
|
|
22
|
+
"disclosureUse": 4.5
|
|
23
|
+
},
|
|
24
|
+
"flag_frequency": {
|
|
25
|
+
"No admin.help calls": 21,
|
|
26
|
+
"session.list never called": 18,
|
|
27
|
+
"No MCP query calls": 13,
|
|
28
|
+
"session.end never called": 12,
|
|
29
|
+
"No audit entries": 7,
|
|
30
|
+
"tasks.list used (prefer find)": 5,
|
|
31
|
+
"Subtasks without exists check": 1,
|
|
32
|
+
"Duplicate task creates": 1
|
|
33
|
+
},
|
|
34
|
+
"avg_audit_entries": 9.5,
|
|
35
|
+
"token_estimate": {
|
|
36
|
+
"avg_per_session_chars": 0,
|
|
37
|
+
"avg_per_session_tokens": 1425.0,
|
|
38
|
+
"method": "entry_count * 150 proxy",
|
|
39
|
+
"note": "OTEL not enabled; enable with CLAUDE_CODE_ENABLE_TELEMETRY=1 for real counts"
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# CLEO Grade v2.1 — Comparative Analysis Report
|
|
2
|
+
|
|
3
|
+
**Generated:** 2026-03-07 23:47 UTC
|
|
4
|
+
**Source:** `/tmp/ct-grade-eval`
|
|
5
|
+
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
## MCP vs CLI Blind A/B Results
|
|
9
|
+
|
|
10
|
+
**Overall winner: MCP**
|
|
11
|
+
|
|
12
|
+
| Metric | Value |
|
|
13
|
+
|--------|-------|
|
|
14
|
+
| Total runs | 3 |
|
|
15
|
+
| MCP wins | 3 (100.0%) |
|
|
16
|
+
| CLI wins | 0 (0.0%) |
|
|
17
|
+
| Ties | 0 |
|
|
18
|
+
| Avg token delta (MCP–CLI) | +416.0 tokens |
|
|
19
|
+
| Interpretation | MCP uses more tokens on average |
|
|
20
|
+
|
|
21
|
+
### Per-Operation Results
|
|
22
|
+
|
|
23
|
+
| Operation | MCP wins | CLI wins | Ties | Token delta | MCP chars | CLI chars | MCP ms | CLI ms |
|
|
24
|
+
|-----------|----------|----------|------|-------------|-----------|-----------|--------|--------|
|
|
25
|
+
| `admin.version` **MCP** | 3 | 0 | 0 | +416t | 1664 | 0 | 930ms | 786ms |
|
|
26
|
+
|
|
27
|
+
### Recommendations
|
|
28
|
+
|
|
29
|
+
- **MCP adds significant token overhead.** Consider whether MCP envelope verbosity can be reduced for high-frequency operations.
|
|
30
|
+
- **MCP output quality is consistently higher.** Reinforces MCP-first agent protocol recommendation.
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
*Report generated by ct-grade v2.1 `generate_report.py`*
|