@cleocode/skills 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch-config.json +404 -0
- package/index.d.ts +178 -0
- package/index.js +405 -0
- package/package.json +14 -0
- package/profiles/core.json +7 -0
- package/profiles/full.json +10 -0
- package/profiles/minimal.json +7 -0
- package/profiles/recommended.json +7 -0
- package/provider-skills-map.json +97 -0
- package/skills/_shared/cleo-style-guide.md +84 -0
- package/skills/_shared/manifest-operations.md +810 -0
- package/skills/_shared/placeholders.json +433 -0
- package/skills/_shared/skill-chaining-patterns.md +237 -0
- package/skills/_shared/subagent-protocol-base.md +223 -0
- package/skills/_shared/task-system-integration.md +232 -0
- package/skills/_shared/testing-framework-config.md +110 -0
- package/skills/ct-cleo/SKILL.md +490 -0
- package/skills/ct-cleo/references/anti-patterns.md +19 -0
- package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
- package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
- package/skills/ct-cleo/references/session-protocol.md +162 -0
- package/skills/ct-codebase-mapper/SKILL.md +82 -0
- package/skills/ct-contribution/SKILL.md +521 -0
- package/skills/ct-contribution/templates/contribution-init.json +21 -0
- package/skills/ct-dev-workflow/SKILL.md +423 -0
- package/skills/ct-docs-lookup/SKILL.md +66 -0
- package/skills/ct-docs-review/SKILL.md +175 -0
- package/skills/ct-docs-write/SKILL.md +108 -0
- package/skills/ct-documentor/SKILL.md +231 -0
- package/skills/ct-epic-architect/SKILL.md +305 -0
- package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
- package/skills/ct-epic-architect/references/commands.md +201 -0
- package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
- package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
- package/skills/ct-epic-architect/references/output-format.md +92 -0
- package/skills/ct-epic-architect/references/patterns.md +284 -0
- package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
- package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
- package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
- package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
- package/skills/ct-grade/SKILL.md +230 -0
- package/skills/ct-grade/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade/agents/blind-comparator.md +157 -0
- package/skills/ct-grade/agents/scenario-runner.md +134 -0
- package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
- package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
- package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
- package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
- package/skills/ct-grade/eval-viewer/viewer.html +219 -0
- package/skills/ct-grade/evals/evals.json +94 -0
- package/skills/ct-grade/references/ab-test-methodology.md +150 -0
- package/skills/ct-grade/references/domains.md +137 -0
- package/skills/ct-grade/references/grade-spec.md +236 -0
- package/skills/ct-grade/references/scenario-playbook.md +234 -0
- package/skills/ct-grade/references/token-tracking.md +120 -0
- package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
- package/skills/ct-grade/scripts/generate_report.py +283 -0
- package/skills/ct-grade/scripts/run_ab_test.py +504 -0
- package/skills/ct-grade/scripts/run_all.py +287 -0
- package/skills/ct-grade/scripts/setup_run.py +183 -0
- package/skills/ct-grade/scripts/token_tracker.py +630 -0
- package/skills/ct-grade-v2-1/SKILL.md +237 -0
- package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
- package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
- package/skills/ct-grade-v2-1/evals/evals.json +74 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
- package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
- package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
- package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
- package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
- package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
- package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
- package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
- package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
- package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
- package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
- package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
- package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
- package/skills/ct-memory/SKILL.md +84 -0
- package/skills/ct-orchestrator/INSTALL.md +61 -0
- package/skills/ct-orchestrator/README.md +69 -0
- package/skills/ct-orchestrator/SKILL.md +380 -0
- package/skills/ct-orchestrator/manifest-entry.json +19 -0
- package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
- package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
- package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
- package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
- package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
- package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
- package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
- package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
- package/skills/ct-research-agent/SKILL.md +226 -0
- package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
- package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
- package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
- package/skills/ct-skill-creator/SKILL.md +356 -0
- package/skills/ct-skill-creator/agents/analyzer.md +276 -0
- package/skills/ct-skill-creator/agents/comparator.md +204 -0
- package/skills/ct-skill-creator/agents/grader.md +225 -0
- package/skills/ct-skill-creator/assets/eval_review.html +146 -0
- package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/ct-skill-creator/manifest-entry.json +17 -0
- package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
- package/skills/ct-skill-creator/references/frontmatter.md +83 -0
- package/skills/ct-skill-creator/references/invocation-control.md +165 -0
- package/skills/ct-skill-creator/references/output-patterns.md +86 -0
- package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
- package/skills/ct-skill-creator/references/schemas.md +430 -0
- package/skills/ct-skill-creator/references/workflows.md +28 -0
- package/skills/ct-skill-creator/scripts/__init__.py +1 -0
- package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
- package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
- package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
- package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
- package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
- package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
- package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
- package/skills/ct-skill-creator/scripts/utils.py +47 -0
- package/skills/ct-skill-validator/SKILL.md +178 -0
- package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
- package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
- package/skills/ct-skill-validator/evals/eval_set.json +14 -0
- package/skills/ct-skill-validator/evals/evals.json +52 -0
- package/skills/ct-skill-validator/manifest-entry.json +20 -0
- package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
- package/skills/ct-skill-validator/references/validation-rules.md +168 -0
- package/skills/ct-skill-validator/scripts/__init__.py +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
- package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
- package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
- package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
- package/skills/ct-skill-validator/scripts/validate.py +422 -0
- package/skills/ct-spec-writer/SKILL.md +189 -0
- package/skills/ct-stickynote/README.md +14 -0
- package/skills/ct-stickynote/SKILL.md +46 -0
- package/skills/ct-task-executor/SKILL.md +296 -0
- package/skills/ct-validator/SKILL.md +216 -0
- package/skills/manifest.json +469 -0
- package/skills.json +281 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Run a CLEO grade scenario and capture metrics.
|
|
4
|
+
|
|
5
|
+
Executes a predefined grade scenario against a live CLEO project,
|
|
6
|
+
capturing timing and output metrics for later analysis.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python run_scenario.py --scenario S1 [options]
|
|
10
|
+
python run_scenario.py --scenario full [options]
|
|
11
|
+
|
|
12
|
+
Options:
|
|
13
|
+
--scenario S1-S5, full, or P1-P3 (default: S1)
|
|
14
|
+
--cleo CLEO binary (default: cleo-dev)
|
|
15
|
+
--output-dir Results directory (default: ./grade-results/<timestamp>)
|
|
16
|
+
--scope Session scope (default: global)
|
|
17
|
+
--parent-task Task ID for subtask scenarios (S2, S5)
|
|
18
|
+
--seed-task Existing task ID for lifecycle scenarios (S3, S4)
|
|
19
|
+
--runs Number of times to repeat (default: 1)
|
|
20
|
+
--json Output results as JSON to stdout
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import json
|
|
25
|
+
import os
|
|
26
|
+
import subprocess
|
|
27
|
+
import sys
|
|
28
|
+
import time
|
|
29
|
+
from datetime import datetime, timezone
|
|
30
|
+
from pathlib import Path
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
# Scenario definitions
|
|
35
|
+
# ---------------------------------------------------------------------------
|
|
36
|
+
|
|
37
|
+
def _build_scenario(name, ops_fn):
|
|
38
|
+
return {"name": name, "build_ops": ops_fn}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def scenario_s1(args):
|
|
42
|
+
"""S1: Session Discipline — tests session.list before task ops and session.end."""
|
|
43
|
+
seed = args.seed_task or "T100"
|
|
44
|
+
return [
|
|
45
|
+
(["session", "list"], "Check existing sessions"),
|
|
46
|
+
(["admin", "dash"], "Project overview"),
|
|
47
|
+
(["tasks", "find", "--status", "active"], "Discover active tasks"),
|
|
48
|
+
(["tasks", "show", seed], "Inspect specific task"),
|
|
49
|
+
# session.end is handled by run_graded_session wrapper
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def scenario_s2(args):
|
|
54
|
+
"""S2: Task Hygiene — tests task creation with descriptions and parent verification."""
|
|
55
|
+
parent = args.parent_task or args.seed_task
|
|
56
|
+
if not parent:
|
|
57
|
+
print("WARNING: --parent-task not set for S2; using T100 as placeholder", file=sys.stderr)
|
|
58
|
+
parent = "T100"
|
|
59
|
+
return [
|
|
60
|
+
(["session", "list"], "Check existing sessions"),
|
|
61
|
+
(["tasks", "exists", parent], "Verify parent exists"),
|
|
62
|
+
(["tasks", "add",
|
|
63
|
+
"--title", "Impl auth",
|
|
64
|
+
"--description", "Add JWT authentication to API endpoints",
|
|
65
|
+
"--parent", parent], "Create subtask with description"),
|
|
66
|
+
(["tasks", "add",
|
|
67
|
+
"--title", "Write auth tests",
|
|
68
|
+
"--description", "Unit tests for auth module"], "Create standalone task with description"),
|
|
69
|
+
]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def scenario_s3(args):
|
|
73
|
+
"""S3: Error Recovery — tests E_NOT_FOUND recovery and no duplicate creates."""
|
|
74
|
+
return [
|
|
75
|
+
(["session", "list"], "Check existing sessions"),
|
|
76
|
+
(["tasks", "show", "T99999"], "Trigger E_NOT_FOUND intentionally"),
|
|
77
|
+
(["tasks", "find", "--query", "T99999"], "Recovery lookup after E_NOT_FOUND"),
|
|
78
|
+
(["tasks", "add",
|
|
79
|
+
"--title", "New feature discovered",
|
|
80
|
+
"--description", "Feature that was not found — creating fresh"], "Create once"),
|
|
81
|
+
]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def scenario_s4(args):
|
|
85
|
+
"""S4: Full Lifecycle — all 5 dimensions at 20/20."""
|
|
86
|
+
seed = args.seed_task or "T200"
|
|
87
|
+
return [
|
|
88
|
+
(["session", "list"], "Check existing sessions"),
|
|
89
|
+
(["admin", "help"], "Progressive disclosure — tier 0"),
|
|
90
|
+
(["admin", "dash"], "Project overview"),
|
|
91
|
+
(["tasks", "find", "--status", "pending"], "Discover pending tasks"),
|
|
92
|
+
(["tasks", "show", seed], "Inspect chosen task"),
|
|
93
|
+
(["tasks", "update", "--task-id", seed, "--status", "active"], "Begin work"),
|
|
94
|
+
(["tasks", "complete", seed], "Mark done"),
|
|
95
|
+
(["tasks", "find", "--status", "pending"], "Check for next task"),
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def scenario_s5(args):
|
|
100
|
+
"""S5: Multi-Domain Analysis — cross-domain with session decisions."""
|
|
101
|
+
parent = args.parent_task or "T500"
|
|
102
|
+
seed = args.seed_task or "T501"
|
|
103
|
+
return [
|
|
104
|
+
(["session", "list"], "Check existing sessions"),
|
|
105
|
+
(["admin", "help"], "Progressive disclosure"),
|
|
106
|
+
(["tasks", "find", "--parent", parent], "Discover epic subtasks"),
|
|
107
|
+
(["tasks", "show", seed], "Inspect specific subtask"),
|
|
108
|
+
(["session", "context-drift"], "Check context drift"),
|
|
109
|
+
(["session", "decision-log", "--task-id", seed], "Review past decisions"),
|
|
110
|
+
(["session", "record-decision",
|
|
111
|
+
"--task-id", seed,
|
|
112
|
+
"--decision", "Use adapter pattern",
|
|
113
|
+
"--rationale", "Decouples provider logic"], "Record decision"),
|
|
114
|
+
(["tasks", "update", "--task-id", seed, "--status", "active"], "Begin work"),
|
|
115
|
+
(["tasks", "complete", seed], "Mark done"),
|
|
116
|
+
(["tasks", "find", "--parent", parent, "--status", "pending"], "Find next subtask"),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
SCENARIOS = {
|
|
121
|
+
"S1": scenario_s1,
|
|
122
|
+
"S2": scenario_s2,
|
|
123
|
+
"S3": scenario_s3,
|
|
124
|
+
"S4": scenario_s4,
|
|
125
|
+
"S5": scenario_s5,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
# ---------------------------------------------------------------------------
|
|
130
|
+
# CLEO runner
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
|
|
133
|
+
def run_cleo(cleo_bin, args_list, cwd=None, capture=True):
|
|
134
|
+
"""Run a cleo command and return (returncode, stdout, stderr, duration_ms)."""
|
|
135
|
+
cmd = [cleo_bin] + args_list + ["--json"]
|
|
136
|
+
start = time.time()
|
|
137
|
+
try:
|
|
138
|
+
result = subprocess.run(
|
|
139
|
+
cmd,
|
|
140
|
+
capture_output=capture,
|
|
141
|
+
text=True,
|
|
142
|
+
cwd=cwd,
|
|
143
|
+
timeout=30,
|
|
144
|
+
)
|
|
145
|
+
duration_ms = int((time.time() - start) * 1000)
|
|
146
|
+
return result.returncode, result.stdout or "", result.stderr or "", duration_ms
|
|
147
|
+
except subprocess.TimeoutExpired:
|
|
148
|
+
return -1, "", "TIMEOUT", 30000
|
|
149
|
+
except FileNotFoundError:
|
|
150
|
+
return -1, "", f"Command not found: {cleo_bin}", 0
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def start_graded_session(cleo_bin, scope, name, cwd=None):
|
|
154
|
+
"""Start a grade-enabled session. Returns session ID or None."""
|
|
155
|
+
rc, stdout, stderr, _ = run_cleo(
|
|
156
|
+
cleo_bin,
|
|
157
|
+
["session", "start", "--scope", scope, "--name", name, "--grade"],
|
|
158
|
+
cwd=cwd,
|
|
159
|
+
)
|
|
160
|
+
if rc != 0:
|
|
161
|
+
print(f"ERROR: session start failed: {stderr}", file=sys.stderr)
|
|
162
|
+
return None
|
|
163
|
+
try:
|
|
164
|
+
data = json.loads(stdout)
|
|
165
|
+
# Try common paths for session ID
|
|
166
|
+
return (
|
|
167
|
+
data.get("data", {}).get("sessionId")
|
|
168
|
+
or data.get("sessionId")
|
|
169
|
+
or data.get("id")
|
|
170
|
+
)
|
|
171
|
+
except Exception:
|
|
172
|
+
# Try to extract session ID from plain output
|
|
173
|
+
for line in stdout.splitlines():
|
|
174
|
+
if "session-" in line:
|
|
175
|
+
parts = line.split()
|
|
176
|
+
for p in parts:
|
|
177
|
+
if p.startswith("session-"):
|
|
178
|
+
return p.strip('",')
|
|
179
|
+
return None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def end_session(cleo_bin, cwd=None):
|
|
183
|
+
"""End the current session."""
|
|
184
|
+
rc, stdout, stderr, _ = run_cleo(cleo_bin, ["session", "end"], cwd=cwd)
|
|
185
|
+
return rc == 0
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def grade_session(cleo_bin, session_id, cwd=None):
|
|
189
|
+
"""Grade a session. Returns dict or None."""
|
|
190
|
+
rc, stdout, stderr, _ = run_cleo(cleo_bin, ["grade", session_id], cwd=cwd)
|
|
191
|
+
if rc != 0:
|
|
192
|
+
print(f"WARNING: grade failed (rc={rc}): {stderr}", file=sys.stderr)
|
|
193
|
+
return None
|
|
194
|
+
try:
|
|
195
|
+
data = json.loads(stdout)
|
|
196
|
+
return data.get("data") or data
|
|
197
|
+
except Exception:
|
|
198
|
+
return {"raw": stdout}
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
# ---------------------------------------------------------------------------
|
|
202
|
+
# Single scenario run
|
|
203
|
+
# ---------------------------------------------------------------------------
|
|
204
|
+
|
|
205
|
+
def run_single_scenario(scenario_name, args, output_dir):
|
|
206
|
+
"""Run one scenario. Returns metrics dict."""
|
|
207
|
+
cleo = args.cleo
|
|
208
|
+
scope = args.scope or "global"
|
|
209
|
+
session_name = f"grade-{scenario_name.lower()}-{int(time.time())}"
|
|
210
|
+
|
|
211
|
+
output_dir = Path(output_dir)
|
|
212
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
213
|
+
|
|
214
|
+
print(f"\n=== Scenario {scenario_name} ===")
|
|
215
|
+
print(f" Binary : {cleo}")
|
|
216
|
+
print(f" Scope : {scope}")
|
|
217
|
+
print(f" Output : {output_dir}")
|
|
218
|
+
|
|
219
|
+
# Start graded session
|
|
220
|
+
t_start = time.time()
|
|
221
|
+
session_id = start_graded_session(cleo, scope, session_name, cwd=args.cleo_cwd)
|
|
222
|
+
if not session_id:
|
|
223
|
+
print("ERROR: Could not start graded session", file=sys.stderr)
|
|
224
|
+
metrics = {
|
|
225
|
+
"scenario": scenario_name,
|
|
226
|
+
"session_id": None,
|
|
227
|
+
"error": "DB_UNAVAILABLE",
|
|
228
|
+
"hint": "Use agent-based /ct-grade scenario instead — agents use live MCP tools",
|
|
229
|
+
"grade": None,
|
|
230
|
+
"token_meta": {"estimation_method": "unavailable", "total_estimated_tokens": None},
|
|
231
|
+
}
|
|
232
|
+
metrics_path = output_dir / "metrics.json"
|
|
233
|
+
metrics_path.write_text(json.dumps(metrics, indent=2))
|
|
234
|
+
return metrics
|
|
235
|
+
|
|
236
|
+
print(f" Session: {session_id}")
|
|
237
|
+
|
|
238
|
+
# Build operations for this scenario
|
|
239
|
+
scenario_fn = SCENARIOS[scenario_name]
|
|
240
|
+
operations = scenario_fn(args)
|
|
241
|
+
|
|
242
|
+
# Execute each operation
|
|
243
|
+
op_results = []
|
|
244
|
+
for op_args, description in operations:
|
|
245
|
+
print(f" -> {' '.join(op_args)}")
|
|
246
|
+
rc, stdout, stderr, dur_ms = run_cleo(cleo, op_args, cwd=args.cleo_cwd)
|
|
247
|
+
output_chars = len(stdout)
|
|
248
|
+
estimated_tokens = int(output_chars / 4)
|
|
249
|
+
op_results.append({
|
|
250
|
+
"operation": " ".join(op_args),
|
|
251
|
+
"description": description,
|
|
252
|
+
"returncode": rc,
|
|
253
|
+
"success": rc == 0,
|
|
254
|
+
"output_chars": output_chars,
|
|
255
|
+
"estimated_tokens": estimated_tokens,
|
|
256
|
+
"duration_ms": dur_ms,
|
|
257
|
+
"error": stderr[:200] if rc != 0 else None,
|
|
258
|
+
})
|
|
259
|
+
if rc not in (0, 4): # 4 = E_NOT_FOUND (expected for S3)
|
|
260
|
+
print(f" WARNING: rc={rc} stderr={stderr[:100]}")
|
|
261
|
+
|
|
262
|
+
# End session
|
|
263
|
+
ended = end_session(cleo, cwd=args.cleo_cwd)
|
|
264
|
+
print(f" Session end: {'ok' if ended else 'FAILED'}")
|
|
265
|
+
|
|
266
|
+
# Grade session
|
|
267
|
+
grade = grade_session(cleo, session_id, cwd=args.cleo_cwd)
|
|
268
|
+
t_total = time.time() - t_start
|
|
269
|
+
|
|
270
|
+
# Compute token metadata
|
|
271
|
+
total_output_chars = sum(r["output_chars"] for r in op_results)
|
|
272
|
+
total_estimated_tokens = sum(r["estimated_tokens"] for r in op_results)
|
|
273
|
+
|
|
274
|
+
metrics = {
|
|
275
|
+
"scenario": scenario_name,
|
|
276
|
+
"session_id": session_id,
|
|
277
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
278
|
+
"duration_seconds": round(t_total, 2),
|
|
279
|
+
"operations": op_results,
|
|
280
|
+
"grade": grade,
|
|
281
|
+
"token_meta": {
|
|
282
|
+
"estimation_method": "output_chars",
|
|
283
|
+
"total_output_chars": total_output_chars,
|
|
284
|
+
"total_estimated_tokens": total_estimated_tokens,
|
|
285
|
+
"avg_tokens_per_op": int(total_estimated_tokens / max(len(op_results), 1)),
|
|
286
|
+
},
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
# Save
|
|
290
|
+
metrics_path = output_dir / "metrics.json"
|
|
291
|
+
metrics_path.write_text(json.dumps(metrics, indent=2))
|
|
292
|
+
print(f" Saved : {metrics_path}")
|
|
293
|
+
|
|
294
|
+
if grade:
|
|
295
|
+
score = grade.get("totalScore", "?")
|
|
296
|
+
letter = _score_to_letter(grade.get("totalScore", 0))
|
|
297
|
+
flags = grade.get("flags", [])
|
|
298
|
+
print(f" Grade : {score}/100 ({letter}) — {len(flags)} flag(s)")
|
|
299
|
+
if flags:
|
|
300
|
+
for f in flags:
|
|
301
|
+
print(f" FLAG: {f}")
|
|
302
|
+
|
|
303
|
+
return metrics
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _score_to_letter(score):
|
|
307
|
+
if score >= 90: return "A"
|
|
308
|
+
if score >= 75: return "B"
|
|
309
|
+
if score >= 60: return "C"
|
|
310
|
+
if score >= 45: return "D"
|
|
311
|
+
return "F"
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
# ---------------------------------------------------------------------------
|
|
315
|
+
# Main
|
|
316
|
+
# ---------------------------------------------------------------------------
|
|
317
|
+
|
|
318
|
+
def main():
|
|
319
|
+
parser = argparse.ArgumentParser(description="Run CLEO grade scenarios")
|
|
320
|
+
parser.add_argument("--scenario", default="S1",
|
|
321
|
+
help="S1-S5, full, or comma-separated e.g. S1,S3")
|
|
322
|
+
parser.add_argument("--cleo", default="cleo-dev",
|
|
323
|
+
help="CLEO binary (default: cleo-dev)")
|
|
324
|
+
parser.add_argument("--cleo-cwd", default=None,
|
|
325
|
+
help="Working directory for CLEO commands")
|
|
326
|
+
parser.add_argument("--output-dir", default=None,
|
|
327
|
+
help="Output directory (default: ./grade-results/<timestamp>)")
|
|
328
|
+
parser.add_argument("--scope", default="global",
|
|
329
|
+
help="Session scope (default: global)")
|
|
330
|
+
parser.add_argument("--parent-task", default=None,
|
|
331
|
+
help="Parent task ID for S2/S5 subtask scenarios")
|
|
332
|
+
parser.add_argument("--seed-task", default=None,
|
|
333
|
+
help="Existing task ID for S3/S4/S5 lifecycle scenarios")
|
|
334
|
+
parser.add_argument("--runs", type=int, default=1,
|
|
335
|
+
help="Number of runs per scenario (default: 1)")
|
|
336
|
+
parser.add_argument("--json", action="store_true",
|
|
337
|
+
help="Output summary as JSON to stdout")
|
|
338
|
+
args = parser.parse_args()
|
|
339
|
+
|
|
340
|
+
# Determine which scenarios to run
|
|
341
|
+
if args.scenario.lower() == "full":
|
|
342
|
+
targets = list(SCENARIOS.keys())
|
|
343
|
+
else:
|
|
344
|
+
targets = [s.strip().upper() for s in args.scenario.split(",")]
|
|
345
|
+
unknown = [s for s in targets if s not in SCENARIOS]
|
|
346
|
+
if unknown:
|
|
347
|
+
print(f"ERROR: Unknown scenarios: {unknown}. Valid: {list(SCENARIOS.keys())}", file=sys.stderr)
|
|
348
|
+
sys.exit(1)
|
|
349
|
+
|
|
350
|
+
# Build output directory
|
|
351
|
+
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
352
|
+
base_output = Path(args.output_dir) if args.output_dir else Path(f"./grade-results/{ts}")
|
|
353
|
+
|
|
354
|
+
all_results = []
|
|
355
|
+
|
|
356
|
+
for scenario_name in targets:
|
|
357
|
+
for run_num in range(1, args.runs + 1):
|
|
358
|
+
run_dir = base_output / scenario_name / f"run-{run_num:03d}"
|
|
359
|
+
metrics = run_single_scenario(scenario_name, args, run_dir)
|
|
360
|
+
all_results.append(metrics)
|
|
361
|
+
|
|
362
|
+
# Summary
|
|
363
|
+
summary = {
|
|
364
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
365
|
+
"scenarios_run": targets,
|
|
366
|
+
"total_runs": len(all_results),
|
|
367
|
+
"results": all_results,
|
|
368
|
+
"grade_summary": [
|
|
369
|
+
{
|
|
370
|
+
"scenario": r["scenario"],
|
|
371
|
+
"score": r.get("grade", {}).get("totalScore") if r.get("grade") else None,
|
|
372
|
+
"letter": _score_to_letter(r.get("grade", {}).get("totalScore", 0) if r.get("grade") else 0),
|
|
373
|
+
"flags": len(r.get("grade", {}).get("flags", [])) if r.get("grade") else None,
|
|
374
|
+
"estimated_tokens": r.get("token_meta", {}).get("total_estimated_tokens"),
|
|
375
|
+
}
|
|
376
|
+
for r in all_results
|
|
377
|
+
],
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
summary_path = base_output / "summary.json"
|
|
381
|
+
base_output.mkdir(parents=True, exist_ok=True)
|
|
382
|
+
summary_path.write_text(json.dumps(summary, indent=2))
|
|
383
|
+
|
|
384
|
+
print(f"\n=== Summary ===")
|
|
385
|
+
for gs in summary["grade_summary"]:
|
|
386
|
+
score_str = f"{gs['score']}/100 ({gs['letter']})" if gs['score'] is not None else "N/A"
|
|
387
|
+
tok_str = f"~{gs['estimated_tokens']}t" if gs['estimated_tokens'] else ""
|
|
388
|
+
print(f" {gs['scenario']}: {score_str} flags={gs['flags']} {tok_str}")
|
|
389
|
+
print(f"\nSaved: {base_output}")
|
|
390
|
+
|
|
391
|
+
if args.json:
|
|
392
|
+
print(json.dumps(summary, indent=2))
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
if __name__ == "__main__":
|
|
396
|
+
main()
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
setup_run.py — Set up an A/B test run directory and print the execution plan.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python setup_run.py --mode scenario --scenario s4 --interface both --runs 3 --output-dir ./ab_results/run-001
|
|
7
|
+
|
|
8
|
+
Outputs:
|
|
9
|
+
- Creates run directory structure
|
|
10
|
+
- Writes run-manifest.json
|
|
11
|
+
- Prints step-by-step execution plan for Claude to follow
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
VALID_MODES = ["scenario", "ab", "blind"]
|
|
23
|
+
VALID_SCENARIOS = ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "all"]
|
|
24
|
+
VALID_INTERFACES = ["mcp", "cli", "both"]
|
|
25
|
+
|
|
26
|
+
SCENARIO_LABELS = {
|
|
27
|
+
"s1": "Fresh Discovery",
|
|
28
|
+
"s2": "Task Creation Hygiene",
|
|
29
|
+
"s3": "Error Recovery",
|
|
30
|
+
"s4": "Full Lifecycle",
|
|
31
|
+
"s5": "Multi-Domain Analysis",
|
|
32
|
+
"s6": "Memory Observe & Recall",
|
|
33
|
+
"s7": "Decision Continuity",
|
|
34
|
+
"s8": "Pattern & Learning Storage",
|
|
35
|
+
"s9": "NEXUS Cross-Project",
|
|
36
|
+
"s10": "Full System Throughput",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
DEFAULT_DOMAINS = ["tasks", "session"]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def find_cleo_dir(start_dir="."):
|
|
43
|
+
"""Walk up from start_dir to find a directory containing .cleo/tasks.db."""
|
|
44
|
+
p = Path(start_dir).resolve()
|
|
45
|
+
while p != p.parent:
|
|
46
|
+
if (p / '.cleo' / 'tasks.db').exists():
|
|
47
|
+
return p
|
|
48
|
+
p = p.parent
|
|
49
|
+
return Path(start_dir).resolve()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def expand_scenarios(scenario_arg):
|
|
53
|
+
if scenario_arg == "all":
|
|
54
|
+
return ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10"]
|
|
55
|
+
return [s.strip() for s in scenario_arg.split(",") if s.strip() in SCENARIO_LABELS]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def expand_interfaces(interface_arg):
|
|
59
|
+
if interface_arg == "both":
|
|
60
|
+
return ["mcp", "cli"]
|
|
61
|
+
return [interface_arg]
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def create_dir(path):
|
|
65
|
+
os.makedirs(path, exist_ok=True)
|
|
66
|
+
return path
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def main():
|
|
70
|
+
parser = argparse.ArgumentParser(description="Set up a ct-grade A/B test run")
|
|
71
|
+
parser.add_argument("--mode", default="scenario", choices=VALID_MODES)
|
|
72
|
+
parser.add_argument("--scenario", default="all")
|
|
73
|
+
parser.add_argument("--interface", default="both", choices=VALID_INTERFACES)
|
|
74
|
+
parser.add_argument("--domains", default="tasks,session")
|
|
75
|
+
parser.add_argument("--runs", type=int, default=3)
|
|
76
|
+
parser.add_argument("--output-dir", required=False, default=None,
|
|
77
|
+
help="Output directory (default: .cleo/metrics/grade-runs/run-<timestamp>)")
|
|
78
|
+
parser.add_argument("--project-dir", default=".")
|
|
79
|
+
args = parser.parse_args()
|
|
80
|
+
|
|
81
|
+
if args.output_dir is None:
|
|
82
|
+
workspace = find_cleo_dir(args.project_dir)
|
|
83
|
+
ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
84
|
+
args.output_dir = str(workspace / '.cleo' / 'metrics' / 'grade-runs' / f"run-{ts}")
|
|
85
|
+
|
|
86
|
+
scenarios = expand_scenarios(args.scenario)
|
|
87
|
+
interfaces = expand_interfaces(args.interface)
|
|
88
|
+
domains = [d.strip() for d in args.domains.split(",")]
|
|
89
|
+
|
|
90
|
+
if not scenarios:
|
|
91
|
+
print(f"ERROR: No valid scenarios in '{args.scenario}'. Use: {', '.join(VALID_SCENARIOS)}", file=sys.stderr)
|
|
92
|
+
sys.exit(1)
|
|
93
|
+
|
|
94
|
+
run_dir = args.output_dir
|
|
95
|
+
create_dir(run_dir)
|
|
96
|
+
|
|
97
|
+
# For ab/blind mode, each domain is a "slot"
|
|
98
|
+
slots = scenarios if args.mode == "scenario" else domains
|
|
99
|
+
|
|
100
|
+
# Create directory structure
|
|
101
|
+
for slot in slots:
|
|
102
|
+
for iface in interfaces:
|
|
103
|
+
arm_label = "arm-A" if iface == interfaces[0] else "arm-B"
|
|
104
|
+
for run in range(1, args.runs + 1):
|
|
105
|
+
slot_dir = os.path.join(run_dir, slot, f"run-{run:02d}", arm_label)
|
|
106
|
+
create_dir(slot_dir)
|
|
107
|
+
# Create placeholder timing.json
|
|
108
|
+
timing = {
|
|
109
|
+
"arm": arm_label,
|
|
110
|
+
"interface": iface,
|
|
111
|
+
"slot": slot,
|
|
112
|
+
"run": run,
|
|
113
|
+
"session_id": None,
|
|
114
|
+
"executor_start": None,
|
|
115
|
+
"executor_end": None,
|
|
116
|
+
"executor_duration_seconds": None,
|
|
117
|
+
"token_usage_id": None,
|
|
118
|
+
"total_tokens": None,
|
|
119
|
+
"duration_ms": None,
|
|
120
|
+
}
|
|
121
|
+
timing_path = os.path.join(slot_dir, "timing.json")
|
|
122
|
+
with open(timing_path, "w") as f:
|
|
123
|
+
json.dump(timing, f, indent=2)
|
|
124
|
+
|
|
125
|
+
# Write run-manifest.json
|
|
126
|
+
manifest = {
|
|
127
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
128
|
+
"mode": args.mode,
|
|
129
|
+
"scenarios": scenarios,
|
|
130
|
+
"interfaces": interfaces,
|
|
131
|
+
"domains": domains,
|
|
132
|
+
"runs_per_configuration": args.runs,
|
|
133
|
+
"project_dir": os.path.abspath(args.project_dir),
|
|
134
|
+
"run_dir": os.path.abspath(run_dir),
|
|
135
|
+
"arms": {
|
|
136
|
+
"A": {"interface": interfaces[0], "label": f"{interfaces[0].upper()} interface"},
|
|
137
|
+
"B": {"interface": interfaces[1] if len(interfaces) > 1 else interfaces[0],
|
|
138
|
+
"label": f"{interfaces[-1].upper()} interface"},
|
|
139
|
+
},
|
|
140
|
+
"slots": slots,
|
|
141
|
+
"status": "setup_complete",
|
|
142
|
+
}
|
|
143
|
+
manifest_path = os.path.join(run_dir, "run-manifest.json")
|
|
144
|
+
with open(manifest_path, "w") as f:
|
|
145
|
+
json.dump(manifest, f, indent=2)
|
|
146
|
+
|
|
147
|
+
# Print execution plan
|
|
148
|
+
print(f"\n{'='*60}")
|
|
149
|
+
print(f"ct-grade A/B Run Setup Complete")
|
|
150
|
+
print(f"{'='*60}")
|
|
151
|
+
print(f"Mode: {args.mode}")
|
|
152
|
+
print(f"Scenarios: {', '.join(scenarios)}")
|
|
153
|
+
print(f"Interfaces: {', '.join(interfaces)}")
|
|
154
|
+
print(f"Runs each: {args.runs}")
|
|
155
|
+
print(f"Output: {os.path.abspath(run_dir)}")
|
|
156
|
+
print(f"{'='*60}\n")
|
|
157
|
+
|
|
158
|
+
print("EXECUTION PLAN\n")
|
|
159
|
+
print("Spawn each arm as a parallel Agent task in the same turn.\n")
|
|
160
|
+
|
|
161
|
+
step = 1
|
|
162
|
+
for slot in slots:
|
|
163
|
+
slot_label = SCENARIO_LABELS.get(slot, slot)
|
|
164
|
+
print(f"## Slot: {slot} — {slot_label}\n")
|
|
165
|
+
for run in range(1, args.runs + 1):
|
|
166
|
+
for idx, iface in enumerate(interfaces):
|
|
167
|
+
arm_label = "arm-A" if idx == 0 else "arm-B"
|
|
168
|
+
arm_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}", arm_label)
|
|
169
|
+
print(f"Step {step}: Spawn Agent — {arm_label} ({iface}) | slot={slot} | run={run}")
|
|
170
|
+
print(f" Agent file: agents/scenario-runner.md")
|
|
171
|
+
print(f" SCENARIO: {slot}")
|
|
172
|
+
print(f" INTERFACE: {iface}")
|
|
173
|
+
print(f" OUTPUT_DIR: {arm_dir}")
|
|
174
|
+
print(f" RUN_NUMBER: {run}")
|
|
175
|
+
print(f" CRITICAL: Capture total_tokens + duration_ms from task notification")
|
|
176
|
+
print(f" and update {arm_dir}/timing.json immediately.\n")
|
|
177
|
+
step += 1
|
|
178
|
+
|
|
179
|
+
# After both arms complete for this run
|
|
180
|
+
comp_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}")
|
|
181
|
+
print(f"Step {step}: Spawn blind-comparator Agent")
|
|
182
|
+
print(f" Agent file: agents/blind-comparator.md")
|
|
183
|
+
print(f" OUTPUT_A: {comp_dir}/arm-A/")
|
|
184
|
+
print(f" OUTPUT_B: {comp_dir}/arm-B/")
|
|
185
|
+
print(f" SCENARIO: {slot}")
|
|
186
|
+
print(f" OUTPUT_PATH: {comp_dir}/comparison.json\n")
|
|
187
|
+
step += 1
|
|
188
|
+
|
|
189
|
+
print(f"Step {step}: Aggregate token data")
|
|
190
|
+
print(f" python scripts/token_tracker.py --run-dir {os.path.abspath(run_dir)}\n")
|
|
191
|
+
step += 1
|
|
192
|
+
|
|
193
|
+
print(f"Step {step}: Generate final report")
|
|
194
|
+
print(f" python scripts/generate_report.py --run-dir {os.path.abspath(run_dir)} --mode {args.mode}\n")
|
|
195
|
+
step += 1
|
|
196
|
+
|
|
197
|
+
print(f"Step {step}: (Optional) Spawn analysis-reporter Agent for deep synthesis")
|
|
198
|
+
print(f" Agent file: agents/analysis-reporter.md")
|
|
199
|
+
print(f" RUN_DIR: {os.path.abspath(run_dir)}\n")
|
|
200
|
+
|
|
201
|
+
print(f"{'='*60}")
|
|
202
|
+
print(f"Manifest: {manifest_path}")
|
|
203
|
+
print(f"{'='*60}")
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
if __name__ == "__main__":
|
|
207
|
+
main()
|