@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,396 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Run a CLEO grade scenario and capture metrics.
4
+
5
+ Executes a predefined grade scenario against a live CLEO project,
6
+ capturing timing and output metrics for later analysis.
7
+
8
+ Usage:
9
+ python run_scenario.py --scenario S1 [options]
10
+ python run_scenario.py --scenario full [options]
11
+
12
+ Options:
13
+ --scenario S1-S5, full, or P1-P3 (default: S1)
14
+ --cleo CLEO binary (default: cleo-dev)
15
+ --output-dir Results directory (default: ./grade-results/<timestamp>)
16
+ --scope Session scope (default: global)
17
+ --parent-task Task ID for subtask scenarios (S2, S5)
18
+ --seed-task Existing task ID for lifecycle scenarios (S3, S4)
19
+ --runs Number of times to repeat (default: 1)
20
+ --json Output results as JSON to stdout
21
+ """
22
+
23
+ import argparse
24
+ import json
25
+ import os
26
+ import subprocess
27
+ import sys
28
+ import time
29
+ from datetime import datetime, timezone
30
+ from pathlib import Path
31
+
32
+
33
+ # ---------------------------------------------------------------------------
34
+ # Scenario definitions
35
+ # ---------------------------------------------------------------------------
36
+
37
+ def _build_scenario(name, ops_fn):
38
+ return {"name": name, "build_ops": ops_fn}
39
+
40
+
41
+ def scenario_s1(args):
42
+ """S1: Session Discipline — tests session.list before task ops and session.end."""
43
+ seed = args.seed_task or "T100"
44
+ return [
45
+ (["session", "list"], "Check existing sessions"),
46
+ (["admin", "dash"], "Project overview"),
47
+ (["tasks", "find", "--status", "active"], "Discover active tasks"),
48
+ (["tasks", "show", seed], "Inspect specific task"),
49
+ # session.end is handled by run_graded_session wrapper
50
+ ]
51
+
52
+
53
+ def scenario_s2(args):
54
+ """S2: Task Hygiene — tests task creation with descriptions and parent verification."""
55
+ parent = args.parent_task or args.seed_task
56
+ if not parent:
57
+ print("WARNING: --parent-task not set for S2; using T100 as placeholder", file=sys.stderr)
58
+ parent = "T100"
59
+ return [
60
+ (["session", "list"], "Check existing sessions"),
61
+ (["tasks", "exists", parent], "Verify parent exists"),
62
+ (["tasks", "add",
63
+ "--title", "Impl auth",
64
+ "--description", "Add JWT authentication to API endpoints",
65
+ "--parent", parent], "Create subtask with description"),
66
+ (["tasks", "add",
67
+ "--title", "Write auth tests",
68
+ "--description", "Unit tests for auth module"], "Create standalone task with description"),
69
+ ]
70
+
71
+
72
+ def scenario_s3(args):
73
+ """S3: Error Recovery — tests E_NOT_FOUND recovery and no duplicate creates."""
74
+ return [
75
+ (["session", "list"], "Check existing sessions"),
76
+ (["tasks", "show", "T99999"], "Trigger E_NOT_FOUND intentionally"),
77
+ (["tasks", "find", "--query", "T99999"], "Recovery lookup after E_NOT_FOUND"),
78
+ (["tasks", "add",
79
+ "--title", "New feature discovered",
80
+ "--description", "Feature that was not found — creating fresh"], "Create once"),
81
+ ]
82
+
83
+
84
+ def scenario_s4(args):
85
+ """S4: Full Lifecycle — all 5 dimensions at 20/20."""
86
+ seed = args.seed_task or "T200"
87
+ return [
88
+ (["session", "list"], "Check existing sessions"),
89
+ (["admin", "help"], "Progressive disclosure — tier 0"),
90
+ (["admin", "dash"], "Project overview"),
91
+ (["tasks", "find", "--status", "pending"], "Discover pending tasks"),
92
+ (["tasks", "show", seed], "Inspect chosen task"),
93
+ (["tasks", "update", "--task-id", seed, "--status", "active"], "Begin work"),
94
+ (["tasks", "complete", seed], "Mark done"),
95
+ (["tasks", "find", "--status", "pending"], "Check for next task"),
96
+ ]
97
+
98
+
99
+ def scenario_s5(args):
100
+ """S5: Multi-Domain Analysis — cross-domain with session decisions."""
101
+ parent = args.parent_task or "T500"
102
+ seed = args.seed_task or "T501"
103
+ return [
104
+ (["session", "list"], "Check existing sessions"),
105
+ (["admin", "help"], "Progressive disclosure"),
106
+ (["tasks", "find", "--parent", parent], "Discover epic subtasks"),
107
+ (["tasks", "show", seed], "Inspect specific subtask"),
108
+ (["session", "context-drift"], "Check context drift"),
109
+ (["session", "decision-log", "--task-id", seed], "Review past decisions"),
110
+ (["session", "record-decision",
111
+ "--task-id", seed,
112
+ "--decision", "Use adapter pattern",
113
+ "--rationale", "Decouples provider logic"], "Record decision"),
114
+ (["tasks", "update", "--task-id", seed, "--status", "active"], "Begin work"),
115
+ (["tasks", "complete", seed], "Mark done"),
116
+ (["tasks", "find", "--parent", parent, "--status", "pending"], "Find next subtask"),
117
+ ]
118
+
119
+
120
+ SCENARIOS = {
121
+ "S1": scenario_s1,
122
+ "S2": scenario_s2,
123
+ "S3": scenario_s3,
124
+ "S4": scenario_s4,
125
+ "S5": scenario_s5,
126
+ }
127
+
128
+
129
+ # ---------------------------------------------------------------------------
130
+ # CLEO runner
131
+ # ---------------------------------------------------------------------------
132
+
133
+ def run_cleo(cleo_bin, args_list, cwd=None, capture=True):
134
+ """Run a cleo command and return (returncode, stdout, stderr, duration_ms)."""
135
+ cmd = [cleo_bin] + args_list + ["--json"]
136
+ start = time.time()
137
+ try:
138
+ result = subprocess.run(
139
+ cmd,
140
+ capture_output=capture,
141
+ text=True,
142
+ cwd=cwd,
143
+ timeout=30,
144
+ )
145
+ duration_ms = int((time.time() - start) * 1000)
146
+ return result.returncode, result.stdout or "", result.stderr or "", duration_ms
147
+ except subprocess.TimeoutExpired:
148
+ return -1, "", "TIMEOUT", 30000
149
+ except FileNotFoundError:
150
+ return -1, "", f"Command not found: {cleo_bin}", 0
151
+
152
+
153
+ def start_graded_session(cleo_bin, scope, name, cwd=None):
154
+ """Start a grade-enabled session. Returns session ID or None."""
155
+ rc, stdout, stderr, _ = run_cleo(
156
+ cleo_bin,
157
+ ["session", "start", "--scope", scope, "--name", name, "--grade"],
158
+ cwd=cwd,
159
+ )
160
+ if rc != 0:
161
+ print(f"ERROR: session start failed: {stderr}", file=sys.stderr)
162
+ return None
163
+ try:
164
+ data = json.loads(stdout)
165
+ # Try common paths for session ID
166
+ return (
167
+ data.get("data", {}).get("sessionId")
168
+ or data.get("sessionId")
169
+ or data.get("id")
170
+ )
171
+ except Exception:
172
+ # Try to extract session ID from plain output
173
+ for line in stdout.splitlines():
174
+ if "session-" in line:
175
+ parts = line.split()
176
+ for p in parts:
177
+ if p.startswith("session-"):
178
+ return p.strip('",')
179
+ return None
180
+
181
+
182
+ def end_session(cleo_bin, cwd=None):
183
+ """End the current session."""
184
+ rc, stdout, stderr, _ = run_cleo(cleo_bin, ["session", "end"], cwd=cwd)
185
+ return rc == 0
186
+
187
+
188
+ def grade_session(cleo_bin, session_id, cwd=None):
189
+ """Grade a session. Returns dict or None."""
190
+ rc, stdout, stderr, _ = run_cleo(cleo_bin, ["grade", session_id], cwd=cwd)
191
+ if rc != 0:
192
+ print(f"WARNING: grade failed (rc={rc}): {stderr}", file=sys.stderr)
193
+ return None
194
+ try:
195
+ data = json.loads(stdout)
196
+ return data.get("data") or data
197
+ except Exception:
198
+ return {"raw": stdout}
199
+
200
+
201
+ # ---------------------------------------------------------------------------
202
+ # Single scenario run
203
+ # ---------------------------------------------------------------------------
204
+
205
+ def run_single_scenario(scenario_name, args, output_dir):
206
+ """Run one scenario. Returns metrics dict."""
207
+ cleo = args.cleo
208
+ scope = args.scope or "global"
209
+ session_name = f"grade-{scenario_name.lower()}-{int(time.time())}"
210
+
211
+ output_dir = Path(output_dir)
212
+ output_dir.mkdir(parents=True, exist_ok=True)
213
+
214
+ print(f"\n=== Scenario {scenario_name} ===")
215
+ print(f" Binary : {cleo}")
216
+ print(f" Scope : {scope}")
217
+ print(f" Output : {output_dir}")
218
+
219
+ # Start graded session
220
+ t_start = time.time()
221
+ session_id = start_graded_session(cleo, scope, session_name, cwd=args.cleo_cwd)
222
+ if not session_id:
223
+ print("ERROR: Could not start graded session", file=sys.stderr)
224
+ metrics = {
225
+ "scenario": scenario_name,
226
+ "session_id": None,
227
+ "error": "DB_UNAVAILABLE",
228
+ "hint": "Use agent-based /ct-grade scenario instead — agents use live MCP tools",
229
+ "grade": None,
230
+ "token_meta": {"estimation_method": "unavailable", "total_estimated_tokens": None},
231
+ }
232
+ metrics_path = output_dir / "metrics.json"
233
+ metrics_path.write_text(json.dumps(metrics, indent=2))
234
+ return metrics
235
+
236
+ print(f" Session: {session_id}")
237
+
238
+ # Build operations for this scenario
239
+ scenario_fn = SCENARIOS[scenario_name]
240
+ operations = scenario_fn(args)
241
+
242
+ # Execute each operation
243
+ op_results = []
244
+ for op_args, description in operations:
245
+ print(f" -> {' '.join(op_args)}")
246
+ rc, stdout, stderr, dur_ms = run_cleo(cleo, op_args, cwd=args.cleo_cwd)
247
+ output_chars = len(stdout)
248
+ estimated_tokens = int(output_chars / 4)
249
+ op_results.append({
250
+ "operation": " ".join(op_args),
251
+ "description": description,
252
+ "returncode": rc,
253
+ "success": rc == 0,
254
+ "output_chars": output_chars,
255
+ "estimated_tokens": estimated_tokens,
256
+ "duration_ms": dur_ms,
257
+ "error": stderr[:200] if rc != 0 else None,
258
+ })
259
+ if rc not in (0, 4): # 4 = E_NOT_FOUND (expected for S3)
260
+ print(f" WARNING: rc={rc} stderr={stderr[:100]}")
261
+
262
+ # End session
263
+ ended = end_session(cleo, cwd=args.cleo_cwd)
264
+ print(f" Session end: {'ok' if ended else 'FAILED'}")
265
+
266
+ # Grade session
267
+ grade = grade_session(cleo, session_id, cwd=args.cleo_cwd)
268
+ t_total = time.time() - t_start
269
+
270
+ # Compute token metadata
271
+ total_output_chars = sum(r["output_chars"] for r in op_results)
272
+ total_estimated_tokens = sum(r["estimated_tokens"] for r in op_results)
273
+
274
+ metrics = {
275
+ "scenario": scenario_name,
276
+ "session_id": session_id,
277
+ "timestamp": datetime.now(timezone.utc).isoformat(),
278
+ "duration_seconds": round(t_total, 2),
279
+ "operations": op_results,
280
+ "grade": grade,
281
+ "token_meta": {
282
+ "estimation_method": "output_chars",
283
+ "total_output_chars": total_output_chars,
284
+ "total_estimated_tokens": total_estimated_tokens,
285
+ "avg_tokens_per_op": int(total_estimated_tokens / max(len(op_results), 1)),
286
+ },
287
+ }
288
+
289
+ # Save
290
+ metrics_path = output_dir / "metrics.json"
291
+ metrics_path.write_text(json.dumps(metrics, indent=2))
292
+ print(f" Saved : {metrics_path}")
293
+
294
+ if grade:
295
+ score = grade.get("totalScore", "?")
296
+ letter = _score_to_letter(grade.get("totalScore", 0))
297
+ flags = grade.get("flags", [])
298
+ print(f" Grade : {score}/100 ({letter}) — {len(flags)} flag(s)")
299
+ if flags:
300
+ for f in flags:
301
+ print(f" FLAG: {f}")
302
+
303
+ return metrics
304
+
305
+
306
+ def _score_to_letter(score):
307
+ if score >= 90: return "A"
308
+ if score >= 75: return "B"
309
+ if score >= 60: return "C"
310
+ if score >= 45: return "D"
311
+ return "F"
312
+
313
+
314
+ # ---------------------------------------------------------------------------
315
+ # Main
316
+ # ---------------------------------------------------------------------------
317
+
318
+ def main():
319
+ parser = argparse.ArgumentParser(description="Run CLEO grade scenarios")
320
+ parser.add_argument("--scenario", default="S1",
321
+ help="S1-S5, full, or comma-separated e.g. S1,S3")
322
+ parser.add_argument("--cleo", default="cleo-dev",
323
+ help="CLEO binary (default: cleo-dev)")
324
+ parser.add_argument("--cleo-cwd", default=None,
325
+ help="Working directory for CLEO commands")
326
+ parser.add_argument("--output-dir", default=None,
327
+ help="Output directory (default: ./grade-results/<timestamp>)")
328
+ parser.add_argument("--scope", default="global",
329
+ help="Session scope (default: global)")
330
+ parser.add_argument("--parent-task", default=None,
331
+ help="Parent task ID for S2/S5 subtask scenarios")
332
+ parser.add_argument("--seed-task", default=None,
333
+ help="Existing task ID for S3/S4/S5 lifecycle scenarios")
334
+ parser.add_argument("--runs", type=int, default=1,
335
+ help="Number of runs per scenario (default: 1)")
336
+ parser.add_argument("--json", action="store_true",
337
+ help="Output summary as JSON to stdout")
338
+ args = parser.parse_args()
339
+
340
+ # Determine which scenarios to run
341
+ if args.scenario.lower() == "full":
342
+ targets = list(SCENARIOS.keys())
343
+ else:
344
+ targets = [s.strip().upper() for s in args.scenario.split(",")]
345
+ unknown = [s for s in targets if s not in SCENARIOS]
346
+ if unknown:
347
+ print(f"ERROR: Unknown scenarios: {unknown}. Valid: {list(SCENARIOS.keys())}", file=sys.stderr)
348
+ sys.exit(1)
349
+
350
+ # Build output directory
351
+ ts = datetime.now().strftime("%Y%m%d-%H%M%S")
352
+ base_output = Path(args.output_dir) if args.output_dir else Path(f"./grade-results/{ts}")
353
+
354
+ all_results = []
355
+
356
+ for scenario_name in targets:
357
+ for run_num in range(1, args.runs + 1):
358
+ run_dir = base_output / scenario_name / f"run-{run_num:03d}"
359
+ metrics = run_single_scenario(scenario_name, args, run_dir)
360
+ all_results.append(metrics)
361
+
362
+ # Summary
363
+ summary = {
364
+ "timestamp": datetime.now(timezone.utc).isoformat(),
365
+ "scenarios_run": targets,
366
+ "total_runs": len(all_results),
367
+ "results": all_results,
368
+ "grade_summary": [
369
+ {
370
+ "scenario": r["scenario"],
371
+ "score": r.get("grade", {}).get("totalScore") if r.get("grade") else None,
372
+ "letter": _score_to_letter(r.get("grade", {}).get("totalScore", 0) if r.get("grade") else 0),
373
+ "flags": len(r.get("grade", {}).get("flags", [])) if r.get("grade") else None,
374
+ "estimated_tokens": r.get("token_meta", {}).get("total_estimated_tokens"),
375
+ }
376
+ for r in all_results
377
+ ],
378
+ }
379
+
380
+ summary_path = base_output / "summary.json"
381
+ base_output.mkdir(parents=True, exist_ok=True)
382
+ summary_path.write_text(json.dumps(summary, indent=2))
383
+
384
+ print(f"\n=== Summary ===")
385
+ for gs in summary["grade_summary"]:
386
+ score_str = f"{gs['score']}/100 ({gs['letter']})" if gs['score'] is not None else "N/A"
387
+ tok_str = f"~{gs['estimated_tokens']}t" if gs['estimated_tokens'] else ""
388
+ print(f" {gs['scenario']}: {score_str} flags={gs['flags']} {tok_str}")
389
+ print(f"\nSaved: {base_output}")
390
+
391
+ if args.json:
392
+ print(json.dumps(summary, indent=2))
393
+
394
+
395
+ if __name__ == "__main__":
396
+ main()
@@ -0,0 +1,207 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ setup_run.py — Set up an A/B test run directory and print the execution plan.
4
+
5
+ Usage:
6
+ python setup_run.py --mode scenario --scenario s4 --interface both --runs 3 --output-dir ./ab_results/run-001
7
+
8
+ Outputs:
9
+ - Creates run directory structure
10
+ - Writes run-manifest.json
11
+ - Prints step-by-step execution plan for Claude to follow
12
+ """
13
+
14
+ import argparse
15
+ import json
16
+ import os
17
+ import sys
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+
21
+
22
+ VALID_MODES = ["scenario", "ab", "blind"]
23
+ VALID_SCENARIOS = ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10", "all"]
24
+ VALID_INTERFACES = ["mcp", "cli", "both"]
25
+
26
+ SCENARIO_LABELS = {
27
+ "s1": "Fresh Discovery",
28
+ "s2": "Task Creation Hygiene",
29
+ "s3": "Error Recovery",
30
+ "s4": "Full Lifecycle",
31
+ "s5": "Multi-Domain Analysis",
32
+ "s6": "Memory Observe & Recall",
33
+ "s7": "Decision Continuity",
34
+ "s8": "Pattern & Learning Storage",
35
+ "s9": "NEXUS Cross-Project",
36
+ "s10": "Full System Throughput",
37
+ }
38
+
39
+ DEFAULT_DOMAINS = ["tasks", "session"]
40
+
41
+
42
+ def find_cleo_dir(start_dir="."):
43
+ """Walk up from start_dir to find a directory containing .cleo/tasks.db."""
44
+ p = Path(start_dir).resolve()
45
+ while p != p.parent:
46
+ if (p / '.cleo' / 'tasks.db').exists():
47
+ return p
48
+ p = p.parent
49
+ return Path(start_dir).resolve()
50
+
51
+
52
+ def expand_scenarios(scenario_arg):
53
+ if scenario_arg == "all":
54
+ return ["s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10"]
55
+ return [s.strip() for s in scenario_arg.split(",") if s.strip() in SCENARIO_LABELS]
56
+
57
+
58
+ def expand_interfaces(interface_arg):
59
+ if interface_arg == "both":
60
+ return ["mcp", "cli"]
61
+ return [interface_arg]
62
+
63
+
64
+ def create_dir(path):
65
+ os.makedirs(path, exist_ok=True)
66
+ return path
67
+
68
+
69
+ def main():
70
+ parser = argparse.ArgumentParser(description="Set up a ct-grade A/B test run")
71
+ parser.add_argument("--mode", default="scenario", choices=VALID_MODES)
72
+ parser.add_argument("--scenario", default="all")
73
+ parser.add_argument("--interface", default="both", choices=VALID_INTERFACES)
74
+ parser.add_argument("--domains", default="tasks,session")
75
+ parser.add_argument("--runs", type=int, default=3)
76
+ parser.add_argument("--output-dir", required=False, default=None,
77
+ help="Output directory (default: .cleo/metrics/grade-runs/run-<timestamp>)")
78
+ parser.add_argument("--project-dir", default=".")
79
+ args = parser.parse_args()
80
+
81
+ if args.output_dir is None:
82
+ workspace = find_cleo_dir(args.project_dir)
83
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
84
+ args.output_dir = str(workspace / '.cleo' / 'metrics' / 'grade-runs' / f"run-{ts}")
85
+
86
+ scenarios = expand_scenarios(args.scenario)
87
+ interfaces = expand_interfaces(args.interface)
88
+ domains = [d.strip() for d in args.domains.split(",")]
89
+
90
+ if not scenarios:
91
+ print(f"ERROR: No valid scenarios in '{args.scenario}'. Use: {', '.join(VALID_SCENARIOS)}", file=sys.stderr)
92
+ sys.exit(1)
93
+
94
+ run_dir = args.output_dir
95
+ create_dir(run_dir)
96
+
97
+ # For ab/blind mode, each domain is a "slot"
98
+ slots = scenarios if args.mode == "scenario" else domains
99
+
100
+ # Create directory structure
101
+ for slot in slots:
102
+ for iface in interfaces:
103
+ arm_label = "arm-A" if iface == interfaces[0] else "arm-B"
104
+ for run in range(1, args.runs + 1):
105
+ slot_dir = os.path.join(run_dir, slot, f"run-{run:02d}", arm_label)
106
+ create_dir(slot_dir)
107
+ # Create placeholder timing.json
108
+ timing = {
109
+ "arm": arm_label,
110
+ "interface": iface,
111
+ "slot": slot,
112
+ "run": run,
113
+ "session_id": None,
114
+ "executor_start": None,
115
+ "executor_end": None,
116
+ "executor_duration_seconds": None,
117
+ "token_usage_id": None,
118
+ "total_tokens": None,
119
+ "duration_ms": None,
120
+ }
121
+ timing_path = os.path.join(slot_dir, "timing.json")
122
+ with open(timing_path, "w") as f:
123
+ json.dump(timing, f, indent=2)
124
+
125
+ # Write run-manifest.json
126
+ manifest = {
127
+ "created_at": datetime.now(timezone.utc).isoformat(),
128
+ "mode": args.mode,
129
+ "scenarios": scenarios,
130
+ "interfaces": interfaces,
131
+ "domains": domains,
132
+ "runs_per_configuration": args.runs,
133
+ "project_dir": os.path.abspath(args.project_dir),
134
+ "run_dir": os.path.abspath(run_dir),
135
+ "arms": {
136
+ "A": {"interface": interfaces[0], "label": f"{interfaces[0].upper()} interface"},
137
+ "B": {"interface": interfaces[1] if len(interfaces) > 1 else interfaces[0],
138
+ "label": f"{interfaces[-1].upper()} interface"},
139
+ },
140
+ "slots": slots,
141
+ "status": "setup_complete",
142
+ }
143
+ manifest_path = os.path.join(run_dir, "run-manifest.json")
144
+ with open(manifest_path, "w") as f:
145
+ json.dump(manifest, f, indent=2)
146
+
147
+ # Print execution plan
148
+ print(f"\n{'='*60}")
149
+ print(f"ct-grade A/B Run Setup Complete")
150
+ print(f"{'='*60}")
151
+ print(f"Mode: {args.mode}")
152
+ print(f"Scenarios: {', '.join(scenarios)}")
153
+ print(f"Interfaces: {', '.join(interfaces)}")
154
+ print(f"Runs each: {args.runs}")
155
+ print(f"Output: {os.path.abspath(run_dir)}")
156
+ print(f"{'='*60}\n")
157
+
158
+ print("EXECUTION PLAN\n")
159
+ print("Spawn each arm as a parallel Agent task in the same turn.\n")
160
+
161
+ step = 1
162
+ for slot in slots:
163
+ slot_label = SCENARIO_LABELS.get(slot, slot)
164
+ print(f"## Slot: {slot} — {slot_label}\n")
165
+ for run in range(1, args.runs + 1):
166
+ for idx, iface in enumerate(interfaces):
167
+ arm_label = "arm-A" if idx == 0 else "arm-B"
168
+ arm_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}", arm_label)
169
+ print(f"Step {step}: Spawn Agent — {arm_label} ({iface}) | slot={slot} | run={run}")
170
+ print(f" Agent file: agents/scenario-runner.md")
171
+ print(f" SCENARIO: {slot}")
172
+ print(f" INTERFACE: {iface}")
173
+ print(f" OUTPUT_DIR: {arm_dir}")
174
+ print(f" RUN_NUMBER: {run}")
175
+ print(f" CRITICAL: Capture total_tokens + duration_ms from task notification")
176
+ print(f" and update {arm_dir}/timing.json immediately.\n")
177
+ step += 1
178
+
179
+ # After both arms complete for this run
180
+ comp_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}")
181
+ print(f"Step {step}: Spawn blind-comparator Agent")
182
+ print(f" Agent file: agents/blind-comparator.md")
183
+ print(f" OUTPUT_A: {comp_dir}/arm-A/")
184
+ print(f" OUTPUT_B: {comp_dir}/arm-B/")
185
+ print(f" SCENARIO: {slot}")
186
+ print(f" OUTPUT_PATH: {comp_dir}/comparison.json\n")
187
+ step += 1
188
+
189
+ print(f"Step {step}: Aggregate token data")
190
+ print(f" python scripts/token_tracker.py --run-dir {os.path.abspath(run_dir)}\n")
191
+ step += 1
192
+
193
+ print(f"Step {step}: Generate final report")
194
+ print(f" python scripts/generate_report.py --run-dir {os.path.abspath(run_dir)} --mode {args.mode}\n")
195
+ step += 1
196
+
197
+ print(f"Step {step}: (Optional) Spawn analysis-reporter Agent for deep synthesis")
198
+ print(f" Agent file: agents/analysis-reporter.md")
199
+ print(f" RUN_DIR: {os.path.abspath(run_dir)}\n")
200
+
201
+ print(f"{'='*60}")
202
+ print(f"Manifest: {manifest_path}")
203
+ print(f"{'='*60}")
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()