@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,287 @@
1
+ """
2
+ ct-grade v3 — Master Pipeline Runner
3
+
4
+ Orchestrates the full ct-grade v3 pipeline:
5
+ 1. Audit log analysis
6
+ 2. Scenario note (agents run separately via SKILL.md)
7
+ 3. A/B test
8
+ 4. Token tracker
9
+ 5. Report generation
10
+ 6. Grade review server
11
+
12
+ Usage:
13
+ python scripts/run_all.py [--full] [--skip-ab] [--port 3118]
14
+ [--project-dir .] [--stop] [--no-browser]
15
+ """
16
+
17
+ import argparse
18
+ import os
19
+ import signal
20
+ import subprocess
21
+ import sys
22
+ import time
23
+ import webbrowser
24
+ from datetime import datetime
25
+ from pathlib import Path
26
+
27
+ SCRIPT_DIR = Path(__file__).parent.resolve()
28
+ SKILL_DIR = SCRIPT_DIR.parent # packages/ct-skills/skills/ct-grade/
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Server lifecycle
33
+ # ---------------------------------------------------------------------------
34
+
35
+ def stop_server(project_dir: str) -> None:
36
+ pid_file = Path(project_dir) / ".ct-grade-server.pid"
37
+ if not pid_file.exists():
38
+ print("No server PID file found. Server may not be running.")
39
+ return
40
+ pid = int(pid_file.read_text().strip())
41
+ try:
42
+ os.kill(pid, signal.SIGTERM)
43
+ pid_file.unlink()
44
+ print(f"Server stopped (PID {pid})")
45
+ except ProcessLookupError:
46
+ print(f"Process {pid} not found (already stopped)")
47
+ pid_file.unlink()
48
+ except Exception as e:
49
+ print(f"Error stopping server: {e}")
50
+
51
+
52
+ def start_server(project_dir: str, output_dir: Path, port: int) -> int | None:
53
+ """Step 6: Start grade review server in background."""
54
+ viewer_script = SKILL_DIR / "eval-viewer" / "generate_grade_review.py"
55
+ if not viewer_script.exists():
56
+ print(
57
+ f" WARNING: Viewer script not found at {viewer_script}. Skipping server start."
58
+ )
59
+ return None
60
+
61
+ print(f"\n[6/6] Starting Grade Review server on port {port}...")
62
+ proc = subprocess.Popen(
63
+ [
64
+ sys.executable,
65
+ str(viewer_script),
66
+ str(project_dir),
67
+ "--port",
68
+ str(port),
69
+ "--no-browser",
70
+ "--ab-dir",
71
+ str(output_dir),
72
+ ],
73
+ stdout=subprocess.DEVNULL,
74
+ stderr=subprocess.DEVNULL,
75
+ start_new_session=True, # detach from parent
76
+ )
77
+ pid_file = Path(project_dir) / ".ct-grade-server.pid"
78
+ pid_file.write_text(str(proc.pid))
79
+ return proc.pid
80
+
81
+
82
+ # ---------------------------------------------------------------------------
83
+ # Pipeline steps
84
+ # ---------------------------------------------------------------------------
85
+
86
+ def step_audit_analyze(project_dir: str, output_dir: Path) -> None:
87
+ """Step 1: Extract real per-op stats from tasks.db audit_log."""
88
+ print("\n[1/6] Analyzing audit log...")
89
+ result = subprocess.run(
90
+ [
91
+ sys.executable,
92
+ str(SCRIPT_DIR / "audit_analyzer.py"),
93
+ "--project-dir",
94
+ str(project_dir),
95
+ "--output-dir",
96
+ str(output_dir),
97
+ ],
98
+ capture_output=True,
99
+ text=True,
100
+ )
101
+ if result.returncode != 0:
102
+ print(f" WARNING: audit_analyzer failed: {result.stderr[:200]}")
103
+ else:
104
+ print(" Done.")
105
+
106
+
107
+ def step_scenario_note(full_mode: bool) -> None:
108
+ """Step 2: Print info about scenario agents (not spawned here)."""
109
+ print("\n[2/6] Scenario runners:")
110
+ if full_mode:
111
+ print(
112
+ " Full mode: S1–S5 scenarios are delegated to ct-grade scenario-runner agents."
113
+ )
114
+ else:
115
+ print(
116
+ " Fast mode: S4+S5 scenarios are delegated to ct-grade scenario-runner agents."
117
+ )
118
+ print(
119
+ " Run scenarios separately via SKILL.md orchestration (skill invocation)."
120
+ )
121
+
122
+
123
+ def step_ab_test(project_dir: str, output_dir: Path, full_mode: bool = False) -> None:
124
+ """Step 3: Run A/B test (smoke in fast mode, parity in full mode)."""
125
+ test_set = "parity" if full_mode else "smoke"
126
+ runs = "3"
127
+ print(f"\n[3/6] Running A/B test (--test-set {test_set}, --runs {runs})...")
128
+ result = subprocess.run(
129
+ [
130
+ sys.executable,
131
+ str(SCRIPT_DIR / "run_ab_test.py"),
132
+ "--test-set",
133
+ test_set,
134
+ "--runs",
135
+ runs,
136
+ "--project-dir",
137
+ str(project_dir),
138
+ "--output-dir",
139
+ str(output_dir / "ab-results"),
140
+ ],
141
+ capture_output=False, # show live output
142
+ text=True,
143
+ )
144
+ if result.returncode != 0:
145
+ print(" WARNING: A/B test completed with errors.")
146
+
147
+
148
+ def step_token_tracker(project_dir: str, output_dir: Path, grades_file: Path) -> None:
149
+ """Step 4: Enrich grade data with token estimates."""
150
+ print("\n[4/6] Running token tracker...")
151
+ args = [
152
+ sys.executable,
153
+ str(SCRIPT_DIR / "token_tracker.py"),
154
+ "--project-dir",
155
+ str(project_dir),
156
+ "--output",
157
+ str(output_dir / "token-summary.json"),
158
+ ]
159
+ if grades_file.exists():
160
+ args += ["--grades-file", str(grades_file)]
161
+ result = subprocess.run(args, capture_output=True, text=True)
162
+ if result.returncode != 0:
163
+ print(f" WARNING: token_tracker failed: {result.stderr[:200]}")
164
+ else:
165
+ print(" Done.")
166
+
167
+
168
+ def step_generate_report(output_dir: Path) -> None:
169
+ """Step 5: Generate markdown report."""
170
+ print("\n[5/6] Generating report...")
171
+ result = subprocess.run(
172
+ [
173
+ sys.executable,
174
+ str(SCRIPT_DIR / "generate_report.py"),
175
+ "--run-dir",
176
+ str(output_dir),
177
+ "--mode",
178
+ "ab",
179
+ ],
180
+ capture_output=True,
181
+ text=True,
182
+ )
183
+ if result.returncode != 0:
184
+ print(f" WARNING: generate_report failed: {result.stderr[:200]}")
185
+ else:
186
+ print(f" Done. Report: {output_dir / 'report.md'}")
187
+
188
+
189
+ # ---------------------------------------------------------------------------
190
+ # Entry point
191
+ # ---------------------------------------------------------------------------
192
+
193
+ def main() -> None:
194
+ parser = argparse.ArgumentParser(
195
+ description="ct-grade v3 — Master Pipeline Runner",
196
+ formatter_class=argparse.RawDescriptionHelpFormatter,
197
+ )
198
+ parser.add_argument(
199
+ "--full",
200
+ action="store_true",
201
+ help="Run all 5 scenarios (S1–S5) + parity A/B tests. Default: fast mode (S4+S5 + smoke A/B).",
202
+ )
203
+ parser.add_argument(
204
+ "--skip-ab",
205
+ action="store_true",
206
+ help="Skip the A/B test step.",
207
+ )
208
+ parser.add_argument(
209
+ "--port",
210
+ type=int,
211
+ default=3118,
212
+ help="Port for grade review server (default: 3118).",
213
+ )
214
+ parser.add_argument(
215
+ "--project-dir",
216
+ default=".",
217
+ help="CLEO project root (default: current directory).",
218
+ )
219
+ parser.add_argument(
220
+ "--stop",
221
+ action="store_true",
222
+ help="Kill existing server from .ct-grade-server.pid and exit.",
223
+ )
224
+ parser.add_argument(
225
+ "--no-browser",
226
+ action="store_true",
227
+ help="Don't auto-open browser after starting server.",
228
+ )
229
+ args = parser.parse_args()
230
+
231
+ project_dir = str(Path(args.project_dir).resolve())
232
+
233
+ # --stop: kill running server and exit
234
+ if args.stop:
235
+ stop_server(project_dir)
236
+ return
237
+
238
+ # Set up timestamped output directory
239
+ timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
240
+ output_dir = Path(project_dir) / "ab-results" / timestamp
241
+ output_dir.mkdir(parents=True, exist_ok=True)
242
+
243
+ mode_label = "Full (S1–S5 + parity A/B)" if args.full else "Fast (S4+S5 + smoke A/B)"
244
+
245
+ # Print header
246
+ print("╔═══════════════════════════════════════╗")
247
+ print("║ ct-grade v3 — Grade Review System ║")
248
+ print("╚═══════════════════════════════════════╝")
249
+ print()
250
+ print(f" Mode : {mode_label}")
251
+ print(f" Output : ab-results/{timestamp}")
252
+ print(f" Project : {project_dir}")
253
+
254
+ grades_file = Path(project_dir) / ".cleo" / "metrics" / "GRADES.jsonl"
255
+
256
+ # Step 1: Audit analysis
257
+ step_audit_analyze(project_dir, output_dir)
258
+
259
+ # Step 2: Scenario note
260
+ step_scenario_note(args.full)
261
+
262
+ # Step 3: A/B test (optional)
263
+ if not args.skip_ab:
264
+ step_ab_test(project_dir, output_dir, full_mode=args.full)
265
+ else:
266
+ print("\n[3/6] Skipping A/B test (--skip-ab).")
267
+
268
+ # Step 4: Token tracker
269
+ step_token_tracker(project_dir, output_dir, grades_file)
270
+
271
+ # Step 5: Report generation
272
+ step_generate_report(output_dir)
273
+
274
+ # Step 6: Start server
275
+ pid = start_server(project_dir, output_dir, args.port)
276
+
277
+ # Open browser after brief pause for server to bind
278
+ if not args.no_browser and pid is not None:
279
+ time.sleep(0.5)
280
+ webbrowser.open(f"http://localhost:{args.port}")
281
+
282
+ print(f"\nGrade Review live at http://localhost:{args.port}")
283
+ print("Stop with: python scripts/run_all.py --stop")
284
+
285
+
286
+ if __name__ == "__main__":
287
+ main()
@@ -0,0 +1,183 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ setup_run.py — Set up an A/B test run directory and print the execution plan.
4
+
5
+ Usage:
6
+ python setup_run.py --mode scenario --scenario s4 --interface both --runs 3 --output-dir ./ab_results/run-001
7
+
8
+ Outputs:
9
+ - Creates run directory structure
10
+ - Writes run-manifest.json
11
+ - Prints step-by-step execution plan for Claude to follow
12
+ """
13
+
14
+ import argparse
15
+ import json
16
+ import os
17
+ import sys
18
+ from datetime import datetime, timezone
19
+
20
+
21
+ VALID_MODES = ["scenario", "ab", "blind"]
22
+ VALID_SCENARIOS = ["s1", "s2", "s3", "s4", "s5", "all"]
23
+ VALID_INTERFACES = ["mcp", "cli", "both"]
24
+
25
+ SCENARIO_LABELS = {
26
+ "s1": "Fresh Discovery",
27
+ "s2": "Task Creation Hygiene",
28
+ "s3": "Error Recovery",
29
+ "s4": "Full Lifecycle",
30
+ "s5": "Multi-Domain Analysis",
31
+ }
32
+
33
+ DEFAULT_DOMAINS = ["tasks", "session"]
34
+
35
+
36
+ def expand_scenarios(scenario_arg):
37
+ if scenario_arg == "all":
38
+ return ["s1", "s2", "s3", "s4", "s5"]
39
+ return [s.strip() for s in scenario_arg.split(",") if s.strip() in SCENARIO_LABELS]
40
+
41
+
42
+ def expand_interfaces(interface_arg):
43
+ if interface_arg == "both":
44
+ return ["mcp", "cli"]
45
+ return [interface_arg]
46
+
47
+
48
+ def create_dir(path):
49
+ os.makedirs(path, exist_ok=True)
50
+ return path
51
+
52
+
53
+ def main():
54
+ parser = argparse.ArgumentParser(description="Set up a ct-grade A/B test run")
55
+ parser.add_argument("--mode", default="scenario", choices=VALID_MODES)
56
+ parser.add_argument("--scenario", default="all")
57
+ parser.add_argument("--interface", default="both", choices=VALID_INTERFACES)
58
+ parser.add_argument("--domains", default="tasks,session")
59
+ parser.add_argument("--runs", type=int, default=3)
60
+ parser.add_argument("--output-dir", required=True)
61
+ parser.add_argument("--project-dir", default=".")
62
+ args = parser.parse_args()
63
+
64
+ scenarios = expand_scenarios(args.scenario)
65
+ interfaces = expand_interfaces(args.interface)
66
+ domains = [d.strip() for d in args.domains.split(",")]
67
+
68
+ if not scenarios:
69
+ print(f"ERROR: No valid scenarios in '{args.scenario}'. Use: {', '.join(VALID_SCENARIOS)}", file=sys.stderr)
70
+ sys.exit(1)
71
+
72
+ run_dir = args.output_dir
73
+ create_dir(run_dir)
74
+
75
+ # For ab/blind mode, each domain is a "slot"
76
+ slots = scenarios if args.mode == "scenario" else domains
77
+
78
+ # Create directory structure
79
+ for slot in slots:
80
+ for iface in interfaces:
81
+ arm_label = "arm-A" if iface == interfaces[0] else "arm-B"
82
+ for run in range(1, args.runs + 1):
83
+ slot_dir = os.path.join(run_dir, slot, f"run-{run:02d}", arm_label)
84
+ create_dir(slot_dir)
85
+ # Create placeholder timing.json
86
+ timing = {
87
+ "arm": arm_label,
88
+ "interface": iface,
89
+ "slot": slot,
90
+ "run": run,
91
+ "executor_start": None,
92
+ "executor_end": None,
93
+ "executor_duration_seconds": None,
94
+ "total_tokens": None,
95
+ "duration_ms": None,
96
+ }
97
+ timing_path = os.path.join(slot_dir, "timing.json")
98
+ with open(timing_path, "w") as f:
99
+ json.dump(timing, f, indent=2)
100
+
101
+ # Write run-manifest.json
102
+ manifest = {
103
+ "created_at": datetime.now(timezone.utc).isoformat(),
104
+ "mode": args.mode,
105
+ "scenarios": scenarios,
106
+ "interfaces": interfaces,
107
+ "domains": domains,
108
+ "runs_per_configuration": args.runs,
109
+ "project_dir": os.path.abspath(args.project_dir),
110
+ "run_dir": os.path.abspath(run_dir),
111
+ "arms": {
112
+ "A": {"interface": interfaces[0], "label": f"{interfaces[0].upper()} interface"},
113
+ "B": {"interface": interfaces[1] if len(interfaces) > 1 else interfaces[0],
114
+ "label": f"{interfaces[-1].upper()} interface"},
115
+ },
116
+ "slots": slots,
117
+ "status": "setup_complete",
118
+ }
119
+ manifest_path = os.path.join(run_dir, "run-manifest.json")
120
+ with open(manifest_path, "w") as f:
121
+ json.dump(manifest, f, indent=2)
122
+
123
+ # Print execution plan
124
+ print(f"\n{'='*60}")
125
+ print(f"ct-grade A/B Run Setup Complete")
126
+ print(f"{'='*60}")
127
+ print(f"Mode: {args.mode}")
128
+ print(f"Scenarios: {', '.join(scenarios)}")
129
+ print(f"Interfaces: {', '.join(interfaces)}")
130
+ print(f"Runs each: {args.runs}")
131
+ print(f"Output: {os.path.abspath(run_dir)}")
132
+ print(f"{'='*60}\n")
133
+
134
+ print("EXECUTION PLAN\n")
135
+ print("Spawn each arm as a parallel Agent task in the same turn.\n")
136
+
137
+ step = 1
138
+ for slot in slots:
139
+ slot_label = SCENARIO_LABELS.get(slot, slot)
140
+ print(f"## Slot: {slot} — {slot_label}\n")
141
+ for run in range(1, args.runs + 1):
142
+ for idx, iface in enumerate(interfaces):
143
+ arm_label = "arm-A" if idx == 0 else "arm-B"
144
+ arm_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}", arm_label)
145
+ print(f"Step {step}: Spawn Agent — {arm_label} ({iface}) | slot={slot} | run={run}")
146
+ print(f" Agent file: agents/scenario-runner.md")
147
+ print(f" SCENARIO: {slot}")
148
+ print(f" INTERFACE: {iface}")
149
+ print(f" OUTPUT_DIR: {arm_dir}")
150
+ print(f" RUN_NUMBER: {run}")
151
+ print(f" CRITICAL: Capture total_tokens + duration_ms from task notification")
152
+ print(f" and update {arm_dir}/timing.json immediately.\n")
153
+ step += 1
154
+
155
+ # After both arms complete for this run
156
+ comp_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}")
157
+ print(f"Step {step}: Spawn blind-comparator Agent")
158
+ print(f" Agent file: agents/blind-comparator.md")
159
+ print(f" OUTPUT_A: {comp_dir}/arm-A/")
160
+ print(f" OUTPUT_B: {comp_dir}/arm-B/")
161
+ print(f" SCENARIO: {slot}")
162
+ print(f" OUTPUT_PATH: {comp_dir}/comparison.json\n")
163
+ step += 1
164
+
165
+ print(f"Step {step}: Aggregate token data")
166
+ print(f" python scripts/token_tracker.py --run-dir {os.path.abspath(run_dir)}\n")
167
+ step += 1
168
+
169
+ print(f"Step {step}: Generate final report")
170
+ print(f" python scripts/generate_report.py --run-dir {os.path.abspath(run_dir)} --mode {args.mode}\n")
171
+ step += 1
172
+
173
+ print(f"Step {step}: (Optional) Spawn analysis-reporter Agent for deep synthesis")
174
+ print(f" Agent file: agents/analysis-reporter.md")
175
+ print(f" RUN_DIR: {os.path.abspath(run_dir)}\n")
176
+
177
+ print(f"{'='*60}")
178
+ print(f"Manifest: {manifest_path}")
179
+ print(f"{'='*60}")
180
+
181
+
182
+ if __name__ == "__main__":
183
+ main()