@cleocode/skills 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch-config.json +404 -0
- package/index.d.ts +178 -0
- package/index.js +405 -0
- package/package.json +14 -0
- package/profiles/core.json +7 -0
- package/profiles/full.json +10 -0
- package/profiles/minimal.json +7 -0
- package/profiles/recommended.json +7 -0
- package/provider-skills-map.json +97 -0
- package/skills/_shared/cleo-style-guide.md +84 -0
- package/skills/_shared/manifest-operations.md +810 -0
- package/skills/_shared/placeholders.json +433 -0
- package/skills/_shared/skill-chaining-patterns.md +237 -0
- package/skills/_shared/subagent-protocol-base.md +223 -0
- package/skills/_shared/task-system-integration.md +232 -0
- package/skills/_shared/testing-framework-config.md +110 -0
- package/skills/ct-cleo/SKILL.md +490 -0
- package/skills/ct-cleo/references/anti-patterns.md +19 -0
- package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
- package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
- package/skills/ct-cleo/references/session-protocol.md +162 -0
- package/skills/ct-codebase-mapper/SKILL.md +82 -0
- package/skills/ct-contribution/SKILL.md +521 -0
- package/skills/ct-contribution/templates/contribution-init.json +21 -0
- package/skills/ct-dev-workflow/SKILL.md +423 -0
- package/skills/ct-docs-lookup/SKILL.md +66 -0
- package/skills/ct-docs-review/SKILL.md +175 -0
- package/skills/ct-docs-write/SKILL.md +108 -0
- package/skills/ct-documentor/SKILL.md +231 -0
- package/skills/ct-epic-architect/SKILL.md +305 -0
- package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
- package/skills/ct-epic-architect/references/commands.md +201 -0
- package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
- package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
- package/skills/ct-epic-architect/references/output-format.md +92 -0
- package/skills/ct-epic-architect/references/patterns.md +284 -0
- package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
- package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
- package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
- package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
- package/skills/ct-grade/SKILL.md +230 -0
- package/skills/ct-grade/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade/agents/blind-comparator.md +157 -0
- package/skills/ct-grade/agents/scenario-runner.md +134 -0
- package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
- package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
- package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
- package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
- package/skills/ct-grade/eval-viewer/viewer.html +219 -0
- package/skills/ct-grade/evals/evals.json +94 -0
- package/skills/ct-grade/references/ab-test-methodology.md +150 -0
- package/skills/ct-grade/references/domains.md +137 -0
- package/skills/ct-grade/references/grade-spec.md +236 -0
- package/skills/ct-grade/references/scenario-playbook.md +234 -0
- package/skills/ct-grade/references/token-tracking.md +120 -0
- package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
- package/skills/ct-grade/scripts/generate_report.py +283 -0
- package/skills/ct-grade/scripts/run_ab_test.py +504 -0
- package/skills/ct-grade/scripts/run_all.py +287 -0
- package/skills/ct-grade/scripts/setup_run.py +183 -0
- package/skills/ct-grade/scripts/token_tracker.py +630 -0
- package/skills/ct-grade-v2-1/SKILL.md +237 -0
- package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
- package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
- package/skills/ct-grade-v2-1/evals/evals.json +74 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
- package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
- package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
- package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
- package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
- package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
- package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
- package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
- package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
- package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
- package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
- package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
- package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
- package/skills/ct-memory/SKILL.md +84 -0
- package/skills/ct-orchestrator/INSTALL.md +61 -0
- package/skills/ct-orchestrator/README.md +69 -0
- package/skills/ct-orchestrator/SKILL.md +380 -0
- package/skills/ct-orchestrator/manifest-entry.json +19 -0
- package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
- package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
- package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
- package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
- package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
- package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
- package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
- package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
- package/skills/ct-research-agent/SKILL.md +226 -0
- package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
- package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
- package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
- package/skills/ct-skill-creator/SKILL.md +356 -0
- package/skills/ct-skill-creator/agents/analyzer.md +276 -0
- package/skills/ct-skill-creator/agents/comparator.md +204 -0
- package/skills/ct-skill-creator/agents/grader.md +225 -0
- package/skills/ct-skill-creator/assets/eval_review.html +146 -0
- package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/ct-skill-creator/manifest-entry.json +17 -0
- package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
- package/skills/ct-skill-creator/references/frontmatter.md +83 -0
- package/skills/ct-skill-creator/references/invocation-control.md +165 -0
- package/skills/ct-skill-creator/references/output-patterns.md +86 -0
- package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
- package/skills/ct-skill-creator/references/schemas.md +430 -0
- package/skills/ct-skill-creator/references/workflows.md +28 -0
- package/skills/ct-skill-creator/scripts/__init__.py +1 -0
- package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
- package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
- package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
- package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
- package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
- package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
- package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
- package/skills/ct-skill-creator/scripts/utils.py +47 -0
- package/skills/ct-skill-validator/SKILL.md +178 -0
- package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
- package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
- package/skills/ct-skill-validator/evals/eval_set.json +14 -0
- package/skills/ct-skill-validator/evals/evals.json +52 -0
- package/skills/ct-skill-validator/manifest-entry.json +20 -0
- package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
- package/skills/ct-skill-validator/references/validation-rules.md +168 -0
- package/skills/ct-skill-validator/scripts/__init__.py +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
- package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
- package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
- package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
- package/skills/ct-skill-validator/scripts/validate.py +422 -0
- package/skills/ct-spec-writer/SKILL.md +189 -0
- package/skills/ct-stickynote/README.md +14 -0
- package/skills/ct-stickynote/SKILL.md +46 -0
- package/skills/ct-task-executor/SKILL.md +296 -0
- package/skills/ct-validator/SKILL.md +216 -0
- package/skills/manifest.json +469 -0
- package/skills.json +281 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ct-grade v3 — Master Pipeline Runner
|
|
3
|
+
|
|
4
|
+
Orchestrates the full ct-grade v3 pipeline:
|
|
5
|
+
1. Audit log analysis
|
|
6
|
+
2. Scenario note (agents run separately via SKILL.md)
|
|
7
|
+
3. A/B test
|
|
8
|
+
4. Token tracker
|
|
9
|
+
5. Report generation
|
|
10
|
+
6. Grade review server
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
python scripts/run_all.py [--full] [--skip-ab] [--port 3118]
|
|
14
|
+
[--project-dir .] [--stop] [--no-browser]
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import os
|
|
19
|
+
import signal
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
import time
|
|
23
|
+
import webbrowser
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
SCRIPT_DIR = Path(__file__).parent.resolve()
|
|
28
|
+
SKILL_DIR = SCRIPT_DIR.parent # packages/ct-skills/skills/ct-grade/
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Server lifecycle
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
def stop_server(project_dir: str) -> None:
|
|
36
|
+
pid_file = Path(project_dir) / ".ct-grade-server.pid"
|
|
37
|
+
if not pid_file.exists():
|
|
38
|
+
print("No server PID file found. Server may not be running.")
|
|
39
|
+
return
|
|
40
|
+
pid = int(pid_file.read_text().strip())
|
|
41
|
+
try:
|
|
42
|
+
os.kill(pid, signal.SIGTERM)
|
|
43
|
+
pid_file.unlink()
|
|
44
|
+
print(f"Server stopped (PID {pid})")
|
|
45
|
+
except ProcessLookupError:
|
|
46
|
+
print(f"Process {pid} not found (already stopped)")
|
|
47
|
+
pid_file.unlink()
|
|
48
|
+
except Exception as e:
|
|
49
|
+
print(f"Error stopping server: {e}")
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def start_server(project_dir: str, output_dir: Path, port: int) -> int | None:
|
|
53
|
+
"""Step 6: Start grade review server in background."""
|
|
54
|
+
viewer_script = SKILL_DIR / "eval-viewer" / "generate_grade_review.py"
|
|
55
|
+
if not viewer_script.exists():
|
|
56
|
+
print(
|
|
57
|
+
f" WARNING: Viewer script not found at {viewer_script}. Skipping server start."
|
|
58
|
+
)
|
|
59
|
+
return None
|
|
60
|
+
|
|
61
|
+
print(f"\n[6/6] Starting Grade Review server on port {port}...")
|
|
62
|
+
proc = subprocess.Popen(
|
|
63
|
+
[
|
|
64
|
+
sys.executable,
|
|
65
|
+
str(viewer_script),
|
|
66
|
+
str(project_dir),
|
|
67
|
+
"--port",
|
|
68
|
+
str(port),
|
|
69
|
+
"--no-browser",
|
|
70
|
+
"--ab-dir",
|
|
71
|
+
str(output_dir),
|
|
72
|
+
],
|
|
73
|
+
stdout=subprocess.DEVNULL,
|
|
74
|
+
stderr=subprocess.DEVNULL,
|
|
75
|
+
start_new_session=True, # detach from parent
|
|
76
|
+
)
|
|
77
|
+
pid_file = Path(project_dir) / ".ct-grade-server.pid"
|
|
78
|
+
pid_file.write_text(str(proc.pid))
|
|
79
|
+
return proc.pid
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
# Pipeline steps
|
|
84
|
+
# ---------------------------------------------------------------------------
|
|
85
|
+
|
|
86
|
+
def step_audit_analyze(project_dir: str, output_dir: Path) -> None:
|
|
87
|
+
"""Step 1: Extract real per-op stats from tasks.db audit_log."""
|
|
88
|
+
print("\n[1/6] Analyzing audit log...")
|
|
89
|
+
result = subprocess.run(
|
|
90
|
+
[
|
|
91
|
+
sys.executable,
|
|
92
|
+
str(SCRIPT_DIR / "audit_analyzer.py"),
|
|
93
|
+
"--project-dir",
|
|
94
|
+
str(project_dir),
|
|
95
|
+
"--output-dir",
|
|
96
|
+
str(output_dir),
|
|
97
|
+
],
|
|
98
|
+
capture_output=True,
|
|
99
|
+
text=True,
|
|
100
|
+
)
|
|
101
|
+
if result.returncode != 0:
|
|
102
|
+
print(f" WARNING: audit_analyzer failed: {result.stderr[:200]}")
|
|
103
|
+
else:
|
|
104
|
+
print(" Done.")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def step_scenario_note(full_mode: bool) -> None:
|
|
108
|
+
"""Step 2: Print info about scenario agents (not spawned here)."""
|
|
109
|
+
print("\n[2/6] Scenario runners:")
|
|
110
|
+
if full_mode:
|
|
111
|
+
print(
|
|
112
|
+
" Full mode: S1–S5 scenarios are delegated to ct-grade scenario-runner agents."
|
|
113
|
+
)
|
|
114
|
+
else:
|
|
115
|
+
print(
|
|
116
|
+
" Fast mode: S4+S5 scenarios are delegated to ct-grade scenario-runner agents."
|
|
117
|
+
)
|
|
118
|
+
print(
|
|
119
|
+
" Run scenarios separately via SKILL.md orchestration (skill invocation)."
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def step_ab_test(project_dir: str, output_dir: Path, full_mode: bool = False) -> None:
|
|
124
|
+
"""Step 3: Run A/B test (smoke in fast mode, parity in full mode)."""
|
|
125
|
+
test_set = "parity" if full_mode else "smoke"
|
|
126
|
+
runs = "3"
|
|
127
|
+
print(f"\n[3/6] Running A/B test (--test-set {test_set}, --runs {runs})...")
|
|
128
|
+
result = subprocess.run(
|
|
129
|
+
[
|
|
130
|
+
sys.executable,
|
|
131
|
+
str(SCRIPT_DIR / "run_ab_test.py"),
|
|
132
|
+
"--test-set",
|
|
133
|
+
test_set,
|
|
134
|
+
"--runs",
|
|
135
|
+
runs,
|
|
136
|
+
"--project-dir",
|
|
137
|
+
str(project_dir),
|
|
138
|
+
"--output-dir",
|
|
139
|
+
str(output_dir / "ab-results"),
|
|
140
|
+
],
|
|
141
|
+
capture_output=False, # show live output
|
|
142
|
+
text=True,
|
|
143
|
+
)
|
|
144
|
+
if result.returncode != 0:
|
|
145
|
+
print(" WARNING: A/B test completed with errors.")
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def step_token_tracker(project_dir: str, output_dir: Path, grades_file: Path) -> None:
|
|
149
|
+
"""Step 4: Enrich grade data with token estimates."""
|
|
150
|
+
print("\n[4/6] Running token tracker...")
|
|
151
|
+
args = [
|
|
152
|
+
sys.executable,
|
|
153
|
+
str(SCRIPT_DIR / "token_tracker.py"),
|
|
154
|
+
"--project-dir",
|
|
155
|
+
str(project_dir),
|
|
156
|
+
"--output",
|
|
157
|
+
str(output_dir / "token-summary.json"),
|
|
158
|
+
]
|
|
159
|
+
if grades_file.exists():
|
|
160
|
+
args += ["--grades-file", str(grades_file)]
|
|
161
|
+
result = subprocess.run(args, capture_output=True, text=True)
|
|
162
|
+
if result.returncode != 0:
|
|
163
|
+
print(f" WARNING: token_tracker failed: {result.stderr[:200]}")
|
|
164
|
+
else:
|
|
165
|
+
print(" Done.")
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def step_generate_report(output_dir: Path) -> None:
|
|
169
|
+
"""Step 5: Generate markdown report."""
|
|
170
|
+
print("\n[5/6] Generating report...")
|
|
171
|
+
result = subprocess.run(
|
|
172
|
+
[
|
|
173
|
+
sys.executable,
|
|
174
|
+
str(SCRIPT_DIR / "generate_report.py"),
|
|
175
|
+
"--run-dir",
|
|
176
|
+
str(output_dir),
|
|
177
|
+
"--mode",
|
|
178
|
+
"ab",
|
|
179
|
+
],
|
|
180
|
+
capture_output=True,
|
|
181
|
+
text=True,
|
|
182
|
+
)
|
|
183
|
+
if result.returncode != 0:
|
|
184
|
+
print(f" WARNING: generate_report failed: {result.stderr[:200]}")
|
|
185
|
+
else:
|
|
186
|
+
print(f" Done. Report: {output_dir / 'report.md'}")
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ---------------------------------------------------------------------------
|
|
190
|
+
# Entry point
|
|
191
|
+
# ---------------------------------------------------------------------------
|
|
192
|
+
|
|
193
|
+
def main() -> None:
|
|
194
|
+
parser = argparse.ArgumentParser(
|
|
195
|
+
description="ct-grade v3 — Master Pipeline Runner",
|
|
196
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
197
|
+
)
|
|
198
|
+
parser.add_argument(
|
|
199
|
+
"--full",
|
|
200
|
+
action="store_true",
|
|
201
|
+
help="Run all 5 scenarios (S1–S5) + parity A/B tests. Default: fast mode (S4+S5 + smoke A/B).",
|
|
202
|
+
)
|
|
203
|
+
parser.add_argument(
|
|
204
|
+
"--skip-ab",
|
|
205
|
+
action="store_true",
|
|
206
|
+
help="Skip the A/B test step.",
|
|
207
|
+
)
|
|
208
|
+
parser.add_argument(
|
|
209
|
+
"--port",
|
|
210
|
+
type=int,
|
|
211
|
+
default=3118,
|
|
212
|
+
help="Port for grade review server (default: 3118).",
|
|
213
|
+
)
|
|
214
|
+
parser.add_argument(
|
|
215
|
+
"--project-dir",
|
|
216
|
+
default=".",
|
|
217
|
+
help="CLEO project root (default: current directory).",
|
|
218
|
+
)
|
|
219
|
+
parser.add_argument(
|
|
220
|
+
"--stop",
|
|
221
|
+
action="store_true",
|
|
222
|
+
help="Kill existing server from .ct-grade-server.pid and exit.",
|
|
223
|
+
)
|
|
224
|
+
parser.add_argument(
|
|
225
|
+
"--no-browser",
|
|
226
|
+
action="store_true",
|
|
227
|
+
help="Don't auto-open browser after starting server.",
|
|
228
|
+
)
|
|
229
|
+
args = parser.parse_args()
|
|
230
|
+
|
|
231
|
+
project_dir = str(Path(args.project_dir).resolve())
|
|
232
|
+
|
|
233
|
+
# --stop: kill running server and exit
|
|
234
|
+
if args.stop:
|
|
235
|
+
stop_server(project_dir)
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
# Set up timestamped output directory
|
|
239
|
+
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
240
|
+
output_dir = Path(project_dir) / "ab-results" / timestamp
|
|
241
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
242
|
+
|
|
243
|
+
mode_label = "Full (S1–S5 + parity A/B)" if args.full else "Fast (S4+S5 + smoke A/B)"
|
|
244
|
+
|
|
245
|
+
# Print header
|
|
246
|
+
print("╔═══════════════════════════════════════╗")
|
|
247
|
+
print("║ ct-grade v3 — Grade Review System ║")
|
|
248
|
+
print("╚═══════════════════════════════════════╝")
|
|
249
|
+
print()
|
|
250
|
+
print(f" Mode : {mode_label}")
|
|
251
|
+
print(f" Output : ab-results/{timestamp}")
|
|
252
|
+
print(f" Project : {project_dir}")
|
|
253
|
+
|
|
254
|
+
grades_file = Path(project_dir) / ".cleo" / "metrics" / "GRADES.jsonl"
|
|
255
|
+
|
|
256
|
+
# Step 1: Audit analysis
|
|
257
|
+
step_audit_analyze(project_dir, output_dir)
|
|
258
|
+
|
|
259
|
+
# Step 2: Scenario note
|
|
260
|
+
step_scenario_note(args.full)
|
|
261
|
+
|
|
262
|
+
# Step 3: A/B test (optional)
|
|
263
|
+
if not args.skip_ab:
|
|
264
|
+
step_ab_test(project_dir, output_dir, full_mode=args.full)
|
|
265
|
+
else:
|
|
266
|
+
print("\n[3/6] Skipping A/B test (--skip-ab).")
|
|
267
|
+
|
|
268
|
+
# Step 4: Token tracker
|
|
269
|
+
step_token_tracker(project_dir, output_dir, grades_file)
|
|
270
|
+
|
|
271
|
+
# Step 5: Report generation
|
|
272
|
+
step_generate_report(output_dir)
|
|
273
|
+
|
|
274
|
+
# Step 6: Start server
|
|
275
|
+
pid = start_server(project_dir, output_dir, args.port)
|
|
276
|
+
|
|
277
|
+
# Open browser after brief pause for server to bind
|
|
278
|
+
if not args.no_browser and pid is not None:
|
|
279
|
+
time.sleep(0.5)
|
|
280
|
+
webbrowser.open(f"http://localhost:{args.port}")
|
|
281
|
+
|
|
282
|
+
print(f"\nGrade Review live at http://localhost:{args.port}")
|
|
283
|
+
print("Stop with: python scripts/run_all.py --stop")
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
if __name__ == "__main__":
|
|
287
|
+
main()
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
setup_run.py — Set up an A/B test run directory and print the execution plan.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python setup_run.py --mode scenario --scenario s4 --interface both --runs 3 --output-dir ./ab_results/run-001
|
|
7
|
+
|
|
8
|
+
Outputs:
|
|
9
|
+
- Creates run directory structure
|
|
10
|
+
- Writes run-manifest.json
|
|
11
|
+
- Prints step-by-step execution plan for Claude to follow
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
VALID_MODES = ["scenario", "ab", "blind"]
|
|
22
|
+
VALID_SCENARIOS = ["s1", "s2", "s3", "s4", "s5", "all"]
|
|
23
|
+
VALID_INTERFACES = ["mcp", "cli", "both"]
|
|
24
|
+
|
|
25
|
+
SCENARIO_LABELS = {
|
|
26
|
+
"s1": "Fresh Discovery",
|
|
27
|
+
"s2": "Task Creation Hygiene",
|
|
28
|
+
"s3": "Error Recovery",
|
|
29
|
+
"s4": "Full Lifecycle",
|
|
30
|
+
"s5": "Multi-Domain Analysis",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
DEFAULT_DOMAINS = ["tasks", "session"]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def expand_scenarios(scenario_arg):
|
|
37
|
+
if scenario_arg == "all":
|
|
38
|
+
return ["s1", "s2", "s3", "s4", "s5"]
|
|
39
|
+
return [s.strip() for s in scenario_arg.split(",") if s.strip() in SCENARIO_LABELS]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def expand_interfaces(interface_arg):
|
|
43
|
+
if interface_arg == "both":
|
|
44
|
+
return ["mcp", "cli"]
|
|
45
|
+
return [interface_arg]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def create_dir(path):
|
|
49
|
+
os.makedirs(path, exist_ok=True)
|
|
50
|
+
return path
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def main():
|
|
54
|
+
parser = argparse.ArgumentParser(description="Set up a ct-grade A/B test run")
|
|
55
|
+
parser.add_argument("--mode", default="scenario", choices=VALID_MODES)
|
|
56
|
+
parser.add_argument("--scenario", default="all")
|
|
57
|
+
parser.add_argument("--interface", default="both", choices=VALID_INTERFACES)
|
|
58
|
+
parser.add_argument("--domains", default="tasks,session")
|
|
59
|
+
parser.add_argument("--runs", type=int, default=3)
|
|
60
|
+
parser.add_argument("--output-dir", required=True)
|
|
61
|
+
parser.add_argument("--project-dir", default=".")
|
|
62
|
+
args = parser.parse_args()
|
|
63
|
+
|
|
64
|
+
scenarios = expand_scenarios(args.scenario)
|
|
65
|
+
interfaces = expand_interfaces(args.interface)
|
|
66
|
+
domains = [d.strip() for d in args.domains.split(",")]
|
|
67
|
+
|
|
68
|
+
if not scenarios:
|
|
69
|
+
print(f"ERROR: No valid scenarios in '{args.scenario}'. Use: {', '.join(VALID_SCENARIOS)}", file=sys.stderr)
|
|
70
|
+
sys.exit(1)
|
|
71
|
+
|
|
72
|
+
run_dir = args.output_dir
|
|
73
|
+
create_dir(run_dir)
|
|
74
|
+
|
|
75
|
+
# For ab/blind mode, each domain is a "slot"
|
|
76
|
+
slots = scenarios if args.mode == "scenario" else domains
|
|
77
|
+
|
|
78
|
+
# Create directory structure
|
|
79
|
+
for slot in slots:
|
|
80
|
+
for iface in interfaces:
|
|
81
|
+
arm_label = "arm-A" if iface == interfaces[0] else "arm-B"
|
|
82
|
+
for run in range(1, args.runs + 1):
|
|
83
|
+
slot_dir = os.path.join(run_dir, slot, f"run-{run:02d}", arm_label)
|
|
84
|
+
create_dir(slot_dir)
|
|
85
|
+
# Create placeholder timing.json
|
|
86
|
+
timing = {
|
|
87
|
+
"arm": arm_label,
|
|
88
|
+
"interface": iface,
|
|
89
|
+
"slot": slot,
|
|
90
|
+
"run": run,
|
|
91
|
+
"executor_start": None,
|
|
92
|
+
"executor_end": None,
|
|
93
|
+
"executor_duration_seconds": None,
|
|
94
|
+
"total_tokens": None,
|
|
95
|
+
"duration_ms": None,
|
|
96
|
+
}
|
|
97
|
+
timing_path = os.path.join(slot_dir, "timing.json")
|
|
98
|
+
with open(timing_path, "w") as f:
|
|
99
|
+
json.dump(timing, f, indent=2)
|
|
100
|
+
|
|
101
|
+
# Write run-manifest.json
|
|
102
|
+
manifest = {
|
|
103
|
+
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
104
|
+
"mode": args.mode,
|
|
105
|
+
"scenarios": scenarios,
|
|
106
|
+
"interfaces": interfaces,
|
|
107
|
+
"domains": domains,
|
|
108
|
+
"runs_per_configuration": args.runs,
|
|
109
|
+
"project_dir": os.path.abspath(args.project_dir),
|
|
110
|
+
"run_dir": os.path.abspath(run_dir),
|
|
111
|
+
"arms": {
|
|
112
|
+
"A": {"interface": interfaces[0], "label": f"{interfaces[0].upper()} interface"},
|
|
113
|
+
"B": {"interface": interfaces[1] if len(interfaces) > 1 else interfaces[0],
|
|
114
|
+
"label": f"{interfaces[-1].upper()} interface"},
|
|
115
|
+
},
|
|
116
|
+
"slots": slots,
|
|
117
|
+
"status": "setup_complete",
|
|
118
|
+
}
|
|
119
|
+
manifest_path = os.path.join(run_dir, "run-manifest.json")
|
|
120
|
+
with open(manifest_path, "w") as f:
|
|
121
|
+
json.dump(manifest, f, indent=2)
|
|
122
|
+
|
|
123
|
+
# Print execution plan
|
|
124
|
+
print(f"\n{'='*60}")
|
|
125
|
+
print(f"ct-grade A/B Run Setup Complete")
|
|
126
|
+
print(f"{'='*60}")
|
|
127
|
+
print(f"Mode: {args.mode}")
|
|
128
|
+
print(f"Scenarios: {', '.join(scenarios)}")
|
|
129
|
+
print(f"Interfaces: {', '.join(interfaces)}")
|
|
130
|
+
print(f"Runs each: {args.runs}")
|
|
131
|
+
print(f"Output: {os.path.abspath(run_dir)}")
|
|
132
|
+
print(f"{'='*60}\n")
|
|
133
|
+
|
|
134
|
+
print("EXECUTION PLAN\n")
|
|
135
|
+
print("Spawn each arm as a parallel Agent task in the same turn.\n")
|
|
136
|
+
|
|
137
|
+
step = 1
|
|
138
|
+
for slot in slots:
|
|
139
|
+
slot_label = SCENARIO_LABELS.get(slot, slot)
|
|
140
|
+
print(f"## Slot: {slot} — {slot_label}\n")
|
|
141
|
+
for run in range(1, args.runs + 1):
|
|
142
|
+
for idx, iface in enumerate(interfaces):
|
|
143
|
+
arm_label = "arm-A" if idx == 0 else "arm-B"
|
|
144
|
+
arm_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}", arm_label)
|
|
145
|
+
print(f"Step {step}: Spawn Agent — {arm_label} ({iface}) | slot={slot} | run={run}")
|
|
146
|
+
print(f" Agent file: agents/scenario-runner.md")
|
|
147
|
+
print(f" SCENARIO: {slot}")
|
|
148
|
+
print(f" INTERFACE: {iface}")
|
|
149
|
+
print(f" OUTPUT_DIR: {arm_dir}")
|
|
150
|
+
print(f" RUN_NUMBER: {run}")
|
|
151
|
+
print(f" CRITICAL: Capture total_tokens + duration_ms from task notification")
|
|
152
|
+
print(f" and update {arm_dir}/timing.json immediately.\n")
|
|
153
|
+
step += 1
|
|
154
|
+
|
|
155
|
+
# After both arms complete for this run
|
|
156
|
+
comp_dir = os.path.join(os.path.abspath(run_dir), slot, f"run-{run:02d}")
|
|
157
|
+
print(f"Step {step}: Spawn blind-comparator Agent")
|
|
158
|
+
print(f" Agent file: agents/blind-comparator.md")
|
|
159
|
+
print(f" OUTPUT_A: {comp_dir}/arm-A/")
|
|
160
|
+
print(f" OUTPUT_B: {comp_dir}/arm-B/")
|
|
161
|
+
print(f" SCENARIO: {slot}")
|
|
162
|
+
print(f" OUTPUT_PATH: {comp_dir}/comparison.json\n")
|
|
163
|
+
step += 1
|
|
164
|
+
|
|
165
|
+
print(f"Step {step}: Aggregate token data")
|
|
166
|
+
print(f" python scripts/token_tracker.py --run-dir {os.path.abspath(run_dir)}\n")
|
|
167
|
+
step += 1
|
|
168
|
+
|
|
169
|
+
print(f"Step {step}: Generate final report")
|
|
170
|
+
print(f" python scripts/generate_report.py --run-dir {os.path.abspath(run_dir)} --mode {args.mode}\n")
|
|
171
|
+
step += 1
|
|
172
|
+
|
|
173
|
+
print(f"Step {step}: (Optional) Spawn analysis-reporter Agent for deep synthesis")
|
|
174
|
+
print(f" Agent file: agents/analysis-reporter.md")
|
|
175
|
+
print(f" RUN_DIR: {os.path.abspath(run_dir)}\n")
|
|
176
|
+
|
|
177
|
+
print(f"{'='*60}")
|
|
178
|
+
print(f"Manifest: {manifest_path}")
|
|
179
|
+
print(f"{'='*60}")
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
if __name__ == "__main__":
|
|
183
|
+
main()
|