@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,504 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ ct-grade v3 — Blind A/B test: CLEO MCP vs CLI for the same operations.
4
+
5
+ Side A = MCP JSON-RPC via stdio (node dist/mcp/index.js)
6
+ Side B = CLI subprocess (cleo-dev <domain> <operation>)
7
+
8
+ Randomly shuffles A/B assignment per run so the comparator is blind.
9
+
10
+ Usage:
11
+ python run_ab_test.py --domain tasks --operations find,show,list [options]
12
+ python run_ab_test.py --test-set parity [options]
13
+ python run_ab_test.py --domain session --tier 0 [options]
14
+
15
+ Options:
16
+ --domain CLEO domain to test (tasks, session, admin, tools, etc.)
17
+ --operations Comma-separated operation names (e.g. find,show,list)
18
+ --test-set Predefined set: smoke, standard, parity-P1, parity-P2, parity-P3, parity
19
+ --tier Filter operations by tier (0, 1, 2)
20
+ --gateway query or mutate (default: query)
21
+ --runs Runs per operation (default: 3)
22
+ --cleo CLI binary (default: cleo-dev)
23
+ --project-dir Path to CLEO project root (for MCP server)
24
+ --output-dir Results directory
25
+ --params-json JSON string of params to pass to each operation
26
+ --seed-task Task ID to use in operations that need one
27
+ --json Print summary JSON to stdout
28
+ """
29
+
30
+ import argparse
31
+ import json
32
+ import os
33
+ import random
34
+ import subprocess
35
+ import sys
36
+ import time
37
+ from datetime import datetime, timezone
38
+ from pathlib import Path
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Operation sets
43
+ # ---------------------------------------------------------------------------
44
+
45
+ OPERATION_SETS = {
46
+ "smoke": {
47
+ # Fast default — 6 operations, ~2-3 min
48
+ "tasks": ["find", "show"],
49
+ "session": ["list", "status"],
50
+ "admin": ["dash", "health"],
51
+ },
52
+ "standard": {
53
+ "tasks": ["find", "show", "list", "tree", "plan"],
54
+ "session": ["status", "list", "briefing.show"],
55
+ "admin": ["dash", "health", "help", "stats"],
56
+ "tools": ["skill.list", "provider.list"],
57
+ },
58
+ "parity-P1": {
59
+ # P1: tasks domain query ops
60
+ "tasks": ["find", "show", "list", "tree", "plan", "exists"],
61
+ },
62
+ "parity-P2": {
63
+ # P2: session domain query ops
64
+ "session": ["status", "list", "briefing.show", "handoff.show", "context.drift"],
65
+ },
66
+ "parity-P3": {
67
+ # P3: admin domain query ops
68
+ "admin": ["dash", "health", "help", "stats", "doctor"],
69
+ },
70
+ "parity": {
71
+ # Full parity (P1+P2+P3 combined)
72
+ "tasks": ["find", "show", "list", "tree", "plan", "exists"],
73
+ "session": ["status", "list", "briefing.show", "handoff.show"],
74
+ "admin": ["dash", "health", "help", "stats", "doctor"],
75
+ },
76
+ }
77
+
78
+ # Operations that need a task ID in params
79
+ TASK_ID_OPS = {"show", "exists", "complete", "cancel", "archive", "restore",
80
+ "start", "stop", "relates", "complexity.estimate", "history"}
81
+
82
+ # Map from domain.operation to CLI args builder
83
+ def build_cli_args(domain, operation, seed_task=None):
84
+ """Build CLI argument list for a domain.operation call."""
85
+ base = [domain]
86
+
87
+ # Map dotted operations to CLI sub-commands
88
+ op_parts = operation.split(".")
89
+ base.extend(op_parts)
90
+
91
+ # Add required params
92
+ if operation in TASK_ID_OPS and seed_task:
93
+ base.extend([seed_task])
94
+ elif operation == "find":
95
+ base.extend(["--query", "test"])
96
+ elif operation == "label.show":
97
+ base.extend(["--label", "bug"])
98
+ elif operation == "help":
99
+ pass # no extra args
100
+
101
+ return base
102
+
103
+
104
+ def build_mcp_payload(gateway, domain, operation, seed_task=None, extra_params=None):
105
+ """Build MCP JSON-RPC tool call payload."""
106
+ params = extra_params or {}
107
+ if operation in TASK_ID_OPS and seed_task:
108
+ params["taskId"] = seed_task
109
+ elif operation == "find" and not params:
110
+ params["query"] = "test"
111
+
112
+ return {
113
+ "jsonrpc": "2.0",
114
+ "id": 1,
115
+ "method": "tools/call",
116
+ "params": {
117
+ "name": gateway,
118
+ "arguments": {
119
+ "domain": domain,
120
+ "operation": operation,
121
+ "params": params,
122
+ }
123
+ }
124
+ }
125
+
126
+
127
+ MCP_INIT = {
128
+ "jsonrpc": "2.0",
129
+ "id": 0,
130
+ "method": "initialize",
131
+ "params": {
132
+ "protocolVersion": "2024-11-05",
133
+ "capabilities": {},
134
+ "clientInfo": {"name": "ct-grade-ab-test", "version": "2.1.0"}
135
+ }
136
+ }
137
+
138
+
139
+ # ---------------------------------------------------------------------------
140
+ # Interface callers
141
+ # ---------------------------------------------------------------------------
142
+
143
+ def call_via_mcp(gateway, domain, operation, cleo_path, seed_task=None, extra_params=None):
144
+ """Call CLEO via MCP stdio JSON-RPC. Returns (success, output_chars, duration_ms, response)."""
145
+ payload = build_mcp_payload(gateway, domain, operation, seed_task, extra_params)
146
+ messages = json.dumps(MCP_INIT) + "\n" + json.dumps(payload) + "\n"
147
+
148
+ mcp_entry = Path(cleo_path) / "dist" / "mcp" / "index.js"
149
+ if not mcp_entry.exists():
150
+ return False, 0, 0, {"error": f"MCP server not found at {mcp_entry}"}
151
+
152
+ start = time.time()
153
+ try:
154
+ proc = subprocess.run(
155
+ ["node", str(mcp_entry)],
156
+ input=messages,
157
+ capture_output=True,
158
+ text=True,
159
+ timeout=30,
160
+ cwd=str(cleo_path),
161
+ )
162
+ duration_ms = int((time.time() - start) * 1000)
163
+ output_chars = len(proc.stdout)
164
+
165
+ # Find the tool call response (id=1)
166
+ response = None
167
+ for line in proc.stdout.splitlines():
168
+ line = line.strip()
169
+ if not line:
170
+ continue
171
+ try:
172
+ parsed = json.loads(line)
173
+ if parsed.get("id") == 1:
174
+ response = parsed
175
+ break
176
+ except Exception:
177
+ continue
178
+
179
+ stderr = proc.stderr or ""
180
+ if "migration" in stderr.lower() or "ENOENT" in stderr or "tasks.db" in stderr.lower():
181
+ return False, 0, duration_ms, {"error": "DB_UNAVAILABLE", "stderr": stderr[:200]}
182
+
183
+ if response is None:
184
+ return False, output_chars, duration_ms, {"error": "no response found", "raw": proc.stdout[:500]}
185
+
186
+ success = "result" in response and "error" not in response
187
+ return success, output_chars, duration_ms, response
188
+
189
+ except subprocess.TimeoutExpired:
190
+ return False, 0, 30000, {"error": "timeout"}
191
+ except Exception as e:
192
+ return False, 0, 0, {"error": str(e)}
193
+
194
+
195
+ def call_via_cli(domain, operation, cleo_bin, cwd=None, seed_task=None):
196
+ """Call CLEO via CLI subprocess. Returns (success, output_chars, duration_ms, output)."""
197
+ cli_args = build_cli_args(domain, operation, seed_task)
198
+ cmd = [cleo_bin] + cli_args + ["--json"]
199
+
200
+ start = time.time()
201
+ try:
202
+ proc = subprocess.run(
203
+ cmd,
204
+ capture_output=True,
205
+ text=True,
206
+ timeout=30,
207
+ cwd=cwd,
208
+ )
209
+ duration_ms = int((time.time() - start) * 1000)
210
+ output_chars = len(proc.stdout)
211
+ success = proc.returncode == 0
212
+
213
+ try:
214
+ parsed = json.loads(proc.stdout)
215
+ except Exception:
216
+ parsed = {"raw": proc.stdout[:500]}
217
+
218
+ return success, output_chars, duration_ms, parsed
219
+
220
+ except subprocess.TimeoutExpired:
221
+ return False, 0, 30000, {"error": "timeout"}
222
+ except Exception as e:
223
+ return False, 0, 0, {"error": str(e)}
224
+
225
+
226
+ # ---------------------------------------------------------------------------
227
+ # Blind comparator
228
+ # ---------------------------------------------------------------------------
229
+
230
+ def blind_compare(output_a, output_b, operation, chars_a, chars_b, dur_a, dur_b):
231
+ """
232
+ Simple blind comparator. Returns dict with winner, reasoning, scores.
233
+ In a real run, this would be delegated to an LLM comparator agent.
234
+ Here we use heuristics: completeness, structure, token efficiency.
235
+ """
236
+ def score_response(resp, chars):
237
+ score = 0
238
+ # Completeness: has data?
239
+ if isinstance(resp, dict):
240
+ if resp.get("result") or resp.get("data") or resp.get("success"):
241
+ score += 3
242
+ if "error" not in resp:
243
+ score += 2
244
+ # Structure: is it clean JSON?
245
+ score += 2
246
+ # Token efficiency: smaller is better (same data)
247
+ score += max(0, 3 - int(chars / 2000))
248
+ return min(10, score)
249
+
250
+ score_a = score_response(output_a, chars_a)
251
+ score_b = score_response(output_b, chars_b)
252
+
253
+ if score_a > score_b:
254
+ winner = "A"
255
+ reasoning = f"A scored higher ({score_a} vs {score_b}). Chars: {chars_a} vs {chars_b}."
256
+ elif score_b > score_a:
257
+ winner = "B"
258
+ reasoning = f"B scored higher ({score_b} vs {score_a}). Chars: {chars_b} vs {chars_a}."
259
+ else:
260
+ winner = "TIE"
261
+ reasoning = f"Equal scores ({score_a}). Chars: {chars_a} vs {chars_b}. Latency: {dur_a}ms vs {dur_b}ms."
262
+
263
+ return {
264
+ "winner": winner,
265
+ "reasoning": reasoning,
266
+ "scores": {"A": score_a, "B": score_b},
267
+ "chars": {"A": chars_a, "B": chars_b},
268
+ "duration_ms": {"A": dur_a, "B": dur_b},
269
+ "estimated_tokens": {"A": int(chars_a / 4), "B": int(chars_b / 4)},
270
+ }
271
+
272
+
273
+ # ---------------------------------------------------------------------------
274
+ # Single operation A/B test
275
+ # ---------------------------------------------------------------------------
276
+
277
+ def run_ab_operation(domain, operation, gateway, args, num_runs, output_dir):
278
+ """Run num_runs A/B tests for a single operation. Returns list of run results."""
279
+ op_key = f"{domain}.{operation}"
280
+ op_dir = Path(output_dir) / domain / operation.replace(".", "_")
281
+ op_dir.mkdir(parents=True, exist_ok=True)
282
+
283
+ print(f"\n [{op_key}]")
284
+ run_results = []
285
+
286
+ for run_num in range(1, num_runs + 1):
287
+ run_dir = op_dir / f"run-{run_num:03d}"
288
+ run_dir.mkdir(parents=True, exist_ok=True)
289
+
290
+ # Randomly assign MCP vs CLI to A and B (blind)
291
+ a_is_mcp = random.choice([True, False])
292
+
293
+ if a_is_mcp:
294
+ # Side A = MCP, Side B = CLI
295
+ a_success, a_chars, a_dur, a_resp = call_via_mcp(
296
+ gateway, domain, operation,
297
+ cleo_path=args.project_dir,
298
+ seed_task=args.seed_task,
299
+ )
300
+ b_success, b_chars, b_dur, b_resp = call_via_cli(
301
+ domain, operation, args.cleo,
302
+ cwd=args.project_dir,
303
+ seed_task=args.seed_task,
304
+ )
305
+ else:
306
+ # Side A = CLI, Side B = MCP
307
+ a_success, a_chars, a_dur, a_resp = call_via_cli(
308
+ domain, operation, args.cleo,
309
+ cwd=args.project_dir,
310
+ seed_task=args.seed_task,
311
+ )
312
+ b_success, b_chars, b_dur, b_resp = call_via_mcp(
313
+ gateway, domain, operation,
314
+ cleo_path=args.project_dir,
315
+ seed_task=args.seed_task,
316
+ )
317
+
318
+ comparison = blind_compare(a_resp, b_resp, operation, a_chars, b_chars, a_dur, b_dur)
319
+
320
+ # De-blind: track which physical interface was A/B
321
+ mcp_was_a = a_is_mcp
322
+ mcp_chars = a_chars if a_is_mcp else b_chars
323
+ cli_chars = b_chars if a_is_mcp else a_chars
324
+ mcp_dur = a_dur if a_is_mcp else b_dur
325
+ cli_dur = b_dur if a_is_mcp else a_dur
326
+
327
+ winner_interface = "mcp" if (comparison["winner"] == "A") == a_is_mcp else \
328
+ "cli" if comparison["winner"] != "TIE" else "tie"
329
+
330
+ run_result = {
331
+ "run": run_num,
332
+ "operation": op_key,
333
+ "gateway": gateway,
334
+ "a_is_mcp": a_is_mcp,
335
+ "winner_label": comparison["winner"],
336
+ "winner_interface": winner_interface,
337
+ "comparison": comparison,
338
+ "mcp": {
339
+ "success": a_success if a_is_mcp else b_success,
340
+ "output_chars": mcp_chars,
341
+ "estimated_tokens": int(mcp_chars / 4),
342
+ "duration_ms": mcp_dur,
343
+ },
344
+ "cli": {
345
+ "success": b_success if a_is_mcp else a_success,
346
+ "output_chars": cli_chars,
347
+ "estimated_tokens": int(cli_chars / 4),
348
+ "duration_ms": cli_dur,
349
+ },
350
+ "token_delta": int(mcp_chars / 4) - int(cli_chars / 4),
351
+ "token_delta_pct": f"{((mcp_chars - cli_chars) / max(cli_chars, 1)) * 100:+.1f}%",
352
+ }
353
+
354
+ # Save run data
355
+ (run_dir / "side-a" ).mkdir(exist_ok=True)
356
+ (run_dir / "side-b").mkdir(exist_ok=True)
357
+ (run_dir / "side-a" / "response.json").write_text(json.dumps(a_resp, indent=2))
358
+ (run_dir / "side-b" / "response.json").write_text(json.dumps(b_resp, indent=2))
359
+ (run_dir / "comparison.json").write_text(json.dumps(comparison, indent=2))
360
+ (run_dir / "meta.json").write_text(json.dumps({
361
+ "a_is_mcp": a_is_mcp,
362
+ "winner_interface": winner_interface,
363
+ }, indent=2))
364
+
365
+ status = f"winner={comparison['winner']} ({winner_interface}) mcp={mcp_chars}c cli={cli_chars}c"
366
+ print(f" run {run_num}: {status}")
367
+ run_results.append(run_result)
368
+
369
+ # Save op-level summary
370
+ wins = {"mcp": 0, "cli": 0, "tie": 0}
371
+ for r in run_results:
372
+ wins[r["winner_interface"]] = wins.get(r["winner_interface"], 0) + 1
373
+
374
+ token_deltas = [r["token_delta"] for r in run_results]
375
+ avg_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
376
+
377
+ op_summary = {
378
+ "operation": op_key,
379
+ "runs": num_runs,
380
+ "wins": wins,
381
+ "win_rate": {k: v / num_runs for k, v in wins.items()},
382
+ "avg_token_delta_mcp_minus_cli": round(avg_delta, 1),
383
+ "avg_mcp_chars": round(sum(r["mcp"]["output_chars"] for r in run_results) / num_runs, 0),
384
+ "avg_cli_chars": round(sum(r["cli"]["output_chars"] for r in run_results) / num_runs, 0),
385
+ "avg_mcp_ms": round(sum(r["mcp"]["duration_ms"] for r in run_results) / num_runs, 0),
386
+ "avg_cli_ms": round(sum(r["cli"]["duration_ms"] for r in run_results) / num_runs, 0),
387
+ }
388
+
389
+ (op_dir / "summary.json").write_text(json.dumps(op_summary, indent=2))
390
+ return run_results, op_summary
391
+
392
+
393
+ # ---------------------------------------------------------------------------
394
+ # Main
395
+ # ---------------------------------------------------------------------------
396
+
397
+ def main():
398
+ parser = argparse.ArgumentParser(description="ct-grade v3 — Blind A/B test: CLEO MCP vs CLI")
399
+ parser.add_argument("--domain", default=None, help="CLEO domain (e.g. tasks, session, admin)")
400
+ parser.add_argument("--operations", default=None, help="Comma-separated operations (e.g. find,show,list)")
401
+ parser.add_argument("--test-set", default=None,
402
+ choices=["smoke", "standard", "parity-P1", "parity-P2", "parity-P3", "parity"],
403
+ help="Predefined operation set")
404
+ parser.add_argument("--tier", type=int, default=None, help="Filter by tier (0, 1, 2)")
405
+ parser.add_argument("--gateway", default="query", choices=["query", "mutate"])
406
+ parser.add_argument("--runs", type=int, default=3, help="Runs per operation (default: 3)")
407
+ parser.add_argument("--cleo", default="cleo-dev", help="CLI binary")
408
+ parser.add_argument("--project-dir", default=".", help="CLEO project root (for MCP server)")
409
+ parser.add_argument("--output-dir", default=None, help="Output directory")
410
+ parser.add_argument("--seed-task", default=None, help="Task ID for operations needing one")
411
+ parser.add_argument("--params-json", default=None, help="Extra params as JSON")
412
+ parser.add_argument("--json", action="store_true", help="Print summary JSON to stdout")
413
+ args = parser.parse_args()
414
+
415
+ # Build test matrix
416
+ test_matrix = {}
417
+
418
+ if args.test_set:
419
+ test_matrix = OPERATION_SETS[args.test_set]
420
+ elif args.domain and args.operations:
421
+ ops = [o.strip() for o in args.operations.split(",")]
422
+ test_matrix = {args.domain: ops}
423
+ elif args.domain:
424
+ # Default ops for the domain
425
+ domain_defaults = {
426
+ "tasks": ["find", "show", "list"],
427
+ "session": ["status", "list"],
428
+ "admin": ["dash", "health", "help"],
429
+ "tools": ["skill.list"],
430
+ "memory": ["find"],
431
+ "check": ["health"],
432
+ "pipeline": ["stage.status"],
433
+ "orchestrate": ["status"],
434
+ "nexus": ["status"],
435
+ "sticky": ["list"],
436
+ }
437
+ test_matrix = {args.domain: domain_defaults.get(args.domain, ["find"])}
438
+ else:
439
+ print("ERROR: Provide --domain, --domain + --operations, or --test-set", file=sys.stderr)
440
+ sys.exit(1)
441
+
442
+ # Output directory
443
+ ts = datetime.now().strftime("%Y%m%d-%H%M%S")
444
+ base_output = Path(args.output_dir) if args.output_dir else Path(f"./ab-results/{ts}")
445
+ base_output.mkdir(parents=True, exist_ok=True)
446
+
447
+ print(f"=== CLEO MCP vs CLI Blind A/B Test ===")
448
+ print(f" Domains : {list(test_matrix.keys())}")
449
+ print(f" Runs/op : {args.runs}")
450
+ print(f" Gateway : {args.gateway}")
451
+ print(f" Output : {base_output}")
452
+
453
+ all_op_summaries = []
454
+ all_run_results = []
455
+
456
+ for domain, operations in test_matrix.items():
457
+ for operation in operations:
458
+ run_results, op_summary = run_ab_operation(
459
+ domain, operation, args.gateway, args, args.runs, base_output
460
+ )
461
+ all_op_summaries.append(op_summary)
462
+ all_run_results.extend(run_results)
463
+
464
+ # Global summary
465
+ total_mcp_wins = sum(s["wins"].get("mcp", 0) for s in all_op_summaries)
466
+ total_cli_wins = sum(s["wins"].get("cli", 0) for s in all_op_summaries)
467
+ total_ties = sum(s["wins"].get("tie", 0) for s in all_op_summaries)
468
+ total_runs = len(all_run_results)
469
+ avg_token_delta = sum(s["avg_token_delta_mcp_minus_cli"] for s in all_op_summaries) / max(len(all_op_summaries), 1)
470
+
471
+ summary = {
472
+ "timestamp": datetime.now(timezone.utc).isoformat(),
473
+ "test_matrix": {d: ops for d, ops in test_matrix.items()},
474
+ "total_runs": total_runs,
475
+ "global_wins": {
476
+ "mcp": total_mcp_wins,
477
+ "cli": total_cli_wins,
478
+ "tie": total_ties,
479
+ },
480
+ "global_win_rate": {
481
+ "mcp": round(total_mcp_wins / max(total_runs, 1), 3),
482
+ "cli": round(total_cli_wins / max(total_runs, 1), 3),
483
+ },
484
+ "avg_token_delta_mcp_minus_cli": round(avg_token_delta, 1),
485
+ "per_operation": {s["operation"]: s for s in all_op_summaries},
486
+ }
487
+
488
+ (base_output / "summary.json").write_text(json.dumps(summary, indent=2))
489
+
490
+ print(f"\n=== Results ===")
491
+ print(f" Total runs : {total_runs}")
492
+ print(f" MCP wins : {total_mcp_wins} ({summary['global_win_rate']['mcp']*100:.1f}%)")
493
+ print(f" CLI wins : {total_cli_wins} ({summary['global_win_rate']['cli']*100:.1f}%)")
494
+ print(f" Ties : {total_ties}")
495
+ delta_sign = "+" if avg_token_delta > 0 else ""
496
+ print(f" Avg token delta (MCP-CLI): {delta_sign}{avg_token_delta:.1f} tokens")
497
+ print(f"\nSaved: {base_output}")
498
+
499
+ if args.json:
500
+ print(json.dumps(summary, indent=2))
501
+
502
+
503
+ if __name__ == "__main__":
504
+ main()