@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,493 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Blind A/B test: CLEO MCP vs CLI for the same operations.
4
+
5
+ Side A = MCP JSON-RPC via stdio (node dist/mcp/index.js)
6
+ Side B = CLI subprocess (cleo-dev <domain> <operation>)
7
+
8
+ Randomly shuffles A/B assignment per run so the comparator is blind.
9
+
10
+ Usage:
11
+ python run_ab_test.py --domain tasks --operations find,show,list [options]
12
+ python run_ab_test.py --scenario-set parity [options]
13
+ python run_ab_test.py --domain session --tier 0 [options]
14
+
15
+ Options:
16
+ --domain CLEO domain to test (tasks, session, admin, tools, etc.)
17
+ --operations Comma-separated operation names (e.g. find,show,list)
18
+ --scenario-set Predefined set: parity (P1+P2+P3), smoke, standard
19
+ --tier Filter operations by tier (0, 1, 2)
20
+ --gateway query or mutate (default: query)
21
+ --runs Runs per operation (default: 3)
22
+ --cleo CLI binary (default: cleo-dev)
23
+ --cleo-path Path to CLEO project root (for MCP server)
24
+ --output-dir Results directory
25
+ --params-json JSON string of params to pass to each operation
26
+ --seed-task Task ID to use in operations that need one
27
+ --json Print summary JSON to stdout
28
+ """
29
+
30
+ import argparse
31
+ import json
32
+ import os
33
+ import random
34
+ import subprocess
35
+ import sys
36
+ import time
37
+ from datetime import datetime, timezone
38
+ from pathlib import Path
39
+
40
+
41
+ # ---------------------------------------------------------------------------
42
+ # Operation sets
43
+ # ---------------------------------------------------------------------------
44
+
45
+ OPERATION_SETS = {
46
+ "smoke": {
47
+ "tasks": ["find", "show", "plan"],
48
+ "session": ["status", "list"],
49
+ "admin": ["dash", "health"],
50
+ },
51
+ "standard": {
52
+ "tasks": ["find", "show", "list", "tree", "plan"],
53
+ "session": ["status", "list", "briefing.show"],
54
+ "admin": ["dash", "health", "help", "stats"],
55
+ "tools": ["skill.list", "provider.list"],
56
+ },
57
+ "parity": {
58
+ # P1: tasks domain
59
+ "tasks": ["find", "show", "list", "tree", "plan"],
60
+ # P2: session domain
61
+ "session": ["status", "list", "briefing.show", "handoff.show"],
62
+ # P3: admin domain
63
+ "admin": ["dash", "health", "help", "stats", "doctor"],
64
+ },
65
+ }
66
+
67
+ # Operations that need a task ID in params
68
+ TASK_ID_OPS = {"show", "exists", "complete", "cancel", "archive", "restore",
69
+ "start", "stop", "relates", "complexity.estimate", "history"}
70
+
71
+ # Map from domain.operation to CLI args builder
72
+ def build_cli_args(domain, operation, seed_task=None):
73
+ """Build CLI argument list for a domain.operation call."""
74
+ base = [domain]
75
+
76
+ # Map dotted operations to CLI sub-commands
77
+ op_parts = operation.split(".")
78
+ base.extend(op_parts)
79
+
80
+ # Add required params
81
+ if operation in TASK_ID_OPS and seed_task:
82
+ base.extend([seed_task])
83
+ elif operation == "find":
84
+ base.extend(["--query", "test"])
85
+ elif operation == "label.show":
86
+ base.extend(["--label", "bug"])
87
+ elif operation == "help":
88
+ pass # no extra args
89
+
90
+ return base
91
+
92
+
93
+ def build_mcp_payload(gateway, domain, operation, seed_task=None, extra_params=None):
94
+ """Build MCP JSON-RPC tool call payload."""
95
+ params = extra_params or {}
96
+ if operation in TASK_ID_OPS and seed_task:
97
+ params["taskId"] = seed_task
98
+ elif operation == "find" and not params:
99
+ params["query"] = "test"
100
+
101
+ return {
102
+ "jsonrpc": "2.0",
103
+ "id": 1,
104
+ "method": "tools/call",
105
+ "params": {
106
+ "name": gateway,
107
+ "arguments": {
108
+ "domain": domain,
109
+ "operation": operation,
110
+ "params": params,
111
+ }
112
+ }
113
+ }
114
+
115
+
116
+ MCP_INIT = {
117
+ "jsonrpc": "2.0",
118
+ "id": 0,
119
+ "method": "initialize",
120
+ "params": {
121
+ "protocolVersion": "2024-11-05",
122
+ "capabilities": {},
123
+ "clientInfo": {"name": "ct-grade-ab-test", "version": "2.1.0"}
124
+ }
125
+ }
126
+
127
+
128
+ # ---------------------------------------------------------------------------
129
+ # Interface callers
130
+ # ---------------------------------------------------------------------------
131
+
132
+ def call_via_mcp(gateway, domain, operation, cleo_path, seed_task=None, extra_params=None):
133
+ """Call CLEO via MCP stdio JSON-RPC. Returns (success, output_chars, duration_ms, response)."""
134
+ payload = build_mcp_payload(gateway, domain, operation, seed_task, extra_params)
135
+ messages = json.dumps(MCP_INIT) + "\n" + json.dumps(payload) + "\n"
136
+
137
+ mcp_entry = Path(cleo_path) / "dist" / "mcp" / "index.js"
138
+ if not mcp_entry.exists():
139
+ return False, 0, 0, {"error": f"MCP server not found at {mcp_entry}"}
140
+
141
+ start = time.time()
142
+ try:
143
+ proc = subprocess.run(
144
+ ["node", str(mcp_entry)],
145
+ input=messages,
146
+ capture_output=True,
147
+ text=True,
148
+ timeout=30,
149
+ cwd=str(cleo_path),
150
+ )
151
+ duration_ms = int((time.time() - start) * 1000)
152
+ output_chars = len(proc.stdout)
153
+
154
+ # Find the tool call response (id=1)
155
+ response = None
156
+ for line in proc.stdout.splitlines():
157
+ line = line.strip()
158
+ if not line:
159
+ continue
160
+ try:
161
+ parsed = json.loads(line)
162
+ if parsed.get("id") == 1:
163
+ response = parsed
164
+ break
165
+ except Exception:
166
+ continue
167
+
168
+ stderr = proc.stderr or ""
169
+ if "migration" in stderr.lower() or "ENOENT" in stderr or "tasks.db" in stderr.lower():
170
+ return False, 0, duration_ms, {"error": "DB_UNAVAILABLE", "stderr": stderr[:200]}
171
+
172
+ if response is None:
173
+ return False, output_chars, duration_ms, {"error": "no response found", "raw": proc.stdout[:500]}
174
+
175
+ success = "result" in response and "error" not in response
176
+ return success, output_chars, duration_ms, response
177
+
178
+ except subprocess.TimeoutExpired:
179
+ return False, 0, 30000, {"error": "timeout"}
180
+ except Exception as e:
181
+ return False, 0, 0, {"error": str(e)}
182
+
183
+
184
+ def call_via_cli(domain, operation, cleo_bin, cwd=None, seed_task=None):
185
+ """Call CLEO via CLI subprocess. Returns (success, output_chars, duration_ms, output)."""
186
+ cli_args = build_cli_args(domain, operation, seed_task)
187
+ cmd = [cleo_bin] + cli_args + ["--json"]
188
+
189
+ start = time.time()
190
+ try:
191
+ proc = subprocess.run(
192
+ cmd,
193
+ capture_output=True,
194
+ text=True,
195
+ timeout=30,
196
+ cwd=cwd,
197
+ )
198
+ duration_ms = int((time.time() - start) * 1000)
199
+ output_chars = len(proc.stdout)
200
+ success = proc.returncode == 0
201
+
202
+ try:
203
+ parsed = json.loads(proc.stdout)
204
+ except Exception:
205
+ parsed = {"raw": proc.stdout[:500]}
206
+
207
+ return success, output_chars, duration_ms, parsed
208
+
209
+ except subprocess.TimeoutExpired:
210
+ return False, 0, 30000, {"error": "timeout"}
211
+ except Exception as e:
212
+ return False, 0, 0, {"error": str(e)}
213
+
214
+
215
+ # ---------------------------------------------------------------------------
216
+ # Blind comparator
217
+ # ---------------------------------------------------------------------------
218
+
219
+ def blind_compare(output_a, output_b, operation, chars_a, chars_b, dur_a, dur_b):
220
+ """
221
+ Simple blind comparator. Returns dict with winner, reasoning, scores.
222
+ In a real run, this would be delegated to an LLM comparator agent.
223
+ Here we use heuristics: completeness, structure, token efficiency.
224
+ """
225
+ def score_response(resp, chars):
226
+ score = 0
227
+ # Completeness: has data?
228
+ if isinstance(resp, dict):
229
+ if resp.get("result") or resp.get("data") or resp.get("success"):
230
+ score += 3
231
+ if "error" not in resp:
232
+ score += 2
233
+ # Structure: is it clean JSON?
234
+ score += 2
235
+ # Token efficiency: smaller is better (same data)
236
+ score += max(0, 3 - int(chars / 2000))
237
+ return min(10, score)
238
+
239
+ score_a = score_response(output_a, chars_a)
240
+ score_b = score_response(output_b, chars_b)
241
+
242
+ if score_a > score_b:
243
+ winner = "A"
244
+ reasoning = f"A scored higher ({score_a} vs {score_b}). Chars: {chars_a} vs {chars_b}."
245
+ elif score_b > score_a:
246
+ winner = "B"
247
+ reasoning = f"B scored higher ({score_b} vs {score_a}). Chars: {chars_b} vs {chars_a}."
248
+ else:
249
+ winner = "TIE"
250
+ reasoning = f"Equal scores ({score_a}). Chars: {chars_a} vs {chars_b}. Latency: {dur_a}ms vs {dur_b}ms."
251
+
252
+ return {
253
+ "winner": winner,
254
+ "reasoning": reasoning,
255
+ "scores": {"A": score_a, "B": score_b},
256
+ "chars": {"A": chars_a, "B": chars_b},
257
+ "duration_ms": {"A": dur_a, "B": dur_b},
258
+ "estimated_tokens": {"A": int(chars_a / 4), "B": int(chars_b / 4)},
259
+ }
260
+
261
+
262
+ # ---------------------------------------------------------------------------
263
+ # Single operation A/B test
264
+ # ---------------------------------------------------------------------------
265
+
266
+ def run_ab_operation(domain, operation, gateway, args, num_runs, output_dir):
267
+ """Run num_runs A/B tests for a single operation. Returns list of run results."""
268
+ op_key = f"{domain}.{operation}"
269
+ op_dir = Path(output_dir) / domain / operation.replace(".", "_")
270
+ op_dir.mkdir(parents=True, exist_ok=True)
271
+
272
+ print(f"\n [{op_key}]")
273
+ run_results = []
274
+
275
+ for run_num in range(1, num_runs + 1):
276
+ run_dir = op_dir / f"run-{run_num:03d}"
277
+ run_dir.mkdir(parents=True, exist_ok=True)
278
+
279
+ # Randomly assign MCP vs CLI to A and B (blind)
280
+ a_is_mcp = random.choice([True, False])
281
+
282
+ if a_is_mcp:
283
+ # Side A = MCP, Side B = CLI
284
+ a_success, a_chars, a_dur, a_resp = call_via_mcp(
285
+ gateway, domain, operation,
286
+ cleo_path=args.cleo_path or ".",
287
+ seed_task=args.seed_task,
288
+ )
289
+ b_success, b_chars, b_dur, b_resp = call_via_cli(
290
+ domain, operation, args.cleo,
291
+ cwd=args.cleo_path,
292
+ seed_task=args.seed_task,
293
+ )
294
+ else:
295
+ # Side A = CLI, Side B = MCP
296
+ a_success, a_chars, a_dur, a_resp = call_via_cli(
297
+ domain, operation, args.cleo,
298
+ cwd=args.cleo_path,
299
+ seed_task=args.seed_task,
300
+ )
301
+ b_success, b_chars, b_dur, b_resp = call_via_mcp(
302
+ gateway, domain, operation,
303
+ cleo_path=args.cleo_path or ".",
304
+ seed_task=args.seed_task,
305
+ )
306
+
307
+ comparison = blind_compare(a_resp, b_resp, operation, a_chars, b_chars, a_dur, b_dur)
308
+
309
+ # De-blind: track which physical interface was A/B
310
+ mcp_was_a = a_is_mcp
311
+ mcp_chars = a_chars if a_is_mcp else b_chars
312
+ cli_chars = b_chars if a_is_mcp else a_chars
313
+ mcp_dur = a_dur if a_is_mcp else b_dur
314
+ cli_dur = b_dur if a_is_mcp else a_dur
315
+
316
+ winner_interface = "mcp" if (comparison["winner"] == "A") == a_is_mcp else \
317
+ "cli" if comparison["winner"] != "TIE" else "tie"
318
+
319
+ run_result = {
320
+ "run": run_num,
321
+ "operation": op_key,
322
+ "gateway": gateway,
323
+ "a_is_mcp": a_is_mcp,
324
+ "winner_label": comparison["winner"],
325
+ "winner_interface": winner_interface,
326
+ "comparison": comparison,
327
+ "mcp": {
328
+ "success": a_success if a_is_mcp else b_success,
329
+ "output_chars": mcp_chars,
330
+ "estimated_tokens": int(mcp_chars / 4),
331
+ "duration_ms": mcp_dur,
332
+ },
333
+ "cli": {
334
+ "success": b_success if a_is_mcp else a_success,
335
+ "output_chars": cli_chars,
336
+ "estimated_tokens": int(cli_chars / 4),
337
+ "duration_ms": cli_dur,
338
+ },
339
+ "token_delta": int(mcp_chars / 4) - int(cli_chars / 4),
340
+ "token_delta_pct": f"{((mcp_chars - cli_chars) / max(cli_chars, 1)) * 100:+.1f}%",
341
+ }
342
+
343
+ # Save run data
344
+ (run_dir / "side-a" ).mkdir(exist_ok=True)
345
+ (run_dir / "side-b").mkdir(exist_ok=True)
346
+ (run_dir / "side-a" / "response.json").write_text(json.dumps(a_resp, indent=2))
347
+ (run_dir / "side-b" / "response.json").write_text(json.dumps(b_resp, indent=2))
348
+ (run_dir / "comparison.json").write_text(json.dumps(comparison, indent=2))
349
+ (run_dir / "meta.json").write_text(json.dumps({
350
+ "a_is_mcp": a_is_mcp,
351
+ "winner_interface": winner_interface,
352
+ }, indent=2))
353
+
354
+ status = f"winner={comparison['winner']} ({winner_interface}) mcp={mcp_chars}c cli={cli_chars}c"
355
+ print(f" run {run_num}: {status}")
356
+ run_results.append(run_result)
357
+
358
+ # Save op-level summary
359
+ wins = {"mcp": 0, "cli": 0, "tie": 0}
360
+ for r in run_results:
361
+ wins[r["winner_interface"]] = wins.get(r["winner_interface"], 0) + 1
362
+
363
+ token_deltas = [r["token_delta"] for r in run_results]
364
+ avg_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
365
+
366
+ op_summary = {
367
+ "operation": op_key,
368
+ "runs": num_runs,
369
+ "wins": wins,
370
+ "win_rate": {k: v / num_runs for k, v in wins.items()},
371
+ "avg_token_delta_mcp_minus_cli": round(avg_delta, 1),
372
+ "avg_mcp_chars": round(sum(r["mcp"]["output_chars"] for r in run_results) / num_runs, 0),
373
+ "avg_cli_chars": round(sum(r["cli"]["output_chars"] for r in run_results) / num_runs, 0),
374
+ "avg_mcp_ms": round(sum(r["mcp"]["duration_ms"] for r in run_results) / num_runs, 0),
375
+ "avg_cli_ms": round(sum(r["cli"]["duration_ms"] for r in run_results) / num_runs, 0),
376
+ }
377
+
378
+ (op_dir / "summary.json").write_text(json.dumps(op_summary, indent=2))
379
+ return run_results, op_summary
380
+
381
+
382
+ # ---------------------------------------------------------------------------
383
+ # Main
384
+ # ---------------------------------------------------------------------------
385
+
386
+ def main():
387
+ parser = argparse.ArgumentParser(description="Blind A/B test: CLEO MCP vs CLI")
388
+ parser.add_argument("--domain", default=None, help="CLEO domain (e.g. tasks, session, admin)")
389
+ parser.add_argument("--operations", default=None, help="Comma-separated operations (e.g. find,show,list)")
390
+ parser.add_argument("--scenario-set", default=None,
391
+ choices=list(OPERATION_SETS.keys()),
392
+ help="Predefined operation set")
393
+ parser.add_argument("--tier", type=int, default=None, help="Filter by tier (0, 1, 2)")
394
+ parser.add_argument("--gateway", default="query", choices=["query", "mutate"])
395
+ parser.add_argument("--runs", type=int, default=3, help="Runs per operation (default: 3)")
396
+ parser.add_argument("--cleo", default="cleo-dev", help="CLI binary")
397
+ parser.add_argument("--cleo-path", default=None, help="CLEO project root (for MCP server)")
398
+ parser.add_argument("--output-dir", default=None, help="Output directory")
399
+ parser.add_argument("--seed-task", default=None, help="Task ID for operations needing one")
400
+ parser.add_argument("--params-json", default=None, help="Extra params as JSON")
401
+ parser.add_argument("--json", action="store_true", help="Print summary JSON to stdout")
402
+ args = parser.parse_args()
403
+
404
+ # Build test matrix
405
+ test_matrix = {}
406
+
407
+ if args.scenario_set:
408
+ test_matrix = OPERATION_SETS[args.scenario_set]
409
+ elif args.domain and args.operations:
410
+ ops = [o.strip() for o in args.operations.split(",")]
411
+ test_matrix = {args.domain: ops}
412
+ elif args.domain:
413
+ # Default ops for the domain
414
+ domain_defaults = {
415
+ "tasks": ["find", "show", "list"],
416
+ "session": ["status", "list"],
417
+ "admin": ["dash", "health", "help"],
418
+ "tools": ["skill.list"],
419
+ "memory": ["find"],
420
+ "check": ["health"],
421
+ "pipeline": ["stage.status"],
422
+ "orchestrate": ["status"],
423
+ "nexus": ["status"],
424
+ "sticky": ["list"],
425
+ }
426
+ test_matrix = {args.domain: domain_defaults.get(args.domain, ["find"])}
427
+ else:
428
+ print("ERROR: Provide --domain, --domain + --operations, or --scenario-set", file=sys.stderr)
429
+ sys.exit(1)
430
+
431
+ # Output directory
432
+ ts = datetime.now().strftime("%Y%m%d-%H%M%S")
433
+ base_output = Path(args.output_dir) if args.output_dir else Path(f"./ab-results/{ts}")
434
+ base_output.mkdir(parents=True, exist_ok=True)
435
+
436
+ print(f"=== CLEO MCP vs CLI Blind A/B Test ===")
437
+ print(f" Domains : {list(test_matrix.keys())}")
438
+ print(f" Runs/op : {args.runs}")
439
+ print(f" Gateway : {args.gateway}")
440
+ print(f" Output : {base_output}")
441
+
442
+ all_op_summaries = []
443
+ all_run_results = []
444
+
445
+ for domain, operations in test_matrix.items():
446
+ for operation in operations:
447
+ run_results, op_summary = run_ab_operation(
448
+ domain, operation, args.gateway, args, args.runs, base_output
449
+ )
450
+ all_op_summaries.append(op_summary)
451
+ all_run_results.extend(run_results)
452
+
453
+ # Global summary
454
+ total_mcp_wins = sum(s["wins"].get("mcp", 0) for s in all_op_summaries)
455
+ total_cli_wins = sum(s["wins"].get("cli", 0) for s in all_op_summaries)
456
+ total_ties = sum(s["wins"].get("tie", 0) for s in all_op_summaries)
457
+ total_runs = len(all_run_results)
458
+ avg_token_delta = sum(s["avg_token_delta_mcp_minus_cli"] for s in all_op_summaries) / max(len(all_op_summaries), 1)
459
+
460
+ summary = {
461
+ "timestamp": datetime.now(timezone.utc).isoformat(),
462
+ "test_matrix": {d: ops for d, ops in test_matrix.items()},
463
+ "total_runs": total_runs,
464
+ "global_wins": {
465
+ "mcp": total_mcp_wins,
466
+ "cli": total_cli_wins,
467
+ "tie": total_ties,
468
+ },
469
+ "global_win_rate": {
470
+ "mcp": round(total_mcp_wins / max(total_runs, 1), 3),
471
+ "cli": round(total_cli_wins / max(total_runs, 1), 3),
472
+ },
473
+ "avg_token_delta_mcp_minus_cli": round(avg_token_delta, 1),
474
+ "per_operation": {s["operation"]: s for s in all_op_summaries},
475
+ }
476
+
477
+ (base_output / "summary.json").write_text(json.dumps(summary, indent=2))
478
+
479
+ print(f"\n=== Results ===")
480
+ print(f" Total runs : {total_runs}")
481
+ print(f" MCP wins : {total_mcp_wins} ({summary['global_win_rate']['mcp']*100:.1f}%)")
482
+ print(f" CLI wins : {total_cli_wins} ({summary['global_win_rate']['cli']*100:.1f}%)")
483
+ print(f" Ties : {total_ties}")
484
+ delta_sign = "+" if avg_token_delta > 0 else ""
485
+ print(f" Avg token delta (MCP-CLI): {delta_sign}{avg_token_delta:.1f} tokens")
486
+ print(f"\nSaved: {base_output}")
487
+
488
+ if args.json:
489
+ print(json.dumps(summary, indent=2))
490
+
491
+
492
+ if __name__ == "__main__":
493
+ main()