@cleocode/skills 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch-config.json +404 -0
- package/index.d.ts +178 -0
- package/index.js +405 -0
- package/package.json +14 -0
- package/profiles/core.json +7 -0
- package/profiles/full.json +10 -0
- package/profiles/minimal.json +7 -0
- package/profiles/recommended.json +7 -0
- package/provider-skills-map.json +97 -0
- package/skills/_shared/cleo-style-guide.md +84 -0
- package/skills/_shared/manifest-operations.md +810 -0
- package/skills/_shared/placeholders.json +433 -0
- package/skills/_shared/skill-chaining-patterns.md +237 -0
- package/skills/_shared/subagent-protocol-base.md +223 -0
- package/skills/_shared/task-system-integration.md +232 -0
- package/skills/_shared/testing-framework-config.md +110 -0
- package/skills/ct-cleo/SKILL.md +490 -0
- package/skills/ct-cleo/references/anti-patterns.md +19 -0
- package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
- package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
- package/skills/ct-cleo/references/session-protocol.md +162 -0
- package/skills/ct-codebase-mapper/SKILL.md +82 -0
- package/skills/ct-contribution/SKILL.md +521 -0
- package/skills/ct-contribution/templates/contribution-init.json +21 -0
- package/skills/ct-dev-workflow/SKILL.md +423 -0
- package/skills/ct-docs-lookup/SKILL.md +66 -0
- package/skills/ct-docs-review/SKILL.md +175 -0
- package/skills/ct-docs-write/SKILL.md +108 -0
- package/skills/ct-documentor/SKILL.md +231 -0
- package/skills/ct-epic-architect/SKILL.md +305 -0
- package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
- package/skills/ct-epic-architect/references/commands.md +201 -0
- package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
- package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
- package/skills/ct-epic-architect/references/output-format.md +92 -0
- package/skills/ct-epic-architect/references/patterns.md +284 -0
- package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
- package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
- package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
- package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
- package/skills/ct-grade/SKILL.md +230 -0
- package/skills/ct-grade/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade/agents/blind-comparator.md +157 -0
- package/skills/ct-grade/agents/scenario-runner.md +134 -0
- package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
- package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
- package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
- package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
- package/skills/ct-grade/eval-viewer/viewer.html +219 -0
- package/skills/ct-grade/evals/evals.json +94 -0
- package/skills/ct-grade/references/ab-test-methodology.md +150 -0
- package/skills/ct-grade/references/domains.md +137 -0
- package/skills/ct-grade/references/grade-spec.md +236 -0
- package/skills/ct-grade/references/scenario-playbook.md +234 -0
- package/skills/ct-grade/references/token-tracking.md +120 -0
- package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
- package/skills/ct-grade/scripts/generate_report.py +283 -0
- package/skills/ct-grade/scripts/run_ab_test.py +504 -0
- package/skills/ct-grade/scripts/run_all.py +287 -0
- package/skills/ct-grade/scripts/setup_run.py +183 -0
- package/skills/ct-grade/scripts/token_tracker.py +630 -0
- package/skills/ct-grade-v2-1/SKILL.md +237 -0
- package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
- package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
- package/skills/ct-grade-v2-1/evals/evals.json +74 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
- package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
- package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
- package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
- package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
- package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
- package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
- package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
- package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
- package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
- package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
- package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
- package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
- package/skills/ct-memory/SKILL.md +84 -0
- package/skills/ct-orchestrator/INSTALL.md +61 -0
- package/skills/ct-orchestrator/README.md +69 -0
- package/skills/ct-orchestrator/SKILL.md +380 -0
- package/skills/ct-orchestrator/manifest-entry.json +19 -0
- package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
- package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
- package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
- package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
- package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
- package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
- package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
- package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
- package/skills/ct-research-agent/SKILL.md +226 -0
- package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
- package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
- package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
- package/skills/ct-skill-creator/SKILL.md +356 -0
- package/skills/ct-skill-creator/agents/analyzer.md +276 -0
- package/skills/ct-skill-creator/agents/comparator.md +204 -0
- package/skills/ct-skill-creator/agents/grader.md +225 -0
- package/skills/ct-skill-creator/assets/eval_review.html +146 -0
- package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/ct-skill-creator/manifest-entry.json +17 -0
- package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
- package/skills/ct-skill-creator/references/frontmatter.md +83 -0
- package/skills/ct-skill-creator/references/invocation-control.md +165 -0
- package/skills/ct-skill-creator/references/output-patterns.md +86 -0
- package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
- package/skills/ct-skill-creator/references/schemas.md +430 -0
- package/skills/ct-skill-creator/references/workflows.md +28 -0
- package/skills/ct-skill-creator/scripts/__init__.py +1 -0
- package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
- package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
- package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
- package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
- package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
- package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
- package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
- package/skills/ct-skill-creator/scripts/utils.py +47 -0
- package/skills/ct-skill-validator/SKILL.md +178 -0
- package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
- package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
- package/skills/ct-skill-validator/evals/eval_set.json +14 -0
- package/skills/ct-skill-validator/evals/evals.json +52 -0
- package/skills/ct-skill-validator/manifest-entry.json +20 -0
- package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
- package/skills/ct-skill-validator/references/validation-rules.md +168 -0
- package/skills/ct-skill-validator/scripts/__init__.py +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
- package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
- package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
- package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
- package/skills/ct-skill-validator/scripts/validate.py +422 -0
- package/skills/ct-spec-writer/SKILL.md +189 -0
- package/skills/ct-stickynote/README.md +14 -0
- package/skills/ct-stickynote/SKILL.md +46 -0
- package/skills/ct-task-executor/SKILL.md +296 -0
- package/skills/ct-validator/SKILL.md +216 -0
- package/skills/manifest.json +469 -0
- package/skills.json +281 -0
|
@@ -0,0 +1,493 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Blind A/B test: CLEO MCP vs CLI for the same operations.
|
|
4
|
+
|
|
5
|
+
Side A = MCP JSON-RPC via stdio (node dist/mcp/index.js)
|
|
6
|
+
Side B = CLI subprocess (cleo-dev <domain> <operation>)
|
|
7
|
+
|
|
8
|
+
Randomly shuffles A/B assignment per run so the comparator is blind.
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
python run_ab_test.py --domain tasks --operations find,show,list [options]
|
|
12
|
+
python run_ab_test.py --scenario-set parity [options]
|
|
13
|
+
python run_ab_test.py --domain session --tier 0 [options]
|
|
14
|
+
|
|
15
|
+
Options:
|
|
16
|
+
--domain CLEO domain to test (tasks, session, admin, tools, etc.)
|
|
17
|
+
--operations Comma-separated operation names (e.g. find,show,list)
|
|
18
|
+
--scenario-set Predefined set: parity (P1+P2+P3), smoke, standard
|
|
19
|
+
--tier Filter operations by tier (0, 1, 2)
|
|
20
|
+
--gateway query or mutate (default: query)
|
|
21
|
+
--runs Runs per operation (default: 3)
|
|
22
|
+
--cleo CLI binary (default: cleo-dev)
|
|
23
|
+
--cleo-path Path to CLEO project root (for MCP server)
|
|
24
|
+
--output-dir Results directory
|
|
25
|
+
--params-json JSON string of params to pass to each operation
|
|
26
|
+
--seed-task Task ID to use in operations that need one
|
|
27
|
+
--json Print summary JSON to stdout
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
import argparse
|
|
31
|
+
import json
|
|
32
|
+
import os
|
|
33
|
+
import random
|
|
34
|
+
import subprocess
|
|
35
|
+
import sys
|
|
36
|
+
import time
|
|
37
|
+
from datetime import datetime, timezone
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# Operation sets
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
OPERATION_SETS = {
|
|
46
|
+
"smoke": {
|
|
47
|
+
"tasks": ["find", "show", "plan"],
|
|
48
|
+
"session": ["status", "list"],
|
|
49
|
+
"admin": ["dash", "health"],
|
|
50
|
+
},
|
|
51
|
+
"standard": {
|
|
52
|
+
"tasks": ["find", "show", "list", "tree", "plan"],
|
|
53
|
+
"session": ["status", "list", "briefing.show"],
|
|
54
|
+
"admin": ["dash", "health", "help", "stats"],
|
|
55
|
+
"tools": ["skill.list", "provider.list"],
|
|
56
|
+
},
|
|
57
|
+
"parity": {
|
|
58
|
+
# P1: tasks domain
|
|
59
|
+
"tasks": ["find", "show", "list", "tree", "plan"],
|
|
60
|
+
# P2: session domain
|
|
61
|
+
"session": ["status", "list", "briefing.show", "handoff.show"],
|
|
62
|
+
# P3: admin domain
|
|
63
|
+
"admin": ["dash", "health", "help", "stats", "doctor"],
|
|
64
|
+
},
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
# Operations that need a task ID in params
|
|
68
|
+
TASK_ID_OPS = {"show", "exists", "complete", "cancel", "archive", "restore",
|
|
69
|
+
"start", "stop", "relates", "complexity.estimate", "history"}
|
|
70
|
+
|
|
71
|
+
# Map from domain.operation to CLI args builder
|
|
72
|
+
def build_cli_args(domain, operation, seed_task=None):
|
|
73
|
+
"""Build CLI argument list for a domain.operation call."""
|
|
74
|
+
base = [domain]
|
|
75
|
+
|
|
76
|
+
# Map dotted operations to CLI sub-commands
|
|
77
|
+
op_parts = operation.split(".")
|
|
78
|
+
base.extend(op_parts)
|
|
79
|
+
|
|
80
|
+
# Add required params
|
|
81
|
+
if operation in TASK_ID_OPS and seed_task:
|
|
82
|
+
base.extend([seed_task])
|
|
83
|
+
elif operation == "find":
|
|
84
|
+
base.extend(["--query", "test"])
|
|
85
|
+
elif operation == "label.show":
|
|
86
|
+
base.extend(["--label", "bug"])
|
|
87
|
+
elif operation == "help":
|
|
88
|
+
pass # no extra args
|
|
89
|
+
|
|
90
|
+
return base
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def build_mcp_payload(gateway, domain, operation, seed_task=None, extra_params=None):
|
|
94
|
+
"""Build MCP JSON-RPC tool call payload."""
|
|
95
|
+
params = extra_params or {}
|
|
96
|
+
if operation in TASK_ID_OPS and seed_task:
|
|
97
|
+
params["taskId"] = seed_task
|
|
98
|
+
elif operation == "find" and not params:
|
|
99
|
+
params["query"] = "test"
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"jsonrpc": "2.0",
|
|
103
|
+
"id": 1,
|
|
104
|
+
"method": "tools/call",
|
|
105
|
+
"params": {
|
|
106
|
+
"name": gateway,
|
|
107
|
+
"arguments": {
|
|
108
|
+
"domain": domain,
|
|
109
|
+
"operation": operation,
|
|
110
|
+
"params": params,
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
MCP_INIT = {
|
|
117
|
+
"jsonrpc": "2.0",
|
|
118
|
+
"id": 0,
|
|
119
|
+
"method": "initialize",
|
|
120
|
+
"params": {
|
|
121
|
+
"protocolVersion": "2024-11-05",
|
|
122
|
+
"capabilities": {},
|
|
123
|
+
"clientInfo": {"name": "ct-grade-ab-test", "version": "2.1.0"}
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# ---------------------------------------------------------------------------
|
|
129
|
+
# Interface callers
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
|
|
132
|
+
def call_via_mcp(gateway, domain, operation, cleo_path, seed_task=None, extra_params=None):
|
|
133
|
+
"""Call CLEO via MCP stdio JSON-RPC. Returns (success, output_chars, duration_ms, response)."""
|
|
134
|
+
payload = build_mcp_payload(gateway, domain, operation, seed_task, extra_params)
|
|
135
|
+
messages = json.dumps(MCP_INIT) + "\n" + json.dumps(payload) + "\n"
|
|
136
|
+
|
|
137
|
+
mcp_entry = Path(cleo_path) / "dist" / "mcp" / "index.js"
|
|
138
|
+
if not mcp_entry.exists():
|
|
139
|
+
return False, 0, 0, {"error": f"MCP server not found at {mcp_entry}"}
|
|
140
|
+
|
|
141
|
+
start = time.time()
|
|
142
|
+
try:
|
|
143
|
+
proc = subprocess.run(
|
|
144
|
+
["node", str(mcp_entry)],
|
|
145
|
+
input=messages,
|
|
146
|
+
capture_output=True,
|
|
147
|
+
text=True,
|
|
148
|
+
timeout=30,
|
|
149
|
+
cwd=str(cleo_path),
|
|
150
|
+
)
|
|
151
|
+
duration_ms = int((time.time() - start) * 1000)
|
|
152
|
+
output_chars = len(proc.stdout)
|
|
153
|
+
|
|
154
|
+
# Find the tool call response (id=1)
|
|
155
|
+
response = None
|
|
156
|
+
for line in proc.stdout.splitlines():
|
|
157
|
+
line = line.strip()
|
|
158
|
+
if not line:
|
|
159
|
+
continue
|
|
160
|
+
try:
|
|
161
|
+
parsed = json.loads(line)
|
|
162
|
+
if parsed.get("id") == 1:
|
|
163
|
+
response = parsed
|
|
164
|
+
break
|
|
165
|
+
except Exception:
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
stderr = proc.stderr or ""
|
|
169
|
+
if "migration" in stderr.lower() or "ENOENT" in stderr or "tasks.db" in stderr.lower():
|
|
170
|
+
return False, 0, duration_ms, {"error": "DB_UNAVAILABLE", "stderr": stderr[:200]}
|
|
171
|
+
|
|
172
|
+
if response is None:
|
|
173
|
+
return False, output_chars, duration_ms, {"error": "no response found", "raw": proc.stdout[:500]}
|
|
174
|
+
|
|
175
|
+
success = "result" in response and "error" not in response
|
|
176
|
+
return success, output_chars, duration_ms, response
|
|
177
|
+
|
|
178
|
+
except subprocess.TimeoutExpired:
|
|
179
|
+
return False, 0, 30000, {"error": "timeout"}
|
|
180
|
+
except Exception as e:
|
|
181
|
+
return False, 0, 0, {"error": str(e)}
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def call_via_cli(domain, operation, cleo_bin, cwd=None, seed_task=None):
|
|
185
|
+
"""Call CLEO via CLI subprocess. Returns (success, output_chars, duration_ms, output)."""
|
|
186
|
+
cli_args = build_cli_args(domain, operation, seed_task)
|
|
187
|
+
cmd = [cleo_bin] + cli_args + ["--json"]
|
|
188
|
+
|
|
189
|
+
start = time.time()
|
|
190
|
+
try:
|
|
191
|
+
proc = subprocess.run(
|
|
192
|
+
cmd,
|
|
193
|
+
capture_output=True,
|
|
194
|
+
text=True,
|
|
195
|
+
timeout=30,
|
|
196
|
+
cwd=cwd,
|
|
197
|
+
)
|
|
198
|
+
duration_ms = int((time.time() - start) * 1000)
|
|
199
|
+
output_chars = len(proc.stdout)
|
|
200
|
+
success = proc.returncode == 0
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
parsed = json.loads(proc.stdout)
|
|
204
|
+
except Exception:
|
|
205
|
+
parsed = {"raw": proc.stdout[:500]}
|
|
206
|
+
|
|
207
|
+
return success, output_chars, duration_ms, parsed
|
|
208
|
+
|
|
209
|
+
except subprocess.TimeoutExpired:
|
|
210
|
+
return False, 0, 30000, {"error": "timeout"}
|
|
211
|
+
except Exception as e:
|
|
212
|
+
return False, 0, 0, {"error": str(e)}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# ---------------------------------------------------------------------------
|
|
216
|
+
# Blind comparator
|
|
217
|
+
# ---------------------------------------------------------------------------
|
|
218
|
+
|
|
219
|
+
def blind_compare(output_a, output_b, operation, chars_a, chars_b, dur_a, dur_b):
|
|
220
|
+
"""
|
|
221
|
+
Simple blind comparator. Returns dict with winner, reasoning, scores.
|
|
222
|
+
In a real run, this would be delegated to an LLM comparator agent.
|
|
223
|
+
Here we use heuristics: completeness, structure, token efficiency.
|
|
224
|
+
"""
|
|
225
|
+
def score_response(resp, chars):
|
|
226
|
+
score = 0
|
|
227
|
+
# Completeness: has data?
|
|
228
|
+
if isinstance(resp, dict):
|
|
229
|
+
if resp.get("result") or resp.get("data") or resp.get("success"):
|
|
230
|
+
score += 3
|
|
231
|
+
if "error" not in resp:
|
|
232
|
+
score += 2
|
|
233
|
+
# Structure: is it clean JSON?
|
|
234
|
+
score += 2
|
|
235
|
+
# Token efficiency: smaller is better (same data)
|
|
236
|
+
score += max(0, 3 - int(chars / 2000))
|
|
237
|
+
return min(10, score)
|
|
238
|
+
|
|
239
|
+
score_a = score_response(output_a, chars_a)
|
|
240
|
+
score_b = score_response(output_b, chars_b)
|
|
241
|
+
|
|
242
|
+
if score_a > score_b:
|
|
243
|
+
winner = "A"
|
|
244
|
+
reasoning = f"A scored higher ({score_a} vs {score_b}). Chars: {chars_a} vs {chars_b}."
|
|
245
|
+
elif score_b > score_a:
|
|
246
|
+
winner = "B"
|
|
247
|
+
reasoning = f"B scored higher ({score_b} vs {score_a}). Chars: {chars_b} vs {chars_a}."
|
|
248
|
+
else:
|
|
249
|
+
winner = "TIE"
|
|
250
|
+
reasoning = f"Equal scores ({score_a}). Chars: {chars_a} vs {chars_b}. Latency: {dur_a}ms vs {dur_b}ms."
|
|
251
|
+
|
|
252
|
+
return {
|
|
253
|
+
"winner": winner,
|
|
254
|
+
"reasoning": reasoning,
|
|
255
|
+
"scores": {"A": score_a, "B": score_b},
|
|
256
|
+
"chars": {"A": chars_a, "B": chars_b},
|
|
257
|
+
"duration_ms": {"A": dur_a, "B": dur_b},
|
|
258
|
+
"estimated_tokens": {"A": int(chars_a / 4), "B": int(chars_b / 4)},
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
# ---------------------------------------------------------------------------
|
|
263
|
+
# Single operation A/B test
|
|
264
|
+
# ---------------------------------------------------------------------------
|
|
265
|
+
|
|
266
|
+
def run_ab_operation(domain, operation, gateway, args, num_runs, output_dir):
|
|
267
|
+
"""Run num_runs A/B tests for a single operation. Returns list of run results."""
|
|
268
|
+
op_key = f"{domain}.{operation}"
|
|
269
|
+
op_dir = Path(output_dir) / domain / operation.replace(".", "_")
|
|
270
|
+
op_dir.mkdir(parents=True, exist_ok=True)
|
|
271
|
+
|
|
272
|
+
print(f"\n [{op_key}]")
|
|
273
|
+
run_results = []
|
|
274
|
+
|
|
275
|
+
for run_num in range(1, num_runs + 1):
|
|
276
|
+
run_dir = op_dir / f"run-{run_num:03d}"
|
|
277
|
+
run_dir.mkdir(parents=True, exist_ok=True)
|
|
278
|
+
|
|
279
|
+
# Randomly assign MCP vs CLI to A and B (blind)
|
|
280
|
+
a_is_mcp = random.choice([True, False])
|
|
281
|
+
|
|
282
|
+
if a_is_mcp:
|
|
283
|
+
# Side A = MCP, Side B = CLI
|
|
284
|
+
a_success, a_chars, a_dur, a_resp = call_via_mcp(
|
|
285
|
+
gateway, domain, operation,
|
|
286
|
+
cleo_path=args.cleo_path or ".",
|
|
287
|
+
seed_task=args.seed_task,
|
|
288
|
+
)
|
|
289
|
+
b_success, b_chars, b_dur, b_resp = call_via_cli(
|
|
290
|
+
domain, operation, args.cleo,
|
|
291
|
+
cwd=args.cleo_path,
|
|
292
|
+
seed_task=args.seed_task,
|
|
293
|
+
)
|
|
294
|
+
else:
|
|
295
|
+
# Side A = CLI, Side B = MCP
|
|
296
|
+
a_success, a_chars, a_dur, a_resp = call_via_cli(
|
|
297
|
+
domain, operation, args.cleo,
|
|
298
|
+
cwd=args.cleo_path,
|
|
299
|
+
seed_task=args.seed_task,
|
|
300
|
+
)
|
|
301
|
+
b_success, b_chars, b_dur, b_resp = call_via_mcp(
|
|
302
|
+
gateway, domain, operation,
|
|
303
|
+
cleo_path=args.cleo_path or ".",
|
|
304
|
+
seed_task=args.seed_task,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
comparison = blind_compare(a_resp, b_resp, operation, a_chars, b_chars, a_dur, b_dur)
|
|
308
|
+
|
|
309
|
+
# De-blind: track which physical interface was A/B
|
|
310
|
+
mcp_was_a = a_is_mcp
|
|
311
|
+
mcp_chars = a_chars if a_is_mcp else b_chars
|
|
312
|
+
cli_chars = b_chars if a_is_mcp else a_chars
|
|
313
|
+
mcp_dur = a_dur if a_is_mcp else b_dur
|
|
314
|
+
cli_dur = b_dur if a_is_mcp else a_dur
|
|
315
|
+
|
|
316
|
+
winner_interface = "mcp" if (comparison["winner"] == "A") == a_is_mcp else \
|
|
317
|
+
"cli" if comparison["winner"] != "TIE" else "tie"
|
|
318
|
+
|
|
319
|
+
run_result = {
|
|
320
|
+
"run": run_num,
|
|
321
|
+
"operation": op_key,
|
|
322
|
+
"gateway": gateway,
|
|
323
|
+
"a_is_mcp": a_is_mcp,
|
|
324
|
+
"winner_label": comparison["winner"],
|
|
325
|
+
"winner_interface": winner_interface,
|
|
326
|
+
"comparison": comparison,
|
|
327
|
+
"mcp": {
|
|
328
|
+
"success": a_success if a_is_mcp else b_success,
|
|
329
|
+
"output_chars": mcp_chars,
|
|
330
|
+
"estimated_tokens": int(mcp_chars / 4),
|
|
331
|
+
"duration_ms": mcp_dur,
|
|
332
|
+
},
|
|
333
|
+
"cli": {
|
|
334
|
+
"success": b_success if a_is_mcp else a_success,
|
|
335
|
+
"output_chars": cli_chars,
|
|
336
|
+
"estimated_tokens": int(cli_chars / 4),
|
|
337
|
+
"duration_ms": cli_dur,
|
|
338
|
+
},
|
|
339
|
+
"token_delta": int(mcp_chars / 4) - int(cli_chars / 4),
|
|
340
|
+
"token_delta_pct": f"{((mcp_chars - cli_chars) / max(cli_chars, 1)) * 100:+.1f}%",
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
# Save run data
|
|
344
|
+
(run_dir / "side-a" ).mkdir(exist_ok=True)
|
|
345
|
+
(run_dir / "side-b").mkdir(exist_ok=True)
|
|
346
|
+
(run_dir / "side-a" / "response.json").write_text(json.dumps(a_resp, indent=2))
|
|
347
|
+
(run_dir / "side-b" / "response.json").write_text(json.dumps(b_resp, indent=2))
|
|
348
|
+
(run_dir / "comparison.json").write_text(json.dumps(comparison, indent=2))
|
|
349
|
+
(run_dir / "meta.json").write_text(json.dumps({
|
|
350
|
+
"a_is_mcp": a_is_mcp,
|
|
351
|
+
"winner_interface": winner_interface,
|
|
352
|
+
}, indent=2))
|
|
353
|
+
|
|
354
|
+
status = f"winner={comparison['winner']} ({winner_interface}) mcp={mcp_chars}c cli={cli_chars}c"
|
|
355
|
+
print(f" run {run_num}: {status}")
|
|
356
|
+
run_results.append(run_result)
|
|
357
|
+
|
|
358
|
+
# Save op-level summary
|
|
359
|
+
wins = {"mcp": 0, "cli": 0, "tie": 0}
|
|
360
|
+
for r in run_results:
|
|
361
|
+
wins[r["winner_interface"]] = wins.get(r["winner_interface"], 0) + 1
|
|
362
|
+
|
|
363
|
+
token_deltas = [r["token_delta"] for r in run_results]
|
|
364
|
+
avg_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
|
|
365
|
+
|
|
366
|
+
op_summary = {
|
|
367
|
+
"operation": op_key,
|
|
368
|
+
"runs": num_runs,
|
|
369
|
+
"wins": wins,
|
|
370
|
+
"win_rate": {k: v / num_runs for k, v in wins.items()},
|
|
371
|
+
"avg_token_delta_mcp_minus_cli": round(avg_delta, 1),
|
|
372
|
+
"avg_mcp_chars": round(sum(r["mcp"]["output_chars"] for r in run_results) / num_runs, 0),
|
|
373
|
+
"avg_cli_chars": round(sum(r["cli"]["output_chars"] for r in run_results) / num_runs, 0),
|
|
374
|
+
"avg_mcp_ms": round(sum(r["mcp"]["duration_ms"] for r in run_results) / num_runs, 0),
|
|
375
|
+
"avg_cli_ms": round(sum(r["cli"]["duration_ms"] for r in run_results) / num_runs, 0),
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
(op_dir / "summary.json").write_text(json.dumps(op_summary, indent=2))
|
|
379
|
+
return run_results, op_summary
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
# ---------------------------------------------------------------------------
|
|
383
|
+
# Main
|
|
384
|
+
# ---------------------------------------------------------------------------
|
|
385
|
+
|
|
386
|
+
def main():
|
|
387
|
+
parser = argparse.ArgumentParser(description="Blind A/B test: CLEO MCP vs CLI")
|
|
388
|
+
parser.add_argument("--domain", default=None, help="CLEO domain (e.g. tasks, session, admin)")
|
|
389
|
+
parser.add_argument("--operations", default=None, help="Comma-separated operations (e.g. find,show,list)")
|
|
390
|
+
parser.add_argument("--scenario-set", default=None,
|
|
391
|
+
choices=list(OPERATION_SETS.keys()),
|
|
392
|
+
help="Predefined operation set")
|
|
393
|
+
parser.add_argument("--tier", type=int, default=None, help="Filter by tier (0, 1, 2)")
|
|
394
|
+
parser.add_argument("--gateway", default="query", choices=["query", "mutate"])
|
|
395
|
+
parser.add_argument("--runs", type=int, default=3, help="Runs per operation (default: 3)")
|
|
396
|
+
parser.add_argument("--cleo", default="cleo-dev", help="CLI binary")
|
|
397
|
+
parser.add_argument("--cleo-path", default=None, help="CLEO project root (for MCP server)")
|
|
398
|
+
parser.add_argument("--output-dir", default=None, help="Output directory")
|
|
399
|
+
parser.add_argument("--seed-task", default=None, help="Task ID for operations needing one")
|
|
400
|
+
parser.add_argument("--params-json", default=None, help="Extra params as JSON")
|
|
401
|
+
parser.add_argument("--json", action="store_true", help="Print summary JSON to stdout")
|
|
402
|
+
args = parser.parse_args()
|
|
403
|
+
|
|
404
|
+
# Build test matrix
|
|
405
|
+
test_matrix = {}
|
|
406
|
+
|
|
407
|
+
if args.scenario_set:
|
|
408
|
+
test_matrix = OPERATION_SETS[args.scenario_set]
|
|
409
|
+
elif args.domain and args.operations:
|
|
410
|
+
ops = [o.strip() for o in args.operations.split(",")]
|
|
411
|
+
test_matrix = {args.domain: ops}
|
|
412
|
+
elif args.domain:
|
|
413
|
+
# Default ops for the domain
|
|
414
|
+
domain_defaults = {
|
|
415
|
+
"tasks": ["find", "show", "list"],
|
|
416
|
+
"session": ["status", "list"],
|
|
417
|
+
"admin": ["dash", "health", "help"],
|
|
418
|
+
"tools": ["skill.list"],
|
|
419
|
+
"memory": ["find"],
|
|
420
|
+
"check": ["health"],
|
|
421
|
+
"pipeline": ["stage.status"],
|
|
422
|
+
"orchestrate": ["status"],
|
|
423
|
+
"nexus": ["status"],
|
|
424
|
+
"sticky": ["list"],
|
|
425
|
+
}
|
|
426
|
+
test_matrix = {args.domain: domain_defaults.get(args.domain, ["find"])}
|
|
427
|
+
else:
|
|
428
|
+
print("ERROR: Provide --domain, --domain + --operations, or --scenario-set", file=sys.stderr)
|
|
429
|
+
sys.exit(1)
|
|
430
|
+
|
|
431
|
+
# Output directory
|
|
432
|
+
ts = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
433
|
+
base_output = Path(args.output_dir) if args.output_dir else Path(f"./ab-results/{ts}")
|
|
434
|
+
base_output.mkdir(parents=True, exist_ok=True)
|
|
435
|
+
|
|
436
|
+
print(f"=== CLEO MCP vs CLI Blind A/B Test ===")
|
|
437
|
+
print(f" Domains : {list(test_matrix.keys())}")
|
|
438
|
+
print(f" Runs/op : {args.runs}")
|
|
439
|
+
print(f" Gateway : {args.gateway}")
|
|
440
|
+
print(f" Output : {base_output}")
|
|
441
|
+
|
|
442
|
+
all_op_summaries = []
|
|
443
|
+
all_run_results = []
|
|
444
|
+
|
|
445
|
+
for domain, operations in test_matrix.items():
|
|
446
|
+
for operation in operations:
|
|
447
|
+
run_results, op_summary = run_ab_operation(
|
|
448
|
+
domain, operation, args.gateway, args, args.runs, base_output
|
|
449
|
+
)
|
|
450
|
+
all_op_summaries.append(op_summary)
|
|
451
|
+
all_run_results.extend(run_results)
|
|
452
|
+
|
|
453
|
+
# Global summary
|
|
454
|
+
total_mcp_wins = sum(s["wins"].get("mcp", 0) for s in all_op_summaries)
|
|
455
|
+
total_cli_wins = sum(s["wins"].get("cli", 0) for s in all_op_summaries)
|
|
456
|
+
total_ties = sum(s["wins"].get("tie", 0) for s in all_op_summaries)
|
|
457
|
+
total_runs = len(all_run_results)
|
|
458
|
+
avg_token_delta = sum(s["avg_token_delta_mcp_minus_cli"] for s in all_op_summaries) / max(len(all_op_summaries), 1)
|
|
459
|
+
|
|
460
|
+
summary = {
|
|
461
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
462
|
+
"test_matrix": {d: ops for d, ops in test_matrix.items()},
|
|
463
|
+
"total_runs": total_runs,
|
|
464
|
+
"global_wins": {
|
|
465
|
+
"mcp": total_mcp_wins,
|
|
466
|
+
"cli": total_cli_wins,
|
|
467
|
+
"tie": total_ties,
|
|
468
|
+
},
|
|
469
|
+
"global_win_rate": {
|
|
470
|
+
"mcp": round(total_mcp_wins / max(total_runs, 1), 3),
|
|
471
|
+
"cli": round(total_cli_wins / max(total_runs, 1), 3),
|
|
472
|
+
},
|
|
473
|
+
"avg_token_delta_mcp_minus_cli": round(avg_token_delta, 1),
|
|
474
|
+
"per_operation": {s["operation"]: s for s in all_op_summaries},
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
(base_output / "summary.json").write_text(json.dumps(summary, indent=2))
|
|
478
|
+
|
|
479
|
+
print(f"\n=== Results ===")
|
|
480
|
+
print(f" Total runs : {total_runs}")
|
|
481
|
+
print(f" MCP wins : {total_mcp_wins} ({summary['global_win_rate']['mcp']*100:.1f}%)")
|
|
482
|
+
print(f" CLI wins : {total_cli_wins} ({summary['global_win_rate']['cli']*100:.1f}%)")
|
|
483
|
+
print(f" Ties : {total_ties}")
|
|
484
|
+
delta_sign = "+" if avg_token_delta > 0 else ""
|
|
485
|
+
print(f" Avg token delta (MCP-CLI): {delta_sign}{avg_token_delta:.1f} tokens")
|
|
486
|
+
print(f"\nSaved: {base_output}")
|
|
487
|
+
|
|
488
|
+
if args.json:
|
|
489
|
+
print(json.dumps(summary, indent=2))
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
if __name__ == "__main__":
|
|
493
|
+
main()
|