@cleocode/skills 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch-config.json +404 -0
- package/index.d.ts +178 -0
- package/index.js +405 -0
- package/package.json +14 -0
- package/profiles/core.json +7 -0
- package/profiles/full.json +10 -0
- package/profiles/minimal.json +7 -0
- package/profiles/recommended.json +7 -0
- package/provider-skills-map.json +97 -0
- package/skills/_shared/cleo-style-guide.md +84 -0
- package/skills/_shared/manifest-operations.md +810 -0
- package/skills/_shared/placeholders.json +433 -0
- package/skills/_shared/skill-chaining-patterns.md +237 -0
- package/skills/_shared/subagent-protocol-base.md +223 -0
- package/skills/_shared/task-system-integration.md +232 -0
- package/skills/_shared/testing-framework-config.md +110 -0
- package/skills/ct-cleo/SKILL.md +490 -0
- package/skills/ct-cleo/references/anti-patterns.md +19 -0
- package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
- package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
- package/skills/ct-cleo/references/session-protocol.md +162 -0
- package/skills/ct-codebase-mapper/SKILL.md +82 -0
- package/skills/ct-contribution/SKILL.md +521 -0
- package/skills/ct-contribution/templates/contribution-init.json +21 -0
- package/skills/ct-dev-workflow/SKILL.md +423 -0
- package/skills/ct-docs-lookup/SKILL.md +66 -0
- package/skills/ct-docs-review/SKILL.md +175 -0
- package/skills/ct-docs-write/SKILL.md +108 -0
- package/skills/ct-documentor/SKILL.md +231 -0
- package/skills/ct-epic-architect/SKILL.md +305 -0
- package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
- package/skills/ct-epic-architect/references/commands.md +201 -0
- package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
- package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
- package/skills/ct-epic-architect/references/output-format.md +92 -0
- package/skills/ct-epic-architect/references/patterns.md +284 -0
- package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
- package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
- package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
- package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
- package/skills/ct-grade/SKILL.md +230 -0
- package/skills/ct-grade/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade/agents/blind-comparator.md +157 -0
- package/skills/ct-grade/agents/scenario-runner.md +134 -0
- package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
- package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
- package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
- package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
- package/skills/ct-grade/eval-viewer/viewer.html +219 -0
- package/skills/ct-grade/evals/evals.json +94 -0
- package/skills/ct-grade/references/ab-test-methodology.md +150 -0
- package/skills/ct-grade/references/domains.md +137 -0
- package/skills/ct-grade/references/grade-spec.md +236 -0
- package/skills/ct-grade/references/scenario-playbook.md +234 -0
- package/skills/ct-grade/references/token-tracking.md +120 -0
- package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
- package/skills/ct-grade/scripts/generate_report.py +283 -0
- package/skills/ct-grade/scripts/run_ab_test.py +504 -0
- package/skills/ct-grade/scripts/run_all.py +287 -0
- package/skills/ct-grade/scripts/setup_run.py +183 -0
- package/skills/ct-grade/scripts/token_tracker.py +630 -0
- package/skills/ct-grade-v2-1/SKILL.md +237 -0
- package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
- package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
- package/skills/ct-grade-v2-1/evals/evals.json +74 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
- package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
- package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
- package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
- package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
- package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
- package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
- package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
- package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
- package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
- package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
- package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
- package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
- package/skills/ct-memory/SKILL.md +84 -0
- package/skills/ct-orchestrator/INSTALL.md +61 -0
- package/skills/ct-orchestrator/README.md +69 -0
- package/skills/ct-orchestrator/SKILL.md +380 -0
- package/skills/ct-orchestrator/manifest-entry.json +19 -0
- package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
- package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
- package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
- package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
- package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
- package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
- package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
- package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
- package/skills/ct-research-agent/SKILL.md +226 -0
- package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
- package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
- package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
- package/skills/ct-skill-creator/SKILL.md +356 -0
- package/skills/ct-skill-creator/agents/analyzer.md +276 -0
- package/skills/ct-skill-creator/agents/comparator.md +204 -0
- package/skills/ct-skill-creator/agents/grader.md +225 -0
- package/skills/ct-skill-creator/assets/eval_review.html +146 -0
- package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/ct-skill-creator/manifest-entry.json +17 -0
- package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
- package/skills/ct-skill-creator/references/frontmatter.md +83 -0
- package/skills/ct-skill-creator/references/invocation-control.md +165 -0
- package/skills/ct-skill-creator/references/output-patterns.md +86 -0
- package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
- package/skills/ct-skill-creator/references/schemas.md +430 -0
- package/skills/ct-skill-creator/references/workflows.md +28 -0
- package/skills/ct-skill-creator/scripts/__init__.py +1 -0
- package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
- package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
- package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
- package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
- package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
- package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
- package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
- package/skills/ct-skill-creator/scripts/utils.py +47 -0
- package/skills/ct-skill-validator/SKILL.md +178 -0
- package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
- package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
- package/skills/ct-skill-validator/evals/eval_set.json +14 -0
- package/skills/ct-skill-validator/evals/evals.json +52 -0
- package/skills/ct-skill-validator/manifest-entry.json +20 -0
- package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
- package/skills/ct-skill-validator/references/validation-rules.md +168 -0
- package/skills/ct-skill-validator/scripts/__init__.py +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
- package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
- package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
- package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
- package/skills/ct-skill-validator/scripts/validate.py +422 -0
- package/skills/ct-spec-writer/SKILL.md +189 -0
- package/skills/ct-stickynote/README.md +14 -0
- package/skills/ct-stickynote/SKILL.md +46 -0
- package/skills/ct-task-executor/SKILL.md +296 -0
- package/skills/ct-validator/SKILL.md +216 -0
- package/skills/manifest.json +469 -0
- package/skills.json +281 -0
|
@@ -0,0 +1,419 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Generate a comparative analysis report from grade scenario and/or A/B test results.
|
|
4
|
+
|
|
5
|
+
Reads output directories from run_scenario.py and/or run_ab_test.py and produces:
|
|
6
|
+
- Markdown report (human-readable)
|
|
7
|
+
- analysis.json (machine-readable)
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python generate_report.py --input-dir ./grade-results --format markdown
|
|
11
|
+
python generate_report.py --input-dir ./ab-results --format markdown --focus token-delta
|
|
12
|
+
python generate_report.py --input-dir ./results --format both --output ./report.md
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import json
|
|
17
|
+
import sys
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from statistics import mean, stdev
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ---------------------------------------------------------------------------
|
|
24
|
+
# Loaders
|
|
25
|
+
# ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
def load_json_safe(path):
|
|
28
|
+
"""Load JSON file, return None on failure."""
|
|
29
|
+
try:
|
|
30
|
+
return json.loads(Path(path).read_text())
|
|
31
|
+
except Exception:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def find_summary_files(input_dir):
|
|
36
|
+
"""Find all summary.json files under input_dir."""
|
|
37
|
+
return list(Path(input_dir).rglob("summary.json"))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def find_metrics_files(input_dir):
|
|
41
|
+
"""Find all metrics.json files (scenario runs)."""
|
|
42
|
+
return list(Path(input_dir).rglob("metrics.json"))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def find_ab_summaries(input_dir):
|
|
46
|
+
"""Find summary.json files that look like A/B test summaries."""
|
|
47
|
+
results = []
|
|
48
|
+
for f in Path(input_dir).rglob("summary.json"):
|
|
49
|
+
data = load_json_safe(f)
|
|
50
|
+
if data and "global_wins" in data:
|
|
51
|
+
results.append((f, data))
|
|
52
|
+
return results
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def find_grade_summaries(input_dir):
|
|
56
|
+
"""Find summary.json files that look like grade scenario summaries."""
|
|
57
|
+
results = []
|
|
58
|
+
for f in Path(input_dir).rglob("summary.json"):
|
|
59
|
+
data = load_json_safe(f)
|
|
60
|
+
if data and "grade_summary" in data:
|
|
61
|
+
results.append((f, data))
|
|
62
|
+
return results
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Analysis
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
def analyze_grade_results(grade_summaries):
|
|
70
|
+
"""Aggregate grade scenario results."""
|
|
71
|
+
if not grade_summaries:
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
all_scores = []
|
|
75
|
+
by_scenario = {}
|
|
76
|
+
total_flags = 0
|
|
77
|
+
total_tokens = 0
|
|
78
|
+
|
|
79
|
+
for _, summary in grade_summaries:
|
|
80
|
+
for gs in summary.get("grade_summary", []):
|
|
81
|
+
scenario = gs.get("scenario", "?")
|
|
82
|
+
score = gs.get("score")
|
|
83
|
+
flags = gs.get("flags") or 0
|
|
84
|
+
tokens = gs.get("estimated_tokens") or 0
|
|
85
|
+
|
|
86
|
+
if score is not None:
|
|
87
|
+
all_scores.append(score)
|
|
88
|
+
total_flags += flags
|
|
89
|
+
total_tokens += tokens
|
|
90
|
+
|
|
91
|
+
if scenario not in by_scenario:
|
|
92
|
+
by_scenario[scenario] = {"scores": [], "flags": [], "tokens": []}
|
|
93
|
+
if score is not None:
|
|
94
|
+
by_scenario[scenario]["scores"].append(score)
|
|
95
|
+
by_scenario[scenario]["flags"].append(flags)
|
|
96
|
+
by_scenario[scenario]["tokens"].append(tokens)
|
|
97
|
+
|
|
98
|
+
analysis = {
|
|
99
|
+
"total_runs": len(all_scores),
|
|
100
|
+
"overall": {
|
|
101
|
+
"mean_score": round(mean(all_scores), 1) if all_scores else None,
|
|
102
|
+
"min_score": min(all_scores) if all_scores else None,
|
|
103
|
+
"max_score": max(all_scores) if all_scores else None,
|
|
104
|
+
"stddev_score": round(stdev(all_scores), 2) if len(all_scores) > 1 else 0,
|
|
105
|
+
"total_flags": total_flags,
|
|
106
|
+
"total_estimated_tokens": total_tokens,
|
|
107
|
+
},
|
|
108
|
+
"by_scenario": {},
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
for scenario, data in by_scenario.items():
|
|
112
|
+
scores = data["scores"]
|
|
113
|
+
flags = data["flags"]
|
|
114
|
+
tokens = [t for t in data["tokens"] if t > 0]
|
|
115
|
+
analysis["by_scenario"][scenario] = {
|
|
116
|
+
"runs": len(scores),
|
|
117
|
+
"mean_score": round(mean(scores), 1) if scores else None,
|
|
118
|
+
"min_score": min(scores) if scores else None,
|
|
119
|
+
"max_score": max(scores) if scores else None,
|
|
120
|
+
"total_flags": sum(flags),
|
|
121
|
+
"avg_flags_per_run": round(mean(flags), 2) if flags else 0,
|
|
122
|
+
"avg_estimated_tokens": round(mean(tokens), 0) if tokens else 0,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
return analysis
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def analyze_ab_results(ab_summaries):
|
|
129
|
+
"""Aggregate A/B test results."""
|
|
130
|
+
if not ab_summaries:
|
|
131
|
+
return None
|
|
132
|
+
|
|
133
|
+
total_mcp_wins = 0
|
|
134
|
+
total_cli_wins = 0
|
|
135
|
+
total_ties = 0
|
|
136
|
+
total_runs = 0
|
|
137
|
+
token_deltas = []
|
|
138
|
+
per_op = {}
|
|
139
|
+
|
|
140
|
+
for _, summary in ab_summaries:
|
|
141
|
+
total_mcp_wins += summary.get("global_wins", {}).get("mcp", 0)
|
|
142
|
+
total_cli_wins += summary.get("global_wins", {}).get("cli", 0)
|
|
143
|
+
total_ties += summary.get("global_wins", {}).get("tie", 0)
|
|
144
|
+
total_runs += summary.get("total_runs", 0)
|
|
145
|
+
delta = summary.get("avg_token_delta_mcp_minus_cli")
|
|
146
|
+
if delta is not None:
|
|
147
|
+
token_deltas.append(delta)
|
|
148
|
+
|
|
149
|
+
for op_summary in summary.get("per_operation", []):
|
|
150
|
+
op_key = op_summary.get("operation", "?")
|
|
151
|
+
if op_key not in per_op:
|
|
152
|
+
per_op[op_key] = {
|
|
153
|
+
"mcp_wins": 0, "cli_wins": 0, "ties": 0,
|
|
154
|
+
"token_deltas": [],
|
|
155
|
+
"mcp_chars": [], "cli_chars": [],
|
|
156
|
+
"mcp_ms": [], "cli_ms": [],
|
|
157
|
+
}
|
|
158
|
+
per_op[op_key]["mcp_wins"] += op_summary.get("wins", {}).get("mcp", 0)
|
|
159
|
+
per_op[op_key]["cli_wins"] += op_summary.get("wins", {}).get("cli", 0)
|
|
160
|
+
per_op[op_key]["ties"] += op_summary.get("wins", {}).get("tie", 0)
|
|
161
|
+
if op_summary.get("avg_token_delta_mcp_minus_cli") is not None:
|
|
162
|
+
per_op[op_key]["token_deltas"].append(op_summary["avg_token_delta_mcp_minus_cli"])
|
|
163
|
+
if op_summary.get("avg_mcp_chars"):
|
|
164
|
+
per_op[op_key]["mcp_chars"].append(op_summary["avg_mcp_chars"])
|
|
165
|
+
if op_summary.get("avg_cli_chars"):
|
|
166
|
+
per_op[op_key]["cli_chars"].append(op_summary["avg_cli_chars"])
|
|
167
|
+
if op_summary.get("avg_mcp_ms"):
|
|
168
|
+
per_op[op_key]["mcp_ms"].append(op_summary["avg_mcp_ms"])
|
|
169
|
+
if op_summary.get("avg_cli_ms"):
|
|
170
|
+
per_op[op_key]["cli_ms"].append(op_summary["avg_cli_ms"])
|
|
171
|
+
|
|
172
|
+
overall_winner = "mcp" if total_mcp_wins > total_cli_wins else \
|
|
173
|
+
"cli" if total_cli_wins > total_mcp_wins else "tie"
|
|
174
|
+
avg_delta = mean(token_deltas) if token_deltas else 0
|
|
175
|
+
|
|
176
|
+
analysis = {
|
|
177
|
+
"total_runs": total_runs,
|
|
178
|
+
"overall_winner": overall_winner,
|
|
179
|
+
"global_wins": {
|
|
180
|
+
"mcp": total_mcp_wins,
|
|
181
|
+
"cli": total_cli_wins,
|
|
182
|
+
"tie": total_ties,
|
|
183
|
+
},
|
|
184
|
+
"global_win_rate": {
|
|
185
|
+
"mcp": round(total_mcp_wins / max(total_runs, 1), 3),
|
|
186
|
+
"cli": round(total_cli_wins / max(total_runs, 1), 3),
|
|
187
|
+
},
|
|
188
|
+
"avg_token_delta_mcp_minus_cli": round(avg_delta, 1),
|
|
189
|
+
"interpretation": (
|
|
190
|
+
"MCP uses more tokens on average" if avg_delta > 0 else
|
|
191
|
+
"CLI uses more tokens on average" if avg_delta < 0 else
|
|
192
|
+
"MCP and CLI have equivalent token costs"
|
|
193
|
+
),
|
|
194
|
+
"per_operation": {},
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
for op_key, data in per_op.items():
|
|
198
|
+
total_op = data["mcp_wins"] + data["cli_wins"] + data["ties"]
|
|
199
|
+
analysis["per_operation"][op_key] = {
|
|
200
|
+
"total_runs": total_op,
|
|
201
|
+
"mcp_wins": data["mcp_wins"],
|
|
202
|
+
"cli_wins": data["cli_wins"],
|
|
203
|
+
"ties": data["ties"],
|
|
204
|
+
"winner": "mcp" if data["mcp_wins"] > data["cli_wins"] else
|
|
205
|
+
"cli" if data["cli_wins"] > data["mcp_wins"] else "tie",
|
|
206
|
+
"avg_token_delta": round(mean(data["token_deltas"]), 1) if data["token_deltas"] else 0,
|
|
207
|
+
"avg_mcp_chars": round(mean(data["mcp_chars"]), 0) if data["mcp_chars"] else 0,
|
|
208
|
+
"avg_cli_chars": round(mean(data["cli_chars"]), 0) if data["cli_chars"] else 0,
|
|
209
|
+
"avg_mcp_ms": round(mean(data["mcp_ms"]), 0) if data["mcp_ms"] else 0,
|
|
210
|
+
"avg_cli_ms": round(mean(data["cli_ms"]), 0) if data["cli_ms"] else 0,
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
return analysis
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# ---------------------------------------------------------------------------
|
|
217
|
+
# Markdown report generator
|
|
218
|
+
# ---------------------------------------------------------------------------
|
|
219
|
+
|
|
220
|
+
def generate_markdown(grade_analysis, ab_analysis, input_dir, focus=None):
|
|
221
|
+
"""Produce a markdown comparative analysis report."""
|
|
222
|
+
ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
|
|
223
|
+
lines = [
|
|
224
|
+
f"# CLEO Grade v2.1 — Comparative Analysis Report",
|
|
225
|
+
f"",
|
|
226
|
+
f"**Generated:** {ts} ",
|
|
227
|
+
f"**Source:** `{input_dir}`",
|
|
228
|
+
f"",
|
|
229
|
+
]
|
|
230
|
+
|
|
231
|
+
# --- Grade scenario section ---
|
|
232
|
+
if grade_analysis and focus != "token-delta":
|
|
233
|
+
lines += [
|
|
234
|
+
"---",
|
|
235
|
+
"",
|
|
236
|
+
"## Grade Scenario Results",
|
|
237
|
+
"",
|
|
238
|
+
]
|
|
239
|
+
ov = grade_analysis["overall"]
|
|
240
|
+
lines += [
|
|
241
|
+
f"| Metric | Value |",
|
|
242
|
+
f"|--------|-------|",
|
|
243
|
+
f"| Total runs | {ov['total_runs']} |",
|
|
244
|
+
f"| Mean score | {ov['mean_score']}/100 |",
|
|
245
|
+
f"| Score range | {ov['min_score']}–{ov['max_score']} |",
|
|
246
|
+
f"| Score stddev | {ov['stddev_score']} |",
|
|
247
|
+
f"| Total flags | {ov['total_flags']} |",
|
|
248
|
+
f"| Total est. tokens | {ov['total_estimated_tokens']:,} |",
|
|
249
|
+
"",
|
|
250
|
+
]
|
|
251
|
+
|
|
252
|
+
lines += ["### Per-Scenario Breakdown", ""]
|
|
253
|
+
lines += [
|
|
254
|
+
"| Scenario | Runs | Mean Score | Min | Max | Flags/Run | Avg Tokens |",
|
|
255
|
+
"|----------|------|-----------|-----|-----|-----------|------------|",
|
|
256
|
+
]
|
|
257
|
+
for scenario, data in sorted(grade_analysis["by_scenario"].items()):
|
|
258
|
+
score_str = f"{data['mean_score']}/100" if data["mean_score"] is not None else "N/A"
|
|
259
|
+
lines.append(
|
|
260
|
+
f"| {scenario} | {data['runs']} | {score_str} | "
|
|
261
|
+
f"{data['min_score']} | {data['max_score']} | "
|
|
262
|
+
f"{data['avg_flags_per_run']:.1f} | "
|
|
263
|
+
f"~{int(data['avg_estimated_tokens'])}t |"
|
|
264
|
+
)
|
|
265
|
+
lines.append("")
|
|
266
|
+
|
|
267
|
+
# --- A/B test section ---
|
|
268
|
+
if ab_analysis:
|
|
269
|
+
lines += [
|
|
270
|
+
"---",
|
|
271
|
+
"",
|
|
272
|
+
"## MCP vs CLI Blind A/B Results",
|
|
273
|
+
"",
|
|
274
|
+
]
|
|
275
|
+
ow = ab_analysis["overall_winner"].upper()
|
|
276
|
+
wr = ab_analysis["global_win_rate"]
|
|
277
|
+
gw = ab_analysis["global_wins"]
|
|
278
|
+
delta = ab_analysis["avg_token_delta_mcp_minus_cli"]
|
|
279
|
+
interp = ab_analysis["interpretation"]
|
|
280
|
+
|
|
281
|
+
lines += [
|
|
282
|
+
f"**Overall winner: {ow}** ",
|
|
283
|
+
f"",
|
|
284
|
+
f"| Metric | Value |",
|
|
285
|
+
f"|--------|-------|",
|
|
286
|
+
f"| Total runs | {ab_analysis['total_runs']} |",
|
|
287
|
+
f"| MCP wins | {gw['mcp']} ({wr['mcp']*100:.1f}%) |",
|
|
288
|
+
f"| CLI wins | {gw['cli']} ({wr['cli']*100:.1f}%) |",
|
|
289
|
+
f"| Ties | {gw['tie']} |",
|
|
290
|
+
f"| Avg token delta (MCP–CLI) | {delta:+.1f} tokens |",
|
|
291
|
+
f"| Interpretation | {interp} |",
|
|
292
|
+
"",
|
|
293
|
+
]
|
|
294
|
+
|
|
295
|
+
lines += ["### Per-Operation Results", ""]
|
|
296
|
+
lines += [
|
|
297
|
+
"| Operation | MCP wins | CLI wins | Ties | Token delta | MCP chars | CLI chars | MCP ms | CLI ms |",
|
|
298
|
+
"|-----------|----------|----------|------|-------------|-----------|-----------|--------|--------|",
|
|
299
|
+
]
|
|
300
|
+
for op_key, data in sorted(ab_analysis["per_operation"].items()):
|
|
301
|
+
winner_marker = " **MCP**" if data["winner"] == "mcp" else \
|
|
302
|
+
" **CLI**" if data["winner"] == "cli" else ""
|
|
303
|
+
lines.append(
|
|
304
|
+
f"| `{op_key}`{winner_marker} | {data['mcp_wins']} | {data['cli_wins']} | "
|
|
305
|
+
f"{data['ties']} | {data['avg_token_delta']:+.0f}t | "
|
|
306
|
+
f"{int(data['avg_mcp_chars'])} | {int(data['avg_cli_chars'])} | "
|
|
307
|
+
f"{int(data['avg_mcp_ms'])}ms | {int(data['avg_cli_ms'])}ms |"
|
|
308
|
+
)
|
|
309
|
+
lines.append("")
|
|
310
|
+
|
|
311
|
+
# Recommendations
|
|
312
|
+
lines += ["### Recommendations", ""]
|
|
313
|
+
if delta > 50:
|
|
314
|
+
lines.append("- **MCP adds significant token overhead.** Consider whether MCP envelope verbosity can be reduced for high-frequency operations.")
|
|
315
|
+
elif delta < -50:
|
|
316
|
+
lines.append("- **CLI is more verbose than MCP.** CLI output may include formatting/ANSI tokens not useful to agents.")
|
|
317
|
+
else:
|
|
318
|
+
lines.append("- **MCP and CLI have similar token costs.** Interface choice should be based on other factors (protocol compliance, auditability).")
|
|
319
|
+
|
|
320
|
+
if wr.get("mcp", 0) > 0.6:
|
|
321
|
+
lines.append("- **MCP output quality is consistently higher.** Reinforces MCP-first agent protocol recommendation.")
|
|
322
|
+
elif wr.get("cli", 0) > 0.6:
|
|
323
|
+
lines.append("- **CLI output quality is consistently higher.** Investigate MCP envelope structure for potential improvements.")
|
|
324
|
+
|
|
325
|
+
lines.append("")
|
|
326
|
+
|
|
327
|
+
# --- Token efficiency section ---
|
|
328
|
+
if focus == "token-delta" or (grade_analysis and ab_analysis):
|
|
329
|
+
lines += [
|
|
330
|
+
"---",
|
|
331
|
+
"",
|
|
332
|
+
"## Token Efficiency Summary",
|
|
333
|
+
"",
|
|
334
|
+
]
|
|
335
|
+
if grade_analysis:
|
|
336
|
+
avg_tok = grade_analysis["overall"].get("total_estimated_tokens", 0)
|
|
337
|
+
runs = grade_analysis["overall"].get("total_runs", 1)
|
|
338
|
+
lines.append(f"- Average scenario cost: ~{int(avg_tok/max(runs,1))} estimated tokens/run")
|
|
339
|
+
if ab_analysis:
|
|
340
|
+
delta = ab_analysis["avg_token_delta_mcp_minus_cli"]
|
|
341
|
+
sign = "+" if delta > 0 else ""
|
|
342
|
+
lines.append(f"- MCP interface overhead vs CLI: {sign}{delta:.1f} tokens/operation")
|
|
343
|
+
lines.append(f"- High-cost operations (MCP > CLI by >100t): " +
|
|
344
|
+
", ".join(f"`{op}`" for op, d in ab_analysis["per_operation"].items()
|
|
345
|
+
if d.get("avg_token_delta", 0) > 100) or "none detected")
|
|
346
|
+
lines.append("")
|
|
347
|
+
|
|
348
|
+
lines += [
|
|
349
|
+
"---",
|
|
350
|
+
"",
|
|
351
|
+
f"*Report generated by ct-grade v2.1 `generate_report.py`*",
|
|
352
|
+
"",
|
|
353
|
+
]
|
|
354
|
+
|
|
355
|
+
return "\n".join(lines)
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
# ---------------------------------------------------------------------------
|
|
359
|
+
# Main
|
|
360
|
+
# ---------------------------------------------------------------------------
|
|
361
|
+
|
|
362
|
+
def main():
|
|
363
|
+
parser = argparse.ArgumentParser(description="Generate CLEO grade comparative analysis report")
|
|
364
|
+
parser.add_argument("--input-dir", required=True, help="Directory with grade/AB test results")
|
|
365
|
+
parser.add_argument("--format", default="both", choices=["markdown", "json", "both"])
|
|
366
|
+
parser.add_argument("--output", default=None, help="Output file (default: <input-dir>/report.md)")
|
|
367
|
+
parser.add_argument("--focus", default=None,
|
|
368
|
+
choices=["token-delta", "grade-scores", "ab-wins"],
|
|
369
|
+
help="Focus the report on a specific aspect")
|
|
370
|
+
parser.add_argument("--json", action="store_true", help="Print analysis JSON to stdout")
|
|
371
|
+
args = parser.parse_args()
|
|
372
|
+
|
|
373
|
+
input_dir = Path(args.input_dir)
|
|
374
|
+
if not input_dir.exists():
|
|
375
|
+
print(f"ERROR: --input-dir does not exist: {input_dir}", file=sys.stderr)
|
|
376
|
+
sys.exit(1)
|
|
377
|
+
|
|
378
|
+
# Load results
|
|
379
|
+
grade_summaries = find_grade_summaries(input_dir)
|
|
380
|
+
ab_summaries = find_ab_summaries(input_dir)
|
|
381
|
+
|
|
382
|
+
if not grade_summaries and not ab_summaries:
|
|
383
|
+
print(f"ERROR: No summary.json files found under {input_dir}", file=sys.stderr)
|
|
384
|
+
print("Run run_scenario.py or run_ab_test.py first.", file=sys.stderr)
|
|
385
|
+
sys.exit(1)
|
|
386
|
+
|
|
387
|
+
print(f"Found {len(grade_summaries)} grade summary file(s) and {len(ab_summaries)} A/B summary file(s)")
|
|
388
|
+
|
|
389
|
+
# Analyze
|
|
390
|
+
grade_analysis = analyze_grade_results(grade_summaries) if grade_summaries else None
|
|
391
|
+
ab_analysis = analyze_ab_results(ab_summaries) if ab_summaries else None
|
|
392
|
+
|
|
393
|
+
# Build analysis.json
|
|
394
|
+
analysis = {
|
|
395
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
396
|
+
"input_dir": str(input_dir),
|
|
397
|
+
"grade_analysis": grade_analysis,
|
|
398
|
+
"ab_analysis": ab_analysis,
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
output_base = args.output.rsplit(".", 1)[0] if args.output else str(input_dir / "report")
|
|
402
|
+
|
|
403
|
+
if args.format in ("json", "both"):
|
|
404
|
+
json_path = output_base + ".json" if not (args.output and args.output.endswith(".json")) else args.output
|
|
405
|
+
Path(json_path).write_text(json.dumps(analysis, indent=2))
|
|
406
|
+
print(f"Saved JSON: {json_path}")
|
|
407
|
+
|
|
408
|
+
if args.format in ("markdown", "both"):
|
|
409
|
+
md_content = generate_markdown(grade_analysis, ab_analysis, input_dir, focus=args.focus)
|
|
410
|
+
md_path = output_base + ".md" if not (args.output and args.output.endswith(".md")) else args.output
|
|
411
|
+
Path(md_path).write_text(md_content)
|
|
412
|
+
print(f"Saved Markdown: {md_path}")
|
|
413
|
+
|
|
414
|
+
if args.json:
|
|
415
|
+
print(json.dumps(analysis, indent=2))
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
if __name__ == "__main__":
|
|
419
|
+
main()
|