@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,419 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generate a comparative analysis report from grade scenario and/or A/B test results.
4
+
5
+ Reads output directories from run_scenario.py and/or run_ab_test.py and produces:
6
+ - Markdown report (human-readable)
7
+ - analysis.json (machine-readable)
8
+
9
+ Usage:
10
+ python generate_report.py --input-dir ./grade-results --format markdown
11
+ python generate_report.py --input-dir ./ab-results --format markdown --focus token-delta
12
+ python generate_report.py --input-dir ./results --format both --output ./report.md
13
+ """
14
+
15
+ import argparse
16
+ import json
17
+ import sys
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+ from statistics import mean, stdev
21
+
22
+
23
+ # ---------------------------------------------------------------------------
24
+ # Loaders
25
+ # ---------------------------------------------------------------------------
26
+
27
+ def load_json_safe(path):
28
+ """Load JSON file, return None on failure."""
29
+ try:
30
+ return json.loads(Path(path).read_text())
31
+ except Exception:
32
+ return None
33
+
34
+
35
+ def find_summary_files(input_dir):
36
+ """Find all summary.json files under input_dir."""
37
+ return list(Path(input_dir).rglob("summary.json"))
38
+
39
+
40
+ def find_metrics_files(input_dir):
41
+ """Find all metrics.json files (scenario runs)."""
42
+ return list(Path(input_dir).rglob("metrics.json"))
43
+
44
+
45
+ def find_ab_summaries(input_dir):
46
+ """Find summary.json files that look like A/B test summaries."""
47
+ results = []
48
+ for f in Path(input_dir).rglob("summary.json"):
49
+ data = load_json_safe(f)
50
+ if data and "global_wins" in data:
51
+ results.append((f, data))
52
+ return results
53
+
54
+
55
+ def find_grade_summaries(input_dir):
56
+ """Find summary.json files that look like grade scenario summaries."""
57
+ results = []
58
+ for f in Path(input_dir).rglob("summary.json"):
59
+ data = load_json_safe(f)
60
+ if data and "grade_summary" in data:
61
+ results.append((f, data))
62
+ return results
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Analysis
67
+ # ---------------------------------------------------------------------------
68
+
69
+ def analyze_grade_results(grade_summaries):
70
+ """Aggregate grade scenario results."""
71
+ if not grade_summaries:
72
+ return None
73
+
74
+ all_scores = []
75
+ by_scenario = {}
76
+ total_flags = 0
77
+ total_tokens = 0
78
+
79
+ for _, summary in grade_summaries:
80
+ for gs in summary.get("grade_summary", []):
81
+ scenario = gs.get("scenario", "?")
82
+ score = gs.get("score")
83
+ flags = gs.get("flags") or 0
84
+ tokens = gs.get("estimated_tokens") or 0
85
+
86
+ if score is not None:
87
+ all_scores.append(score)
88
+ total_flags += flags
89
+ total_tokens += tokens
90
+
91
+ if scenario not in by_scenario:
92
+ by_scenario[scenario] = {"scores": [], "flags": [], "tokens": []}
93
+ if score is not None:
94
+ by_scenario[scenario]["scores"].append(score)
95
+ by_scenario[scenario]["flags"].append(flags)
96
+ by_scenario[scenario]["tokens"].append(tokens)
97
+
98
+ analysis = {
99
+ "total_runs": len(all_scores),
100
+ "overall": {
101
+ "mean_score": round(mean(all_scores), 1) if all_scores else None,
102
+ "min_score": min(all_scores) if all_scores else None,
103
+ "max_score": max(all_scores) if all_scores else None,
104
+ "stddev_score": round(stdev(all_scores), 2) if len(all_scores) > 1 else 0,
105
+ "total_flags": total_flags,
106
+ "total_estimated_tokens": total_tokens,
107
+ },
108
+ "by_scenario": {},
109
+ }
110
+
111
+ for scenario, data in by_scenario.items():
112
+ scores = data["scores"]
113
+ flags = data["flags"]
114
+ tokens = [t for t in data["tokens"] if t > 0]
115
+ analysis["by_scenario"][scenario] = {
116
+ "runs": len(scores),
117
+ "mean_score": round(mean(scores), 1) if scores else None,
118
+ "min_score": min(scores) if scores else None,
119
+ "max_score": max(scores) if scores else None,
120
+ "total_flags": sum(flags),
121
+ "avg_flags_per_run": round(mean(flags), 2) if flags else 0,
122
+ "avg_estimated_tokens": round(mean(tokens), 0) if tokens else 0,
123
+ }
124
+
125
+ return analysis
126
+
127
+
128
+ def analyze_ab_results(ab_summaries):
129
+ """Aggregate A/B test results."""
130
+ if not ab_summaries:
131
+ return None
132
+
133
+ total_mcp_wins = 0
134
+ total_cli_wins = 0
135
+ total_ties = 0
136
+ total_runs = 0
137
+ token_deltas = []
138
+ per_op = {}
139
+
140
+ for _, summary in ab_summaries:
141
+ total_mcp_wins += summary.get("global_wins", {}).get("mcp", 0)
142
+ total_cli_wins += summary.get("global_wins", {}).get("cli", 0)
143
+ total_ties += summary.get("global_wins", {}).get("tie", 0)
144
+ total_runs += summary.get("total_runs", 0)
145
+ delta = summary.get("avg_token_delta_mcp_minus_cli")
146
+ if delta is not None:
147
+ token_deltas.append(delta)
148
+
149
+ for op_summary in summary.get("per_operation", []):
150
+ op_key = op_summary.get("operation", "?")
151
+ if op_key not in per_op:
152
+ per_op[op_key] = {
153
+ "mcp_wins": 0, "cli_wins": 0, "ties": 0,
154
+ "token_deltas": [],
155
+ "mcp_chars": [], "cli_chars": [],
156
+ "mcp_ms": [], "cli_ms": [],
157
+ }
158
+ per_op[op_key]["mcp_wins"] += op_summary.get("wins", {}).get("mcp", 0)
159
+ per_op[op_key]["cli_wins"] += op_summary.get("wins", {}).get("cli", 0)
160
+ per_op[op_key]["ties"] += op_summary.get("wins", {}).get("tie", 0)
161
+ if op_summary.get("avg_token_delta_mcp_minus_cli") is not None:
162
+ per_op[op_key]["token_deltas"].append(op_summary["avg_token_delta_mcp_minus_cli"])
163
+ if op_summary.get("avg_mcp_chars"):
164
+ per_op[op_key]["mcp_chars"].append(op_summary["avg_mcp_chars"])
165
+ if op_summary.get("avg_cli_chars"):
166
+ per_op[op_key]["cli_chars"].append(op_summary["avg_cli_chars"])
167
+ if op_summary.get("avg_mcp_ms"):
168
+ per_op[op_key]["mcp_ms"].append(op_summary["avg_mcp_ms"])
169
+ if op_summary.get("avg_cli_ms"):
170
+ per_op[op_key]["cli_ms"].append(op_summary["avg_cli_ms"])
171
+
172
+ overall_winner = "mcp" if total_mcp_wins > total_cli_wins else \
173
+ "cli" if total_cli_wins > total_mcp_wins else "tie"
174
+ avg_delta = mean(token_deltas) if token_deltas else 0
175
+
176
+ analysis = {
177
+ "total_runs": total_runs,
178
+ "overall_winner": overall_winner,
179
+ "global_wins": {
180
+ "mcp": total_mcp_wins,
181
+ "cli": total_cli_wins,
182
+ "tie": total_ties,
183
+ },
184
+ "global_win_rate": {
185
+ "mcp": round(total_mcp_wins / max(total_runs, 1), 3),
186
+ "cli": round(total_cli_wins / max(total_runs, 1), 3),
187
+ },
188
+ "avg_token_delta_mcp_minus_cli": round(avg_delta, 1),
189
+ "interpretation": (
190
+ "MCP uses more tokens on average" if avg_delta > 0 else
191
+ "CLI uses more tokens on average" if avg_delta < 0 else
192
+ "MCP and CLI have equivalent token costs"
193
+ ),
194
+ "per_operation": {},
195
+ }
196
+
197
+ for op_key, data in per_op.items():
198
+ total_op = data["mcp_wins"] + data["cli_wins"] + data["ties"]
199
+ analysis["per_operation"][op_key] = {
200
+ "total_runs": total_op,
201
+ "mcp_wins": data["mcp_wins"],
202
+ "cli_wins": data["cli_wins"],
203
+ "ties": data["ties"],
204
+ "winner": "mcp" if data["mcp_wins"] > data["cli_wins"] else
205
+ "cli" if data["cli_wins"] > data["mcp_wins"] else "tie",
206
+ "avg_token_delta": round(mean(data["token_deltas"]), 1) if data["token_deltas"] else 0,
207
+ "avg_mcp_chars": round(mean(data["mcp_chars"]), 0) if data["mcp_chars"] else 0,
208
+ "avg_cli_chars": round(mean(data["cli_chars"]), 0) if data["cli_chars"] else 0,
209
+ "avg_mcp_ms": round(mean(data["mcp_ms"]), 0) if data["mcp_ms"] else 0,
210
+ "avg_cli_ms": round(mean(data["cli_ms"]), 0) if data["cli_ms"] else 0,
211
+ }
212
+
213
+ return analysis
214
+
215
+
216
+ # ---------------------------------------------------------------------------
217
+ # Markdown report generator
218
+ # ---------------------------------------------------------------------------
219
+
220
+ def generate_markdown(grade_analysis, ab_analysis, input_dir, focus=None):
221
+ """Produce a markdown comparative analysis report."""
222
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
223
+ lines = [
224
+ f"# CLEO Grade v2.1 — Comparative Analysis Report",
225
+ f"",
226
+ f"**Generated:** {ts} ",
227
+ f"**Source:** `{input_dir}`",
228
+ f"",
229
+ ]
230
+
231
+ # --- Grade scenario section ---
232
+ if grade_analysis and focus != "token-delta":
233
+ lines += [
234
+ "---",
235
+ "",
236
+ "## Grade Scenario Results",
237
+ "",
238
+ ]
239
+ ov = grade_analysis["overall"]
240
+ lines += [
241
+ f"| Metric | Value |",
242
+ f"|--------|-------|",
243
+ f"| Total runs | {ov['total_runs']} |",
244
+ f"| Mean score | {ov['mean_score']}/100 |",
245
+ f"| Score range | {ov['min_score']}–{ov['max_score']} |",
246
+ f"| Score stddev | {ov['stddev_score']} |",
247
+ f"| Total flags | {ov['total_flags']} |",
248
+ f"| Total est. tokens | {ov['total_estimated_tokens']:,} |",
249
+ "",
250
+ ]
251
+
252
+ lines += ["### Per-Scenario Breakdown", ""]
253
+ lines += [
254
+ "| Scenario | Runs | Mean Score | Min | Max | Flags/Run | Avg Tokens |",
255
+ "|----------|------|-----------|-----|-----|-----------|------------|",
256
+ ]
257
+ for scenario, data in sorted(grade_analysis["by_scenario"].items()):
258
+ score_str = f"{data['mean_score']}/100" if data["mean_score"] is not None else "N/A"
259
+ lines.append(
260
+ f"| {scenario} | {data['runs']} | {score_str} | "
261
+ f"{data['min_score']} | {data['max_score']} | "
262
+ f"{data['avg_flags_per_run']:.1f} | "
263
+ f"~{int(data['avg_estimated_tokens'])}t |"
264
+ )
265
+ lines.append("")
266
+
267
+ # --- A/B test section ---
268
+ if ab_analysis:
269
+ lines += [
270
+ "---",
271
+ "",
272
+ "## MCP vs CLI Blind A/B Results",
273
+ "",
274
+ ]
275
+ ow = ab_analysis["overall_winner"].upper()
276
+ wr = ab_analysis["global_win_rate"]
277
+ gw = ab_analysis["global_wins"]
278
+ delta = ab_analysis["avg_token_delta_mcp_minus_cli"]
279
+ interp = ab_analysis["interpretation"]
280
+
281
+ lines += [
282
+ f"**Overall winner: {ow}** ",
283
+ f"",
284
+ f"| Metric | Value |",
285
+ f"|--------|-------|",
286
+ f"| Total runs | {ab_analysis['total_runs']} |",
287
+ f"| MCP wins | {gw['mcp']} ({wr['mcp']*100:.1f}%) |",
288
+ f"| CLI wins | {gw['cli']} ({wr['cli']*100:.1f}%) |",
289
+ f"| Ties | {gw['tie']} |",
290
+ f"| Avg token delta (MCP–CLI) | {delta:+.1f} tokens |",
291
+ f"| Interpretation | {interp} |",
292
+ "",
293
+ ]
294
+
295
+ lines += ["### Per-Operation Results", ""]
296
+ lines += [
297
+ "| Operation | MCP wins | CLI wins | Ties | Token delta | MCP chars | CLI chars | MCP ms | CLI ms |",
298
+ "|-----------|----------|----------|------|-------------|-----------|-----------|--------|--------|",
299
+ ]
300
+ for op_key, data in sorted(ab_analysis["per_operation"].items()):
301
+ winner_marker = " **MCP**" if data["winner"] == "mcp" else \
302
+ " **CLI**" if data["winner"] == "cli" else ""
303
+ lines.append(
304
+ f"| `{op_key}`{winner_marker} | {data['mcp_wins']} | {data['cli_wins']} | "
305
+ f"{data['ties']} | {data['avg_token_delta']:+.0f}t | "
306
+ f"{int(data['avg_mcp_chars'])} | {int(data['avg_cli_chars'])} | "
307
+ f"{int(data['avg_mcp_ms'])}ms | {int(data['avg_cli_ms'])}ms |"
308
+ )
309
+ lines.append("")
310
+
311
+ # Recommendations
312
+ lines += ["### Recommendations", ""]
313
+ if delta > 50:
314
+ lines.append("- **MCP adds significant token overhead.** Consider whether MCP envelope verbosity can be reduced for high-frequency operations.")
315
+ elif delta < -50:
316
+ lines.append("- **CLI is more verbose than MCP.** CLI output may include formatting/ANSI tokens not useful to agents.")
317
+ else:
318
+ lines.append("- **MCP and CLI have similar token costs.** Interface choice should be based on other factors (protocol compliance, auditability).")
319
+
320
+ if wr.get("mcp", 0) > 0.6:
321
+ lines.append("- **MCP output quality is consistently higher.** Reinforces MCP-first agent protocol recommendation.")
322
+ elif wr.get("cli", 0) > 0.6:
323
+ lines.append("- **CLI output quality is consistently higher.** Investigate MCP envelope structure for potential improvements.")
324
+
325
+ lines.append("")
326
+
327
+ # --- Token efficiency section ---
328
+ if focus == "token-delta" or (grade_analysis and ab_analysis):
329
+ lines += [
330
+ "---",
331
+ "",
332
+ "## Token Efficiency Summary",
333
+ "",
334
+ ]
335
+ if grade_analysis:
336
+ avg_tok = grade_analysis["overall"].get("total_estimated_tokens", 0)
337
+ runs = grade_analysis["overall"].get("total_runs", 1)
338
+ lines.append(f"- Average scenario cost: ~{int(avg_tok/max(runs,1))} estimated tokens/run")
339
+ if ab_analysis:
340
+ delta = ab_analysis["avg_token_delta_mcp_minus_cli"]
341
+ sign = "+" if delta > 0 else ""
342
+ lines.append(f"- MCP interface overhead vs CLI: {sign}{delta:.1f} tokens/operation")
343
+ lines.append(f"- High-cost operations (MCP > CLI by >100t): " +
344
+ ", ".join(f"`{op}`" for op, d in ab_analysis["per_operation"].items()
345
+ if d.get("avg_token_delta", 0) > 100) or "none detected")
346
+ lines.append("")
347
+
348
+ lines += [
349
+ "---",
350
+ "",
351
+ f"*Report generated by ct-grade v2.1 `generate_report.py`*",
352
+ "",
353
+ ]
354
+
355
+ return "\n".join(lines)
356
+
357
+
358
+ # ---------------------------------------------------------------------------
359
+ # Main
360
+ # ---------------------------------------------------------------------------
361
+
362
+ def main():
363
+ parser = argparse.ArgumentParser(description="Generate CLEO grade comparative analysis report")
364
+ parser.add_argument("--input-dir", required=True, help="Directory with grade/AB test results")
365
+ parser.add_argument("--format", default="both", choices=["markdown", "json", "both"])
366
+ parser.add_argument("--output", default=None, help="Output file (default: <input-dir>/report.md)")
367
+ parser.add_argument("--focus", default=None,
368
+ choices=["token-delta", "grade-scores", "ab-wins"],
369
+ help="Focus the report on a specific aspect")
370
+ parser.add_argument("--json", action="store_true", help="Print analysis JSON to stdout")
371
+ args = parser.parse_args()
372
+
373
+ input_dir = Path(args.input_dir)
374
+ if not input_dir.exists():
375
+ print(f"ERROR: --input-dir does not exist: {input_dir}", file=sys.stderr)
376
+ sys.exit(1)
377
+
378
+ # Load results
379
+ grade_summaries = find_grade_summaries(input_dir)
380
+ ab_summaries = find_ab_summaries(input_dir)
381
+
382
+ if not grade_summaries and not ab_summaries:
383
+ print(f"ERROR: No summary.json files found under {input_dir}", file=sys.stderr)
384
+ print("Run run_scenario.py or run_ab_test.py first.", file=sys.stderr)
385
+ sys.exit(1)
386
+
387
+ print(f"Found {len(grade_summaries)} grade summary file(s) and {len(ab_summaries)} A/B summary file(s)")
388
+
389
+ # Analyze
390
+ grade_analysis = analyze_grade_results(grade_summaries) if grade_summaries else None
391
+ ab_analysis = analyze_ab_results(ab_summaries) if ab_summaries else None
392
+
393
+ # Build analysis.json
394
+ analysis = {
395
+ "timestamp": datetime.now(timezone.utc).isoformat(),
396
+ "input_dir": str(input_dir),
397
+ "grade_analysis": grade_analysis,
398
+ "ab_analysis": ab_analysis,
399
+ }
400
+
401
+ output_base = args.output.rsplit(".", 1)[0] if args.output else str(input_dir / "report")
402
+
403
+ if args.format in ("json", "both"):
404
+ json_path = output_base + ".json" if not (args.output and args.output.endswith(".json")) else args.output
405
+ Path(json_path).write_text(json.dumps(analysis, indent=2))
406
+ print(f"Saved JSON: {json_path}")
407
+
408
+ if args.format in ("markdown", "both"):
409
+ md_content = generate_markdown(grade_analysis, ab_analysis, input_dir, focus=args.focus)
410
+ md_path = output_base + ".md" if not (args.output and args.output.endswith(".md")) else args.output
411
+ Path(md_path).write_text(md_content)
412
+ print(f"Saved Markdown: {md_path}")
413
+
414
+ if args.json:
415
+ print(json.dumps(analysis, indent=2))
416
+
417
+
418
+ if __name__ == "__main__":
419
+ main()