@cleocode/skills 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (171) hide show
  1. package/dispatch-config.json +404 -0
  2. package/index.d.ts +178 -0
  3. package/index.js +405 -0
  4. package/package.json +14 -0
  5. package/profiles/core.json +7 -0
  6. package/profiles/full.json +10 -0
  7. package/profiles/minimal.json +7 -0
  8. package/profiles/recommended.json +7 -0
  9. package/provider-skills-map.json +97 -0
  10. package/skills/_shared/cleo-style-guide.md +84 -0
  11. package/skills/_shared/manifest-operations.md +810 -0
  12. package/skills/_shared/placeholders.json +433 -0
  13. package/skills/_shared/skill-chaining-patterns.md +237 -0
  14. package/skills/_shared/subagent-protocol-base.md +223 -0
  15. package/skills/_shared/task-system-integration.md +232 -0
  16. package/skills/_shared/testing-framework-config.md +110 -0
  17. package/skills/ct-cleo/SKILL.md +490 -0
  18. package/skills/ct-cleo/references/anti-patterns.md +19 -0
  19. package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
  20. package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
  21. package/skills/ct-cleo/references/session-protocol.md +162 -0
  22. package/skills/ct-codebase-mapper/SKILL.md +82 -0
  23. package/skills/ct-contribution/SKILL.md +521 -0
  24. package/skills/ct-contribution/templates/contribution-init.json +21 -0
  25. package/skills/ct-dev-workflow/SKILL.md +423 -0
  26. package/skills/ct-docs-lookup/SKILL.md +66 -0
  27. package/skills/ct-docs-review/SKILL.md +175 -0
  28. package/skills/ct-docs-write/SKILL.md +108 -0
  29. package/skills/ct-documentor/SKILL.md +231 -0
  30. package/skills/ct-epic-architect/SKILL.md +305 -0
  31. package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
  32. package/skills/ct-epic-architect/references/commands.md +201 -0
  33. package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
  34. package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
  35. package/skills/ct-epic-architect/references/output-format.md +92 -0
  36. package/skills/ct-epic-architect/references/patterns.md +284 -0
  37. package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
  38. package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
  39. package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
  40. package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
  41. package/skills/ct-grade/SKILL.md +230 -0
  42. package/skills/ct-grade/agents/analysis-reporter.md +203 -0
  43. package/skills/ct-grade/agents/blind-comparator.md +157 -0
  44. package/skills/ct-grade/agents/scenario-runner.md +134 -0
  45. package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  46. package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  47. package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  48. package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  49. package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  50. package/skills/ct-grade/eval-viewer/viewer.html +219 -0
  51. package/skills/ct-grade/evals/evals.json +94 -0
  52. package/skills/ct-grade/references/ab-test-methodology.md +150 -0
  53. package/skills/ct-grade/references/domains.md +137 -0
  54. package/skills/ct-grade/references/grade-spec.md +236 -0
  55. package/skills/ct-grade/references/scenario-playbook.md +234 -0
  56. package/skills/ct-grade/references/token-tracking.md +120 -0
  57. package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
  58. package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
  59. package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
  60. package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
  61. package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  62. package/skills/ct-grade/scripts/generate_report.py +283 -0
  63. package/skills/ct-grade/scripts/run_ab_test.py +504 -0
  64. package/skills/ct-grade/scripts/run_all.py +287 -0
  65. package/skills/ct-grade/scripts/setup_run.py +183 -0
  66. package/skills/ct-grade/scripts/token_tracker.py +630 -0
  67. package/skills/ct-grade-v2-1/SKILL.md +237 -0
  68. package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  69. package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  70. package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  71. package/skills/ct-grade-v2-1/evals/evals.json +74 -0
  72. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
  73. package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
  74. package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  75. package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  76. package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  77. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  78. package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  79. package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  80. package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  81. package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  82. package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  83. package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  84. package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  85. package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  86. package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  87. package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  88. package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  89. package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  90. package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  91. package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  92. package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  93. package/skills/ct-memory/SKILL.md +84 -0
  94. package/skills/ct-orchestrator/INSTALL.md +61 -0
  95. package/skills/ct-orchestrator/README.md +69 -0
  96. package/skills/ct-orchestrator/SKILL.md +380 -0
  97. package/skills/ct-orchestrator/manifest-entry.json +19 -0
  98. package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
  99. package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
  100. package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
  101. package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
  102. package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
  103. package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
  104. package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
  105. package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
  106. package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
  107. package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
  108. package/skills/ct-research-agent/SKILL.md +226 -0
  109. package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
  110. package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
  111. package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
  112. package/skills/ct-skill-creator/SKILL.md +356 -0
  113. package/skills/ct-skill-creator/agents/analyzer.md +276 -0
  114. package/skills/ct-skill-creator/agents/comparator.md +204 -0
  115. package/skills/ct-skill-creator/agents/grader.md +225 -0
  116. package/skills/ct-skill-creator/assets/eval_review.html +146 -0
  117. package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
  118. package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  119. package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  120. package/skills/ct-skill-creator/manifest-entry.json +17 -0
  121. package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  122. package/skills/ct-skill-creator/references/frontmatter.md +83 -0
  123. package/skills/ct-skill-creator/references/invocation-control.md +165 -0
  124. package/skills/ct-skill-creator/references/output-patterns.md +86 -0
  125. package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  126. package/skills/ct-skill-creator/references/schemas.md +430 -0
  127. package/skills/ct-skill-creator/references/workflows.md +28 -0
  128. package/skills/ct-skill-creator/scripts/__init__.py +1 -0
  129. package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
  130. package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
  131. package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
  132. package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
  133. package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
  134. package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
  135. package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
  136. package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
  137. package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
  138. package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  139. package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  140. package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  141. package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
  142. package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
  143. package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
  144. package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  145. package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  146. package/skills/ct-skill-creator/scripts/utils.py +47 -0
  147. package/skills/ct-skill-validator/SKILL.md +178 -0
  148. package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  149. package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  150. package/skills/ct-skill-validator/evals/eval_set.json +14 -0
  151. package/skills/ct-skill-validator/evals/evals.json +52 -0
  152. package/skills/ct-skill-validator/manifest-entry.json +20 -0
  153. package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  154. package/skills/ct-skill-validator/references/validation-rules.md +168 -0
  155. package/skills/ct-skill-validator/scripts/__init__.py +0 -0
  156. package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
  157. package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
  158. package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
  159. package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
  160. package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  161. package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  162. package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  163. package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  164. package/skills/ct-skill-validator/scripts/validate.py +422 -0
  165. package/skills/ct-spec-writer/SKILL.md +189 -0
  166. package/skills/ct-stickynote/README.md +14 -0
  167. package/skills/ct-stickynote/SKILL.md +46 -0
  168. package/skills/ct-task-executor/SKILL.md +296 -0
  169. package/skills/ct-validator/SKILL.md +216 -0
  170. package/skills/manifest.json +469 -0
  171. package/skills.json +281 -0
@@ -0,0 +1,279 @@
1
+ """
2
+ audit_analyzer.py — Read CLEO tasks.db audit_log and extract MCP vs CLI performance stats.
3
+
4
+ Usage:
5
+ python scripts/audit_analyzer.py [--project-dir .] [--output-dir ab-results] [--json]
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import sqlite3
11
+ import sys
12
+ from datetime import datetime, timezone
13
+ from pathlib import Path
14
+
15
+
16
+ def find_tasks_db(project_dir: Path) -> Path | None:
17
+ """Walk up from project_dir to find .cleo/tasks.db (up to 5 levels)."""
18
+ current = project_dir.resolve()
19
+ for _ in range(5):
20
+ candidate = current / ".cleo" / "tasks.db"
21
+ if candidate.exists():
22
+ return candidate
23
+ parent = current.parent
24
+ if parent == current:
25
+ break
26
+ current = parent
27
+ return None
28
+
29
+
30
+ def query_per_operation(conn: sqlite3.Connection) -> list[dict]:
31
+ sql = """
32
+ SELECT
33
+ domain,
34
+ operation,
35
+ source,
36
+ COUNT(*) AS call_count,
37
+ SUM(CASE WHEN success=1 THEN 1 ELSE 0 END) AS success_count,
38
+ AVG(duration_ms) AS avg_ms,
39
+ MIN(duration_ms) AS min_ms,
40
+ MAX(duration_ms) AS max_ms,
41
+ AVG(LENGTH(COALESCE(details_json, ''))) AS avg_chars
42
+ FROM audit_log
43
+ WHERE domain IS NOT NULL AND operation IS NOT NULL
44
+ GROUP BY domain, operation, source
45
+ ORDER BY domain, operation, source
46
+ """
47
+ cursor = conn.execute(sql)
48
+ columns = [col[0] for col in cursor.description]
49
+ return [dict(zip(columns, row)) for row in cursor.fetchall()]
50
+
51
+
52
+ def query_session_ratio(conn: sqlite3.Connection) -> list[dict]:
53
+ sql = """
54
+ SELECT
55
+ source,
56
+ COUNT(DISTINCT session_id) AS session_count,
57
+ COUNT(*) AS total_ops
58
+ FROM audit_log
59
+ WHERE source IS NOT NULL
60
+ GROUP BY source
61
+ """
62
+ cursor = conn.execute(sql)
63
+ columns = [col[0] for col in cursor.description]
64
+ return [dict(zip(columns, row)) for row in cursor.fetchall()]
65
+
66
+
67
+ def build_per_operation_stats(rows: list[dict], session_rows: list[dict]) -> dict:
68
+ now = datetime.now(timezone.utc).isoformat()
69
+ by_operation: dict[str, dict] = {}
70
+ total_ops = 0
71
+
72
+ for row in rows:
73
+ key = f"{row['domain']}.{row['operation']}"
74
+ source = row["source"] or "unknown"
75
+ call_count = int(row["call_count"] or 0)
76
+ success_count = int(row["success_count"] or 0)
77
+ avg_ms = float(row["avg_ms"]) if row["avg_ms"] is not None else 0.0
78
+ min_ms = int(row["min_ms"]) if row["min_ms"] is not None else 0
79
+ max_ms = int(row["max_ms"]) if row["max_ms"] is not None else 0
80
+ avg_chars = float(row["avg_chars"]) if row["avg_chars"] is not None else 0.0
81
+
82
+ success_rate = round(success_count / call_count, 4) if call_count > 0 else 0.0
83
+
84
+ if key not in by_operation:
85
+ by_operation[key] = {}
86
+
87
+ by_operation[key][source] = {
88
+ "calls": call_count,
89
+ "success_rate": success_rate,
90
+ "avg_ms": round(avg_ms, 2),
91
+ "min_ms": min_ms,
92
+ "max_ms": max_ms,
93
+ "avg_chars": round(avg_chars, 2),
94
+ }
95
+ total_ops += call_count
96
+
97
+ session_ratio: dict[str, dict] = {}
98
+ for sr in session_rows:
99
+ src = sr["source"] or "unknown"
100
+ session_ratio[src] = {
101
+ "session_count": int(sr["session_count"] or 0),
102
+ "total_ops": int(sr["total_ops"] or 0),
103
+ }
104
+
105
+ return {
106
+ "generated_at": now,
107
+ "total_ops_analyzed": total_ops,
108
+ "by_operation": by_operation,
109
+ "session_ratio": session_ratio,
110
+ }
111
+
112
+
113
+ def build_operation_coverage(rows: list[dict]) -> dict:
114
+ now = datetime.now(timezone.utc).isoformat()
115
+ coverage: dict[str, dict] = {}
116
+ domains_seen: set[str] = set()
117
+
118
+ for row in rows:
119
+ key = f"{row['domain']}.{row['operation']}"
120
+ source = row["source"] or "unknown"
121
+ call_count = int(row["call_count"] or 0)
122
+ domains_seen.add(row["domain"])
123
+
124
+ if key not in coverage:
125
+ coverage[key] = {"tested": True, "mcp_calls": 0, "cli_calls": 0}
126
+
127
+ if source == "mcp":
128
+ coverage[key]["mcp_calls"] += call_count
129
+ elif source == "cli":
130
+ coverage[key]["cli_calls"] += call_count
131
+
132
+ return {
133
+ "generated_at": now,
134
+ "coverage": coverage,
135
+ "total_operations_seen": len(coverage),
136
+ "domains_seen": sorted(domains_seen),
137
+ }
138
+
139
+
140
+ def print_summary(
141
+ db_path: Path,
142
+ stats: dict,
143
+ coverage: dict,
144
+ output_dir: Path,
145
+ ) -> None:
146
+ total_ops = stats["total_ops_analyzed"]
147
+ session_ratio = stats.get("session_ratio", {})
148
+ mcp_ops = session_ratio.get("mcp", {}).get("total_ops", 0)
149
+ cli_ops = session_ratio.get("cli", {}).get("total_ops", 0)
150
+ grand_total = mcp_ops + cli_ops
151
+ mcp_pct = round(mcp_ops / grand_total * 100) if grand_total > 0 else 0
152
+ cli_pct = round(cli_ops / grand_total * 100) if grand_total > 0 else 0
153
+
154
+ # Top operations by total call count (mcp + cli combined)
155
+ by_op = stats["by_operation"]
156
+ op_totals = []
157
+ for op_key, sources in by_op.items():
158
+ total = sum(s["calls"] for s in sources.values())
159
+ mcp_calls = sources.get("mcp", {}).get("calls", 0)
160
+ cli_calls = sources.get("cli", {}).get("calls", 0)
161
+ op_totals.append((op_key, total, mcp_calls, cli_calls))
162
+ op_totals.sort(key=lambda x: x[1], reverse=True)
163
+
164
+ print(" Audit Analyzer")
165
+ print(" " + "\u2500" * 34)
166
+ print(f" DB path : {db_path}")
167
+ print(f" Total ops : {total_ops:,} rows analyzed")
168
+ print(f" Operations: {coverage['total_operations_seen']} unique domain.operation pairs")
169
+ print(f" MCP ops : {mcp_ops:,} ({mcp_pct}%)")
170
+ print(f" CLI ops : {cli_ops:,} ({cli_pct}%)")
171
+ print()
172
+ print(" Top operations by call count:")
173
+ for op_key, _total, mcp_c, cli_c in op_totals[:10]:
174
+ print(f" {op_key:<20} mcp={mcp_c:<6} cli={cli_c}")
175
+ print()
176
+ print(f" Written: {output_dir / 'per_operation_stats.json'}")
177
+ print(f" Written: {output_dir / 'operation_coverage.json'}")
178
+
179
+
180
+ def empty_stats() -> dict:
181
+ return {
182
+ "generated_at": datetime.now(timezone.utc).isoformat(),
183
+ "total_ops_analyzed": 0,
184
+ "by_operation": {},
185
+ "session_ratio": {},
186
+ }
187
+
188
+
189
+ def empty_coverage() -> dict:
190
+ return {
191
+ "generated_at": datetime.now(timezone.utc).isoformat(),
192
+ "coverage": {},
193
+ "total_operations_seen": 0,
194
+ "domains_seen": [],
195
+ }
196
+
197
+
198
+ def write_json(path: Path, data: dict) -> None:
199
+ path.write_text(json.dumps(data, indent=2), encoding="utf-8")
200
+
201
+
202
+ def main() -> int:
203
+ parser = argparse.ArgumentParser(
204
+ description="Extract MCP vs CLI performance stats from CLEO audit_log."
205
+ )
206
+ parser.add_argument(
207
+ "--project-dir",
208
+ default=".",
209
+ help="Root of the CLEO project (default: current directory)",
210
+ )
211
+ parser.add_argument(
212
+ "--output-dir",
213
+ default="ab-results",
214
+ help="Directory to write output JSON files (default: ab-results)",
215
+ )
216
+ parser.add_argument(
217
+ "--json",
218
+ action="store_true",
219
+ help="Print machine-readable JSON summary to stdout instead of human-readable text",
220
+ )
221
+ args = parser.parse_args()
222
+
223
+ project_dir = Path(args.project_dir)
224
+ output_dir = Path(args.output_dir)
225
+ output_dir.mkdir(parents=True, exist_ok=True)
226
+
227
+ stats_path = output_dir / "per_operation_stats.json"
228
+ coverage_path = output_dir / "operation_coverage.json"
229
+
230
+ db_path = find_tasks_db(project_dir)
231
+ if db_path is None:
232
+ print(
233
+ f"Warning: could not find .cleo/tasks.db under {project_dir.resolve()} "
234
+ "(searched up to 5 levels). Writing empty output files.",
235
+ file=sys.stderr,
236
+ )
237
+ write_json(stats_path, empty_stats())
238
+ write_json(coverage_path, empty_coverage())
239
+ return 0
240
+
241
+ try:
242
+ conn = sqlite3.connect(str(db_path))
243
+ conn.row_factory = sqlite3.Row
244
+ try:
245
+ per_op_rows = query_per_operation(conn)
246
+ session_rows = query_session_ratio(conn)
247
+ except sqlite3.OperationalError as exc:
248
+ print(
249
+ f"Warning: audit_log table not found or query failed ({exc}). "
250
+ "Writing empty output files.",
251
+ file=sys.stderr,
252
+ )
253
+ write_json(stats_path, empty_stats())
254
+ write_json(coverage_path, empty_coverage())
255
+ return 0
256
+ finally:
257
+ conn.close()
258
+ except sqlite3.DatabaseError as exc:
259
+ print(f"Warning: could not open {db_path}: {exc}. Writing empty output files.", file=sys.stderr)
260
+ write_json(stats_path, empty_stats())
261
+ write_json(coverage_path, empty_coverage())
262
+ return 0
263
+
264
+ stats = build_per_operation_stats(per_op_rows, session_rows)
265
+ coverage = build_operation_coverage(per_op_rows)
266
+
267
+ write_json(stats_path, stats)
268
+ write_json(coverage_path, coverage)
269
+
270
+ if args.json:
271
+ print(json.dumps({"stats": stats, "coverage": coverage}, indent=2))
272
+ else:
273
+ print_summary(db_path, stats, coverage, output_dir)
274
+
275
+ return 0
276
+
277
+
278
+ if __name__ == "__main__":
279
+ sys.exit(main())
@@ -0,0 +1,283 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ generate_report.py — Generate a comparative analysis report from ct-grade A/B results.
4
+
5
+ Usage:
6
+ python generate_report.py --run-dir ./ab_results/run-001 --mode ab [--html]
7
+
8
+ Reads: run-manifest.json, token-summary.json, */run-*/comparison.json, */run-*/arm-*/grade.json
9
+ Writes: <run-dir>/report.md (and optionally report.html)
10
+ """
11
+
12
+ import argparse
13
+ import json
14
+ import os
15
+ import sys
16
+ from pathlib import Path
17
+ from datetime import datetime, timezone
18
+
19
+
20
+ DIMENSION_LABELS = {
21
+ "sessionDiscipline": "S1 Session Discipline",
22
+ "discoveryEfficiency": "S2 Discovery Efficiency",
23
+ "taskHygiene": "S3 Task Hygiene",
24
+ "errorProtocol": "S4 Error Protocol",
25
+ "disclosureUse": "S5 Progressive Disclosure",
26
+ }
27
+
28
+ SCENARIO_LABELS = {
29
+ "s1": "Fresh Discovery",
30
+ "s2": "Task Creation Hygiene",
31
+ "s3": "Error Recovery",
32
+ "s4": "Full Lifecycle",
33
+ "s5": "Multi-Domain Analysis",
34
+ }
35
+
36
+ GRADE_THRESHOLDS = [
37
+ (90, "A"), (75, "B"), (60, "C"), (45, "D"), (0, "F")
38
+ ]
39
+
40
+
41
+ def letter_grade(score):
42
+ for threshold, letter in GRADE_THRESHOLDS:
43
+ if score >= threshold:
44
+ return letter
45
+ return "F"
46
+
47
+
48
+ def find_json(path, filename):
49
+ p = Path(path) / filename
50
+ if p.exists():
51
+ try:
52
+ return json.loads(p.read_text())
53
+ except Exception:
54
+ return None
55
+ return None
56
+
57
+
58
+ def find_all_comparison_files(run_dir):
59
+ return list(Path(run_dir).rglob("comparison.json"))
60
+
61
+
62
+ def find_grade_files(run_dir):
63
+ return list(Path(run_dir).rglob("grade.json"))
64
+
65
+
66
+ def load_grade(path):
67
+ try:
68
+ return json.loads(Path(path).read_text())
69
+ except Exception:
70
+ return None
71
+
72
+
73
+ def mean(values):
74
+ return sum(values) / len(values) if values else None
75
+
76
+
77
+ def collect_scores(run_dir):
78
+ """Collect grade scores by arm from all grade.json files."""
79
+ by_arm = {"arm-A": [], "arm-B": []}
80
+ by_arm_dimensions = {
81
+ "arm-A": {d: [] for d in DIMENSION_LABELS},
82
+ "arm-B": {d: [] for d in DIMENSION_LABELS},
83
+ }
84
+
85
+ for gfile in find_grade_files(run_dir):
86
+ parts = Path(gfile).parts
87
+ arm = next((p for p in parts if p.startswith("arm-")), None)
88
+ if arm not in by_arm:
89
+ continue
90
+ data = load_grade(gfile)
91
+ if data and "totalScore" in data:
92
+ by_arm[arm].append(data["totalScore"])
93
+ dims = data.get("dimensions", {})
94
+ for dim, label in DIMENSION_LABELS.items():
95
+ if dim in dims:
96
+ by_arm_dimensions[arm][dim].append(dims[dim].get("score", 0))
97
+
98
+ return by_arm, by_arm_dimensions
99
+
100
+
101
+ def collect_comparisons(run_dir):
102
+ wins = {"arm-A": 0, "arm-B": 0, "tie": 0}
103
+ comparisons = []
104
+ for cfile in find_all_comparison_files(run_dir):
105
+ data = find_json(os.path.dirname(cfile), "comparison.json")
106
+ if data:
107
+ winner = data.get("winner", "tie").lower()
108
+ if winner in wins:
109
+ wins[winner] += 1
110
+ comparisons.append(data)
111
+ return wins, comparisons
112
+
113
+
114
+ def build_report(run_dir, mode, manifest, token_summary, scores, dim_scores, wins, comparisons):
115
+ arm_a_scores = scores.get("arm-A", [])
116
+ arm_b_scores = scores.get("arm-B", [])
117
+ arm_a_mean = mean(arm_a_scores)
118
+ arm_b_mean = mean(arm_b_scores)
119
+
120
+ arm_a_config = manifest.get("arms", {}).get("A", {}).get("label", "Arm A")
121
+ arm_b_config = manifest.get("arms", {}).get("B", {}).get("label", "Arm B")
122
+
123
+ token_a = (token_summary or {}).get("by_arm", {}).get("arm-A", {}).get("total_tokens", {})
124
+ token_b = (token_summary or {}).get("by_arm", {}).get("arm-B", {}).get("total_tokens", {})
125
+ token_delta = (token_summary or {}).get("delta_A_vs_B", {})
126
+
127
+ total_runs = wins["arm-A"] + wins["arm-B"] + wins["tie"]
128
+ a_win_rate = wins["arm-A"] / total_runs if total_runs else 0
129
+
130
+ # Determine overall winner
131
+ if wins["arm-A"] > wins["arm-B"]:
132
+ overall_winner = f"Arm A ({arm_a_config})"
133
+ winner_arm = "A"
134
+ elif wins["arm-B"] > wins["arm-A"]:
135
+ overall_winner = f"Arm B ({arm_b_config})"
136
+ winner_arm = "B"
137
+ else:
138
+ overall_winner = "Tie"
139
+ winner_arm = "tie"
140
+
141
+ lines = []
142
+ ts = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
143
+ lines.append(f"# CLEO Grade A/B Analysis Report")
144
+ lines.append(f"**Generated**: {ts} **Mode**: {mode} **Run dir**: `{run_dir}`\n")
145
+
146
+ # Executive Summary
147
+ lines.append("## Executive Summary\n")
148
+ a_mean_str = f"{arm_a_mean:.1f}" if arm_a_mean is not None else "N/A"
149
+ b_mean_str = f"{arm_b_mean:.1f}" if arm_b_mean is not None else "N/A"
150
+ delta_str = f"{arm_a_mean - arm_b_mean:+.1f}" if arm_a_mean and arm_b_mean else "N/A"
151
+ a_grade = letter_grade(arm_a_mean) if arm_a_mean else "?"
152
+ b_grade = letter_grade(arm_b_mean) if arm_b_mean else "?"
153
+
154
+ tok_a_mean = token_a.get("mean") or "N/A"
155
+ tok_b_mean = token_b.get("mean") or "N/A"
156
+ tok_delta = token_delta.get("percent", "N/A")
157
+ tok_note = token_delta.get("note", "")
158
+
159
+ lines.append(f"| Metric | Arm A ({arm_a_config}) | Arm B ({arm_b_config}) | Delta |")
160
+ lines.append(f"|---|---|---|---|")
161
+ lines.append(f"| Mean Score | {a_mean_str}/100 | {b_mean_str}/100 | {delta_str} |")
162
+ lines.append(f"| Grade | {a_grade} | {b_grade} | — |")
163
+ lines.append(f"| Mean Tokens | {tok_a_mean} | {tok_b_mean} | {tok_delta} |")
164
+ lines.append(f"| Win Rate | {wins['arm-A']}/{total_runs} | {wins['arm-B']}/{total_runs} | — |")
165
+ lines.append(f"| Ties | — | — | {wins['tie']} |")
166
+ lines.append("")
167
+ lines.append(f"**Overall Winner: {overall_winner}**")
168
+ if tok_note:
169
+ lines.append(f"Token note: {tok_note}")
170
+ lines.append("")
171
+
172
+ # Per-dimension breakdown
173
+ lines.append("## Per-Dimension Scores (Mean)\n")
174
+ lines.append(f"| Dimension | Arm A | Arm B | Delta | Max |")
175
+ lines.append(f"|---|---|---|---|---|")
176
+ for dim, label in DIMENSION_LABELS.items():
177
+ a_vals = dim_scores.get("arm-A", {}).get(dim, [])
178
+ b_vals = dim_scores.get("arm-B", {}).get(dim, [])
179
+ a_m = mean(a_vals)
180
+ b_m = mean(b_vals)
181
+ a_str = f"{a_m:.1f}" if a_m is not None else "N/A"
182
+ b_str = f"{b_m:.1f}" if b_m is not None else "N/A"
183
+ d_str = f"{a_m - b_m:+.1f}" if a_m is not None and b_m is not None else "N/A"
184
+ lines.append(f"| {label} | {a_str} | {b_str} | {d_str} | 20 |")
185
+ lines.append("")
186
+
187
+ # Comparison results
188
+ if comparisons:
189
+ lines.append("## Comparison Results\n")
190
+ lines.append(f"| Run | Slot | Winner | A Score | B Score | A Flags | B Flags |")
191
+ lines.append(f"|---|---|---|---|---|---|---|")
192
+ for i, c in enumerate(comparisons, 1):
193
+ winner = c.get("winner", "?")
194
+ gc = c.get("grade_comparison", {})
195
+ a_total = gc.get("A", {}).get("total_score", "?")
196
+ b_total = gc.get("B", {}).get("total_score", "?")
197
+ a_flags = len(gc.get("A", {}).get("flags", []))
198
+ b_flags = len(gc.get("B", {}).get("flags", []))
199
+ lines.append(f"| {i} | — | {winner} | {a_total} | {b_total} | {a_flags} | {b_flags} |")
200
+ lines.append("")
201
+
202
+ # Token Economy
203
+ lines.append("## Token Economy\n")
204
+ if token_a.get("mean") and token_b.get("mean"):
205
+ a_tok = token_a["mean"]
206
+ b_tok = token_b["mean"]
207
+ a_spt = (arm_a_mean / a_tok * 1000) if arm_a_mean and a_tok else None
208
+ b_spt = (arm_b_mean / b_tok * 1000) if arm_b_mean and b_tok else None
209
+ lines.append(f"| Metric | Arm A | Arm B |")
210
+ lines.append(f"|---|---|---|")
211
+ lines.append(f"| Mean tokens | {a_tok:.0f} | {b_tok:.0f} |")
212
+ lines.append(f"| Stddev | {token_a.get('stddev', 0):.0f} | {token_b.get('stddev', 0):.0f} |")
213
+ if a_spt and b_spt:
214
+ lines.append(f"| Score per 1k tokens | {a_spt:.1f} | {b_spt:.1f} |")
215
+ lines.append("")
216
+ lines.append(f"**Token delta**: {tok_delta} — {tok_note}")
217
+ else:
218
+ lines.append("_Token data incomplete. Fill `total_tokens` in timing.json from task notifications._")
219
+ lines.append("")
220
+
221
+ # Recommendations placeholder
222
+ lines.append("## Recommendations\n")
223
+ lines.append("_Run `agents/analysis-reporter.md` for AI-generated recommendations based on full pattern analysis._\n")
224
+
225
+ return "\n".join(lines)
226
+
227
+
228
+ def main():
229
+ parser = argparse.ArgumentParser(description="Generate ct-grade A/B comparison report")
230
+ parser.add_argument("--run-dir", required=True)
231
+ parser.add_argument("--mode", default="ab", choices=["scenario", "ab", "blind"])
232
+ parser.add_argument("--html", action="store_true", help="Also generate report.html")
233
+ args = parser.parse_args()
234
+
235
+ run_dir = args.run_dir
236
+ if not os.path.isdir(run_dir):
237
+ print(f"ERROR: Run dir not found: {run_dir}", file=sys.stderr)
238
+ sys.exit(1)
239
+
240
+ manifest = find_json(run_dir, "run-manifest.json") or {}
241
+ token_summary = find_json(run_dir, "token-summary.json")
242
+
243
+ if token_summary is None:
244
+ print("WARN: token-summary.json not found. Run token_tracker.py first.", file=sys.stderr)
245
+
246
+ scores, dim_scores = collect_scores(run_dir)
247
+ wins, comparisons = collect_comparisons(run_dir)
248
+
249
+ report = build_report(run_dir, args.mode, manifest, token_summary,
250
+ scores, dim_scores, wins, comparisons)
251
+
252
+ report_path = os.path.join(run_dir, "report.md")
253
+ with open(report_path, "w") as f:
254
+ f.write(report)
255
+
256
+ print(f"\nReport written: {report_path}")
257
+
258
+ if args.html:
259
+ # Basic HTML wrapper
260
+ html = f"""<!DOCTYPE html>
261
+ <html><head><meta charset="utf-8"><title>ct-grade Report</title>
262
+ <style>body{{font-family:sans-serif;max-width:900px;margin:40px auto;padding:0 20px}}
263
+ table{{border-collapse:collapse;width:100%}}td,th{{border:1px solid #ddd;padding:8px;text-align:left}}
264
+ th{{background:#f5f5f5}}code{{background:#f5f5f5;padding:2px 4px;border-radius:3px}}</style>
265
+ </head><body>
266
+ <pre>{report}</pre>
267
+ </body></html>"""
268
+ html_path = os.path.join(run_dir, "report.html")
269
+ with open(html_path, "w") as f:
270
+ f.write(html)
271
+ print(f"HTML report written: {html_path}")
272
+
273
+ # Summary
274
+ total_a = scores.get("arm-A", [])
275
+ total_b = scores.get("arm-B", [])
276
+ print(f"\nScore summary:")
277
+ print(f" Arm A: mean={mean(total_a):.1f} n={len(total_a)}" if total_a else " Arm A: no data")
278
+ print(f" Arm B: mean={mean(total_b):.1f} n={len(total_b)}" if total_b else " Arm B: no data")
279
+ print(f" Wins: A={wins['arm-A']}, B={wins['arm-B']}, tie={wins['tie']}")
280
+
281
+
282
+ if __name__ == "__main__":
283
+ main()