@cleocode/cleo 2026.3.20 → 2026.3.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. package/dist/cli/index.js +39394 -38817
  2. package/dist/cli/index.js.map +4 -4
  3. package/dist/mcp/index.js +35841 -36702
  4. package/dist/mcp/index.js.map +4 -4
  5. package/drizzle-brain.config.ts +7 -0
  6. package/drizzle-nexus.config.ts +7 -0
  7. package/drizzle-tasks.config.ts +7 -0
  8. package/migrations/drizzle-brain/20260301230215_workable_spitfire/migration.sql +68 -0
  9. package/migrations/drizzle-brain/20260301230215_workable_spitfire/snapshot.json +651 -0
  10. package/migrations/drizzle-brain/20260302050325_unknown_justin_hammer/migration.sql +23 -0
  11. package/migrations/drizzle-brain/20260302050325_unknown_justin_hammer/snapshot.json +884 -0
  12. package/migrations/drizzle-brain/20260302061755_unusual_jamie_braddock/migration.sql +2 -0
  13. package/migrations/drizzle-brain/20260302061755_unusual_jamie_braddock/snapshot.json +908 -0
  14. package/migrations/drizzle-brain/20260302193548_luxuriant_glorian/migration.sql +20 -0
  15. package/migrations/drizzle-brain/20260302193548_luxuriant_glorian/snapshot.json +1078 -0
  16. package/migrations/drizzle-brain/20260304045002_white_thunderbolt_ross/migration.sql +16 -0
  17. package/migrations/drizzle-brain/20260304045002_white_thunderbolt_ross/snapshot.json +1233 -0
  18. package/migrations/drizzle-nexus/20260305070805_quick_ted_forrester/migration.sql +46 -0
  19. package/migrations/drizzle-nexus/20260305070805_quick_ted_forrester/snapshot.json +461 -0
  20. package/migrations/drizzle-tasks/20260308024513_oval_king_bedlam/migration.sql +32 -0
  21. package/migrations/drizzle-tasks/20260308024513_oval_king_bedlam/snapshot.json +3727 -0
  22. package/package.json +14 -4
  23. package/packages/ct-skills/skills/ct-cleo/SKILL.md +344 -81
  24. package/packages/ct-skills/skills/ct-grade/SKILL.md +20 -4
  25. package/packages/ct-skills/skills/ct-grade/agents/analysis-reporter.md +203 -0
  26. package/packages/ct-skills/skills/ct-grade/agents/blind-comparator.md +157 -0
  27. package/packages/ct-skills/skills/ct-grade/agents/scenario-runner.md +134 -0
  28. package/packages/ct-skills/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
  29. package/packages/ct-skills/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
  30. package/packages/ct-skills/skills/ct-grade/eval-viewer/generate_review.py +283 -0
  31. package/packages/ct-skills/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
  32. package/packages/ct-skills/skills/ct-grade/eval-viewer/viewer.html +219 -0
  33. package/packages/ct-skills/skills/ct-grade/evals/evals.json +94 -0
  34. package/packages/ct-skills/skills/ct-grade/references/ab-test-methodology.md +150 -0
  35. package/packages/ct-skills/skills/ct-grade/references/domains.md +137 -0
  36. package/packages/ct-skills/skills/ct-grade/references/grade-spec.md +236 -0
  37. package/packages/ct-skills/skills/ct-grade/references/scenario-playbook.md +234 -0
  38. package/packages/ct-skills/skills/ct-grade/references/token-tracking.md +120 -0
  39. package/packages/ct-skills/skills/ct-grade/scripts/audit_analyzer.py +279 -0
  40. package/packages/ct-skills/skills/ct-grade/scripts/generate_report.py +283 -0
  41. package/packages/ct-skills/skills/ct-grade/scripts/run_ab_test.py +504 -0
  42. package/packages/ct-skills/skills/ct-grade/scripts/run_all.py +287 -0
  43. package/packages/ct-skills/skills/ct-grade/scripts/setup_run.py +183 -0
  44. package/packages/ct-skills/skills/ct-grade/scripts/token_tracker.py +630 -0
  45. package/packages/ct-skills/skills/ct-grade-v2-1/SKILL.md +237 -0
  46. package/packages/ct-skills/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
  47. package/packages/ct-skills/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
  48. package/packages/ct-skills/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
  49. package/packages/ct-skills/skills/ct-grade-v2-1/evals/evals.json +74 -0
  50. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
  51. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
  52. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
  53. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
  54. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
  55. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
  56. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
  57. package/packages/ct-skills/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
  58. package/packages/ct-skills/skills/ct-grade-v2-1/manifest-entry.json +31 -0
  59. package/packages/ct-skills/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
  60. package/packages/ct-skills/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
  61. package/packages/ct-skills/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
  62. package/packages/ct-skills/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
  63. package/packages/ct-skills/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
  64. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
  65. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
  66. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
  67. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
  68. package/packages/ct-skills/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
  69. package/packages/ct-skills/skills/ct-orchestrator/SKILL.md +1 -29
  70. package/packages/ct-skills/skills/ct-orchestrator/manifest-entry.json +19 -0
  71. package/packages/ct-skills/skills/ct-skill-creator/SKILL.md +0 -12
  72. package/packages/ct-skills/skills/ct-skill-creator/agents/analyzer.md +276 -0
  73. package/packages/ct-skills/skills/ct-skill-creator/agents/comparator.md +204 -0
  74. package/packages/ct-skills/skills/ct-skill-creator/agents/grader.md +225 -0
  75. package/packages/ct-skills/skills/ct-skill-creator/assets/eval_review.html +146 -0
  76. package/packages/ct-skills/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
  77. package/packages/ct-skills/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
  78. package/packages/ct-skills/skills/ct-skill-creator/manifest-entry.json +17 -0
  79. package/packages/ct-skills/skills/ct-skill-creator/references/dynamic-context.md +228 -0
  80. package/packages/ct-skills/skills/ct-skill-creator/references/frontmatter.md +83 -0
  81. package/packages/ct-skills/skills/ct-skill-creator/references/invocation-control.md +165 -0
  82. package/packages/ct-skills/skills/ct-skill-creator/references/provider-deployment.md +175 -0
  83. package/packages/ct-skills/skills/ct-skill-creator/references/schemas.md +430 -0
  84. package/packages/ct-skills/skills/ct-skill-creator/scripts/__init__.py +1 -0
  85. package/packages/ct-skills/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
  86. package/packages/ct-skills/skills/ct-skill-creator/scripts/generate_report.py +326 -0
  87. package/packages/ct-skills/skills/ct-skill-creator/scripts/improve_description.py +247 -0
  88. package/packages/ct-skills/skills/ct-skill-creator/scripts/run_eval.py +310 -0
  89. package/packages/ct-skills/skills/ct-skill-creator/scripts/run_loop.py +328 -0
  90. package/packages/ct-skills/skills/ct-skill-creator/scripts/utils.py +47 -0
  91. package/packages/ct-skills/skills/ct-skill-validator/SKILL.md +178 -0
  92. package/packages/ct-skills/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
  93. package/packages/ct-skills/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
  94. package/packages/ct-skills/skills/ct-skill-validator/evals/eval_set.json +14 -0
  95. package/packages/ct-skills/skills/ct-skill-validator/evals/evals.json +52 -0
  96. package/packages/ct-skills/skills/ct-skill-validator/manifest-entry.json +20 -0
  97. package/packages/ct-skills/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
  98. package/packages/ct-skills/skills/ct-skill-validator/references/validation-rules.md +168 -0
  99. package/packages/ct-skills/skills/ct-skill-validator/scripts/__init__.py +0 -0
  100. package/packages/ct-skills/skills/ct-skill-validator/scripts/audit_body.py +242 -0
  101. package/packages/ct-skills/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
  102. package/packages/ct-skills/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
  103. package/packages/ct-skills/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
  104. package/packages/ct-skills/skills/ct-skill-validator/scripts/validate.py +422 -0
  105. /package/{drizzle → migrations/drizzle-tasks}/20260224040019_baseline/migration.sql +0 -0
  106. /package/{drizzle → migrations/drizzle-tasks}/20260224040019_baseline/snapshot.json +0 -0
  107. /package/{drizzle → migrations/drizzle-tasks}/20260224040238_add-audit-log/migration.sql +0 -0
  108. /package/{drizzle → migrations/drizzle-tasks}/20260224040238_add-audit-log/snapshot.json +0 -0
  109. /package/{drizzle → migrations/drizzle-tasks}/20260224144602_closed_grim_reaper/migration.sql +0 -0
  110. /package/{drizzle → migrations/drizzle-tasks}/20260224144602_closed_grim_reaper/snapshot.json +0 -0
  111. /package/{drizzle → migrations/drizzle-tasks}/20260225024442_sync-lifecycle-enums-and-arch-decisions/migration.sql +0 -0
  112. /package/{drizzle → migrations/drizzle-tasks}/20260225024442_sync-lifecycle-enums-and-arch-decisions/snapshot.json +0 -0
  113. /package/{drizzle → migrations/drizzle-tasks}/20260227014821_adr-system-and-status-registry/migration.sql +0 -0
  114. /package/{drizzle → migrations/drizzle-tasks}/20260227014821_adr-system-and-status-registry/snapshot.json +0 -0
  115. /package/{drizzle → migrations/drizzle-tasks}/20260227021231_add-cancelled-pipeline-status/migration.sql +0 -0
  116. /package/{drizzle → migrations/drizzle-tasks}/20260227021231_add-cancelled-pipeline-status/snapshot.json +0 -0
  117. /package/{drizzle → migrations/drizzle-tasks}/20260227022417_adr-cognitive-search-fields/migration.sql +0 -0
  118. /package/{drizzle → migrations/drizzle-tasks}/20260227022417_adr-cognitive-search-fields/snapshot.json +0 -0
  119. /package/{drizzle → migrations/drizzle-tasks}/20260227172236_freezing_grey_gargoyle/migration.sql +0 -0
  120. /package/{drizzle → migrations/drizzle-tasks}/20260227172236_freezing_grey_gargoyle/snapshot.json +0 -0
  121. /package/{drizzle → migrations/drizzle-tasks}/20260227183444_fix-orphaned-parent-ids/migration.sql +0 -0
  122. /package/{drizzle → migrations/drizzle-tasks}/20260227183444_fix-orphaned-parent-ids/snapshot.json +0 -0
  123. /package/{drizzle → migrations/drizzle-tasks}/20260227183521_parent-id-on-delete-set-null/migration.sql +0 -0
  124. /package/{drizzle → migrations/drizzle-tasks}/20260227183521_parent-id-on-delete-set-null/snapshot.json +0 -0
  125. /package/{drizzle → migrations/drizzle-tasks}/20260227200430_numerous_mysterio/migration.sql +0 -0
  126. /package/{drizzle → migrations/drizzle-tasks}/20260227200430_numerous_mysterio/snapshot.json +0 -0
  127. /package/{drizzle → migrations/drizzle-tasks}/20260227235745_add-audit-log-dispatch-columns/migration.sql +0 -0
  128. /package/{drizzle → migrations/drizzle-tasks}/20260227235745_add-audit-log-dispatch-columns/snapshot.json +0 -0
  129. /package/{drizzle → migrations/drizzle-tasks}/20260301053344_careless_changeling/migration.sql +0 -0
  130. /package/{drizzle → migrations/drizzle-tasks}/20260301053344_careless_changeling/snapshot.json +0 -0
  131. /package/{drizzle → migrations/drizzle-tasks}/20260301175940_futuristic_eternity/migration.sql +0 -0
  132. /package/{drizzle → migrations/drizzle-tasks}/20260301175940_futuristic_eternity/snapshot.json +0 -0
  133. /package/{drizzle → migrations/drizzle-tasks}/20260301180528_update-task-relations-check-constraint/migration.sql +0 -0
  134. /package/{drizzle → migrations/drizzle-tasks}/20260301180528_update-task-relations-check-constraint/snapshot.json +0 -0
  135. /package/{drizzle → migrations/drizzle-tasks}/20260302163443_free_silk_fever/migration.sql +0 -0
  136. /package/{drizzle → migrations/drizzle-tasks}/20260302163443_free_silk_fever/snapshot.json +0 -0
  137. /package/{drizzle → migrations/drizzle-tasks}/20260302163457_robust_johnny_storm/migration.sql +0 -0
  138. /package/{drizzle → migrations/drizzle-tasks}/20260302163457_robust_johnny_storm/snapshot.json +0 -0
  139. /package/{drizzle → migrations/drizzle-tasks}/20260302163511_late_sphinx/migration.sql +0 -0
  140. /package/{drizzle → migrations/drizzle-tasks}/20260302163511_late_sphinx/snapshot.json +0 -0
  141. /package/{drizzle → migrations/drizzle-tasks}/20260305011924_cheerful_mongu/migration.sql +0 -0
  142. /package/{drizzle → migrations/drizzle-tasks}/20260305011924_cheerful_mongu/snapshot.json +0 -0
  143. /package/{drizzle → migrations/drizzle-tasks}/20260305203927_demonic_storm/migration.sql +0 -0
  144. /package/{drizzle → migrations/drizzle-tasks}/20260305203927_demonic_storm/snapshot.json +0 -0
  145. /package/{drizzle → migrations/drizzle-tasks}/20260306001243_spooky_rage/migration.sql +0 -0
  146. /package/{drizzle → migrations/drizzle-tasks}/20260306001243_spooky_rage/snapshot.json +0 -0
  147. /package/{drizzle → migrations/drizzle-tasks}/20260306193138_young_morbius/migration.sql +0 -0
  148. /package/{drizzle → migrations/drizzle-tasks}/20260306193138_young_morbius/snapshot.json +0 -0
  149. /package/{drizzle → migrations/drizzle-tasks}/20260306194959_sticky_captain_flint/migration.sql +0 -0
  150. /package/{drizzle → migrations/drizzle-tasks}/20260306194959_sticky_captain_flint/snapshot.json +0 -0
@@ -0,0 +1,283 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ generate_review.py — Serve an interactive eval review for ct-grade.
4
+
5
+ Reads eval run outputs from a workspace directory, embeds all data into
6
+ the viewer.html template, and serves it at localhost:3118.
7
+
8
+ Usage:
9
+ # Serve (live-reloading on refresh):
10
+ python eval-viewer/generate_review.py <workspace-path> [--port 3118]
11
+
12
+ # Write static HTML file instead:
13
+ python eval-viewer/generate_review.py <workspace-path> --static output.html
14
+
15
+ # Include benchmark data:
16
+ python eval-viewer/generate_review.py <workspace-path> --benchmark benchmark.json
17
+
18
+ No external dependencies — stdlib only.
19
+ """
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import re
25
+ import signal
26
+ import subprocess
27
+ import sys
28
+ import time
29
+ import webbrowser
30
+ from functools import partial
31
+ from http.server import HTTPServer, BaseHTTPRequestHandler
32
+ from pathlib import Path
33
+
34
+
35
+ TEXT_EXTENSIONS = {
36
+ ".txt", ".md", ".json", ".jsonl", ".csv", ".py", ".ts", ".js",
37
+ ".yaml", ".yml", ".sh", ".html", ".css",
38
+ }
39
+
40
+ METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
41
+
42
+
43
+ def find_runs(workspace: Path) -> list[dict]:
44
+ """Find eval run dirs — directories with an outputs/ subdir."""
45
+ runs = []
46
+ _find_recursive(workspace, workspace, runs)
47
+ runs.sort(key=lambda r: (r.get("eval_id") or float("inf"), r["id"]))
48
+ return runs
49
+
50
+
51
+ def _find_recursive(root: Path, current: Path, runs: list) -> None:
52
+ if not current.is_dir():
53
+ return
54
+ skip = {"node_modules", ".git", "__pycache__", "eval-viewer", "assets", "scripts"}
55
+ outputs_dir = current / "outputs"
56
+ if outputs_dir.is_dir():
57
+ run = _build_run(root, current)
58
+ if run:
59
+ runs.append(run)
60
+ return
61
+ for child in sorted(current.iterdir()):
62
+ if child.is_dir() and child.name not in skip:
63
+ _find_recursive(root, child, runs)
64
+
65
+
66
+ def _build_run(root: Path, run_dir: Path) -> dict | None:
67
+ prompt = ""
68
+ eval_id = None
69
+
70
+ for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
71
+ if candidate.exists():
72
+ try:
73
+ meta = json.loads(candidate.read_text())
74
+ prompt = meta.get("prompt", "")
75
+ eval_id = meta.get("eval_id")
76
+ except Exception:
77
+ pass
78
+ if prompt:
79
+ break
80
+
81
+ if not prompt:
82
+ for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
83
+ if candidate.exists():
84
+ try:
85
+ text = candidate.read_text()
86
+ m = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
87
+ if m:
88
+ prompt = m.group(1).strip()
89
+ except Exception:
90
+ pass
91
+ if prompt:
92
+ break
93
+
94
+ prompt = prompt or "(No prompt found)"
95
+ run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
96
+
97
+ outputs_dir = run_dir / "outputs"
98
+ output_files = []
99
+ if outputs_dir.is_dir():
100
+ for f in sorted(outputs_dir.iterdir()):
101
+ if f.is_file() and f.name not in METADATA_FILES:
102
+ output_files.append(_embed_file(f))
103
+
104
+ grading = None
105
+ for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
106
+ if candidate.exists():
107
+ try:
108
+ grading = json.loads(candidate.read_text())
109
+ except Exception:
110
+ pass
111
+ if grading:
112
+ break
113
+
114
+ return {
115
+ "id": run_id,
116
+ "prompt": prompt,
117
+ "eval_id": eval_id,
118
+ "outputs": output_files,
119
+ "grading": grading,
120
+ }
121
+
122
+
123
+ def _embed_file(path: Path) -> dict:
124
+ ext = path.suffix.lower()
125
+ if ext in TEXT_EXTENSIONS:
126
+ try:
127
+ content = path.read_text(errors="replace")
128
+ except OSError:
129
+ content = "(Error reading file)"
130
+ return {"name": path.name, "type": "text", "content": content}
131
+ else:
132
+ import base64
133
+ try:
134
+ raw = path.read_bytes()
135
+ b64 = base64.b64encode(raw).decode("ascii")
136
+ except OSError:
137
+ return {"name": path.name, "type": "error", "content": "(Error reading file)"}
138
+ return {"name": path.name, "type": "binary", "data_b64": b64}
139
+
140
+
141
+ def _generate_html(runs: list[dict], skill_name: str, benchmark: dict | None = None) -> str:
142
+ template_path = Path(__file__).parent / "viewer.html"
143
+ template = template_path.read_text()
144
+ embedded = {"skill_name": skill_name, "runs": runs, "previous_feedback": {}, "previous_outputs": {}}
145
+ if benchmark:
146
+ embedded["benchmark"] = benchmark
147
+ data_json = json.dumps(embedded)
148
+ return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
149
+
150
+
151
+ def _kill_port(port: int) -> None:
152
+ try:
153
+ result = subprocess.run(["lsof", "-ti", f":{port}"], capture_output=True, text=True, timeout=5)
154
+ for pid_str in result.stdout.strip().split("\n"):
155
+ if pid_str.strip():
156
+ try:
157
+ os.kill(int(pid_str.strip()), signal.SIGTERM)
158
+ except (ProcessLookupError, ValueError):
159
+ pass
160
+ if result.stdout.strip():
161
+ time.sleep(0.5)
162
+ except (subprocess.TimeoutExpired, FileNotFoundError):
163
+ pass
164
+
165
+
166
+ class Handler(BaseHTTPRequestHandler):
167
+ def __init__(self, workspace, skill_name, feedback_path, benchmark_path, *args, **kwargs):
168
+ self.workspace = workspace
169
+ self.skill_name = skill_name
170
+ self.feedback_path = feedback_path
171
+ self.benchmark_path = benchmark_path
172
+ super().__init__(*args, **kwargs)
173
+
174
+ def do_GET(self) -> None:
175
+ if self.path in ("/", "/index.html"):
176
+ runs = find_runs(self.workspace)
177
+ benchmark = None
178
+ if self.benchmark_path and self.benchmark_path.exists():
179
+ try:
180
+ benchmark = json.loads(self.benchmark_path.read_text())
181
+ except Exception:
182
+ pass
183
+ html = _generate_html(runs, self.skill_name, benchmark)
184
+ content = html.encode("utf-8")
185
+ self.send_response(200)
186
+ self.send_header("Content-Type", "text/html; charset=utf-8")
187
+ self.send_header("Content-Length", str(len(content)))
188
+ self.end_headers()
189
+ self.wfile.write(content)
190
+ elif self.path == "/api/feedback":
191
+ data = self.feedback_path.read_bytes() if self.feedback_path.exists() else b"{}"
192
+ self.send_response(200)
193
+ self.send_header("Content-Type", "application/json")
194
+ self.send_header("Content-Length", str(len(data)))
195
+ self.end_headers()
196
+ self.wfile.write(data)
197
+ else:
198
+ self.send_error(404)
199
+
200
+ def do_POST(self) -> None:
201
+ if self.path == "/api/feedback":
202
+ length = int(self.headers.get("Content-Length", 0))
203
+ body = self.rfile.read(length)
204
+ try:
205
+ data = json.loads(body)
206
+ self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
207
+ resp = b'{"ok":true}'
208
+ self.send_response(200)
209
+ except Exception as e:
210
+ resp = json.dumps({"error": str(e)}).encode()
211
+ self.send_response(500)
212
+ self.send_header("Content-Type", "application/json")
213
+ self.send_header("Content-Length", str(len(resp)))
214
+ self.end_headers()
215
+ self.wfile.write(resp)
216
+ else:
217
+ self.send_error(404)
218
+
219
+ def log_message(self, fmt, *args):
220
+ pass
221
+
222
+
223
+ def main() -> None:
224
+ parser = argparse.ArgumentParser(description="ct-grade eval review viewer")
225
+ parser.add_argument("workspace", type=Path, help="Workspace directory with eval runs")
226
+ parser.add_argument("--port", "-p", type=int, default=3118)
227
+ parser.add_argument("--skill-name", "-n", default="ct-grade")
228
+ parser.add_argument("--benchmark", type=Path, default=None)
229
+ parser.add_argument("--static", "-s", type=Path, default=None, help="Write static HTML, don't serve")
230
+ args = parser.parse_args()
231
+
232
+ workspace = args.workspace.resolve()
233
+ if not workspace.is_dir():
234
+ print(f"Error: {workspace} is not a directory", file=sys.stderr)
235
+ sys.exit(1)
236
+
237
+ runs = find_runs(workspace)
238
+ if not runs:
239
+ print(f"No eval runs found in {workspace}", file=sys.stderr)
240
+ print("Runs need an outputs/ subdirectory with result files.", file=sys.stderr)
241
+ sys.exit(1)
242
+
243
+ benchmark = None
244
+ if args.benchmark and args.benchmark.exists():
245
+ try:
246
+ benchmark = json.loads(args.benchmark.read_text())
247
+ except Exception:
248
+ pass
249
+
250
+ if args.static:
251
+ html = _generate_html(runs, args.skill_name, benchmark)
252
+ args.static.parent.mkdir(parents=True, exist_ok=True)
253
+ args.static.write_text(html)
254
+ print(f"\n Static viewer: {args.static}\n")
255
+ sys.exit(0)
256
+
257
+ port = args.port
258
+ _kill_port(port)
259
+ feedback_path = workspace / "feedback.json"
260
+ handler = partial(Handler, workspace, args.skill_name, feedback_path, args.benchmark)
261
+ try:
262
+ server = HTTPServer(("127.0.0.1", port), handler)
263
+ except OSError:
264
+ server = HTTPServer(("127.0.0.1", 0), handler)
265
+ port = server.server_address[1]
266
+
267
+ url = f"http://localhost:{port}"
268
+ print(f"\n ct-grade Eval Viewer")
269
+ print(f" ───────────────────────────")
270
+ print(f" URL: {url}")
271
+ print(f" Workspace: {workspace}")
272
+ print(f" Runs: {len(runs)} found")
273
+ print(f"\n Press Ctrl+C to stop.\n")
274
+ webbrowser.open(url)
275
+ try:
276
+ server.serve_forever()
277
+ except KeyboardInterrupt:
278
+ print("\nStopped.")
279
+ server.server_close()
280
+
281
+
282
+ if __name__ == "__main__":
283
+ main()