@cleocode/skills 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dispatch-config.json +404 -0
- package/index.d.ts +178 -0
- package/index.js +405 -0
- package/package.json +14 -0
- package/profiles/core.json +7 -0
- package/profiles/full.json +10 -0
- package/profiles/minimal.json +7 -0
- package/profiles/recommended.json +7 -0
- package/provider-skills-map.json +97 -0
- package/skills/_shared/cleo-style-guide.md +84 -0
- package/skills/_shared/manifest-operations.md +810 -0
- package/skills/_shared/placeholders.json +433 -0
- package/skills/_shared/skill-chaining-patterns.md +237 -0
- package/skills/_shared/subagent-protocol-base.md +223 -0
- package/skills/_shared/task-system-integration.md +232 -0
- package/skills/_shared/testing-framework-config.md +110 -0
- package/skills/ct-cleo/SKILL.md +490 -0
- package/skills/ct-cleo/references/anti-patterns.md +19 -0
- package/skills/ct-cleo/references/loom-lifecycle.md +136 -0
- package/skills/ct-cleo/references/orchestrator-constraints.md +55 -0
- package/skills/ct-cleo/references/session-protocol.md +162 -0
- package/skills/ct-codebase-mapper/SKILL.md +82 -0
- package/skills/ct-contribution/SKILL.md +521 -0
- package/skills/ct-contribution/templates/contribution-init.json +21 -0
- package/skills/ct-dev-workflow/SKILL.md +423 -0
- package/skills/ct-docs-lookup/SKILL.md +66 -0
- package/skills/ct-docs-review/SKILL.md +175 -0
- package/skills/ct-docs-write/SKILL.md +108 -0
- package/skills/ct-documentor/SKILL.md +231 -0
- package/skills/ct-epic-architect/SKILL.md +305 -0
- package/skills/ct-epic-architect/references/bug-epic-example.md +172 -0
- package/skills/ct-epic-architect/references/commands.md +201 -0
- package/skills/ct-epic-architect/references/feature-epic-example.md +210 -0
- package/skills/ct-epic-architect/references/migration-epic-example.md +244 -0
- package/skills/ct-epic-architect/references/output-format.md +92 -0
- package/skills/ct-epic-architect/references/patterns.md +284 -0
- package/skills/ct-epic-architect/references/refactor-epic-example.md +412 -0
- package/skills/ct-epic-architect/references/research-epic-example.md +226 -0
- package/skills/ct-epic-architect/references/shell-escaping.md +86 -0
- package/skills/ct-epic-architect/references/skill-aware-execution.md +195 -0
- package/skills/ct-grade/SKILL.md +230 -0
- package/skills/ct-grade/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade/agents/blind-comparator.md +157 -0
- package/skills/ct-grade/agents/scenario-runner.md +134 -0
- package/skills/ct-grade/eval-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade/eval-viewer/generate_grade_review.py +1138 -0
- package/skills/ct-grade/eval-viewer/generate_grade_viewer.py +544 -0
- package/skills/ct-grade/eval-viewer/generate_review.py +283 -0
- package/skills/ct-grade/eval-viewer/grade-review.html +1574 -0
- package/skills/ct-grade/eval-viewer/viewer.html +219 -0
- package/skills/ct-grade/evals/evals.json +94 -0
- package/skills/ct-grade/references/ab-test-methodology.md +150 -0
- package/skills/ct-grade/references/domains.md +137 -0
- package/skills/ct-grade/references/grade-spec.md +236 -0
- package/skills/ct-grade/references/scenario-playbook.md +234 -0
- package/skills/ct-grade/references/token-tracking.md +120 -0
- package/skills/ct-grade/scripts/__pycache__/audit_analyzer.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_ab_test.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/run_all.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/__pycache__/token_tracker.cpython-314.pyc +0 -0
- package/skills/ct-grade/scripts/audit_analyzer.py +279 -0
- package/skills/ct-grade/scripts/generate_report.py +283 -0
- package/skills/ct-grade/scripts/run_ab_test.py +504 -0
- package/skills/ct-grade/scripts/run_all.py +287 -0
- package/skills/ct-grade/scripts/setup_run.py +183 -0
- package/skills/ct-grade/scripts/token_tracker.py +630 -0
- package/skills/ct-grade-v2-1/SKILL.md +237 -0
- package/skills/ct-grade-v2-1/agents/analysis-reporter.md +203 -0
- package/skills/ct-grade-v2-1/agents/blind-comparator.md +157 -0
- package/skills/ct-grade-v2-1/agents/scenario-runner.md +179 -0
- package/skills/ct-grade-v2-1/evals/evals.json +74 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/build_op_stats.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/__pycache__/generate_grade_review.cpython-314.pyc +0 -0
- package/skills/ct-grade-v2-1/grade-viewer/build_op_stats.py +174 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-analysis.json +41 -0
- package/skills/ct-grade-v2-1/grade-viewer/eval-report.md +34 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_review.py +1023 -0
- package/skills/ct-grade-v2-1/grade-viewer/generate_grade_viewer.py +548 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review-eval.html +613 -0
- package/skills/ct-grade-v2-1/grade-viewer/grade-review.html +1532 -0
- package/skills/ct-grade-v2-1/grade-viewer/viewer.html +620 -0
- package/skills/ct-grade-v2-1/manifest-entry.json +31 -0
- package/skills/ct-grade-v2-1/references/ab-testing.md +233 -0
- package/skills/ct-grade-v2-1/references/domains-ssot.md +156 -0
- package/skills/ct-grade-v2-1/references/grade-spec-v2.md +167 -0
- package/skills/ct-grade-v2-1/references/playbook-v2.md +393 -0
- package/skills/ct-grade-v2-1/references/token-tracking.md +202 -0
- package/skills/ct-grade-v2-1/scripts/generate_report.py +419 -0
- package/skills/ct-grade-v2-1/scripts/run_ab_test.py +493 -0
- package/skills/ct-grade-v2-1/scripts/run_scenario.py +396 -0
- package/skills/ct-grade-v2-1/scripts/setup_run.py +207 -0
- package/skills/ct-grade-v2-1/scripts/token_tracker.py +175 -0
- package/skills/ct-memory/SKILL.md +84 -0
- package/skills/ct-orchestrator/INSTALL.md +61 -0
- package/skills/ct-orchestrator/README.md +69 -0
- package/skills/ct-orchestrator/SKILL.md +380 -0
- package/skills/ct-orchestrator/manifest-entry.json +19 -0
- package/skills/ct-orchestrator/orchestrator-prompt.txt +17 -0
- package/skills/ct-orchestrator/references/SUBAGENT-PROTOCOL-BLOCK.md +66 -0
- package/skills/ct-orchestrator/references/autonomous-operation.md +167 -0
- package/skills/ct-orchestrator/references/lifecycle-gates.md +98 -0
- package/skills/ct-orchestrator/references/orchestrator-compliance.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-handoffs.md +85 -0
- package/skills/ct-orchestrator/references/orchestrator-patterns.md +164 -0
- package/skills/ct-orchestrator/references/orchestrator-recovery.md +113 -0
- package/skills/ct-orchestrator/references/orchestrator-spawning.md +271 -0
- package/skills/ct-orchestrator/references/orchestrator-tokens.md +180 -0
- package/skills/ct-research-agent/SKILL.md +226 -0
- package/skills/ct-skill-creator/.cleo/.context-state.json +13 -0
- package/skills/ct-skill-creator/.cleo/logs/cleo.2026-03-07.1.log +24 -0
- package/skills/ct-skill-creator/.cleo/tasks.db +0 -0
- package/skills/ct-skill-creator/SKILL.md +356 -0
- package/skills/ct-skill-creator/agents/analyzer.md +276 -0
- package/skills/ct-skill-creator/agents/comparator.md +204 -0
- package/skills/ct-skill-creator/agents/grader.md +225 -0
- package/skills/ct-skill-creator/assets/eval_review.html +146 -0
- package/skills/ct-skill-creator/eval-viewer/__pycache__/generate_review.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/eval-viewer/generate_review.py +471 -0
- package/skills/ct-skill-creator/eval-viewer/viewer.html +1325 -0
- package/skills/ct-skill-creator/manifest-entry.json +17 -0
- package/skills/ct-skill-creator/references/dynamic-context.md +228 -0
- package/skills/ct-skill-creator/references/frontmatter.md +83 -0
- package/skills/ct-skill-creator/references/invocation-control.md +165 -0
- package/skills/ct-skill-creator/references/output-patterns.md +86 -0
- package/skills/ct-skill-creator/references/provider-deployment.md +175 -0
- package/skills/ct-skill-creator/references/schemas.md +430 -0
- package/skills/ct-skill-creator/references/workflows.md +28 -0
- package/skills/ct-skill-creator/scripts/__init__.py +1 -0
- package/skills/ct-skill-creator/scripts/__pycache__/__init__.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/aggregate_benchmark.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/generate_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/improve_description.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/init_skill.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/quick_validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_eval.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/run_loop.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/__pycache__/utils.cpython-314.pyc +0 -0
- package/skills/ct-skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/skills/ct-skill-creator/scripts/generate_report.py +326 -0
- package/skills/ct-skill-creator/scripts/improve_description.py +247 -0
- package/skills/ct-skill-creator/scripts/init_skill.py +306 -0
- package/skills/ct-skill-creator/scripts/package_skill.py +110 -0
- package/skills/ct-skill-creator/scripts/quick_validate.py +97 -0
- package/skills/ct-skill-creator/scripts/run_eval.py +310 -0
- package/skills/ct-skill-creator/scripts/run_loop.py +328 -0
- package/skills/ct-skill-creator/scripts/utils.py +47 -0
- package/skills/ct-skill-validator/SKILL.md +178 -0
- package/skills/ct-skill-validator/agents/ecosystem-checker.md +151 -0
- package/skills/ct-skill-validator/assets/valid-skill-example.md +13 -0
- package/skills/ct-skill-validator/evals/eval_set.json +14 -0
- package/skills/ct-skill-validator/evals/evals.json +52 -0
- package/skills/ct-skill-validator/manifest-entry.json +20 -0
- package/skills/ct-skill-validator/references/cleo-ecosystem-rules.md +163 -0
- package/skills/ct-skill-validator/references/validation-rules.md +168 -0
- package/skills/ct-skill-validator/scripts/__init__.py +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc +0 -0
- package/skills/ct-skill-validator/scripts/audit_body.py +242 -0
- package/skills/ct-skill-validator/scripts/check_ecosystem.py +169 -0
- package/skills/ct-skill-validator/scripts/check_manifest.py +172 -0
- package/skills/ct-skill-validator/scripts/generate_validation_report.py +442 -0
- package/skills/ct-skill-validator/scripts/validate.py +422 -0
- package/skills/ct-spec-writer/SKILL.md +189 -0
- package/skills/ct-stickynote/README.md +14 -0
- package/skills/ct-stickynote/SKILL.md +46 -0
- package/skills/ct-task-executor/SKILL.md +296 -0
- package/skills/ct-validator/SKILL.md +216 -0
- package/skills/manifest.json +469 -0
- package/skills.json +281 -0
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Generate and serve a review page for eval results.
|
|
3
|
+
|
|
4
|
+
Reads the workspace directory, discovers runs (directories with outputs/),
|
|
5
|
+
embeds all output data into a self-contained HTML page, and serves it via
|
|
6
|
+
a tiny HTTP server. Feedback auto-saves to feedback.json in the workspace.
|
|
7
|
+
|
|
8
|
+
Usage:
|
|
9
|
+
python generate_review.py <workspace-path> [--port PORT] [--skill-name NAME]
|
|
10
|
+
python generate_review.py <workspace-path> --previous-feedback /path/to/old/feedback.json
|
|
11
|
+
|
|
12
|
+
No dependencies beyond the Python stdlib are required.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import base64
|
|
17
|
+
import json
|
|
18
|
+
import mimetypes
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import signal
|
|
22
|
+
import subprocess
|
|
23
|
+
import sys
|
|
24
|
+
import time
|
|
25
|
+
import webbrowser
|
|
26
|
+
from functools import partial
|
|
27
|
+
from http.server import HTTPServer, BaseHTTPRequestHandler
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
|
|
30
|
+
# Files to exclude from output listings
|
|
31
|
+
METADATA_FILES = {"transcript.md", "user_notes.md", "metrics.json"}
|
|
32
|
+
|
|
33
|
+
# Extensions we render as inline text
|
|
34
|
+
TEXT_EXTENSIONS = {
|
|
35
|
+
".txt", ".md", ".json", ".csv", ".py", ".js", ".ts", ".tsx", ".jsx",
|
|
36
|
+
".yaml", ".yml", ".xml", ".html", ".css", ".sh", ".rb", ".go", ".rs",
|
|
37
|
+
".java", ".c", ".cpp", ".h", ".hpp", ".sql", ".r", ".toml",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Extensions we render as inline images
|
|
41
|
+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp"}
|
|
42
|
+
|
|
43
|
+
# MIME type overrides for common types
|
|
44
|
+
MIME_OVERRIDES = {
|
|
45
|
+
".svg": "image/svg+xml",
|
|
46
|
+
".xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
47
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
48
|
+
".pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_mime_type(path: Path) -> str:
|
|
53
|
+
ext = path.suffix.lower()
|
|
54
|
+
if ext in MIME_OVERRIDES:
|
|
55
|
+
return MIME_OVERRIDES[ext]
|
|
56
|
+
mime, _ = mimetypes.guess_type(str(path))
|
|
57
|
+
return mime or "application/octet-stream"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def find_runs(workspace: Path) -> list[dict]:
|
|
61
|
+
"""Recursively find directories that contain an outputs/ subdirectory."""
|
|
62
|
+
runs: list[dict] = []
|
|
63
|
+
_find_runs_recursive(workspace, workspace, runs)
|
|
64
|
+
runs.sort(key=lambda r: (r.get("eval_id", float("inf")), r["id"]))
|
|
65
|
+
return runs
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _find_runs_recursive(root: Path, current: Path, runs: list[dict]) -> None:
|
|
69
|
+
if not current.is_dir():
|
|
70
|
+
return
|
|
71
|
+
|
|
72
|
+
outputs_dir = current / "outputs"
|
|
73
|
+
if outputs_dir.is_dir():
|
|
74
|
+
run = build_run(root, current)
|
|
75
|
+
if run:
|
|
76
|
+
runs.append(run)
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
skip = {"node_modules", ".git", "__pycache__", "skill", "inputs"}
|
|
80
|
+
for child in sorted(current.iterdir()):
|
|
81
|
+
if child.is_dir() and child.name not in skip:
|
|
82
|
+
_find_runs_recursive(root, child, runs)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def build_run(root: Path, run_dir: Path) -> dict | None:
|
|
86
|
+
"""Build a run dict with prompt, outputs, and grading data."""
|
|
87
|
+
prompt = ""
|
|
88
|
+
eval_id = None
|
|
89
|
+
|
|
90
|
+
# Try eval_metadata.json
|
|
91
|
+
for candidate in [run_dir / "eval_metadata.json", run_dir.parent / "eval_metadata.json"]:
|
|
92
|
+
if candidate.exists():
|
|
93
|
+
try:
|
|
94
|
+
metadata = json.loads(candidate.read_text())
|
|
95
|
+
prompt = metadata.get("prompt", "")
|
|
96
|
+
eval_id = metadata.get("eval_id")
|
|
97
|
+
except (json.JSONDecodeError, OSError):
|
|
98
|
+
pass
|
|
99
|
+
if prompt:
|
|
100
|
+
break
|
|
101
|
+
|
|
102
|
+
# Fall back to transcript.md
|
|
103
|
+
if not prompt:
|
|
104
|
+
for candidate in [run_dir / "transcript.md", run_dir / "outputs" / "transcript.md"]:
|
|
105
|
+
if candidate.exists():
|
|
106
|
+
try:
|
|
107
|
+
text = candidate.read_text()
|
|
108
|
+
match = re.search(r"## Eval Prompt\n\n([\s\S]*?)(?=\n##|$)", text)
|
|
109
|
+
if match:
|
|
110
|
+
prompt = match.group(1).strip()
|
|
111
|
+
except OSError:
|
|
112
|
+
pass
|
|
113
|
+
if prompt:
|
|
114
|
+
break
|
|
115
|
+
|
|
116
|
+
if not prompt:
|
|
117
|
+
prompt = "(No prompt found)"
|
|
118
|
+
|
|
119
|
+
run_id = str(run_dir.relative_to(root)).replace("/", "-").replace("\\", "-")
|
|
120
|
+
|
|
121
|
+
# Collect output files
|
|
122
|
+
outputs_dir = run_dir / "outputs"
|
|
123
|
+
output_files: list[dict] = []
|
|
124
|
+
if outputs_dir.is_dir():
|
|
125
|
+
for f in sorted(outputs_dir.iterdir()):
|
|
126
|
+
if f.is_file() and f.name not in METADATA_FILES:
|
|
127
|
+
output_files.append(embed_file(f))
|
|
128
|
+
|
|
129
|
+
# Load grading if present
|
|
130
|
+
grading = None
|
|
131
|
+
for candidate in [run_dir / "grading.json", run_dir.parent / "grading.json"]:
|
|
132
|
+
if candidate.exists():
|
|
133
|
+
try:
|
|
134
|
+
grading = json.loads(candidate.read_text())
|
|
135
|
+
except (json.JSONDecodeError, OSError):
|
|
136
|
+
pass
|
|
137
|
+
if grading:
|
|
138
|
+
break
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
"id": run_id,
|
|
142
|
+
"prompt": prompt,
|
|
143
|
+
"eval_id": eval_id,
|
|
144
|
+
"outputs": output_files,
|
|
145
|
+
"grading": grading,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def embed_file(path: Path) -> dict:
|
|
150
|
+
"""Read a file and return an embedded representation."""
|
|
151
|
+
ext = path.suffix.lower()
|
|
152
|
+
mime = get_mime_type(path)
|
|
153
|
+
|
|
154
|
+
if ext in TEXT_EXTENSIONS:
|
|
155
|
+
try:
|
|
156
|
+
content = path.read_text(errors="replace")
|
|
157
|
+
except OSError:
|
|
158
|
+
content = "(Error reading file)"
|
|
159
|
+
return {
|
|
160
|
+
"name": path.name,
|
|
161
|
+
"type": "text",
|
|
162
|
+
"content": content,
|
|
163
|
+
}
|
|
164
|
+
elif ext in IMAGE_EXTENSIONS:
|
|
165
|
+
try:
|
|
166
|
+
raw = path.read_bytes()
|
|
167
|
+
b64 = base64.b64encode(raw).decode("ascii")
|
|
168
|
+
except OSError:
|
|
169
|
+
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
|
|
170
|
+
return {
|
|
171
|
+
"name": path.name,
|
|
172
|
+
"type": "image",
|
|
173
|
+
"mime": mime,
|
|
174
|
+
"data_uri": f"data:{mime};base64,{b64}",
|
|
175
|
+
}
|
|
176
|
+
elif ext == ".pdf":
|
|
177
|
+
try:
|
|
178
|
+
raw = path.read_bytes()
|
|
179
|
+
b64 = base64.b64encode(raw).decode("ascii")
|
|
180
|
+
except OSError:
|
|
181
|
+
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
|
|
182
|
+
return {
|
|
183
|
+
"name": path.name,
|
|
184
|
+
"type": "pdf",
|
|
185
|
+
"data_uri": f"data:{mime};base64,{b64}",
|
|
186
|
+
}
|
|
187
|
+
elif ext == ".xlsx":
|
|
188
|
+
try:
|
|
189
|
+
raw = path.read_bytes()
|
|
190
|
+
b64 = base64.b64encode(raw).decode("ascii")
|
|
191
|
+
except OSError:
|
|
192
|
+
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
|
|
193
|
+
return {
|
|
194
|
+
"name": path.name,
|
|
195
|
+
"type": "xlsx",
|
|
196
|
+
"data_b64": b64,
|
|
197
|
+
}
|
|
198
|
+
else:
|
|
199
|
+
# Binary / unknown — base64 download link
|
|
200
|
+
try:
|
|
201
|
+
raw = path.read_bytes()
|
|
202
|
+
b64 = base64.b64encode(raw).decode("ascii")
|
|
203
|
+
except OSError:
|
|
204
|
+
return {"name": path.name, "type": "error", "content": "(Error reading file)"}
|
|
205
|
+
return {
|
|
206
|
+
"name": path.name,
|
|
207
|
+
"type": "binary",
|
|
208
|
+
"mime": mime,
|
|
209
|
+
"data_uri": f"data:{mime};base64,{b64}",
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def load_previous_iteration(workspace: Path) -> dict[str, dict]:
|
|
214
|
+
"""Load previous iteration's feedback and outputs.
|
|
215
|
+
|
|
216
|
+
Returns a map of run_id -> {"feedback": str, "outputs": list[dict]}.
|
|
217
|
+
"""
|
|
218
|
+
result: dict[str, dict] = {}
|
|
219
|
+
|
|
220
|
+
# Load feedback
|
|
221
|
+
feedback_map: dict[str, str] = {}
|
|
222
|
+
feedback_path = workspace / "feedback.json"
|
|
223
|
+
if feedback_path.exists():
|
|
224
|
+
try:
|
|
225
|
+
data = json.loads(feedback_path.read_text())
|
|
226
|
+
feedback_map = {
|
|
227
|
+
r["run_id"]: r["feedback"]
|
|
228
|
+
for r in data.get("reviews", [])
|
|
229
|
+
if r.get("feedback", "").strip()
|
|
230
|
+
}
|
|
231
|
+
except (json.JSONDecodeError, OSError, KeyError):
|
|
232
|
+
pass
|
|
233
|
+
|
|
234
|
+
# Load runs (to get outputs)
|
|
235
|
+
prev_runs = find_runs(workspace)
|
|
236
|
+
for run in prev_runs:
|
|
237
|
+
result[run["id"]] = {
|
|
238
|
+
"feedback": feedback_map.get(run["id"], ""),
|
|
239
|
+
"outputs": run.get("outputs", []),
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
# Also add feedback for run_ids that had feedback but no matching run
|
|
243
|
+
for run_id, fb in feedback_map.items():
|
|
244
|
+
if run_id not in result:
|
|
245
|
+
result[run_id] = {"feedback": fb, "outputs": []}
|
|
246
|
+
|
|
247
|
+
return result
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def generate_html(
|
|
251
|
+
runs: list[dict],
|
|
252
|
+
skill_name: str,
|
|
253
|
+
previous: dict[str, dict] | None = None,
|
|
254
|
+
benchmark: dict | None = None,
|
|
255
|
+
) -> str:
|
|
256
|
+
"""Generate the complete standalone HTML page with embedded data."""
|
|
257
|
+
template_path = Path(__file__).parent / "viewer.html"
|
|
258
|
+
template = template_path.read_text()
|
|
259
|
+
|
|
260
|
+
# Build previous_feedback and previous_outputs maps for the template
|
|
261
|
+
previous_feedback: dict[str, str] = {}
|
|
262
|
+
previous_outputs: dict[str, list[dict]] = {}
|
|
263
|
+
if previous:
|
|
264
|
+
for run_id, data in previous.items():
|
|
265
|
+
if data.get("feedback"):
|
|
266
|
+
previous_feedback[run_id] = data["feedback"]
|
|
267
|
+
if data.get("outputs"):
|
|
268
|
+
previous_outputs[run_id] = data["outputs"]
|
|
269
|
+
|
|
270
|
+
embedded = {
|
|
271
|
+
"skill_name": skill_name,
|
|
272
|
+
"runs": runs,
|
|
273
|
+
"previous_feedback": previous_feedback,
|
|
274
|
+
"previous_outputs": previous_outputs,
|
|
275
|
+
}
|
|
276
|
+
if benchmark:
|
|
277
|
+
embedded["benchmark"] = benchmark
|
|
278
|
+
|
|
279
|
+
data_json = json.dumps(embedded)
|
|
280
|
+
|
|
281
|
+
return template.replace("/*__EMBEDDED_DATA__*/", f"const EMBEDDED_DATA = {data_json};")
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# ---------------------------------------------------------------------------
|
|
285
|
+
# HTTP server (stdlib only, zero dependencies)
|
|
286
|
+
# ---------------------------------------------------------------------------
|
|
287
|
+
|
|
288
|
+
def _kill_port(port: int) -> None:
|
|
289
|
+
"""Kill any process listening on the given port."""
|
|
290
|
+
try:
|
|
291
|
+
result = subprocess.run(
|
|
292
|
+
["lsof", "-ti", f":{port}"],
|
|
293
|
+
capture_output=True, text=True, timeout=5,
|
|
294
|
+
)
|
|
295
|
+
for pid_str in result.stdout.strip().split("\n"):
|
|
296
|
+
if pid_str.strip():
|
|
297
|
+
try:
|
|
298
|
+
os.kill(int(pid_str.strip()), signal.SIGTERM)
|
|
299
|
+
except (ProcessLookupError, ValueError):
|
|
300
|
+
pass
|
|
301
|
+
if result.stdout.strip():
|
|
302
|
+
time.sleep(0.5)
|
|
303
|
+
except subprocess.TimeoutExpired:
|
|
304
|
+
pass
|
|
305
|
+
except FileNotFoundError:
|
|
306
|
+
print("Note: lsof not found, cannot check if port is in use", file=sys.stderr)
|
|
307
|
+
|
|
308
|
+
class ReviewHandler(BaseHTTPRequestHandler):
|
|
309
|
+
"""Serves the review HTML and handles feedback saves.
|
|
310
|
+
|
|
311
|
+
Regenerates the HTML on each page load so that refreshing the browser
|
|
312
|
+
picks up new eval outputs without restarting the server.
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
def __init__(
|
|
316
|
+
self,
|
|
317
|
+
workspace: Path,
|
|
318
|
+
skill_name: str,
|
|
319
|
+
feedback_path: Path,
|
|
320
|
+
previous: dict[str, dict],
|
|
321
|
+
benchmark_path: Path | None,
|
|
322
|
+
*args,
|
|
323
|
+
**kwargs,
|
|
324
|
+
):
|
|
325
|
+
self.workspace = workspace
|
|
326
|
+
self.skill_name = skill_name
|
|
327
|
+
self.feedback_path = feedback_path
|
|
328
|
+
self.previous = previous
|
|
329
|
+
self.benchmark_path = benchmark_path
|
|
330
|
+
super().__init__(*args, **kwargs)
|
|
331
|
+
|
|
332
|
+
def do_GET(self) -> None:
|
|
333
|
+
if self.path == "/" or self.path == "/index.html":
|
|
334
|
+
# Regenerate HTML on each request (re-scans workspace for new outputs)
|
|
335
|
+
runs = find_runs(self.workspace)
|
|
336
|
+
benchmark = None
|
|
337
|
+
if self.benchmark_path and self.benchmark_path.exists():
|
|
338
|
+
try:
|
|
339
|
+
benchmark = json.loads(self.benchmark_path.read_text())
|
|
340
|
+
except (json.JSONDecodeError, OSError):
|
|
341
|
+
pass
|
|
342
|
+
html = generate_html(runs, self.skill_name, self.previous, benchmark)
|
|
343
|
+
content = html.encode("utf-8")
|
|
344
|
+
self.send_response(200)
|
|
345
|
+
self.send_header("Content-Type", "text/html; charset=utf-8")
|
|
346
|
+
self.send_header("Content-Length", str(len(content)))
|
|
347
|
+
self.end_headers()
|
|
348
|
+
self.wfile.write(content)
|
|
349
|
+
elif self.path == "/api/feedback":
|
|
350
|
+
data = b"{}"
|
|
351
|
+
if self.feedback_path.exists():
|
|
352
|
+
data = self.feedback_path.read_bytes()
|
|
353
|
+
self.send_response(200)
|
|
354
|
+
self.send_header("Content-Type", "application/json")
|
|
355
|
+
self.send_header("Content-Length", str(len(data)))
|
|
356
|
+
self.end_headers()
|
|
357
|
+
self.wfile.write(data)
|
|
358
|
+
else:
|
|
359
|
+
self.send_error(404)
|
|
360
|
+
|
|
361
|
+
def do_POST(self) -> None:
|
|
362
|
+
if self.path == "/api/feedback":
|
|
363
|
+
length = int(self.headers.get("Content-Length", 0))
|
|
364
|
+
body = self.rfile.read(length)
|
|
365
|
+
try:
|
|
366
|
+
data = json.loads(body)
|
|
367
|
+
if not isinstance(data, dict) or "reviews" not in data:
|
|
368
|
+
raise ValueError("Expected JSON object with 'reviews' key")
|
|
369
|
+
self.feedback_path.write_text(json.dumps(data, indent=2) + "\n")
|
|
370
|
+
resp = b'{"ok":true}'
|
|
371
|
+
self.send_response(200)
|
|
372
|
+
except (json.JSONDecodeError, OSError, ValueError) as e:
|
|
373
|
+
resp = json.dumps({"error": str(e)}).encode()
|
|
374
|
+
self.send_response(500)
|
|
375
|
+
self.send_header("Content-Type", "application/json")
|
|
376
|
+
self.send_header("Content-Length", str(len(resp)))
|
|
377
|
+
self.end_headers()
|
|
378
|
+
self.wfile.write(resp)
|
|
379
|
+
else:
|
|
380
|
+
self.send_error(404)
|
|
381
|
+
|
|
382
|
+
def log_message(self, format: str, *args: object) -> None:
|
|
383
|
+
# Suppress request logging to keep terminal clean
|
|
384
|
+
pass
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def main() -> None:
|
|
388
|
+
parser = argparse.ArgumentParser(description="Generate and serve eval review")
|
|
389
|
+
parser.add_argument("workspace", type=Path, help="Path to workspace directory")
|
|
390
|
+
parser.add_argument("--port", "-p", type=int, default=3117, help="Server port (default: 3117)")
|
|
391
|
+
parser.add_argument("--skill-name", "-n", type=str, default=None, help="Skill name for header")
|
|
392
|
+
parser.add_argument(
|
|
393
|
+
"--previous-workspace", type=Path, default=None,
|
|
394
|
+
help="Path to previous iteration's workspace (shows old outputs and feedback as context)",
|
|
395
|
+
)
|
|
396
|
+
parser.add_argument(
|
|
397
|
+
"--benchmark", type=Path, default=None,
|
|
398
|
+
help="Path to benchmark.json to show in the Benchmark tab",
|
|
399
|
+
)
|
|
400
|
+
parser.add_argument(
|
|
401
|
+
"--static", "-s", type=Path, default=None,
|
|
402
|
+
help="Write standalone HTML to this path instead of starting a server",
|
|
403
|
+
)
|
|
404
|
+
args = parser.parse_args()
|
|
405
|
+
|
|
406
|
+
workspace = args.workspace.resolve()
|
|
407
|
+
if not workspace.is_dir():
|
|
408
|
+
print(f"Error: {workspace} is not a directory", file=sys.stderr)
|
|
409
|
+
sys.exit(1)
|
|
410
|
+
|
|
411
|
+
runs = find_runs(workspace)
|
|
412
|
+
if not runs:
|
|
413
|
+
print(f"No runs found in {workspace}", file=sys.stderr)
|
|
414
|
+
sys.exit(1)
|
|
415
|
+
|
|
416
|
+
skill_name = args.skill_name or workspace.name.replace("-workspace", "")
|
|
417
|
+
feedback_path = workspace / "feedback.json"
|
|
418
|
+
|
|
419
|
+
previous: dict[str, dict] = {}
|
|
420
|
+
if args.previous_workspace:
|
|
421
|
+
previous = load_previous_iteration(args.previous_workspace.resolve())
|
|
422
|
+
|
|
423
|
+
benchmark_path = args.benchmark.resolve() if args.benchmark else None
|
|
424
|
+
benchmark = None
|
|
425
|
+
if benchmark_path and benchmark_path.exists():
|
|
426
|
+
try:
|
|
427
|
+
benchmark = json.loads(benchmark_path.read_text())
|
|
428
|
+
except (json.JSONDecodeError, OSError):
|
|
429
|
+
pass
|
|
430
|
+
|
|
431
|
+
if args.static:
|
|
432
|
+
html = generate_html(runs, skill_name, previous, benchmark)
|
|
433
|
+
args.static.parent.mkdir(parents=True, exist_ok=True)
|
|
434
|
+
args.static.write_text(html)
|
|
435
|
+
print(f"\n Static viewer written to: {args.static}\n")
|
|
436
|
+
sys.exit(0)
|
|
437
|
+
|
|
438
|
+
# Kill any existing process on the target port
|
|
439
|
+
port = args.port
|
|
440
|
+
_kill_port(port)
|
|
441
|
+
handler = partial(ReviewHandler, workspace, skill_name, feedback_path, previous, benchmark_path)
|
|
442
|
+
try:
|
|
443
|
+
server = HTTPServer(("127.0.0.1", port), handler)
|
|
444
|
+
except OSError:
|
|
445
|
+
# Port still in use after kill attempt — find a free one
|
|
446
|
+
server = HTTPServer(("127.0.0.1", 0), handler)
|
|
447
|
+
port = server.server_address[1]
|
|
448
|
+
|
|
449
|
+
url = f"http://localhost:{port}"
|
|
450
|
+
print(f"\n Eval Viewer")
|
|
451
|
+
print(f" ─────────────────────────────────")
|
|
452
|
+
print(f" URL: {url}")
|
|
453
|
+
print(f" Workspace: {workspace}")
|
|
454
|
+
print(f" Feedback: {feedback_path}")
|
|
455
|
+
if previous:
|
|
456
|
+
print(f" Previous: {args.previous_workspace} ({len(previous)} runs)")
|
|
457
|
+
if benchmark_path:
|
|
458
|
+
print(f" Benchmark: {benchmark_path}")
|
|
459
|
+
print(f"\n Press Ctrl+C to stop.\n")
|
|
460
|
+
|
|
461
|
+
webbrowser.open(url)
|
|
462
|
+
|
|
463
|
+
try:
|
|
464
|
+
server.serve_forever()
|
|
465
|
+
except KeyboardInterrupt:
|
|
466
|
+
print("\nStopped.")
|
|
467
|
+
server.server_close()
|
|
468
|
+
|
|
469
|
+
|
|
470
|
+
if __name__ == "__main__":
|
|
471
|
+
main()
|