astron-eval 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +119 -0
- package/bin/astron-eval.mjs +111 -0
- package/package.json +24 -0
- package/skills/astron-eval/SKILL.md +60 -0
- package/skills/model-evaluation/SKILL.md +180 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
- package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
- package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
- package/skills/model-evaluation/assets/eval-judge.json +11 -0
- package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
- package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
- package/skills/model-evaluation/assets/experts/content-match.json +37 -0
- package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
- package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
- package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
- package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
- package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
- package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
- package/skills/model-evaluation/eval-build.md +281 -0
- package/skills/model-evaluation/eval-execute.md +196 -0
- package/skills/model-evaluation/eval-init.md +237 -0
- package/skills/model-evaluation/processes/dimension-process.md +207 -0
- package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
- package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
- package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
- package/skills/model-evaluation/processes/keypoint-process.md +148 -0
- package/skills/model-evaluation/processes/python-env-process.md +113 -0
- package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
- package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
- package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
- package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
- package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
- package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
- package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
- package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
- package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
- package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
- package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
- package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
- package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
- package/skills/model-evaluation/scripts/eval_auth.py +588 -0
- package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
- package/skills/model-evaluation/scripts/eval_set.py +410 -0
- package/skills/model-evaluation/scripts/eval_task.py +324 -0
- package/skills/model-evaluation/scripts/files/__init__.py +38 -0
- package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
- package/skills/model-evaluation/scripts/files/streaming.py +245 -0
- package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
- package/skills/model-evaluation/scripts/utils/constants.py +101 -0
- package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
- package/skills/model-evaluation/scripts/utils/errors.py +244 -0
- package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
- package/skills/skill-driven-eval/SKILL.md +456 -0
- package/skills/skill-driven-eval/agents/grader.md +144 -0
- package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
- package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
- package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
- package/skills/skill-driven-eval/references/schemas.md +282 -0
- package/skills/skill-driven-eval/scripts/__init__.py +1 -0
- package/skills/skill-driven-eval/scripts/__main__.py +70 -0
- package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
- package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
- package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Extract and format session transcript into human-readable markdown.
|
|
4
|
+
|
|
5
|
+
This script reads a Claude Code session transcript (JSONL format) and generates
|
|
6
|
+
a readable markdown document showing the conversation flow:
|
|
7
|
+
- User messages
|
|
8
|
+
- Assistant responses (with thinking blocks)
|
|
9
|
+
- Tool calls and their results
|
|
10
|
+
|
|
11
|
+
Usage:
|
|
12
|
+
python extract_transcript.py <session.jsonl> --output <output.md>
|
|
13
|
+
|
|
14
|
+
The output is suitable for human review but does NOT include model identification,
|
|
15
|
+
maintaining blind evaluation integrity.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import json
|
|
20
|
+
import sys
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def format_timestamp(ts: float | None) -> str:
|
|
26
|
+
"""Format Unix timestamp to readable string."""
|
|
27
|
+
if ts is None:
|
|
28
|
+
return ""
|
|
29
|
+
try:
|
|
30
|
+
return datetime.fromtimestamp(ts).strftime("%H:%M:%S")
|
|
31
|
+
except (ValueError, TypeError):
|
|
32
|
+
return ""
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def format_content_block(block: dict, indent: str = "") -> list[str]:
|
|
36
|
+
"""Format a single content block to markdown lines."""
|
|
37
|
+
lines = []
|
|
38
|
+
block_type = block.get("type", "unknown")
|
|
39
|
+
|
|
40
|
+
if block_type == "text":
|
|
41
|
+
text = block.get("text", "")
|
|
42
|
+
if text.strip():
|
|
43
|
+
lines.append(f"{indent}{text}")
|
|
44
|
+
|
|
45
|
+
elif block_type == "thinking":
|
|
46
|
+
thinking = block.get("thinking", "")
|
|
47
|
+
if thinking.strip():
|
|
48
|
+
lines.append(f"{indent}### Thinking")
|
|
49
|
+
lines.append(f"{indent}```")
|
|
50
|
+
lines.append(f"{indent}{thinking}")
|
|
51
|
+
lines.append(f"{indent}```")
|
|
52
|
+
lines.append("")
|
|
53
|
+
|
|
54
|
+
elif block_type == "tool_use":
|
|
55
|
+
tool_name = block.get("name", "unknown")
|
|
56
|
+
tool_id = block.get("id", "")
|
|
57
|
+
tool_input = block.get("input", {})
|
|
58
|
+
|
|
59
|
+
lines.append(f"{indent}### Tool: `{tool_name}`")
|
|
60
|
+
lines.append(f"{indent}```json")
|
|
61
|
+
# Format input nicely, truncate if too long
|
|
62
|
+
input_str = json.dumps(tool_input, indent=2, ensure_ascii=False)
|
|
63
|
+
if len(input_str) > 2000:
|
|
64
|
+
input_str = input_str[:2000] + "\n... (truncated)"
|
|
65
|
+
lines.append(f"{indent}{input_str}")
|
|
66
|
+
lines.append(f"{indent}```")
|
|
67
|
+
lines.append("")
|
|
68
|
+
|
|
69
|
+
elif block_type == "redacted_thinking":
|
|
70
|
+
lines.append(f"{indent}*[Redacted thinking block]*")
|
|
71
|
+
lines.append("")
|
|
72
|
+
|
|
73
|
+
return lines
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def format_tool_result(result: dict, indent: str = "") -> list[str]:
|
|
77
|
+
"""Format a tool result block."""
|
|
78
|
+
lines = []
|
|
79
|
+
tool_use_id = result.get("tool_use_id", "")
|
|
80
|
+
content = result.get("content", "")
|
|
81
|
+
is_error = result.get("is_error", False)
|
|
82
|
+
|
|
83
|
+
header = "### Tool Result"
|
|
84
|
+
if is_error:
|
|
85
|
+
header = "### Tool Result (Error)"
|
|
86
|
+
|
|
87
|
+
lines.append(f"{indent}{header}")
|
|
88
|
+
lines.append(f"{indent}```")
|
|
89
|
+
|
|
90
|
+
# Handle different content types
|
|
91
|
+
if isinstance(content, str):
|
|
92
|
+
# Truncate long results
|
|
93
|
+
if len(content) > 3000:
|
|
94
|
+
content = content[:3000] + "\n... (truncated)"
|
|
95
|
+
lines.append(f"{indent}{content}")
|
|
96
|
+
elif isinstance(content, list):
|
|
97
|
+
for item in content:
|
|
98
|
+
if isinstance(item, dict) and item.get("type") == "text":
|
|
99
|
+
text = item.get("text", "")
|
|
100
|
+
if len(text) > 3000:
|
|
101
|
+
text = text[:3000] + "\n... (truncated)"
|
|
102
|
+
lines.append(f"{indent}{text}")
|
|
103
|
+
elif isinstance(item, dict) and item.get("type") == "image":
|
|
104
|
+
lines.append(f"{indent}[Image: {item.get('source', {}).get('media_type', 'unknown')}]")
|
|
105
|
+
else:
|
|
106
|
+
item_str = str(item)
|
|
107
|
+
if len(item_str) > 500:
|
|
108
|
+
item_str = item_str[:500] + "... (truncated)"
|
|
109
|
+
lines.append(f"{indent}{item_str}")
|
|
110
|
+
else:
|
|
111
|
+
content_str = str(content)
|
|
112
|
+
if len(content_str) > 3000:
|
|
113
|
+
content_str = content_str[:3000] + "\n... (truncated)"
|
|
114
|
+
lines.append(f"{indent}{content_str}")
|
|
115
|
+
|
|
116
|
+
lines.append(f"{indent}```")
|
|
117
|
+
lines.append("")
|
|
118
|
+
|
|
119
|
+
return lines
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def extract_transcript(input_path: Path, output_path: Path, include_metadata: bool = True):
|
|
123
|
+
"""
|
|
124
|
+
Extract transcript from JSONL session file to markdown.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
input_path: Path to session.jsonl file
|
|
128
|
+
output_path: Path to output markdown file
|
|
129
|
+
include_metadata: Whether to include timing and token metadata
|
|
130
|
+
"""
|
|
131
|
+
if not input_path.exists():
|
|
132
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
|
|
135
|
+
lines = []
|
|
136
|
+
|
|
137
|
+
# Header
|
|
138
|
+
lines.append("# Session Transcript")
|
|
139
|
+
lines.append("")
|
|
140
|
+
lines.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*")
|
|
141
|
+
lines.append("")
|
|
142
|
+
lines.append("---")
|
|
143
|
+
lines.append("")
|
|
144
|
+
|
|
145
|
+
# Track message order for tool results
|
|
146
|
+
pending_tool_results = {}
|
|
147
|
+
message_count = 0
|
|
148
|
+
total_tokens = 0
|
|
149
|
+
|
|
150
|
+
with open(input_path, 'r', encoding='utf-8') as f:
|
|
151
|
+
for line_num, line in enumerate(f, 1):
|
|
152
|
+
line = line.strip()
|
|
153
|
+
if not line:
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
try:
|
|
157
|
+
entry = json.loads(line)
|
|
158
|
+
except json.JSONDecodeError as e:
|
|
159
|
+
lines.append(f"*Error parsing line {line_num}: {e}*")
|
|
160
|
+
lines.append("")
|
|
161
|
+
continue
|
|
162
|
+
|
|
163
|
+
entry_type = entry.get("type", "")
|
|
164
|
+
|
|
165
|
+
# Handle different entry types
|
|
166
|
+
if entry_type == "summary":
|
|
167
|
+
# Skip summary entries
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Get role - could be at top level or in message
|
|
171
|
+
role = entry.get("role", "")
|
|
172
|
+
if not role and "message" in entry:
|
|
173
|
+
role = entry.get("message", {}).get("role", "")
|
|
174
|
+
|
|
175
|
+
# Get content blocks
|
|
176
|
+
content = entry.get("content", [])
|
|
177
|
+
if not content and "message" in entry:
|
|
178
|
+
content = entry.get("message", {}).get("content", [])
|
|
179
|
+
|
|
180
|
+
# Get timestamp
|
|
181
|
+
timestamp = entry.get("timestamp") or entry.get("message", {}).get("timestamp")
|
|
182
|
+
|
|
183
|
+
# Get usage info
|
|
184
|
+
usage = entry.get("usage") or entry.get("message", {}).get("usage", {})
|
|
185
|
+
if usage:
|
|
186
|
+
total_tokens += usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
|
|
187
|
+
|
|
188
|
+
if role == "user":
|
|
189
|
+
message_count += 1
|
|
190
|
+
ts_str = format_timestamp(timestamp)
|
|
191
|
+
lines.append(f"## User Message {message_count}")
|
|
192
|
+
if ts_str and include_metadata:
|
|
193
|
+
lines.append(f"*Time: {ts_str}*")
|
|
194
|
+
lines.append("")
|
|
195
|
+
|
|
196
|
+
# Handle content that is a string directly
|
|
197
|
+
if isinstance(content, str):
|
|
198
|
+
lines.append(content)
|
|
199
|
+
elif isinstance(content, list):
|
|
200
|
+
for block in content:
|
|
201
|
+
if isinstance(block, dict):
|
|
202
|
+
lines.extend(format_content_block(block))
|
|
203
|
+
elif isinstance(block, str):
|
|
204
|
+
lines.append(block)
|
|
205
|
+
lines.append("---")
|
|
206
|
+
lines.append("")
|
|
207
|
+
|
|
208
|
+
elif role == "assistant":
|
|
209
|
+
message_count += 1
|
|
210
|
+
ts_str = format_timestamp(timestamp)
|
|
211
|
+
lines.append(f"## Assistant Response {message_count}")
|
|
212
|
+
if ts_str and include_metadata:
|
|
213
|
+
lines.append(f"*Time: {ts_str}*")
|
|
214
|
+
if usage:
|
|
215
|
+
input_t = usage.get("input_tokens", 0)
|
|
216
|
+
output_t = usage.get("output_tokens", 0)
|
|
217
|
+
lines.append(f"*Tokens: {input_t} in, {output_t} out*")
|
|
218
|
+
lines.append("")
|
|
219
|
+
|
|
220
|
+
# Handle content that is a string directly
|
|
221
|
+
if isinstance(content, str):
|
|
222
|
+
lines.append(content)
|
|
223
|
+
elif isinstance(content, list):
|
|
224
|
+
for block in content:
|
|
225
|
+
if isinstance(block, dict):
|
|
226
|
+
block_type = block.get("type", "")
|
|
227
|
+
if block_type == "tool_use":
|
|
228
|
+
# Store tool call for potential result matching
|
|
229
|
+
tool_id = block.get("id", "")
|
|
230
|
+
pending_tool_results[tool_id] = message_count
|
|
231
|
+
lines.extend(format_content_block(block))
|
|
232
|
+
elif isinstance(block, str):
|
|
233
|
+
lines.append(block)
|
|
234
|
+
lines.append("---")
|
|
235
|
+
lines.append("")
|
|
236
|
+
|
|
237
|
+
# Handle tool results in user content
|
|
238
|
+
if role == "user" and isinstance(content, list):
|
|
239
|
+
for block in content:
|
|
240
|
+
if isinstance(block, dict) and block.get("type") == "tool_result":
|
|
241
|
+
lines.extend(format_tool_result(block))
|
|
242
|
+
|
|
243
|
+
# Footer with summary
|
|
244
|
+
lines.append("")
|
|
245
|
+
lines.append("---")
|
|
246
|
+
lines.append("")
|
|
247
|
+
lines.append("## Summary")
|
|
248
|
+
lines.append("")
|
|
249
|
+
lines.append(f"- **Total messages**: {message_count}")
|
|
250
|
+
if include_metadata and total_tokens > 0:
|
|
251
|
+
lines.append(f"- **Total tokens**: {total_tokens}")
|
|
252
|
+
lines.append("")
|
|
253
|
+
|
|
254
|
+
# Write output
|
|
255
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
256
|
+
output_path.write_text("\n".join(lines), encoding='utf-8')
|
|
257
|
+
print(f"Generated: {output_path}")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def main():
|
|
261
|
+
parser = argparse.ArgumentParser(
|
|
262
|
+
description="Extract session transcript to human-readable markdown"
|
|
263
|
+
)
|
|
264
|
+
parser.add_argument(
|
|
265
|
+
"input",
|
|
266
|
+
type=Path,
|
|
267
|
+
help="Path to session.jsonl file"
|
|
268
|
+
)
|
|
269
|
+
parser.add_argument(
|
|
270
|
+
"--output", "-o",
|
|
271
|
+
type=Path,
|
|
272
|
+
help="Output markdown file path (default: same name with .md extension)"
|
|
273
|
+
)
|
|
274
|
+
parser.add_argument(
|
|
275
|
+
"--no-metadata",
|
|
276
|
+
action="store_true",
|
|
277
|
+
help="Exclude timing and token metadata"
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
args = parser.parse_args()
|
|
281
|
+
|
|
282
|
+
output_path = args.output
|
|
283
|
+
if output_path is None:
|
|
284
|
+
output_path = args.input.with_suffix(".md")
|
|
285
|
+
|
|
286
|
+
extract_transcript(
|
|
287
|
+
args.input,
|
|
288
|
+
output_path,
|
|
289
|
+
include_metadata=not args.no_metadata
|
|
290
|
+
)
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
if __name__ == "__main__":
|
|
294
|
+
main()
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Test script for aggregate_results.py.
|
|
3
|
+
|
|
4
|
+
Creates sample test data with anonymous run IDs and mapping.json,
|
|
5
|
+
then runs the aggregation to verify it works correctly.
|
|
6
|
+
|
|
7
|
+
Run with: python test_aggregate.py
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import shutil
|
|
12
|
+
import sys
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_test_workspace():
|
|
17
|
+
"""Create a test workspace with sample data using anonymous run IDs."""
|
|
18
|
+
test_dir = Path(__file__).parent.parent / "test-workspace"
|
|
19
|
+
|
|
20
|
+
# Clean up if exists
|
|
21
|
+
if test_dir.exists():
|
|
22
|
+
shutil.rmtree(test_dir)
|
|
23
|
+
|
|
24
|
+
test_dir.mkdir(parents=True)
|
|
25
|
+
|
|
26
|
+
# Create evals.json
|
|
27
|
+
evals = {
|
|
28
|
+
"target_skill": "pdf",
|
|
29
|
+
"target_skill_path": "/path/to/pdf-skill",
|
|
30
|
+
"models_to_compare": ["opus", "sonnet"],
|
|
31
|
+
"evals": [
|
|
32
|
+
{
|
|
33
|
+
"id": 1,
|
|
34
|
+
"name": "Form Filling",
|
|
35
|
+
"prompt": "Fill out the PDF form with the following data: name=John Doe, email=john@example.com",
|
|
36
|
+
"expected_output": "A filled PDF form with all fields populated",
|
|
37
|
+
"assertions": [
|
|
38
|
+
"The output is a PDF file",
|
|
39
|
+
"The name field contains 'John Doe'",
|
|
40
|
+
"The email field contains 'john@example.com'"
|
|
41
|
+
]
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
"id": 2,
|
|
45
|
+
"name": "Text Extraction",
|
|
46
|
+
"prompt": "Extract all text from the invoice PDF and create a summary",
|
|
47
|
+
"expected_output": "A text summary of the invoice contents",
|
|
48
|
+
"assertions": [
|
|
49
|
+
"The output contains extracted text",
|
|
50
|
+
"The total amount is mentioned",
|
|
51
|
+
"The date is correctly extracted"
|
|
52
|
+
]
|
|
53
|
+
}
|
|
54
|
+
]
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
with open(test_dir / "evals.json", "w") as f:
|
|
58
|
+
json.dump(evals, f, indent=2)
|
|
59
|
+
|
|
60
|
+
# Mapping from run IDs to models (created by MainAgent, but for testing we create it now)
|
|
61
|
+
# In real usage, this would be created AFTER grading is complete
|
|
62
|
+
mapping = {
|
|
63
|
+
"run-001": {"model": "opus", "eval_id": 1, "eval_name": "Form Filling"},
|
|
64
|
+
"run-002": {"model": "sonnet", "eval_id": 1, "eval_name": "Form Filling"},
|
|
65
|
+
"run-003": {"model": "opus", "eval_id": 2, "eval_name": "Text Extraction"},
|
|
66
|
+
"run-004": {"model": "sonnet", "eval_id": 2, "eval_name": "Text Extraction"}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Create run directories with anonymous IDs
|
|
70
|
+
runs_data = {
|
|
71
|
+
"run-001": {
|
|
72
|
+
"grading": {
|
|
73
|
+
"run_id": "run-001",
|
|
74
|
+
"expectations": [
|
|
75
|
+
{"text": "The output is a PDF file", "passed": True, "evidence": "Output file is filled_form.pdf"},
|
|
76
|
+
{"text": "The name field contains 'John Doe'", "passed": True, "evidence": "Found in field mapping"},
|
|
77
|
+
{"text": "The email field contains 'john@example.com'", "passed": True, "evidence": "Verified in output"}
|
|
78
|
+
],
|
|
79
|
+
"summary": {"passed": 3, "failed": 0, "total": 3, "pass_rate": 1.0},
|
|
80
|
+
"execution_metrics": {"total_tool_calls": 12, "errors_encountered": 0}
|
|
81
|
+
},
|
|
82
|
+
"timing": {
|
|
83
|
+
"run_id": "run-001",
|
|
84
|
+
"total_tokens": 45000,
|
|
85
|
+
"duration_ms": 45200,
|
|
86
|
+
"total_duration_seconds": 45.2
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
"run-002": {
|
|
90
|
+
"grading": {
|
|
91
|
+
"run_id": "run-002",
|
|
92
|
+
"expectations": [
|
|
93
|
+
{"text": "The output is a PDF file", "passed": True, "evidence": "Output file is filled_form.pdf"},
|
|
94
|
+
{"text": "The name field contains 'John Doe'", "passed": True, "evidence": "Found in field mapping"},
|
|
95
|
+
{"text": "The email field contains 'john@example.com'", "passed": False, "evidence": "Email was truncated"}
|
|
96
|
+
],
|
|
97
|
+
"summary": {"passed": 2, "failed": 1, "total": 3, "pass_rate": 0.67},
|
|
98
|
+
"execution_metrics": {"total_tool_calls": 10, "errors_encountered": 0}
|
|
99
|
+
},
|
|
100
|
+
"timing": {
|
|
101
|
+
"run_id": "run-002",
|
|
102
|
+
"total_tokens": 28000,
|
|
103
|
+
"duration_ms": 32100,
|
|
104
|
+
"total_duration_seconds": 32.1
|
|
105
|
+
}
|
|
106
|
+
},
|
|
107
|
+
"run-003": {
|
|
108
|
+
"grading": {
|
|
109
|
+
"run_id": "run-003",
|
|
110
|
+
"expectations": [
|
|
111
|
+
{"text": "The output contains extracted text", "passed": True, "evidence": "Text extracted successfully"},
|
|
112
|
+
{"text": "The total amount is mentioned", "passed": True, "evidence": "Found '$1,234.56' in output"},
|
|
113
|
+
{"text": "The date is correctly extracted", "passed": True, "evidence": "Date: 2024-03-15"}
|
|
114
|
+
],
|
|
115
|
+
"summary": {"passed": 3, "failed": 0, "total": 3, "pass_rate": 1.0},
|
|
116
|
+
"execution_metrics": {"total_tool_calls": 8, "errors_encountered": 0}
|
|
117
|
+
},
|
|
118
|
+
"timing": {
|
|
119
|
+
"run_id": "run-003",
|
|
120
|
+
"total_tokens": 38000,
|
|
121
|
+
"duration_ms": 28500,
|
|
122
|
+
"total_duration_seconds": 28.5
|
|
123
|
+
}
|
|
124
|
+
},
|
|
125
|
+
"run-004": {
|
|
126
|
+
"grading": {
|
|
127
|
+
"run_id": "run-004",
|
|
128
|
+
"expectations": [
|
|
129
|
+
{"text": "The output contains extracted text", "passed": True, "evidence": "Text extracted"},
|
|
130
|
+
{"text": "The total amount is mentioned", "passed": False, "evidence": "Amount not found in output"},
|
|
131
|
+
{"text": "The date is correctly extracted", "passed": True, "evidence": "Date: 2024-03-15"}
|
|
132
|
+
],
|
|
133
|
+
"summary": {"passed": 2, "failed": 1, "total": 3, "pass_rate": 0.67},
|
|
134
|
+
"execution_metrics": {"total_tool_calls": 7, "errors_encountered": 0}
|
|
135
|
+
},
|
|
136
|
+
"timing": {
|
|
137
|
+
"run_id": "run-004",
|
|
138
|
+
"total_tokens": 24000,
|
|
139
|
+
"duration_ms": 22300,
|
|
140
|
+
"total_duration_seconds": 22.3
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
# Create run directories
|
|
146
|
+
for run_id, data in runs_data.items():
|
|
147
|
+
run_dir = test_dir / run_id
|
|
148
|
+
run_dir.mkdir()
|
|
149
|
+
(run_dir / "outputs").mkdir()
|
|
150
|
+
|
|
151
|
+
# Write grading.json (blind - no model info)
|
|
152
|
+
with open(run_dir / "grading.json", "w") as f:
|
|
153
|
+
json.dump(data["grading"], f, indent=2)
|
|
154
|
+
|
|
155
|
+
# Write timing.json (blind - no model info)
|
|
156
|
+
with open(run_dir / "timing.json", "w") as f:
|
|
157
|
+
json.dump(data["timing"], f, indent=2)
|
|
158
|
+
|
|
159
|
+
# Write mapping.json (would be created by MainAgent after grading)
|
|
160
|
+
# For testing, we create it now
|
|
161
|
+
with open(test_dir / "mapping.json", "w") as f:
|
|
162
|
+
json.dump(mapping, f, indent=2)
|
|
163
|
+
|
|
164
|
+
print(f"Created test workspace at: {test_dir}")
|
|
165
|
+
print(f"\nNote: In real usage, mapping.json is created by MainAgent AFTER grading.")
|
|
166
|
+
print(f"For this test, we create it upfront to verify aggregation.")
|
|
167
|
+
return test_dir
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def run_aggregation(test_dir):
|
|
171
|
+
"""Run the aggregation script on the test workspace."""
|
|
172
|
+
sys.path.insert(0, str(Path(__file__).parent))
|
|
173
|
+
from aggregate_results import generate_benchmark, generate_markdown, load_mapping
|
|
174
|
+
|
|
175
|
+
# Load the mapping
|
|
176
|
+
mapping = load_mapping(test_dir)
|
|
177
|
+
|
|
178
|
+
benchmark = generate_benchmark(test_dir, mapping)
|
|
179
|
+
|
|
180
|
+
# Print summary
|
|
181
|
+
print("\n=== Benchmark Summary ===")
|
|
182
|
+
print(f"Target Skill: {benchmark['metadata']['target_skill']}")
|
|
183
|
+
print(f"Models Compared: {', '.join(benchmark['metadata']['models_compared'])}")
|
|
184
|
+
print(f"Evals Run: {benchmark['metadata']['evals_run']}")
|
|
185
|
+
print(f"Note: {benchmark['metadata']['note']}")
|
|
186
|
+
|
|
187
|
+
print("\n=== Model Summary ===")
|
|
188
|
+
for model, summary in benchmark['model_summary'].items():
|
|
189
|
+
pr = summary['pass_rate']['mean']
|
|
190
|
+
time = summary['time_seconds']['mean']
|
|
191
|
+
tokens = summary['tokens']['mean']
|
|
192
|
+
print(f" {model}: {pr*100:.0f}% pass rate, {time:.1f}s, {tokens:.0f} tokens")
|
|
193
|
+
|
|
194
|
+
print("\n=== Comparison ===")
|
|
195
|
+
comp = benchmark['comparison']
|
|
196
|
+
print(f" Pass Rate Delta: {comp['pass_rate_delta']}")
|
|
197
|
+
print(f" Time Delta: {comp['time_delta']}")
|
|
198
|
+
print(f" Token Delta: {comp['token_delta']}")
|
|
199
|
+
print(f" Cost Efficiency: {comp['cost_efficiency']}")
|
|
200
|
+
|
|
201
|
+
print("\n=== Recommendations (data-driven) ===")
|
|
202
|
+
for rec in benchmark['recommendations']:
|
|
203
|
+
print(f" [{rec['scenario']}] Use {rec['recommended_model']}: {rec['reason']}")
|
|
204
|
+
|
|
205
|
+
print("\n=== Notes ===")
|
|
206
|
+
for note in benchmark['notes']:
|
|
207
|
+
print(f" - {note}")
|
|
208
|
+
|
|
209
|
+
# Write benchmark.json
|
|
210
|
+
benchmark_path = test_dir / "benchmark.json"
|
|
211
|
+
with open(benchmark_path, "w") as f:
|
|
212
|
+
json.dump(benchmark, f, indent=2)
|
|
213
|
+
print(f"\nWritten: {benchmark_path}")
|
|
214
|
+
|
|
215
|
+
# Write benchmark.md
|
|
216
|
+
markdown = generate_markdown(benchmark)
|
|
217
|
+
md_path = test_dir / "benchmark.md"
|
|
218
|
+
with open(md_path, "w") as f:
|
|
219
|
+
f.write(markdown)
|
|
220
|
+
print(f"Written: {md_path}")
|
|
221
|
+
|
|
222
|
+
return benchmark
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def main():
|
|
226
|
+
print("Creating test workspace with anonymous run IDs...")
|
|
227
|
+
test_dir = create_test_workspace()
|
|
228
|
+
|
|
229
|
+
print("\nRunning aggregation with mapping.json...")
|
|
230
|
+
benchmark = run_aggregation(test_dir)
|
|
231
|
+
|
|
232
|
+
print("\n=== Test Complete ===")
|
|
233
|
+
print(f"Test workspace: {test_dir}")
|
|
234
|
+
print("\nKey points verified:")
|
|
235
|
+
print(" 1. Run directories use anonymous IDs (run-001, run-002, etc.)")
|
|
236
|
+
print(" 2. Grading files do not contain model information")
|
|
237
|
+
print(" 3. Mapping.json maps run IDs to models")
|
|
238
|
+
print(" 4. Aggregation combines grading + mapping for benchmark")
|
|
239
|
+
print("\nYou can now test the report viewer:")
|
|
240
|
+
print(f" python -m eval-viewer.generate_report {test_dir} --serve")
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
if __name__ == "__main__":
|
|
244
|
+
main()
|