astron-eval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +119 -0
  3. package/bin/astron-eval.mjs +111 -0
  4. package/package.json +24 -0
  5. package/skills/astron-eval/SKILL.md +60 -0
  6. package/skills/model-evaluation/SKILL.md +180 -0
  7. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  8. package/skills/model-evaluation/assets/dimensions//345/206/205/345/256/271/347/262/276/347/241/256/347/273/264/345/272/246.json +19 -0
  9. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  10. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  11. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246-/346/227/205/346/270/270/345/207/272/350/241/214.json +20 -0
  12. package/skills/model-evaluation/assets/dimensions//345/207/206/347/241/256/346/200/247/347/273/264/345/272/246.json +20 -0
  13. package/skills/model-evaluation/assets/dimensions//345/210/233/346/204/217/346/200/247-/345/220/270/345/274/225/346/200/247/347/273/264/345/272/246.json +21 -0
  14. package/skills/model-evaluation/assets/dimensions//345/210/233/346/226/260/346/200/247/347/273/264/345/272/246.json +20 -0
  15. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  16. package/skills/model-evaluation/assets/dimensions//345/256/214/346/225/264/346/200/247/347/273/264/345/272/246.json +20 -0
  17. package/skills/model-evaluation/assets/dimensions//345/275/242/345/274/217/347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +20 -0
  18. package/skills/model-evaluation/assets/dimensions//345/277/240/350/257/232/345/272/246/347/273/264/345/272/246.json +20 -0
  19. package/skills/model-evaluation/assets/dimensions//346/214/207/344/273/244/351/201/265/345/276/252/347/273/264/345/272/246.json +20 -0
  20. package/skills/model-evaluation/assets/dimensions//346/226/207/346/234/254/345/267/256/345/274/202/345/272/246-TER/347/273/264/345/272/246.json +20 -0
  21. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  22. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  23. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  24. package/skills/model-evaluation/assets/dimensions//346/234/211/346/225/210/346/200/247/347/273/264/345/272/246.json +21 -0
  25. package/skills/model-evaluation/assets/dimensions//346/240/270/345/277/203/345/205/203/347/264/240/347/273/264/345/272/246.json +20 -0
  26. package/skills/model-evaluation/assets/dimensions//346/240/274/345/274/217/351/201/265/345/276/252/347/273/264/345/272/246.json +19 -0
  27. package/skills/model-evaluation/assets/dimensions//347/211/271/350/211/262/344/272/256/347/202/271/347/273/264/345/272/246.json +20 -0
  28. package/skills/model-evaluation/assets/dimensions//347/224/250/344/276/213/347/272/247/350/257/204/346/265/213/347/273/264/345/272/246/346/250/241/346/235/277.json +25 -0
  29. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-BERTScore/347/273/264/345/272/246.json +20 -0
  30. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-Cosine/347/273/264/345/272/246.json +20 -0
  31. package/skills/model-evaluation/assets/dimensions//347/233/270/344/274/274/345/272/246-ROUGE/347/273/264/345/272/246.json +20 -0
  32. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  33. package/skills/model-evaluation/assets/dimensions//347/233/270/345/205/263/346/200/247/347/273/264/345/272/246.json +21 -0
  34. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-BLUE/347/273/264/345/272/246.json +20 -0
  35. package/skills/model-evaluation/assets/dimensions//347/262/276/347/241/256/346/200/247-COMET/347/273/264/345/272/246.json +20 -0
  36. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/345/220/210/347/220/206/346/200/247/347/273/264/345/272/246.json +20 -0
  37. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/270/252/346/200/247/345/214/226/350/247/204/345/210/222.json +20 -0
  38. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/344/277/241/346/201/257/345/210/206/346/236/220.json +20 -0
  39. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246-/346/265/201/347/250/213/350/207/252/345/212/250/345/214/226.json +20 -0
  40. package/skills/model-evaluation/assets/dimensions//351/200/273/350/276/221/350/277/236/350/264/257/346/200/247/347/273/264/345/272/246.json +21 -0
  41. package/skills/model-evaluation/assets/eval-judge.json +11 -0
  42. package/skills/model-evaluation/assets/experts/business-process-automation.json +71 -0
  43. package/skills/model-evaluation/assets/experts/content-generation.json +75 -0
  44. package/skills/model-evaluation/assets/experts/content-match.json +37 -0
  45. package/skills/model-evaluation/assets/experts/information-analysis.json +87 -0
  46. package/skills/model-evaluation/assets/experts/marketing-digital-human.json +27 -0
  47. package/skills/model-evaluation/assets/experts/personalized-planning.json +87 -0
  48. package/skills/model-evaluation/assets/experts/text-translation.json +103 -0
  49. package/skills/model-evaluation/assets/experts/tourism-travel.json +119 -0
  50. package/skills/model-evaluation/assets/templates/custom-dimension.template.json +30 -0
  51. package/skills/model-evaluation/eval-build.md +281 -0
  52. package/skills/model-evaluation/eval-execute.md +196 -0
  53. package/skills/model-evaluation/eval-init.md +237 -0
  54. package/skills/model-evaluation/processes/dimension-process.md +207 -0
  55. package/skills/model-evaluation/processes/evalset-create-process.md +184 -0
  56. package/skills/model-evaluation/processes/evalset-parse-process.md +171 -0
  57. package/skills/model-evaluation/processes/evalset-supplement-process.md +136 -0
  58. package/skills/model-evaluation/processes/keypoint-process.md +148 -0
  59. package/skills/model-evaluation/processes/python-env-process.md +113 -0
  60. package/skills/model-evaluation/references//344/270/255/351/227/264/344/272/247/347/211/251/350/257/264/346/230/216.md +340 -0
  61. package/skills/model-evaluation/references//345/206/205/347/275/256/346/250/241/346/235/277/350/257/264/346/230/216.md +149 -0
  62. package/skills/model-evaluation/references//350/204/232/346/234/254/345/256/232/344/271/211.md +274 -0
  63. package/skills/model-evaluation/references//350/256/244/350/257/201/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +271 -0
  64. package/skills/model-evaluation/references//350/257/204/346/265/213/346/234/215/345/212/241/346/216/245/345/217/243/350/257/264/346/230/216.md +455 -0
  65. package/skills/model-evaluation/references//350/257/204/346/265/213/347/273/264/345/272/246/350/257/264/346/230/216.md +171 -0
  66. package/skills/model-evaluation/scripts/cfg/eval-auth.cfg +16 -0
  67. package/skills/model-evaluation/scripts/cfg/eval-server.cfg +1 -0
  68. package/skills/model-evaluation/scripts/clients/__init__.py +33 -0
  69. package/skills/model-evaluation/scripts/clients/api_client.py +97 -0
  70. package/skills/model-evaluation/scripts/clients/auth_client.py +96 -0
  71. package/skills/model-evaluation/scripts/clients/http_client.py +199 -0
  72. package/skills/model-evaluation/scripts/clients/oauth_callback.py +397 -0
  73. package/skills/model-evaluation/scripts/clients/token_manager.py +53 -0
  74. package/skills/model-evaluation/scripts/eval_auth.py +588 -0
  75. package/skills/model-evaluation/scripts/eval_dimension.py +240 -0
  76. package/skills/model-evaluation/scripts/eval_set.py +410 -0
  77. package/skills/model-evaluation/scripts/eval_task.py +324 -0
  78. package/skills/model-evaluation/scripts/files/__init__.py +38 -0
  79. package/skills/model-evaluation/scripts/files/file_utils.py +330 -0
  80. package/skills/model-evaluation/scripts/files/streaming.py +245 -0
  81. package/skills/model-evaluation/scripts/utils/__init__.py +128 -0
  82. package/skills/model-evaluation/scripts/utils/constants.py +101 -0
  83. package/skills/model-evaluation/scripts/utils/datetime_utils.py +60 -0
  84. package/skills/model-evaluation/scripts/utils/errors.py +244 -0
  85. package/skills/model-evaluation/scripts/utils/keypoint_prompts.py +73 -0
  86. package/skills/skill-driven-eval/SKILL.md +456 -0
  87. package/skills/skill-driven-eval/agents/grader.md +144 -0
  88. package/skills/skill-driven-eval/eval-viewer/__init__.py +1 -0
  89. package/skills/skill-driven-eval/eval-viewer/generate_report.py +485 -0
  90. package/skills/skill-driven-eval/eval-viewer/viewer.html +767 -0
  91. package/skills/skill-driven-eval/references/schemas.md +282 -0
  92. package/skills/skill-driven-eval/scripts/__init__.py +1 -0
  93. package/skills/skill-driven-eval/scripts/__main__.py +70 -0
  94. package/skills/skill-driven-eval/scripts/aggregate_results.py +681 -0
  95. package/skills/skill-driven-eval/scripts/extract_transcript.py +294 -0
  96. package/skills/skill-driven-eval/scripts/test_aggregate.py +244 -0
@@ -0,0 +1,294 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Extract and format session transcript into human-readable markdown.
4
+
5
+ This script reads a Claude Code session transcript (JSONL format) and generates
6
+ a readable markdown document showing the conversation flow:
7
+ - User messages
8
+ - Assistant responses (with thinking blocks)
9
+ - Tool calls and their results
10
+
11
+ Usage:
12
+ python extract_transcript.py <session.jsonl> --output <output.md>
13
+
14
+ The output is suitable for human review but does NOT include model identification,
15
+ maintaining blind evaluation integrity.
16
+ """
17
+
18
+ import argparse
19
+ import json
20
+ import sys
21
+ from pathlib import Path
22
+ from datetime import datetime
23
+
24
+
25
+ def format_timestamp(ts: float | None) -> str:
26
+ """Format Unix timestamp to readable string."""
27
+ if ts is None:
28
+ return ""
29
+ try:
30
+ return datetime.fromtimestamp(ts).strftime("%H:%M:%S")
31
+ except (ValueError, TypeError):
32
+ return ""
33
+
34
+
35
+ def format_content_block(block: dict, indent: str = "") -> list[str]:
36
+ """Format a single content block to markdown lines."""
37
+ lines = []
38
+ block_type = block.get("type", "unknown")
39
+
40
+ if block_type == "text":
41
+ text = block.get("text", "")
42
+ if text.strip():
43
+ lines.append(f"{indent}{text}")
44
+
45
+ elif block_type == "thinking":
46
+ thinking = block.get("thinking", "")
47
+ if thinking.strip():
48
+ lines.append(f"{indent}### Thinking")
49
+ lines.append(f"{indent}```")
50
+ lines.append(f"{indent}{thinking}")
51
+ lines.append(f"{indent}```")
52
+ lines.append("")
53
+
54
+ elif block_type == "tool_use":
55
+ tool_name = block.get("name", "unknown")
56
+ tool_id = block.get("id", "")
57
+ tool_input = block.get("input", {})
58
+
59
+ lines.append(f"{indent}### Tool: `{tool_name}`")
60
+ lines.append(f"{indent}```json")
61
+ # Format input nicely, truncate if too long
62
+ input_str = json.dumps(tool_input, indent=2, ensure_ascii=False)
63
+ if len(input_str) > 2000:
64
+ input_str = input_str[:2000] + "\n... (truncated)"
65
+ lines.append(f"{indent}{input_str}")
66
+ lines.append(f"{indent}```")
67
+ lines.append("")
68
+
69
+ elif block_type == "redacted_thinking":
70
+ lines.append(f"{indent}*[Redacted thinking block]*")
71
+ lines.append("")
72
+
73
+ return lines
74
+
75
+
76
+ def format_tool_result(result: dict, indent: str = "") -> list[str]:
77
+ """Format a tool result block."""
78
+ lines = []
79
+ tool_use_id = result.get("tool_use_id", "")
80
+ content = result.get("content", "")
81
+ is_error = result.get("is_error", False)
82
+
83
+ header = "### Tool Result"
84
+ if is_error:
85
+ header = "### Tool Result (Error)"
86
+
87
+ lines.append(f"{indent}{header}")
88
+ lines.append(f"{indent}```")
89
+
90
+ # Handle different content types
91
+ if isinstance(content, str):
92
+ # Truncate long results
93
+ if len(content) > 3000:
94
+ content = content[:3000] + "\n... (truncated)"
95
+ lines.append(f"{indent}{content}")
96
+ elif isinstance(content, list):
97
+ for item in content:
98
+ if isinstance(item, dict) and item.get("type") == "text":
99
+ text = item.get("text", "")
100
+ if len(text) > 3000:
101
+ text = text[:3000] + "\n... (truncated)"
102
+ lines.append(f"{indent}{text}")
103
+ elif isinstance(item, dict) and item.get("type") == "image":
104
+ lines.append(f"{indent}[Image: {item.get('source', {}).get('media_type', 'unknown')}]")
105
+ else:
106
+ item_str = str(item)
107
+ if len(item_str) > 500:
108
+ item_str = item_str[:500] + "... (truncated)"
109
+ lines.append(f"{indent}{item_str}")
110
+ else:
111
+ content_str = str(content)
112
+ if len(content_str) > 3000:
113
+ content_str = content_str[:3000] + "\n... (truncated)"
114
+ lines.append(f"{indent}{content_str}")
115
+
116
+ lines.append(f"{indent}```")
117
+ lines.append("")
118
+
119
+ return lines
120
+
121
+
122
+ def extract_transcript(input_path: Path, output_path: Path, include_metadata: bool = True):
123
+ """
124
+ Extract transcript from JSONL session file to markdown.
125
+
126
+ Args:
127
+ input_path: Path to session.jsonl file
128
+ output_path: Path to output markdown file
129
+ include_metadata: Whether to include timing and token metadata
130
+ """
131
+ if not input_path.exists():
132
+ print(f"Error: Input file not found: {input_path}", file=sys.stderr)
133
+ sys.exit(1)
134
+
135
+ lines = []
136
+
137
+ # Header
138
+ lines.append("# Session Transcript")
139
+ lines.append("")
140
+ lines.append(f"*Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}*")
141
+ lines.append("")
142
+ lines.append("---")
143
+ lines.append("")
144
+
145
+ # Track message order for tool results
146
+ pending_tool_results = {}
147
+ message_count = 0
148
+ total_tokens = 0
149
+
150
+ with open(input_path, 'r', encoding='utf-8') as f:
151
+ for line_num, line in enumerate(f, 1):
152
+ line = line.strip()
153
+ if not line:
154
+ continue
155
+
156
+ try:
157
+ entry = json.loads(line)
158
+ except json.JSONDecodeError as e:
159
+ lines.append(f"*Error parsing line {line_num}: {e}*")
160
+ lines.append("")
161
+ continue
162
+
163
+ entry_type = entry.get("type", "")
164
+
165
+ # Handle different entry types
166
+ if entry_type == "summary":
167
+ # Skip summary entries
168
+ continue
169
+
170
+ # Get role - could be at top level or in message
171
+ role = entry.get("role", "")
172
+ if not role and "message" in entry:
173
+ role = entry.get("message", {}).get("role", "")
174
+
175
+ # Get content blocks
176
+ content = entry.get("content", [])
177
+ if not content and "message" in entry:
178
+ content = entry.get("message", {}).get("content", [])
179
+
180
+ # Get timestamp
181
+ timestamp = entry.get("timestamp") or entry.get("message", {}).get("timestamp")
182
+
183
+ # Get usage info
184
+ usage = entry.get("usage") or entry.get("message", {}).get("usage", {})
185
+ if usage:
186
+ total_tokens += usage.get("input_tokens", 0) + usage.get("output_tokens", 0)
187
+
188
+ if role == "user":
189
+ message_count += 1
190
+ ts_str = format_timestamp(timestamp)
191
+ lines.append(f"## User Message {message_count}")
192
+ if ts_str and include_metadata:
193
+ lines.append(f"*Time: {ts_str}*")
194
+ lines.append("")
195
+
196
+ # Handle content that is a string directly
197
+ if isinstance(content, str):
198
+ lines.append(content)
199
+ elif isinstance(content, list):
200
+ for block in content:
201
+ if isinstance(block, dict):
202
+ lines.extend(format_content_block(block))
203
+ elif isinstance(block, str):
204
+ lines.append(block)
205
+ lines.append("---")
206
+ lines.append("")
207
+
208
+ elif role == "assistant":
209
+ message_count += 1
210
+ ts_str = format_timestamp(timestamp)
211
+ lines.append(f"## Assistant Response {message_count}")
212
+ if ts_str and include_metadata:
213
+ lines.append(f"*Time: {ts_str}*")
214
+ if usage:
215
+ input_t = usage.get("input_tokens", 0)
216
+ output_t = usage.get("output_tokens", 0)
217
+ lines.append(f"*Tokens: {input_t} in, {output_t} out*")
218
+ lines.append("")
219
+
220
+ # Handle content that is a string directly
221
+ if isinstance(content, str):
222
+ lines.append(content)
223
+ elif isinstance(content, list):
224
+ for block in content:
225
+ if isinstance(block, dict):
226
+ block_type = block.get("type", "")
227
+ if block_type == "tool_use":
228
+ # Store tool call for potential result matching
229
+ tool_id = block.get("id", "")
230
+ pending_tool_results[tool_id] = message_count
231
+ lines.extend(format_content_block(block))
232
+ elif isinstance(block, str):
233
+ lines.append(block)
234
+ lines.append("---")
235
+ lines.append("")
236
+
237
+ # Handle tool results in user content
238
+ if role == "user" and isinstance(content, list):
239
+ for block in content:
240
+ if isinstance(block, dict) and block.get("type") == "tool_result":
241
+ lines.extend(format_tool_result(block))
242
+
243
+ # Footer with summary
244
+ lines.append("")
245
+ lines.append("---")
246
+ lines.append("")
247
+ lines.append("## Summary")
248
+ lines.append("")
249
+ lines.append(f"- **Total messages**: {message_count}")
250
+ if include_metadata and total_tokens > 0:
251
+ lines.append(f"- **Total tokens**: {total_tokens}")
252
+ lines.append("")
253
+
254
+ # Write output
255
+ output_path.parent.mkdir(parents=True, exist_ok=True)
256
+ output_path.write_text("\n".join(lines), encoding='utf-8')
257
+ print(f"Generated: {output_path}")
258
+
259
+
260
+ def main():
261
+ parser = argparse.ArgumentParser(
262
+ description="Extract session transcript to human-readable markdown"
263
+ )
264
+ parser.add_argument(
265
+ "input",
266
+ type=Path,
267
+ help="Path to session.jsonl file"
268
+ )
269
+ parser.add_argument(
270
+ "--output", "-o",
271
+ type=Path,
272
+ help="Output markdown file path (default: same name with .md extension)"
273
+ )
274
+ parser.add_argument(
275
+ "--no-metadata",
276
+ action="store_true",
277
+ help="Exclude timing and token metadata"
278
+ )
279
+
280
+ args = parser.parse_args()
281
+
282
+ output_path = args.output
283
+ if output_path is None:
284
+ output_path = args.input.with_suffix(".md")
285
+
286
+ extract_transcript(
287
+ args.input,
288
+ output_path,
289
+ include_metadata=not args.no_metadata
290
+ )
291
+
292
+
293
+ if __name__ == "__main__":
294
+ main()
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env python3
2
+ """Test script for aggregate_results.py.
3
+
4
+ Creates sample test data with anonymous run IDs and mapping.json,
5
+ then runs the aggregation to verify it works correctly.
6
+
7
+ Run with: python test_aggregate.py
8
+ """
9
+
10
+ import json
11
+ import shutil
12
+ import sys
13
+ from pathlib import Path
14
+
15
+
16
+ def create_test_workspace():
17
+ """Create a test workspace with sample data using anonymous run IDs."""
18
+ test_dir = Path(__file__).parent.parent / "test-workspace"
19
+
20
+ # Clean up if exists
21
+ if test_dir.exists():
22
+ shutil.rmtree(test_dir)
23
+
24
+ test_dir.mkdir(parents=True)
25
+
26
+ # Create evals.json
27
+ evals = {
28
+ "target_skill": "pdf",
29
+ "target_skill_path": "/path/to/pdf-skill",
30
+ "models_to_compare": ["opus", "sonnet"],
31
+ "evals": [
32
+ {
33
+ "id": 1,
34
+ "name": "Form Filling",
35
+ "prompt": "Fill out the PDF form with the following data: name=John Doe, email=john@example.com",
36
+ "expected_output": "A filled PDF form with all fields populated",
37
+ "assertions": [
38
+ "The output is a PDF file",
39
+ "The name field contains 'John Doe'",
40
+ "The email field contains 'john@example.com'"
41
+ ]
42
+ },
43
+ {
44
+ "id": 2,
45
+ "name": "Text Extraction",
46
+ "prompt": "Extract all text from the invoice PDF and create a summary",
47
+ "expected_output": "A text summary of the invoice contents",
48
+ "assertions": [
49
+ "The output contains extracted text",
50
+ "The total amount is mentioned",
51
+ "The date is correctly extracted"
52
+ ]
53
+ }
54
+ ]
55
+ }
56
+
57
+ with open(test_dir / "evals.json", "w") as f:
58
+ json.dump(evals, f, indent=2)
59
+
60
+ # Mapping from run IDs to models (created by MainAgent, but for testing we create it now)
61
+ # In real usage, this would be created AFTER grading is complete
62
+ mapping = {
63
+ "run-001": {"model": "opus", "eval_id": 1, "eval_name": "Form Filling"},
64
+ "run-002": {"model": "sonnet", "eval_id": 1, "eval_name": "Form Filling"},
65
+ "run-003": {"model": "opus", "eval_id": 2, "eval_name": "Text Extraction"},
66
+ "run-004": {"model": "sonnet", "eval_id": 2, "eval_name": "Text Extraction"}
67
+ }
68
+
69
+ # Create run directories with anonymous IDs
70
+ runs_data = {
71
+ "run-001": {
72
+ "grading": {
73
+ "run_id": "run-001",
74
+ "expectations": [
75
+ {"text": "The output is a PDF file", "passed": True, "evidence": "Output file is filled_form.pdf"},
76
+ {"text": "The name field contains 'John Doe'", "passed": True, "evidence": "Found in field mapping"},
77
+ {"text": "The email field contains 'john@example.com'", "passed": True, "evidence": "Verified in output"}
78
+ ],
79
+ "summary": {"passed": 3, "failed": 0, "total": 3, "pass_rate": 1.0},
80
+ "execution_metrics": {"total_tool_calls": 12, "errors_encountered": 0}
81
+ },
82
+ "timing": {
83
+ "run_id": "run-001",
84
+ "total_tokens": 45000,
85
+ "duration_ms": 45200,
86
+ "total_duration_seconds": 45.2
87
+ }
88
+ },
89
+ "run-002": {
90
+ "grading": {
91
+ "run_id": "run-002",
92
+ "expectations": [
93
+ {"text": "The output is a PDF file", "passed": True, "evidence": "Output file is filled_form.pdf"},
94
+ {"text": "The name field contains 'John Doe'", "passed": True, "evidence": "Found in field mapping"},
95
+ {"text": "The email field contains 'john@example.com'", "passed": False, "evidence": "Email was truncated"}
96
+ ],
97
+ "summary": {"passed": 2, "failed": 1, "total": 3, "pass_rate": 0.67},
98
+ "execution_metrics": {"total_tool_calls": 10, "errors_encountered": 0}
99
+ },
100
+ "timing": {
101
+ "run_id": "run-002",
102
+ "total_tokens": 28000,
103
+ "duration_ms": 32100,
104
+ "total_duration_seconds": 32.1
105
+ }
106
+ },
107
+ "run-003": {
108
+ "grading": {
109
+ "run_id": "run-003",
110
+ "expectations": [
111
+ {"text": "The output contains extracted text", "passed": True, "evidence": "Text extracted successfully"},
112
+ {"text": "The total amount is mentioned", "passed": True, "evidence": "Found '$1,234.56' in output"},
113
+ {"text": "The date is correctly extracted", "passed": True, "evidence": "Date: 2024-03-15"}
114
+ ],
115
+ "summary": {"passed": 3, "failed": 0, "total": 3, "pass_rate": 1.0},
116
+ "execution_metrics": {"total_tool_calls": 8, "errors_encountered": 0}
117
+ },
118
+ "timing": {
119
+ "run_id": "run-003",
120
+ "total_tokens": 38000,
121
+ "duration_ms": 28500,
122
+ "total_duration_seconds": 28.5
123
+ }
124
+ },
125
+ "run-004": {
126
+ "grading": {
127
+ "run_id": "run-004",
128
+ "expectations": [
129
+ {"text": "The output contains extracted text", "passed": True, "evidence": "Text extracted"},
130
+ {"text": "The total amount is mentioned", "passed": False, "evidence": "Amount not found in output"},
131
+ {"text": "The date is correctly extracted", "passed": True, "evidence": "Date: 2024-03-15"}
132
+ ],
133
+ "summary": {"passed": 2, "failed": 1, "total": 3, "pass_rate": 0.67},
134
+ "execution_metrics": {"total_tool_calls": 7, "errors_encountered": 0}
135
+ },
136
+ "timing": {
137
+ "run_id": "run-004",
138
+ "total_tokens": 24000,
139
+ "duration_ms": 22300,
140
+ "total_duration_seconds": 22.3
141
+ }
142
+ }
143
+ }
144
+
145
+ # Create run directories
146
+ for run_id, data in runs_data.items():
147
+ run_dir = test_dir / run_id
148
+ run_dir.mkdir()
149
+ (run_dir / "outputs").mkdir()
150
+
151
+ # Write grading.json (blind - no model info)
152
+ with open(run_dir / "grading.json", "w") as f:
153
+ json.dump(data["grading"], f, indent=2)
154
+
155
+ # Write timing.json (blind - no model info)
156
+ with open(run_dir / "timing.json", "w") as f:
157
+ json.dump(data["timing"], f, indent=2)
158
+
159
+ # Write mapping.json (would be created by MainAgent after grading)
160
+ # For testing, we create it now
161
+ with open(test_dir / "mapping.json", "w") as f:
162
+ json.dump(mapping, f, indent=2)
163
+
164
+ print(f"Created test workspace at: {test_dir}")
165
+ print(f"\nNote: In real usage, mapping.json is created by MainAgent AFTER grading.")
166
+ print(f"For this test, we create it upfront to verify aggregation.")
167
+ return test_dir
168
+
169
+
170
+ def run_aggregation(test_dir):
171
+ """Run the aggregation script on the test workspace."""
172
+ sys.path.insert(0, str(Path(__file__).parent))
173
+ from aggregate_results import generate_benchmark, generate_markdown, load_mapping
174
+
175
+ # Load the mapping
176
+ mapping = load_mapping(test_dir)
177
+
178
+ benchmark = generate_benchmark(test_dir, mapping)
179
+
180
+ # Print summary
181
+ print("\n=== Benchmark Summary ===")
182
+ print(f"Target Skill: {benchmark['metadata']['target_skill']}")
183
+ print(f"Models Compared: {', '.join(benchmark['metadata']['models_compared'])}")
184
+ print(f"Evals Run: {benchmark['metadata']['evals_run']}")
185
+ print(f"Note: {benchmark['metadata']['note']}")
186
+
187
+ print("\n=== Model Summary ===")
188
+ for model, summary in benchmark['model_summary'].items():
189
+ pr = summary['pass_rate']['mean']
190
+ time = summary['time_seconds']['mean']
191
+ tokens = summary['tokens']['mean']
192
+ print(f" {model}: {pr*100:.0f}% pass rate, {time:.1f}s, {tokens:.0f} tokens")
193
+
194
+ print("\n=== Comparison ===")
195
+ comp = benchmark['comparison']
196
+ print(f" Pass Rate Delta: {comp['pass_rate_delta']}")
197
+ print(f" Time Delta: {comp['time_delta']}")
198
+ print(f" Token Delta: {comp['token_delta']}")
199
+ print(f" Cost Efficiency: {comp['cost_efficiency']}")
200
+
201
+ print("\n=== Recommendations (data-driven) ===")
202
+ for rec in benchmark['recommendations']:
203
+ print(f" [{rec['scenario']}] Use {rec['recommended_model']}: {rec['reason']}")
204
+
205
+ print("\n=== Notes ===")
206
+ for note in benchmark['notes']:
207
+ print(f" - {note}")
208
+
209
+ # Write benchmark.json
210
+ benchmark_path = test_dir / "benchmark.json"
211
+ with open(benchmark_path, "w") as f:
212
+ json.dump(benchmark, f, indent=2)
213
+ print(f"\nWritten: {benchmark_path}")
214
+
215
+ # Write benchmark.md
216
+ markdown = generate_markdown(benchmark)
217
+ md_path = test_dir / "benchmark.md"
218
+ with open(md_path, "w") as f:
219
+ f.write(markdown)
220
+ print(f"Written: {md_path}")
221
+
222
+ return benchmark
223
+
224
+
225
+ def main():
226
+ print("Creating test workspace with anonymous run IDs...")
227
+ test_dir = create_test_workspace()
228
+
229
+ print("\nRunning aggregation with mapping.json...")
230
+ benchmark = run_aggregation(test_dir)
231
+
232
+ print("\n=== Test Complete ===")
233
+ print(f"Test workspace: {test_dir}")
234
+ print("\nKey points verified:")
235
+ print(" 1. Run directories use anonymous IDs (run-001, run-002, etc.)")
236
+ print(" 2. Grading files do not contain model information")
237
+ print(" 3. Mapping.json maps run IDs to models")
238
+ print(" 4. Aggregation combines grading + mapping for benchmark")
239
+ print("\nYou can now test the report viewer:")
240
+ print(f" python -m eval-viewer.generate_report {test_dir} --serve")
241
+
242
+
243
+ if __name__ == "__main__":
244
+ main()