isage-benchmark-agent 0.1.0.1__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
- isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
- isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
- isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
- isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
- isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
- sage/__init__.py +0 -0
- sage/benchmark/__init__.py +0 -0
- sage/benchmark/benchmark_agent/__init__.py +108 -0
- sage/benchmark/benchmark_agent/__main__.py +177 -0
- sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
- sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
- sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
- sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
- sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
- sage/benchmark/benchmark_agent/data_paths.py +332 -0
- sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
- sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
- sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
- sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
- sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
- sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
- sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
- sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
- sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
- sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
- sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
- sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
- sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
- sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
- sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
- sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
- sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
- sage/benchmark/benchmark_agent/tools_loader.py +212 -0
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Report builders for generating JSON and Markdown evaluation reports.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Optional
|
|
8
|
+
|
|
9
|
+
from . import EvaluationReport
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class JsonReportBuilder:
|
|
13
|
+
"""Build JSON format evaluation reports."""
|
|
14
|
+
|
|
15
|
+
def build(self, report: EvaluationReport, output_path: Path) -> Path:
|
|
16
|
+
"""
|
|
17
|
+
Build and save JSON report.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
report: EvaluationReport to format
|
|
21
|
+
output_path: Path to save report
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Path to saved report file
|
|
25
|
+
"""
|
|
26
|
+
output_path = Path(output_path)
|
|
27
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
28
|
+
|
|
29
|
+
# Convert report to dict and handle Path objects
|
|
30
|
+
report_dict = report.model_dump()
|
|
31
|
+
report_dict["artifacts"] = {k: str(v) for k, v in report.artifacts.items()}
|
|
32
|
+
|
|
33
|
+
# Write JSON with pretty formatting
|
|
34
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
35
|
+
json.dump(report_dict, f, indent=2, ensure_ascii=False)
|
|
36
|
+
|
|
37
|
+
return output_path
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class MarkdownReportBuilder:
|
|
41
|
+
"""Build Markdown format evaluation reports."""
|
|
42
|
+
|
|
43
|
+
def __init__(self, template: Optional[str] = None):
|
|
44
|
+
"""
|
|
45
|
+
Initialize builder.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
template: Optional custom template string
|
|
49
|
+
"""
|
|
50
|
+
self.template = template or self._default_template()
|
|
51
|
+
|
|
52
|
+
def build(self, report: EvaluationReport, output_path: Path) -> Path:
|
|
53
|
+
"""
|
|
54
|
+
Build and save Markdown report.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
report: EvaluationReport to format
|
|
58
|
+
output_path: Path to save report
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Path to saved report file
|
|
62
|
+
"""
|
|
63
|
+
output_path = Path(output_path)
|
|
64
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
# Generate markdown content
|
|
67
|
+
content = self._generate_markdown(report)
|
|
68
|
+
|
|
69
|
+
# Write to file
|
|
70
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
71
|
+
f.write(content)
|
|
72
|
+
|
|
73
|
+
return output_path
|
|
74
|
+
|
|
75
|
+
def _generate_markdown(self, report: EvaluationReport) -> str:
|
|
76
|
+
"""Generate markdown content from report."""
|
|
77
|
+
lines = []
|
|
78
|
+
|
|
79
|
+
# Header
|
|
80
|
+
lines.append(f"# Evaluation Report: {report.task.replace('_', ' ').title()}")
|
|
81
|
+
lines.append("")
|
|
82
|
+
lines.append(f"**Experiment ID**: `{report.experiment_id}` ")
|
|
83
|
+
lines.append(f"**Timestamp**: {report.timestamp} ")
|
|
84
|
+
lines.append("")
|
|
85
|
+
lines.append("---")
|
|
86
|
+
lines.append("")
|
|
87
|
+
|
|
88
|
+
# Metrics section
|
|
89
|
+
lines.append("## 📊 Metrics")
|
|
90
|
+
lines.append("")
|
|
91
|
+
|
|
92
|
+
if report.metrics:
|
|
93
|
+
lines.append("| Metric | Value |")
|
|
94
|
+
lines.append("|--------|-------|")
|
|
95
|
+
for metric_name, value in sorted(report.metrics.items()):
|
|
96
|
+
formatted_value = f"{value:.4f}" if isinstance(value, float) else str(value)
|
|
97
|
+
lines.append(f"| {metric_name} | {formatted_value} |")
|
|
98
|
+
lines.append("")
|
|
99
|
+
else:
|
|
100
|
+
lines.append("*No metrics computed*")
|
|
101
|
+
lines.append("")
|
|
102
|
+
|
|
103
|
+
# Breakdowns section
|
|
104
|
+
if report.breakdowns:
|
|
105
|
+
lines.append("---")
|
|
106
|
+
lines.append("")
|
|
107
|
+
lines.append("## 🔍 Detailed Analysis")
|
|
108
|
+
lines.append("")
|
|
109
|
+
|
|
110
|
+
for section_name, section_data in report.breakdowns.items():
|
|
111
|
+
lines.append(f"### {section_name.replace('_', ' ').title()}")
|
|
112
|
+
lines.append("")
|
|
113
|
+
lines.append(self._format_section(section_data))
|
|
114
|
+
lines.append("")
|
|
115
|
+
|
|
116
|
+
# Artifacts section
|
|
117
|
+
if report.artifacts:
|
|
118
|
+
lines.append("---")
|
|
119
|
+
lines.append("")
|
|
120
|
+
lines.append("## 📁 Artifacts")
|
|
121
|
+
lines.append("")
|
|
122
|
+
for name, path in report.artifacts.items():
|
|
123
|
+
lines.append(f"- **{name}**: `{path}`")
|
|
124
|
+
lines.append("")
|
|
125
|
+
|
|
126
|
+
return "\n".join(lines)
|
|
127
|
+
|
|
128
|
+
def _format_section(self, data: Any, indent: int = 0) -> str:
|
|
129
|
+
"""Recursively format section data."""
|
|
130
|
+
lines = []
|
|
131
|
+
prefix = " " * indent
|
|
132
|
+
|
|
133
|
+
if isinstance(data, dict):
|
|
134
|
+
for key, value in data.items():
|
|
135
|
+
if isinstance(value, (dict, list)) and len(str(value)) > 50:
|
|
136
|
+
lines.append(f"{prefix}- **{key}**:")
|
|
137
|
+
lines.append(self._format_section(value, indent + 1))
|
|
138
|
+
else:
|
|
139
|
+
formatted_value = self._format_value(value)
|
|
140
|
+
lines.append(f"{prefix}- **{key}**: {formatted_value}")
|
|
141
|
+
elif isinstance(data, list):
|
|
142
|
+
if len(data) > 0 and isinstance(data[0], dict):
|
|
143
|
+
# Format as table if list of dicts
|
|
144
|
+
lines.append(self._format_table(data, indent))
|
|
145
|
+
else:
|
|
146
|
+
for item in data[:10]: # Limit display
|
|
147
|
+
lines.append(f"{prefix}- {self._format_value(item)}")
|
|
148
|
+
if len(data) > 10:
|
|
149
|
+
lines.append(f"{prefix} *(... and {len(data) - 10} more)*")
|
|
150
|
+
else:
|
|
151
|
+
lines.append(f"{prefix}{self._format_value(data)}")
|
|
152
|
+
|
|
153
|
+
return "\n".join(lines)
|
|
154
|
+
|
|
155
|
+
def _format_value(self, value: Any) -> str:
|
|
156
|
+
"""Format individual value."""
|
|
157
|
+
if isinstance(value, float):
|
|
158
|
+
return f"{value:.4f}"
|
|
159
|
+
elif isinstance(value, (list, tuple)) and len(value) <= 5:
|
|
160
|
+
return ", ".join(str(v) for v in value)
|
|
161
|
+
else:
|
|
162
|
+
return str(value)
|
|
163
|
+
|
|
164
|
+
def _format_table(self, data: list, indent: int = 0) -> str:
|
|
165
|
+
"""Format list of dicts as markdown table."""
|
|
166
|
+
if not data:
|
|
167
|
+
return ""
|
|
168
|
+
|
|
169
|
+
prefix = " " * indent
|
|
170
|
+
lines = []
|
|
171
|
+
|
|
172
|
+
# Get all keys
|
|
173
|
+
keys = list(data[0].keys())
|
|
174
|
+
|
|
175
|
+
# Header
|
|
176
|
+
lines.append(prefix + "| " + " | ".join(keys) + " |")
|
|
177
|
+
lines.append(prefix + "|" + "|".join([" --- "] * len(keys)) + "|")
|
|
178
|
+
|
|
179
|
+
# Rows (limit to 10)
|
|
180
|
+
for row in data[:10]:
|
|
181
|
+
values = [self._format_value(row.get(k, "")) for k in keys]
|
|
182
|
+
lines.append(prefix + "| " + " | ".join(values) + " |")
|
|
183
|
+
|
|
184
|
+
if len(data) > 10:
|
|
185
|
+
lines.append(prefix + f"*... and {len(data) - 10} more rows*")
|
|
186
|
+
|
|
187
|
+
return "\n".join(lines)
|
|
188
|
+
|
|
189
|
+
@staticmethod
|
|
190
|
+
def _default_template() -> str:
|
|
191
|
+
"""Return default markdown template."""
|
|
192
|
+
return "" # We use programmatic generation instead
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def create_report_builders(
|
|
196
|
+
formats: list[str],
|
|
197
|
+
) -> dict[str, JsonReportBuilder | MarkdownReportBuilder]:
|
|
198
|
+
"""
|
|
199
|
+
Create report builders for specified formats.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
formats: List of format names ("json", "markdown")
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Dictionary mapping format to builder instance
|
|
206
|
+
"""
|
|
207
|
+
builders: dict[str, JsonReportBuilder | MarkdownReportBuilder] = {}
|
|
208
|
+
|
|
209
|
+
for fmt in formats:
|
|
210
|
+
if fmt == "json":
|
|
211
|
+
builders["json"] = JsonReportBuilder()
|
|
212
|
+
elif fmt == "markdown":
|
|
213
|
+
builders["markdown"] = MarkdownReportBuilder()
|
|
214
|
+
else:
|
|
215
|
+
raise ValueError(f"Unknown report format: {fmt}")
|
|
216
|
+
|
|
217
|
+
return builders
|