isage-benchmark-agent 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
  2. isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
  3. isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
  4. isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
  5. isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
  6. isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
  7. sage/__init__.py +0 -0
  8. sage/benchmark/__init__.py +0 -0
  9. sage/benchmark/benchmark_agent/__init__.py +108 -0
  10. sage/benchmark/benchmark_agent/__main__.py +177 -0
  11. sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
  12. sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
  13. sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
  14. sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  15. sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  16. sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  17. sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  18. sage/benchmark/benchmark_agent/data_paths.py +332 -0
  19. sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  20. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  21. sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
  22. sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
  23. sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
  24. sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
  25. sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
  26. sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
  27. sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
  28. sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  29. sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
  30. sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
  31. sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
  32. sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
  33. sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
  34. sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  35. sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  36. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
  37. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
  38. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
  39. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
  40. sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
  41. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
  42. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
  43. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
  44. sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
  45. sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
  46. sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
  47. sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
  48. sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
  49. sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
  50. sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
  51. sage/benchmark/benchmark_agent/tools_loader.py +212 -0
@@ -0,0 +1,217 @@
1
+ """
2
+ Report builders for generating JSON and Markdown evaluation reports.
3
+ """
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any, Optional
8
+
9
+ from . import EvaluationReport
10
+
11
+
12
+ class JsonReportBuilder:
13
+ """Build JSON format evaluation reports."""
14
+
15
+ def build(self, report: EvaluationReport, output_path: Path) -> Path:
16
+ """
17
+ Build and save JSON report.
18
+
19
+ Args:
20
+ report: EvaluationReport to format
21
+ output_path: Path to save report
22
+
23
+ Returns:
24
+ Path to saved report file
25
+ """
26
+ output_path = Path(output_path)
27
+ output_path.parent.mkdir(parents=True, exist_ok=True)
28
+
29
+ # Convert report to dict and handle Path objects
30
+ report_dict = report.model_dump()
31
+ report_dict["artifacts"] = {k: str(v) for k, v in report.artifacts.items()}
32
+
33
+ # Write JSON with pretty formatting
34
+ with open(output_path, "w", encoding="utf-8") as f:
35
+ json.dump(report_dict, f, indent=2, ensure_ascii=False)
36
+
37
+ return output_path
38
+
39
+
40
+ class MarkdownReportBuilder:
41
+ """Build Markdown format evaluation reports."""
42
+
43
+ def __init__(self, template: Optional[str] = None):
44
+ """
45
+ Initialize builder.
46
+
47
+ Args:
48
+ template: Optional custom template string
49
+ """
50
+ self.template = template or self._default_template()
51
+
52
+ def build(self, report: EvaluationReport, output_path: Path) -> Path:
53
+ """
54
+ Build and save Markdown report.
55
+
56
+ Args:
57
+ report: EvaluationReport to format
58
+ output_path: Path to save report
59
+
60
+ Returns:
61
+ Path to saved report file
62
+ """
63
+ output_path = Path(output_path)
64
+ output_path.parent.mkdir(parents=True, exist_ok=True)
65
+
66
+ # Generate markdown content
67
+ content = self._generate_markdown(report)
68
+
69
+ # Write to file
70
+ with open(output_path, "w", encoding="utf-8") as f:
71
+ f.write(content)
72
+
73
+ return output_path
74
+
75
+ def _generate_markdown(self, report: EvaluationReport) -> str:
76
+ """Generate markdown content from report."""
77
+ lines = []
78
+
79
+ # Header
80
+ lines.append(f"# Evaluation Report: {report.task.replace('_', ' ').title()}")
81
+ lines.append("")
82
+ lines.append(f"**Experiment ID**: `{report.experiment_id}` ")
83
+ lines.append(f"**Timestamp**: {report.timestamp} ")
84
+ lines.append("")
85
+ lines.append("---")
86
+ lines.append("")
87
+
88
+ # Metrics section
89
+ lines.append("## 📊 Metrics")
90
+ lines.append("")
91
+
92
+ if report.metrics:
93
+ lines.append("| Metric | Value |")
94
+ lines.append("|--------|-------|")
95
+ for metric_name, value in sorted(report.metrics.items()):
96
+ formatted_value = f"{value:.4f}" if isinstance(value, float) else str(value)
97
+ lines.append(f"| {metric_name} | {formatted_value} |")
98
+ lines.append("")
99
+ else:
100
+ lines.append("*No metrics computed*")
101
+ lines.append("")
102
+
103
+ # Breakdowns section
104
+ if report.breakdowns:
105
+ lines.append("---")
106
+ lines.append("")
107
+ lines.append("## 🔍 Detailed Analysis")
108
+ lines.append("")
109
+
110
+ for section_name, section_data in report.breakdowns.items():
111
+ lines.append(f"### {section_name.replace('_', ' ').title()}")
112
+ lines.append("")
113
+ lines.append(self._format_section(section_data))
114
+ lines.append("")
115
+
116
+ # Artifacts section
117
+ if report.artifacts:
118
+ lines.append("---")
119
+ lines.append("")
120
+ lines.append("## 📁 Artifacts")
121
+ lines.append("")
122
+ for name, path in report.artifacts.items():
123
+ lines.append(f"- **{name}**: `{path}`")
124
+ lines.append("")
125
+
126
+ return "\n".join(lines)
127
+
128
+ def _format_section(self, data: Any, indent: int = 0) -> str:
129
+ """Recursively format section data."""
130
+ lines = []
131
+ prefix = " " * indent
132
+
133
+ if isinstance(data, dict):
134
+ for key, value in data.items():
135
+ if isinstance(value, (dict, list)) and len(str(value)) > 50:
136
+ lines.append(f"{prefix}- **{key}**:")
137
+ lines.append(self._format_section(value, indent + 1))
138
+ else:
139
+ formatted_value = self._format_value(value)
140
+ lines.append(f"{prefix}- **{key}**: {formatted_value}")
141
+ elif isinstance(data, list):
142
+ if len(data) > 0 and isinstance(data[0], dict):
143
+ # Format as table if list of dicts
144
+ lines.append(self._format_table(data, indent))
145
+ else:
146
+ for item in data[:10]: # Limit display
147
+ lines.append(f"{prefix}- {self._format_value(item)}")
148
+ if len(data) > 10:
149
+ lines.append(f"{prefix} *(... and {len(data) - 10} more)*")
150
+ else:
151
+ lines.append(f"{prefix}{self._format_value(data)}")
152
+
153
+ return "\n".join(lines)
154
+
155
+ def _format_value(self, value: Any) -> str:
156
+ """Format individual value."""
157
+ if isinstance(value, float):
158
+ return f"{value:.4f}"
159
+ elif isinstance(value, (list, tuple)) and len(value) <= 5:
160
+ return ", ".join(str(v) for v in value)
161
+ else:
162
+ return str(value)
163
+
164
+ def _format_table(self, data: list, indent: int = 0) -> str:
165
+ """Format list of dicts as markdown table."""
166
+ if not data:
167
+ return ""
168
+
169
+ prefix = " " * indent
170
+ lines = []
171
+
172
+ # Get all keys
173
+ keys = list(data[0].keys())
174
+
175
+ # Header
176
+ lines.append(prefix + "| " + " | ".join(keys) + " |")
177
+ lines.append(prefix + "|" + "|".join([" --- "] * len(keys)) + "|")
178
+
179
+ # Rows (limit to 10)
180
+ for row in data[:10]:
181
+ values = [self._format_value(row.get(k, "")) for k in keys]
182
+ lines.append(prefix + "| " + " | ".join(values) + " |")
183
+
184
+ if len(data) > 10:
185
+ lines.append(prefix + f"*... and {len(data) - 10} more rows*")
186
+
187
+ return "\n".join(lines)
188
+
189
+ @staticmethod
190
+ def _default_template() -> str:
191
+ """Return default markdown template."""
192
+ return "" # We use programmatic generation instead
193
+
194
+
195
+ def create_report_builders(
196
+ formats: list[str],
197
+ ) -> dict[str, JsonReportBuilder | MarkdownReportBuilder]:
198
+ """
199
+ Create report builders for specified formats.
200
+
201
+ Args:
202
+ formats: List of format names ("json", "markdown")
203
+
204
+ Returns:
205
+ Dictionary mapping format to builder instance
206
+ """
207
+ builders: dict[str, JsonReportBuilder | MarkdownReportBuilder] = {}
208
+
209
+ for fmt in formats:
210
+ if fmt == "json":
211
+ builders["json"] = JsonReportBuilder()
212
+ elif fmt == "markdown":
213
+ builders["markdown"] = MarkdownReportBuilder()
214
+ else:
215
+ raise ValueError(f"Unknown report format: {fmt}")
216
+
217
+ return builders