isage-benchmark-agent 0.1.0.1__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
- isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
- isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
- isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
- isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
- isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
- sage/__init__.py +0 -0
- sage/benchmark/__init__.py +0 -0
- sage/benchmark/benchmark_agent/__init__.py +108 -0
- sage/benchmark/benchmark_agent/__main__.py +177 -0
- sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
- sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
- sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
- sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
- sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
- sage/benchmark/benchmark_agent/data_paths.py +332 -0
- sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
- sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
- sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
- sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
- sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
- sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
- sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
- sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
- sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
- sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
- sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
- sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
- sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
- sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
- sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
- sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
- sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
- sage/benchmark/benchmark_agent/tools_loader.py +212 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation pipeline orchestrator.
|
|
3
|
+
|
|
4
|
+
Coordinates metrics computation, analysis, and report generation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from datetime import datetime
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Optional
|
|
11
|
+
|
|
12
|
+
from ..experiments.base_experiment import ExperimentConfig, ExperimentResult
|
|
13
|
+
from . import EvaluationReport
|
|
14
|
+
from .analyzers import PlanningAnalyzer, TimingAnalyzer, ToolSelectionAnalyzer
|
|
15
|
+
from .metrics import load_metrics
|
|
16
|
+
from .report_builder import create_report_builders
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EvaluationPipeline:
|
|
22
|
+
"""
|
|
23
|
+
Pipeline for evaluating experiment results.
|
|
24
|
+
|
|
25
|
+
Orchestrates:
|
|
26
|
+
1. Metric computation
|
|
27
|
+
2. Detailed analysis
|
|
28
|
+
3. Report generation
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, metrics: list[Any], analyzers: list[Any], report_builders: dict[str, Any]):
|
|
32
|
+
"""
|
|
33
|
+
Initialize pipeline.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
metrics: List of metric instances
|
|
37
|
+
analyzers: List of analyzer instances
|
|
38
|
+
report_builders: Dict mapping format to builder instance
|
|
39
|
+
"""
|
|
40
|
+
self.metrics = metrics
|
|
41
|
+
self.analyzers = analyzers
|
|
42
|
+
self.report_builders = report_builders
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def from_config(
|
|
46
|
+
cls, config: ExperimentConfig, metrics_registry: Any = None
|
|
47
|
+
) -> "EvaluationPipeline":
|
|
48
|
+
"""
|
|
49
|
+
Create pipeline from experiment config.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
config: Experiment configuration
|
|
53
|
+
metrics_registry: Optional custom metrics registry
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Initialized EvaluationPipeline
|
|
57
|
+
"""
|
|
58
|
+
# Load metrics
|
|
59
|
+
metric_names = config.metrics if config.metrics else []
|
|
60
|
+
metrics = load_metrics(metric_names)
|
|
61
|
+
|
|
62
|
+
# Create task-specific analyzer
|
|
63
|
+
task = config.experiment
|
|
64
|
+
analyzers: list[ToolSelectionAnalyzer | PlanningAnalyzer | TimingAnalyzer] = []
|
|
65
|
+
|
|
66
|
+
if task == "tool_selection":
|
|
67
|
+
analyzers.append(ToolSelectionAnalyzer())
|
|
68
|
+
elif task == "planning":
|
|
69
|
+
analyzers.append(PlanningAnalyzer())
|
|
70
|
+
elif task == "timing_detection":
|
|
71
|
+
analyzers.append(TimingAnalyzer())
|
|
72
|
+
|
|
73
|
+
# Create report builders
|
|
74
|
+
report_formats: list[str] = list(
|
|
75
|
+
config.report.format if hasattr(config.report, "format") else ["json", "markdown"]
|
|
76
|
+
)
|
|
77
|
+
builders = create_report_builders(report_formats)
|
|
78
|
+
|
|
79
|
+
return cls(metrics=metrics, analyzers=analyzers, report_builders=builders)
|
|
80
|
+
|
|
81
|
+
def evaluate(self, result: ExperimentResult, config: ExperimentConfig) -> EvaluationReport:
|
|
82
|
+
"""
|
|
83
|
+
Evaluate experiment result.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
result: Experiment result to evaluate
|
|
87
|
+
config: Experiment configuration
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
EvaluationReport with metrics and analysis
|
|
91
|
+
"""
|
|
92
|
+
logger.info(f"Evaluating experiment: {result.experiment_id}")
|
|
93
|
+
|
|
94
|
+
# Extract predictions and references based on task type
|
|
95
|
+
predictions, references = self._extract_pred_ref(result)
|
|
96
|
+
|
|
97
|
+
# Compute metrics
|
|
98
|
+
logger.info(f"Computing {len(self.metrics)} metrics...")
|
|
99
|
+
metric_results = {}
|
|
100
|
+
metric_details = {}
|
|
101
|
+
|
|
102
|
+
for metric in self.metrics:
|
|
103
|
+
try:
|
|
104
|
+
logger.debug(f"Computing metric: {metric.name}")
|
|
105
|
+
output = metric.compute(predictions, references)
|
|
106
|
+
metric_results[metric.name] = output.value
|
|
107
|
+
metric_details[metric.name] = output.details
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Error computing metric {metric.name}: {e}")
|
|
110
|
+
metric_results[metric.name] = 0.0
|
|
111
|
+
metric_details[metric.name] = {"error": str(e)}
|
|
112
|
+
|
|
113
|
+
# Run analyzers
|
|
114
|
+
logger.info(f"Running {len(self.analyzers)} analyzers...")
|
|
115
|
+
breakdowns = {}
|
|
116
|
+
|
|
117
|
+
for analyzer in self.analyzers:
|
|
118
|
+
try:
|
|
119
|
+
logger.debug(f"Running analyzer: {analyzer.name}")
|
|
120
|
+
analysis = analyzer.analyze(predictions, references, result.metadata)
|
|
121
|
+
breakdowns[analyzer.name] = analysis
|
|
122
|
+
except Exception as e:
|
|
123
|
+
logger.error(f"Error in analyzer {analyzer.name}: {e}")
|
|
124
|
+
breakdowns[analyzer.name] = {"error": str(e)}
|
|
125
|
+
|
|
126
|
+
# Add metric details to breakdowns
|
|
127
|
+
if metric_details:
|
|
128
|
+
breakdowns["metric_details"] = metric_details
|
|
129
|
+
|
|
130
|
+
# Create evaluation report
|
|
131
|
+
report = EvaluationReport(
|
|
132
|
+
task=result.task,
|
|
133
|
+
experiment_id=result.experiment_id,
|
|
134
|
+
metrics=metric_results,
|
|
135
|
+
breakdowns=breakdowns,
|
|
136
|
+
artifacts={},
|
|
137
|
+
timestamp=datetime.utcnow().isoformat(),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Generate and save reports
|
|
141
|
+
output_dir = Path(config.report.path) / result.experiment_id
|
|
142
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
143
|
+
|
|
144
|
+
logger.info(f"Generating reports in {output_dir}")
|
|
145
|
+
for format_name, builder in self.report_builders.items():
|
|
146
|
+
try:
|
|
147
|
+
output_file = output_dir / f"evaluation_report.{format_name}"
|
|
148
|
+
saved_path = builder.build(report, output_file)
|
|
149
|
+
report.artifacts[f"report_{format_name}"] = saved_path
|
|
150
|
+
logger.info(f"Saved {format_name} report to: {saved_path}")
|
|
151
|
+
except Exception as e:
|
|
152
|
+
logger.error(f"Error building {format_name} report: {e}")
|
|
153
|
+
|
|
154
|
+
logger.info(f"Evaluation complete. Overall metrics: {metric_results}")
|
|
155
|
+
|
|
156
|
+
return report
|
|
157
|
+
|
|
158
|
+
def _extract_pred_ref(self, result: ExperimentResult):
|
|
159
|
+
"""
|
|
160
|
+
Extract predictions and references from result.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
result: ExperimentResult
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Tuple of (predictions, references)
|
|
167
|
+
"""
|
|
168
|
+
task = result.task
|
|
169
|
+
|
|
170
|
+
if task == "tool_selection":
|
|
171
|
+
# Extract tool ID lists
|
|
172
|
+
predictions = []
|
|
173
|
+
for pred in result.predictions:
|
|
174
|
+
pred_tools = pred.get("predicted_tools", pred.get("tools", []))
|
|
175
|
+
# Handle both list of dicts and list of strings
|
|
176
|
+
if pred_tools and isinstance(pred_tools[0], dict):
|
|
177
|
+
pred_tools = [t.get("tool_id", t.get("id")) for t in pred_tools]
|
|
178
|
+
predictions.append(pred_tools)
|
|
179
|
+
|
|
180
|
+
references = []
|
|
181
|
+
for ref in result.references:
|
|
182
|
+
ref_tools = ref.get("ground_truth_tools", ref.get("tools", ref.get("top_k", [])))
|
|
183
|
+
references.append(ref_tools)
|
|
184
|
+
|
|
185
|
+
elif task == "planning":
|
|
186
|
+
# Extract tool sequences
|
|
187
|
+
predictions = [
|
|
188
|
+
pred.get("tool_sequence", pred.get("steps", [])) for pred in result.predictions
|
|
189
|
+
]
|
|
190
|
+
references = [
|
|
191
|
+
ref.get("tool_sequence", ref.get("steps", [])) for ref in result.references
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
elif task == "timing_detection":
|
|
195
|
+
# Extract boolean decisions
|
|
196
|
+
predictions = [
|
|
197
|
+
pred.get("should_call_tool", pred.get("decision", False))
|
|
198
|
+
for pred in result.predictions
|
|
199
|
+
]
|
|
200
|
+
references = [
|
|
201
|
+
ref.get("should_call_tool", ref.get("decision", False)) for ref in result.references
|
|
202
|
+
]
|
|
203
|
+
|
|
204
|
+
else:
|
|
205
|
+
raise ValueError(f"Unknown task type: {task}")
|
|
206
|
+
|
|
207
|
+
return predictions, references
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def save_report(report: EvaluationReport, output_dir: Path, formats: Optional[list[str]] = None):
|
|
211
|
+
"""
|
|
212
|
+
Save evaluation report to files.
|
|
213
|
+
|
|
214
|
+
Args:
|
|
215
|
+
report: EvaluationReport to save
|
|
216
|
+
output_dir: Directory to save reports
|
|
217
|
+
formats: List of formats to save (default: ["json", "markdown"])
|
|
218
|
+
"""
|
|
219
|
+
formats = formats or ["json", "markdown"]
|
|
220
|
+
output_dir = Path(output_dir)
|
|
221
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
222
|
+
|
|
223
|
+
builders = create_report_builders(formats)
|
|
224
|
+
|
|
225
|
+
for format_name, builder in builders.items():
|
|
226
|
+
output_file = output_dir / f"evaluation_report.{format_name}"
|
|
227
|
+
builder.build(report, output_file)
|
|
228
|
+
logger.info(f"Saved {format_name} report to: {output_file}")
|