isage-benchmark-agent 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
  2. isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
  3. isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
  4. isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
  5. isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
  6. isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
  7. sage/__init__.py +0 -0
  8. sage/benchmark/__init__.py +0 -0
  9. sage/benchmark/benchmark_agent/__init__.py +108 -0
  10. sage/benchmark/benchmark_agent/__main__.py +177 -0
  11. sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
  12. sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
  13. sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
  14. sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  15. sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  16. sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  17. sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  18. sage/benchmark/benchmark_agent/data_paths.py +332 -0
  19. sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  20. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  21. sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
  22. sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
  23. sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
  24. sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
  25. sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
  26. sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
  27. sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
  28. sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  29. sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
  30. sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
  31. sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
  32. sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
  33. sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
  34. sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  35. sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  36. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
  37. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
  38. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
  39. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
  40. sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
  41. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
  42. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
  43. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
  44. sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
  45. sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
  46. sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
  47. sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
  48. sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
  49. sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
  50. sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
  51. sage/benchmark/benchmark_agent/tools_loader.py +212 -0
@@ -0,0 +1,228 @@
1
+ """
2
+ Evaluation pipeline orchestrator.
3
+
4
+ Coordinates metrics computation, analysis, and report generation.
5
+ """
6
+
7
+ import logging
8
+ from datetime import datetime
9
+ from pathlib import Path
10
+ from typing import Any, Optional
11
+
12
+ from ..experiments.base_experiment import ExperimentConfig, ExperimentResult
13
+ from . import EvaluationReport
14
+ from .analyzers import PlanningAnalyzer, TimingAnalyzer, ToolSelectionAnalyzer
15
+ from .metrics import load_metrics
16
+ from .report_builder import create_report_builders
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class EvaluationPipeline:
22
+ """
23
+ Pipeline for evaluating experiment results.
24
+
25
+ Orchestrates:
26
+ 1. Metric computation
27
+ 2. Detailed analysis
28
+ 3. Report generation
29
+ """
30
+
31
+ def __init__(self, metrics: list[Any], analyzers: list[Any], report_builders: dict[str, Any]):
32
+ """
33
+ Initialize pipeline.
34
+
35
+ Args:
36
+ metrics: List of metric instances
37
+ analyzers: List of analyzer instances
38
+ report_builders: Dict mapping format to builder instance
39
+ """
40
+ self.metrics = metrics
41
+ self.analyzers = analyzers
42
+ self.report_builders = report_builders
43
+
44
+ @classmethod
45
+ def from_config(
46
+ cls, config: ExperimentConfig, metrics_registry: Any = None
47
+ ) -> "EvaluationPipeline":
48
+ """
49
+ Create pipeline from experiment config.
50
+
51
+ Args:
52
+ config: Experiment configuration
53
+ metrics_registry: Optional custom metrics registry
54
+
55
+ Returns:
56
+ Initialized EvaluationPipeline
57
+ """
58
+ # Load metrics
59
+ metric_names = config.metrics if config.metrics else []
60
+ metrics = load_metrics(metric_names)
61
+
62
+ # Create task-specific analyzer
63
+ task = config.experiment
64
+ analyzers: list[ToolSelectionAnalyzer | PlanningAnalyzer | TimingAnalyzer] = []
65
+
66
+ if task == "tool_selection":
67
+ analyzers.append(ToolSelectionAnalyzer())
68
+ elif task == "planning":
69
+ analyzers.append(PlanningAnalyzer())
70
+ elif task == "timing_detection":
71
+ analyzers.append(TimingAnalyzer())
72
+
73
+ # Create report builders
74
+ report_formats: list[str] = list(
75
+ config.report.format if hasattr(config.report, "format") else ["json", "markdown"]
76
+ )
77
+ builders = create_report_builders(report_formats)
78
+
79
+ return cls(metrics=metrics, analyzers=analyzers, report_builders=builders)
80
+
81
+ def evaluate(self, result: ExperimentResult, config: ExperimentConfig) -> EvaluationReport:
82
+ """
83
+ Evaluate experiment result.
84
+
85
+ Args:
86
+ result: Experiment result to evaluate
87
+ config: Experiment configuration
88
+
89
+ Returns:
90
+ EvaluationReport with metrics and analysis
91
+ """
92
+ logger.info(f"Evaluating experiment: {result.experiment_id}")
93
+
94
+ # Extract predictions and references based on task type
95
+ predictions, references = self._extract_pred_ref(result)
96
+
97
+ # Compute metrics
98
+ logger.info(f"Computing {len(self.metrics)} metrics...")
99
+ metric_results = {}
100
+ metric_details = {}
101
+
102
+ for metric in self.metrics:
103
+ try:
104
+ logger.debug(f"Computing metric: {metric.name}")
105
+ output = metric.compute(predictions, references)
106
+ metric_results[metric.name] = output.value
107
+ metric_details[metric.name] = output.details
108
+ except Exception as e:
109
+ logger.error(f"Error computing metric {metric.name}: {e}")
110
+ metric_results[metric.name] = 0.0
111
+ metric_details[metric.name] = {"error": str(e)}
112
+
113
+ # Run analyzers
114
+ logger.info(f"Running {len(self.analyzers)} analyzers...")
115
+ breakdowns = {}
116
+
117
+ for analyzer in self.analyzers:
118
+ try:
119
+ logger.debug(f"Running analyzer: {analyzer.name}")
120
+ analysis = analyzer.analyze(predictions, references, result.metadata)
121
+ breakdowns[analyzer.name] = analysis
122
+ except Exception as e:
123
+ logger.error(f"Error in analyzer {analyzer.name}: {e}")
124
+ breakdowns[analyzer.name] = {"error": str(e)}
125
+
126
+ # Add metric details to breakdowns
127
+ if metric_details:
128
+ breakdowns["metric_details"] = metric_details
129
+
130
+ # Create evaluation report
131
+ report = EvaluationReport(
132
+ task=result.task,
133
+ experiment_id=result.experiment_id,
134
+ metrics=metric_results,
135
+ breakdowns=breakdowns,
136
+ artifacts={},
137
+ timestamp=datetime.utcnow().isoformat(),
138
+ )
139
+
140
+ # Generate and save reports
141
+ output_dir = Path(config.report.path) / result.experiment_id
142
+ output_dir.mkdir(parents=True, exist_ok=True)
143
+
144
+ logger.info(f"Generating reports in {output_dir}")
145
+ for format_name, builder in self.report_builders.items():
146
+ try:
147
+ output_file = output_dir / f"evaluation_report.{format_name}"
148
+ saved_path = builder.build(report, output_file)
149
+ report.artifacts[f"report_{format_name}"] = saved_path
150
+ logger.info(f"Saved {format_name} report to: {saved_path}")
151
+ except Exception as e:
152
+ logger.error(f"Error building {format_name} report: {e}")
153
+
154
+ logger.info(f"Evaluation complete. Overall metrics: {metric_results}")
155
+
156
+ return report
157
+
158
+ def _extract_pred_ref(self, result: ExperimentResult):
159
+ """
160
+ Extract predictions and references from result.
161
+
162
+ Args:
163
+ result: ExperimentResult
164
+
165
+ Returns:
166
+ Tuple of (predictions, references)
167
+ """
168
+ task = result.task
169
+
170
+ if task == "tool_selection":
171
+ # Extract tool ID lists
172
+ predictions = []
173
+ for pred in result.predictions:
174
+ pred_tools = pred.get("predicted_tools", pred.get("tools", []))
175
+ # Handle both list of dicts and list of strings
176
+ if pred_tools and isinstance(pred_tools[0], dict):
177
+ pred_tools = [t.get("tool_id", t.get("id")) for t in pred_tools]
178
+ predictions.append(pred_tools)
179
+
180
+ references = []
181
+ for ref in result.references:
182
+ ref_tools = ref.get("ground_truth_tools", ref.get("tools", ref.get("top_k", [])))
183
+ references.append(ref_tools)
184
+
185
+ elif task == "planning":
186
+ # Extract tool sequences
187
+ predictions = [
188
+ pred.get("tool_sequence", pred.get("steps", [])) for pred in result.predictions
189
+ ]
190
+ references = [
191
+ ref.get("tool_sequence", ref.get("steps", [])) for ref in result.references
192
+ ]
193
+
194
+ elif task == "timing_detection":
195
+ # Extract boolean decisions
196
+ predictions = [
197
+ pred.get("should_call_tool", pred.get("decision", False))
198
+ for pred in result.predictions
199
+ ]
200
+ references = [
201
+ ref.get("should_call_tool", ref.get("decision", False)) for ref in result.references
202
+ ]
203
+
204
+ else:
205
+ raise ValueError(f"Unknown task type: {task}")
206
+
207
+ return predictions, references
208
+
209
+
210
+ def save_report(report: EvaluationReport, output_dir: Path, formats: Optional[list[str]] = None):
211
+ """
212
+ Save evaluation report to files.
213
+
214
+ Args:
215
+ report: EvaluationReport to save
216
+ output_dir: Directory to save reports
217
+ formats: List of formats to save (default: ["json", "markdown"])
218
+ """
219
+ formats = formats or ["json", "markdown"]
220
+ output_dir = Path(output_dir)
221
+ output_dir.mkdir(parents=True, exist_ok=True)
222
+
223
+ builders = create_report_builders(formats)
224
+
225
+ for format_name, builder in builders.items():
226
+ output_file = output_dir / f"evaluation_report.{format_name}"
227
+ builder.build(report, output_file)
228
+ logger.info(f"Saved {format_name} report to: {output_file}")