isage-benchmark-agent 0.1.0.1__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
- isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
- isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
- isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
- isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
- isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
- sage/__init__.py +0 -0
- sage/benchmark/__init__.py +0 -0
- sage/benchmark/benchmark_agent/__init__.py +108 -0
- sage/benchmark/benchmark_agent/__main__.py +177 -0
- sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
- sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
- sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
- sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
- sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
- sage/benchmark/benchmark_agent/data_paths.py +332 -0
- sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
- sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
- sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
- sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
- sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
- sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
- sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
- sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
- sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
- sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
- sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
- sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
- sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
- sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
- sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
- sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
- sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
- sage/benchmark/benchmark_agent/tools_loader.py +212 -0
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base Experiment Framework for Agent Benchmarking
|
|
3
|
+
|
|
4
|
+
This module provides the abstract base class and configuration models for
|
|
5
|
+
agent capability experiments (tool selection, planning, timing judgment).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Literal, Optional
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ReportConfig(BaseModel):
|
|
18
|
+
"""Configuration for experiment report generation."""
|
|
19
|
+
|
|
20
|
+
format: list[Literal["json", "markdown"]] = Field(default=["json", "markdown"])
|
|
21
|
+
include_breakdowns: bool = Field(default=True)
|
|
22
|
+
path: str = Field(default="${PROJECT_ROOT}/outputs/agent_benchmark")
|
|
23
|
+
markdown_template: Optional[str] = Field(default=None)
|
|
24
|
+
|
|
25
|
+
@field_validator("path")
|
|
26
|
+
@classmethod
|
|
27
|
+
def resolve_path(cls, v):
|
|
28
|
+
"""Resolve environment variables in path."""
|
|
29
|
+
if "${PROJECT_ROOT}" in v:
|
|
30
|
+
project_root = os.environ.get("PROJECT_ROOT", os.getcwd())
|
|
31
|
+
v = v.replace("${PROJECT_ROOT}", project_root)
|
|
32
|
+
return v
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class ExperimentConfig(BaseModel):
|
|
36
|
+
"""Base configuration for all experiments."""
|
|
37
|
+
|
|
38
|
+
experiment: Literal["tool_selection", "planning", "timing_detection"]
|
|
39
|
+
profile: str = Field(default="quick_eval", description="DataManager usage profile")
|
|
40
|
+
split: Literal["train", "dev", "test"] = Field(default="dev")
|
|
41
|
+
metrics: list[str] = Field(default_factory=list)
|
|
42
|
+
report: ReportConfig = Field(default_factory=ReportConfig)
|
|
43
|
+
max_samples: Optional[int] = Field(default=None, description="Limit number of samples")
|
|
44
|
+
seed: int = Field(default=42, description="Random seed for reproducibility")
|
|
45
|
+
|
|
46
|
+
model_config = ConfigDict(extra="allow")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ToolSelectionConfig(ExperimentConfig):
|
|
50
|
+
"""Configuration for tool selection experiments."""
|
|
51
|
+
|
|
52
|
+
experiment: Literal["tool_selection"] = "tool_selection"
|
|
53
|
+
selector: str = Field(default="baseline.keyword", description="Selector strategy name")
|
|
54
|
+
top_k: int = Field(default=5, description="Number of tools to select")
|
|
55
|
+
selector_params: dict[str, Any] = Field(default_factory=dict)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class PlanningConfig(ExperimentConfig):
|
|
59
|
+
"""Configuration for task planning experiments."""
|
|
60
|
+
|
|
61
|
+
experiment: Literal["planning"] = "planning"
|
|
62
|
+
planner: str = Field(default="baseline.template", description="Planning strategy name")
|
|
63
|
+
min_steps: int = Field(default=5)
|
|
64
|
+
max_steps: int = Field(default=10)
|
|
65
|
+
planner_params: dict[str, Any] = Field(default_factory=dict)
|
|
66
|
+
verbose: bool = Field(default=False, description="Enable verbose output")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class TimingDetectionConfig(ExperimentConfig):
|
|
70
|
+
"""Configuration for timing judgment experiments."""
|
|
71
|
+
|
|
72
|
+
experiment: Literal["timing_detection"] = "timing_detection"
|
|
73
|
+
detector: str = Field(default="baseline.threshold", description="Timing detector strategy")
|
|
74
|
+
threshold: float = Field(default=0.5, description="Decision threshold")
|
|
75
|
+
detector_params: dict[str, Any] = Field(default_factory=dict)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ToolPrediction(BaseModel):
|
|
79
|
+
"""Prediction for tool selection."""
|
|
80
|
+
|
|
81
|
+
tool_id: str
|
|
82
|
+
score: float = Field(ge=0.0, le=1.0)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class PlanStep(BaseModel):
|
|
86
|
+
"""Single step in a planning prediction."""
|
|
87
|
+
|
|
88
|
+
step_id: int
|
|
89
|
+
description: str
|
|
90
|
+
tool_id: str
|
|
91
|
+
confidence: float = Field(default=1.0, ge=0.0, le=1.0)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class PlanningPrediction(BaseModel):
|
|
95
|
+
"""Prediction for task planning."""
|
|
96
|
+
|
|
97
|
+
steps: list[PlanStep]
|
|
98
|
+
tool_sequence: list[str]
|
|
99
|
+
|
|
100
|
+
@field_validator("tool_sequence")
|
|
101
|
+
@classmethod
|
|
102
|
+
def validate_sequence(cls, v, info):
|
|
103
|
+
"""Ensure tool_sequence matches steps."""
|
|
104
|
+
if "steps" in info.data:
|
|
105
|
+
step_tools = [step.tool_id for step in info.data["steps"]]
|
|
106
|
+
if v != step_tools:
|
|
107
|
+
raise ValueError("tool_sequence must match step tool_ids")
|
|
108
|
+
return v
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class TimingDecision(BaseModel):
|
|
112
|
+
"""Decision for timing judgment."""
|
|
113
|
+
|
|
114
|
+
should_call_tool: bool
|
|
115
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
116
|
+
reasoning: Optional[str] = Field(default=None)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class ExperimentResult(BaseModel):
|
|
120
|
+
"""Result of an experiment run."""
|
|
121
|
+
|
|
122
|
+
task: Literal["tool_selection", "planning", "timing_detection"]
|
|
123
|
+
experiment_id: str
|
|
124
|
+
config: dict[str, Any]
|
|
125
|
+
predictions: list[dict[str, Any]]
|
|
126
|
+
references: list[dict[str, Any]]
|
|
127
|
+
metadata: dict[str, Any] = Field(default_factory=dict)
|
|
128
|
+
timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
|
|
129
|
+
|
|
130
|
+
model_config = ConfigDict(extra="allow")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class BaseExperiment(ABC):
|
|
134
|
+
"""
|
|
135
|
+
Abstract base class for agent capability experiments.
|
|
136
|
+
|
|
137
|
+
Defines the lifecycle and interface for running experiments:
|
|
138
|
+
1. prepare() - Load data and initialize components
|
|
139
|
+
2. run() - Execute experiment and collect results
|
|
140
|
+
3. finalize() - Cleanup and save artifacts
|
|
141
|
+
|
|
142
|
+
Subclasses must implement the run() method.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(
|
|
146
|
+
self, config: ExperimentConfig, data_manager: Any = None, adapter_registry: Any = None
|
|
147
|
+
):
|
|
148
|
+
"""
|
|
149
|
+
Initialize experiment.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
config: Experiment configuration
|
|
153
|
+
data_manager: DataManager instance for data loading
|
|
154
|
+
adapter_registry: Registry for strategy adapters
|
|
155
|
+
"""
|
|
156
|
+
self.config = config
|
|
157
|
+
self.dm = data_manager
|
|
158
|
+
self.adapter_registry = adapter_registry
|
|
159
|
+
self.experiment_id = f"{config.experiment}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
|
160
|
+
|
|
161
|
+
# Will be populated by subclasses
|
|
162
|
+
self.benchmark_loader = None
|
|
163
|
+
self.tools_loader = None
|
|
164
|
+
self.strategy = None
|
|
165
|
+
|
|
166
|
+
def prepare(self):
|
|
167
|
+
"""
|
|
168
|
+
Prepare experiment: load data, initialize strategy.
|
|
169
|
+
|
|
170
|
+
This method loads the benchmark data and tool catalog from DataManager,
|
|
171
|
+
and initializes the strategy adapter based on config.
|
|
172
|
+
"""
|
|
173
|
+
if self.dm is None:
|
|
174
|
+
raise ValueError("DataManager is required for data loading")
|
|
175
|
+
|
|
176
|
+
# Load data sources through DataManager
|
|
177
|
+
# This will be implemented by subclasses based on their needs
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
@abstractmethod
|
|
181
|
+
def run(self) -> ExperimentResult:
|
|
182
|
+
"""
|
|
183
|
+
Run the experiment and return results.
|
|
184
|
+
|
|
185
|
+
Returns:
|
|
186
|
+
ExperimentResult containing predictions, references, and metadata
|
|
187
|
+
"""
|
|
188
|
+
pass
|
|
189
|
+
|
|
190
|
+
def finalize(self):
|
|
191
|
+
"""
|
|
192
|
+
Finalize experiment: save artifacts, cleanup resources.
|
|
193
|
+
|
|
194
|
+
Default implementation creates output directory.
|
|
195
|
+
Subclasses can override to add custom cleanup logic.
|
|
196
|
+
"""
|
|
197
|
+
output_dir = Path(self.config.report.path)
|
|
198
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
|
|
200
|
+
def run_iteration(self, sample: Any) -> Any:
|
|
201
|
+
"""
|
|
202
|
+
Process a single sample (template method).
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
sample: A single benchmark sample
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Prediction for this sample
|
|
209
|
+
"""
|
|
210
|
+
# To be implemented by subclasses if needed
|
|
211
|
+
raise NotImplementedError("run_iteration not implemented")
|
|
212
|
+
|
|
213
|
+
def _create_result(
|
|
214
|
+
self,
|
|
215
|
+
predictions: list[dict[str, Any]],
|
|
216
|
+
references: list[dict[str, Any]],
|
|
217
|
+
metadata: Optional[dict[str, Any]] = None,
|
|
218
|
+
) -> ExperimentResult:
|
|
219
|
+
"""
|
|
220
|
+
Create ExperimentResult from predictions and references.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
predictions: List of prediction dicts
|
|
224
|
+
references: List of reference/ground truth dicts
|
|
225
|
+
metadata: Optional metadata dict
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
ExperimentResult object
|
|
229
|
+
"""
|
|
230
|
+
return ExperimentResult(
|
|
231
|
+
task=self.config.experiment,
|
|
232
|
+
experiment_id=self.experiment_id,
|
|
233
|
+
config=self.config.model_dump(),
|
|
234
|
+
predictions=predictions,
|
|
235
|
+
references=references,
|
|
236
|
+
metadata=metadata or {},
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
# Config type mapping for dynamic loading
|
|
241
|
+
CONFIG_TYPES = {
|
|
242
|
+
"tool_selection": ToolSelectionConfig,
|
|
243
|
+
"planning": PlanningConfig,
|
|
244
|
+
"timing_detection": TimingDetectionConfig,
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def create_config(config_dict: dict[str, Any]) -> ExperimentConfig:
|
|
249
|
+
"""
|
|
250
|
+
Create appropriate config object from dictionary.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
config_dict: Configuration dictionary (from YAML)
|
|
254
|
+
|
|
255
|
+
Returns:
|
|
256
|
+
Typed ExperimentConfig subclass instance
|
|
257
|
+
"""
|
|
258
|
+
experiment_type = config_dict.get("experiment")
|
|
259
|
+
if experiment_type not in CONFIG_TYPES:
|
|
260
|
+
raise ValueError(f"Unknown experiment type: {experiment_type}")
|
|
261
|
+
|
|
262
|
+
config_class = CONFIG_TYPES[experiment_type]
|
|
263
|
+
return config_class(**config_dict)
|