isage-benchmark-agent 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
  2. isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
  3. isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
  4. isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
  5. isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
  6. isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
  7. sage/__init__.py +0 -0
  8. sage/benchmark/__init__.py +0 -0
  9. sage/benchmark/benchmark_agent/__init__.py +108 -0
  10. sage/benchmark/benchmark_agent/__main__.py +177 -0
  11. sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
  12. sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
  13. sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
  14. sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  15. sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  16. sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  17. sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  18. sage/benchmark/benchmark_agent/data_paths.py +332 -0
  19. sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  20. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  21. sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
  22. sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
  23. sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
  24. sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
  25. sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
  26. sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
  27. sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
  28. sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  29. sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
  30. sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
  31. sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
  32. sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
  33. sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
  34. sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  35. sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  36. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
  37. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
  38. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
  39. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
  40. sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
  41. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
  42. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
  43. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
  44. sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
  45. sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
  46. sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
  47. sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
  48. sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
  49. sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
  50. sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
  51. sage/benchmark/benchmark_agent/tools_loader.py +212 -0
@@ -0,0 +1,263 @@
1
+ """
2
+ Base Experiment Framework for Agent Benchmarking
3
+
4
+ This module provides the abstract base class and configuration models for
5
+ agent capability experiments (tool selection, planning, timing judgment).
6
+ """
7
+
8
+ import os
9
+ from abc import ABC, abstractmethod
10
+ from datetime import datetime
11
+ from pathlib import Path
12
+ from typing import Any, Literal, Optional
13
+
14
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
15
+
16
+
17
+ class ReportConfig(BaseModel):
18
+ """Configuration for experiment report generation."""
19
+
20
+ format: list[Literal["json", "markdown"]] = Field(default=["json", "markdown"])
21
+ include_breakdowns: bool = Field(default=True)
22
+ path: str = Field(default="${PROJECT_ROOT}/outputs/agent_benchmark")
23
+ markdown_template: Optional[str] = Field(default=None)
24
+
25
+ @field_validator("path")
26
+ @classmethod
27
+ def resolve_path(cls, v):
28
+ """Resolve environment variables in path."""
29
+ if "${PROJECT_ROOT}" in v:
30
+ project_root = os.environ.get("PROJECT_ROOT", os.getcwd())
31
+ v = v.replace("${PROJECT_ROOT}", project_root)
32
+ return v
33
+
34
+
35
+ class ExperimentConfig(BaseModel):
36
+ """Base configuration for all experiments."""
37
+
38
+ experiment: Literal["tool_selection", "planning", "timing_detection"]
39
+ profile: str = Field(default="quick_eval", description="DataManager usage profile")
40
+ split: Literal["train", "dev", "test"] = Field(default="dev")
41
+ metrics: list[str] = Field(default_factory=list)
42
+ report: ReportConfig = Field(default_factory=ReportConfig)
43
+ max_samples: Optional[int] = Field(default=None, description="Limit number of samples")
44
+ seed: int = Field(default=42, description="Random seed for reproducibility")
45
+
46
+ model_config = ConfigDict(extra="allow")
47
+
48
+
49
+ class ToolSelectionConfig(ExperimentConfig):
50
+ """Configuration for tool selection experiments."""
51
+
52
+ experiment: Literal["tool_selection"] = "tool_selection"
53
+ selector: str = Field(default="baseline.keyword", description="Selector strategy name")
54
+ top_k: int = Field(default=5, description="Number of tools to select")
55
+ selector_params: dict[str, Any] = Field(default_factory=dict)
56
+
57
+
58
+ class PlanningConfig(ExperimentConfig):
59
+ """Configuration for task planning experiments."""
60
+
61
+ experiment: Literal["planning"] = "planning"
62
+ planner: str = Field(default="baseline.template", description="Planning strategy name")
63
+ min_steps: int = Field(default=5)
64
+ max_steps: int = Field(default=10)
65
+ planner_params: dict[str, Any] = Field(default_factory=dict)
66
+ verbose: bool = Field(default=False, description="Enable verbose output")
67
+
68
+
69
+ class TimingDetectionConfig(ExperimentConfig):
70
+ """Configuration for timing judgment experiments."""
71
+
72
+ experiment: Literal["timing_detection"] = "timing_detection"
73
+ detector: str = Field(default="baseline.threshold", description="Timing detector strategy")
74
+ threshold: float = Field(default=0.5, description="Decision threshold")
75
+ detector_params: dict[str, Any] = Field(default_factory=dict)
76
+
77
+
78
+ class ToolPrediction(BaseModel):
79
+ """Prediction for tool selection."""
80
+
81
+ tool_id: str
82
+ score: float = Field(ge=0.0, le=1.0)
83
+
84
+
85
+ class PlanStep(BaseModel):
86
+ """Single step in a planning prediction."""
87
+
88
+ step_id: int
89
+ description: str
90
+ tool_id: str
91
+ confidence: float = Field(default=1.0, ge=0.0, le=1.0)
92
+
93
+
94
+ class PlanningPrediction(BaseModel):
95
+ """Prediction for task planning."""
96
+
97
+ steps: list[PlanStep]
98
+ tool_sequence: list[str]
99
+
100
+ @field_validator("tool_sequence")
101
+ @classmethod
102
+ def validate_sequence(cls, v, info):
103
+ """Ensure tool_sequence matches steps."""
104
+ if "steps" in info.data:
105
+ step_tools = [step.tool_id for step in info.data["steps"]]
106
+ if v != step_tools:
107
+ raise ValueError("tool_sequence must match step tool_ids")
108
+ return v
109
+
110
+
111
+ class TimingDecision(BaseModel):
112
+ """Decision for timing judgment."""
113
+
114
+ should_call_tool: bool
115
+ confidence: float = Field(ge=0.0, le=1.0)
116
+ reasoning: Optional[str] = Field(default=None)
117
+
118
+
119
+ class ExperimentResult(BaseModel):
120
+ """Result of an experiment run."""
121
+
122
+ task: Literal["tool_selection", "planning", "timing_detection"]
123
+ experiment_id: str
124
+ config: dict[str, Any]
125
+ predictions: list[dict[str, Any]]
126
+ references: list[dict[str, Any]]
127
+ metadata: dict[str, Any] = Field(default_factory=dict)
128
+ timestamp: str = Field(default_factory=lambda: datetime.utcnow().isoformat())
129
+
130
+ model_config = ConfigDict(extra="allow")
131
+
132
+
133
+ class BaseExperiment(ABC):
134
+ """
135
+ Abstract base class for agent capability experiments.
136
+
137
+ Defines the lifecycle and interface for running experiments:
138
+ 1. prepare() - Load data and initialize components
139
+ 2. run() - Execute experiment and collect results
140
+ 3. finalize() - Cleanup and save artifacts
141
+
142
+ Subclasses must implement the run() method.
143
+ """
144
+
145
+ def __init__(
146
+ self, config: ExperimentConfig, data_manager: Any = None, adapter_registry: Any = None
147
+ ):
148
+ """
149
+ Initialize experiment.
150
+
151
+ Args:
152
+ config: Experiment configuration
153
+ data_manager: DataManager instance for data loading
154
+ adapter_registry: Registry for strategy adapters
155
+ """
156
+ self.config = config
157
+ self.dm = data_manager
158
+ self.adapter_registry = adapter_registry
159
+ self.experiment_id = f"{config.experiment}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
160
+
161
+ # Will be populated by subclasses
162
+ self.benchmark_loader = None
163
+ self.tools_loader = None
164
+ self.strategy = None
165
+
166
+ def prepare(self):
167
+ """
168
+ Prepare experiment: load data, initialize strategy.
169
+
170
+ This method loads the benchmark data and tool catalog from DataManager,
171
+ and initializes the strategy adapter based on config.
172
+ """
173
+ if self.dm is None:
174
+ raise ValueError("DataManager is required for data loading")
175
+
176
+ # Load data sources through DataManager
177
+ # This will be implemented by subclasses based on their needs
178
+ pass
179
+
180
+ @abstractmethod
181
+ def run(self) -> ExperimentResult:
182
+ """
183
+ Run the experiment and return results.
184
+
185
+ Returns:
186
+ ExperimentResult containing predictions, references, and metadata
187
+ """
188
+ pass
189
+
190
+ def finalize(self):
191
+ """
192
+ Finalize experiment: save artifacts, cleanup resources.
193
+
194
+ Default implementation creates output directory.
195
+ Subclasses can override to add custom cleanup logic.
196
+ """
197
+ output_dir = Path(self.config.report.path)
198
+ output_dir.mkdir(parents=True, exist_ok=True)
199
+
200
+ def run_iteration(self, sample: Any) -> Any:
201
+ """
202
+ Process a single sample (template method).
203
+
204
+ Args:
205
+ sample: A single benchmark sample
206
+
207
+ Returns:
208
+ Prediction for this sample
209
+ """
210
+ # To be implemented by subclasses if needed
211
+ raise NotImplementedError("run_iteration not implemented")
212
+
213
+ def _create_result(
214
+ self,
215
+ predictions: list[dict[str, Any]],
216
+ references: list[dict[str, Any]],
217
+ metadata: Optional[dict[str, Any]] = None,
218
+ ) -> ExperimentResult:
219
+ """
220
+ Create ExperimentResult from predictions and references.
221
+
222
+ Args:
223
+ predictions: List of prediction dicts
224
+ references: List of reference/ground truth dicts
225
+ metadata: Optional metadata dict
226
+
227
+ Returns:
228
+ ExperimentResult object
229
+ """
230
+ return ExperimentResult(
231
+ task=self.config.experiment,
232
+ experiment_id=self.experiment_id,
233
+ config=self.config.model_dump(),
234
+ predictions=predictions,
235
+ references=references,
236
+ metadata=metadata or {},
237
+ )
238
+
239
+
240
+ # Config type mapping for dynamic loading
241
+ CONFIG_TYPES = {
242
+ "tool_selection": ToolSelectionConfig,
243
+ "planning": PlanningConfig,
244
+ "timing_detection": TimingDetectionConfig,
245
+ }
246
+
247
+
248
+ def create_config(config_dict: dict[str, Any]) -> ExperimentConfig:
249
+ """
250
+ Create appropriate config object from dictionary.
251
+
252
+ Args:
253
+ config_dict: Configuration dictionary (from YAML)
254
+
255
+ Returns:
256
+ Typed ExperimentConfig subclass instance
257
+ """
258
+ experiment_type = config_dict.get("experiment")
259
+ if experiment_type not in CONFIG_TYPES:
260
+ raise ValueError(f"Unknown experiment type: {experiment_type}")
261
+
262
+ config_class = CONFIG_TYPES[experiment_type]
263
+ return config_class(**config_dict)