isage-benchmark-agent 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
  2. isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
  3. isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
  4. isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
  5. isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
  6. isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
  7. sage/__init__.py +0 -0
  8. sage/benchmark/__init__.py +0 -0
  9. sage/benchmark/benchmark_agent/__init__.py +108 -0
  10. sage/benchmark/benchmark_agent/__main__.py +177 -0
  11. sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
  12. sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
  13. sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
  14. sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  15. sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  16. sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  17. sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  18. sage/benchmark/benchmark_agent/data_paths.py +332 -0
  19. sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  20. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  21. sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
  22. sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
  23. sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
  24. sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
  25. sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
  26. sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
  27. sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
  28. sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  29. sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
  30. sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
  31. sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
  32. sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
  33. sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
  34. sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  35. sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  36. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
  37. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
  38. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
  39. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
  40. sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
  41. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
  42. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
  43. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
  44. sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
  45. sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
  46. sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
  47. sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
  48. sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
  49. sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
  50. sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
  51. sage/benchmark/benchmark_agent/tools_loader.py +212 -0
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.4
2
+ Name: isage-benchmark-agent
3
+ Version: 0.1.0.1
4
+ Summary: SAGE Benchmark Agent - Agent capability evaluation framework
5
+ Author-email: IntelliStream Team <shuhao_zhang@hust.edu.cn>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/intellistream/sage-agent-benchmark
8
+ Project-URL: Documentation, https://github.com/intellistream/sage-agent-benchmark#readme
9
+ Project-URL: Repository, https://github.com/intellistream/sage-agent-benchmark
10
+ Project-URL: Issues, https://github.com/intellistream/sage-agent-benchmark/issues
11
+ Keywords: sage,benchmark,agent,tool-selection,planning,evaluation,intellistream
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: isage-common
25
+ Requires-Dist: isage-libs
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: pandas>=2.0.0
28
+ Requires-Dist: numpy<2.3.0,>=1.26.0
29
+ Requires-Dist: typer<1.0.0,>=0.15.0
30
+ Requires-Dist: rich<14.0.0,>=13.0.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
33
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
35
+ Requires-Dist: black>=23.0.0; extra == "dev"
36
+ Requires-Dist: ruff==0.14.6; extra == "dev"
37
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
38
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
39
+ Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
40
+ Provides-Extra: all
41
+ Requires-Dist: isage-benchmark-agent[dev]; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ # SAGE Benchmark Agent
45
+
46
+ Configuration-driven experiment framework for evaluating agent capabilities.
47
+
48
+ ## Features
49
+
50
+ - **Tool Selection Evaluation**: Tool retrieval and ranking benchmarks
51
+ - **Planning Evaluation**: Multi-step planning with tool composition
52
+ - **Timing Detection**: Timing judgment for tool invocation decisions
53
+
54
+ ## Quick Start
55
+
56
+ ```bash
57
+ # Install
58
+ pip install isage-benchmark-agent
59
+
60
+ # Run tool selection experiment
61
+ sage-agent-bench tool-selection --config config/tool_selection_exp.yaml
62
+
63
+ # Run planning experiment
64
+ sage-agent-bench planning --config config/planning_exp.yaml
65
+ ```
66
+
67
+ ## Documentation
68
+
69
+ See [benchmark_agent/README.md](src/sage/benchmark/benchmark_agent/README.md) for detailed documentation.
70
+
71
+ ## Development
72
+
73
+ ```bash
74
+ # Clone
75
+ git clone https://github.com/intellistream/sage-agent-benchmark.git
76
+ cd sage-agent-benchmark
77
+
78
+ # Setup virtual environment
79
+ python -m venv .venv
80
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
81
+
82
+ # Install in development mode
83
+ pip install -e ".[dev]"
84
+
85
+ # Run tests
86
+ pytest
87
+ ```
88
+
89
+ ## License
90
+
91
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,51 @@
1
+ isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE,sha256=vBNVIGkYYZY0B8f0Ui1ITYwRu7WNtSwyxvIAVGYS6jU,1075
2
+ sage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ sage/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ sage/benchmark/benchmark_agent/__init__.py,sha256=Bnlx9jxIxIq0xo1Q-3Hrr2VRJZq0g6nHhXcmffotBYc,2807
5
+ sage/benchmark/benchmark_agent/__main__.py,sha256=3yDJ34Wuk9TaNnOoKWAsZL-74H8xiw6xFTeaobN77ew,5464
6
+ sage/benchmark/benchmark_agent/acebench_loader.py,sha256=NHCuCoJXczv2OhjaSsQmSk2l8f4wc04Nyga79Wn26VQ,11571
7
+ sage/benchmark/benchmark_agent/adapter_registry.py,sha256=MjliEzacQHNiHMohpYe8AHdxIR06e2h25vDHX2yLdYo,123890
8
+ sage/benchmark/benchmark_agent/data_paths.py,sha256=qpVxRNkFe7B2ktuFvGFp30JhVCfL4YBWiLHZ3c4FjZs,9761
9
+ sage/benchmark/benchmark_agent/tools_loader.py,sha256=25OT8_8F2NyuydrmCcRkqaOWc2FneAg3mqcoOHFMlOs,6324
10
+ sage/benchmark/benchmark_agent/config/config_loader.py,sha256=a5Du-HIIMjpa9rzWdlObksrwVU3cDtLAK4QfepnbmI4,4954
11
+ sage/benchmark/benchmark_agent/config/default_config.yaml,sha256=iHNVAkou8UxcIOMBEuJheNeVI0fAlfe-BV9inzxw9_s,532
12
+ sage/benchmark/benchmark_agent/config/planning_exp.yaml,sha256=nyimmpQ725skpvEF1eNJpYudkCxYuh2hz6u2e1velNE,607
13
+ sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml,sha256=058TwEeeP8RkzjV7hUbYWFBctrzpcbAXKAlZzw6fJdI,604
14
+ sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml,sha256=dfar6m9XJ1AHsoqVqEN3iN5x8Ka2M3UpvzhpXlCkc6o,544
15
+ sage/benchmark/benchmark_agent/evaluation/__init__.py,sha256=Ypoer04sin7kTryIJWOqIImifonmXvhHWhOAwrhHK10,6615
16
+ sage/benchmark/benchmark_agent/evaluation/evaluator.py,sha256=kvve7bWrkDLfDNub-bL4EYoCRCHGkjVhNIS10_aMAGM,8003
17
+ sage/benchmark/benchmark_agent/evaluation/metrics.py,sha256=6SkPkFexcRpoUvpF7F9AWgGVVhSvazdKTZ6NlADULcU,19060
18
+ sage/benchmark/benchmark_agent/evaluation/report_builder.py,sha256=OYh2JdxsgyE9ovWS0o-DKBeoJWuPV8qx5fIVjrcwFdg,7029
19
+ sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py,sha256=7UqR6RwyO440oO3Dto6XUnzGO2zr0l07A2cY7fQrYl8,19665
20
+ sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py,sha256=e_kzn4nWKe6l4L0OtSd4-V7Tt064bzwvlz3BsfQXBZI,282
21
+ sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py,sha256=4RHWITAYgm4oefaPHFJaoeGW5v95_VgtLwKUP4DcBlQ,3931
22
+ sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py,sha256=pgID5v_lcHNQi1U0NH2XJhXFkCBuAMUOITzTdLmvgl8,4808
23
+ sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py,sha256=nWFxeGMwJO7ZOR_KB4KJAVZ5voQ_c1QFlM7PoFP1QHI,4086
24
+ sage/benchmark/benchmark_agent/experiments/__init__.py,sha256=w3aKDv8nfgXceZX81KFtOwxuRXhbETLSxUeQDD7HONY,1574
25
+ sage/benchmark/benchmark_agent/experiments/base_experiment.py,sha256=6u5kVNIDysGIck6qAbs9ihSn_bBBqiLWPn_p-EBLqEE,8519
26
+ sage/benchmark/benchmark_agent/experiments/method_comparison.py,sha256=3LP8x7PjnhssTnKabQiBK29TxkCQ0ZIqz6zJ8i33rLI,27680
27
+ sage/benchmark/benchmark_agent/experiments/planning_exp.py,sha256=-qrC6urBQTaS5YCOQjn2uWzt6KBwSRZwAmPJQ1zzwA8,9751
28
+ sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py,sha256=ifkAdd7Nym21HrrbxriY3QNtRnprtft73WrlGiMdAwE,6992
29
+ sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py,sha256=9-BNYZMZ_FIujuAo1cAAH6j8llwxeeThBffG9fCMtRc,9320
30
+ sage/benchmark/benchmark_agent/scripts/__init__.py,sha256=C8YqglL5eDKIyB8fKg7mC5NZJZ_Fn7LlmEFl2e8RVRw,668
31
+ sage/benchmark/benchmark_agent/scripts/experiments/__init__.py,sha256=d3VxS2Qfuz7WujRnREz25IUUOVOOatokoD5rhAExy14,1125
32
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py,sha256=78aNKL0dhw5rDU9aiIQ80O0j7CAl0JPHYxWEFINFcOs,13238
33
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py,sha256=kXp2e3hteVvil1nMhjBLxvJvtiV00XbgmmhqZzS5yfk,13422
34
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py,sha256=vMmeXskW-kmWjWGr6sPGEw_Y45x9cLchKk5LMUtjyd0,14445
35
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py,sha256=uXfzkT8HV1OQahyDJ_ns6hYJj7YxlvqjuGenmcpIIyM,17725
36
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py,sha256=v6Ku0hgKmeUo0CcYD2UK-kjDZi0K6TmS0FwUDB2ZcHw,12179
37
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py,sha256=uXPCFcnn1FC8DMGPvcylVNwT6tjDfEU7CJc7950pWSs,10845
38
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py,sha256=yM1Dkxh6iguHzBKG1saABCDkr5QqslNmfPM9CxctYa4,11423
39
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py,sha256=Q5ZvPf-MumLCIsV7Dtvb7wt1R9GA39rzcqMjhWQ9dLc,8674
40
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py,sha256=BWgkS37KM0lUcIEs1iofQbfvrIIyzq7iQ9g2gx5vyFs,20490
41
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py,sha256=5F8F4UvS1eTdNJ7YpDk0crT2pYzqAehX4eqEshZHxrI,12875
42
+ sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py,sha256=hIcGCGQ28TVDb90SVLbO1MqBaZyi9RsJfpnm0kFZMV0,20136
43
+ sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py,sha256=na_bR0Gvj0088GEHQzfg0xakDRv5cWX-UlLE3odbc1k,8893
44
+ sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py,sha256=cZ1z8fYULZ2ezIg56ITBOXAGWHmwpL_qiY0CgAYs53E,20854
45
+ sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py,sha256=m_MqNJIfC1motfodbjd1WPsv37wCTIafP5lwJUKsHrg,14950
46
+ sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py,sha256=Mxqo3lmcQsEpTuaiwklQVEo3xn6G-KRVsCizbsMeJM0,12885
47
+ isage_benchmark_agent-0.1.0.1.dist-info/METADATA,sha256=GQNMGxPXYmBtSRcJ9TOu5XznKYVbQLPHTZzWmceCWPg,3016
48
+ isage_benchmark_agent-0.1.0.1.dist-info/WHEEL,sha256=yk-B4c9kYsinhQ_MzhPAVcDm9mhkAVmdo0rg0jgFCmo,94
49
+ isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt,sha256=g34HO224bwnCvIklVp2JQw4wTNDgNW7u61F-cL02pSA,82
50
+ isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt,sha256=hibFyzQHiLOMK68qL1OWsNKaXOmSXqZjeLTBem6Yy7I,5
51
+ isage_benchmark_agent-0.1.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: cp311-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ sage-agent-bench = sage.benchmark.benchmark_agent.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 IntelliStream Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sage/__init__.py ADDED
File without changes
File without changes
@@ -0,0 +1,108 @@
1
+ """
2
+ Agent Capability Benchmark Module
3
+
4
+ This module provides infrastructure for evaluating agent capabilities including:
5
+ - Tool selection
6
+ - Task planning
7
+ - Timing detection
8
+
9
+ Architecture:
10
+ config/ Configuration files and loaders
11
+ experiments/ Experiment runners and base classes
12
+ adapter_registry.py Strategy adapter registry
13
+
14
+ Usage:
15
+ # Via CLI
16
+ python -m sage.benchmark.benchmark_agent --config config/tool_selection_exp.yaml
17
+
18
+ # Programmatic
19
+ from sage.benchmark.benchmark_agent import ToolSelectionExperiment
20
+ from sage.benchmark.benchmark_agent.config import ConfigLoader
21
+ from sage.benchmark.benchmark_agent.adapter_registry import get_adapter_registry
22
+ from sage.data import DataManager
23
+
24
+ loader = ConfigLoader()
25
+ config = loader.load_config("config/tool_selection_exp.yaml")
26
+
27
+ dm = DataManager.get_instance()
28
+ registry = get_adapter_registry()
29
+
30
+ exp = ToolSelectionExperiment(config, data_manager=dm, adapter_registry=registry)
31
+ exp.prepare()
32
+ result = exp.run()
33
+ exp.finalize()
34
+ """
35
+
36
+ from sage.benchmark.benchmark_agent.adapter_registry import (
37
+ AdapterRegistry,
38
+ PlannerAdapter,
39
+ SelectorAdapter,
40
+ TimingAdapter,
41
+ get_adapter_registry,
42
+ register_strategy,
43
+ )
44
+
45
+ # Data paths management
46
+ from sage.benchmark.benchmark_agent.data_paths import (
47
+ DataPathsConfig,
48
+ RuntimePaths,
49
+ SourcePaths,
50
+ ensure_runtime_dirs,
51
+ get_data_paths_config,
52
+ get_runtime_paths,
53
+ get_source_paths,
54
+ )
55
+ from sage.benchmark.benchmark_agent.experiments import ( # Base classes; Configs; Experiments
56
+ BaseExperiment,
57
+ ExperimentConfig,
58
+ ExperimentResult,
59
+ PlanningConfig,
60
+ PlanningExperiment,
61
+ TimingDetectionConfig,
62
+ TimingDetectionExperiment,
63
+ ToolSelectionConfig,
64
+ ToolSelectionExperiment,
65
+ )
66
+ from sage.benchmark.benchmark_agent.experiments.method_comparison import (
67
+ ExperimentResult as ComparisonResult,
68
+ )
69
+ from sage.benchmark.benchmark_agent.experiments.method_comparison import (
70
+ MethodComparisonExperiment,
71
+ MethodConfig,
72
+ MethodRegistry,
73
+ run_full_comparison,
74
+ run_quick_comparison,
75
+ )
76
+
77
+ __version__ = "0.1.0"
78
+
79
+ __all__ = [
80
+ "__version__",
81
+ # Experiments
82
+ "ToolSelectionExperiment",
83
+ "PlanningExperiment",
84
+ "TimingDetectionExperiment",
85
+ # Base
86
+ "BaseExperiment",
87
+ "ExperimentConfig",
88
+ "ExperimentResult",
89
+ # Configs
90
+ "ToolSelectionConfig",
91
+ "PlanningConfig",
92
+ "TimingDetectionConfig",
93
+ # Adapter Registry
94
+ "AdapterRegistry",
95
+ "SelectorAdapter",
96
+ "PlannerAdapter",
97
+ "TimingAdapter",
98
+ "get_adapter_registry",
99
+ "register_strategy",
100
+ # Data Paths
101
+ "get_source_paths",
102
+ "get_runtime_paths",
103
+ "get_data_paths_config",
104
+ "ensure_runtime_dirs",
105
+ "SourcePaths",
106
+ "RuntimePaths",
107
+ "DataPathsConfig",
108
+ ]
@@ -0,0 +1,177 @@
1
+ """
2
+ CLI entry point for running agent benchmark experiments.
3
+
4
+ Usage:
5
+ python -m sage.benchmark.benchmark_agent --config config/tool_selection_exp.yaml
6
+ python -m sage.benchmark.benchmark_agent --config config/planning_exp.yaml
7
+ python -m sage.benchmark.benchmark_agent --config config/timing_detection_exp.yaml
8
+ """
9
+
10
+ import argparse
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ from sage.benchmark.benchmark_agent.adapter_registry import get_adapter_registry
15
+ from sage.benchmark.benchmark_agent.config.config_loader import ConfigLoader
16
+ from sage.benchmark.benchmark_agent.experiments import (
17
+ PlanningExperiment,
18
+ TimingDetectionExperiment,
19
+ ToolSelectionExperiment,
20
+ )
21
+ from sage.data import DataManager
22
+
23
+
24
+ def create_experiment(config, data_manager, adapter_registry):
25
+ """
26
+ Create appropriate experiment instance based on config type.
27
+
28
+ Args:
29
+ config: Experiment configuration
30
+ data_manager: DataManager instance
31
+ adapter_registry: AdapterRegistry instance
32
+
33
+ Returns:
34
+ Experiment instance
35
+
36
+ Raises:
37
+ ValueError: If experiment type not recognized
38
+ """
39
+ experiment_type = config.experiment
40
+
41
+ if experiment_type == "tool_selection":
42
+ return ToolSelectionExperiment(
43
+ config, data_manager=data_manager, adapter_registry=adapter_registry
44
+ )
45
+ elif experiment_type == "planning":
46
+ return PlanningExperiment(
47
+ config, data_manager=data_manager, adapter_registry=adapter_registry
48
+ )
49
+ elif experiment_type == "timing_detection":
50
+ return TimingDetectionExperiment(
51
+ config, data_manager=data_manager, adapter_registry=adapter_registry
52
+ )
53
+ else:
54
+ raise ValueError(f"Unknown experiment type: {experiment_type}")
55
+
56
+
57
+ def main():
58
+ """Main CLI entry point."""
59
+ parser = argparse.ArgumentParser(
60
+ description="Run agent benchmark experiments",
61
+ formatter_class=argparse.RawDescriptionHelpFormatter,
62
+ epilog="""
63
+ Examples:
64
+ # Run tool selection experiment
65
+ python -m sage.benchmark.benchmark_agent --config config/tool_selection_exp.yaml
66
+
67
+ # Run planning experiment with custom output
68
+ python -m sage.benchmark.benchmark_agent \\
69
+ --config config/planning_exp.yaml \\
70
+ --output results/planning_results.json
71
+
72
+ # Run timing detection with verbose output
73
+ python -m sage.benchmark.benchmark_agent \\
74
+ --config config/timing_detection_exp.yaml \\
75
+ --verbose
76
+ """,
77
+ )
78
+
79
+ parser.add_argument(
80
+ "--config", type=str, required=True, help="Path to experiment configuration YAML file"
81
+ )
82
+
83
+ parser.add_argument("--output", type=str, help="Override output path for results (optional)")
84
+
85
+ parser.add_argument("--verbose", action="store_true", help="Enable verbose output")
86
+
87
+ parser.add_argument(
88
+ "--dry-run", action="store_true", help="Validate config without running experiment"
89
+ )
90
+
91
+ args = parser.parse_args()
92
+
93
+ # Load configuration
94
+ try:
95
+ config_path = Path(args.config)
96
+ if not config_path.exists():
97
+ print(f"Error: Config file not found: {config_path}")
98
+ sys.exit(1)
99
+
100
+ loader = ConfigLoader()
101
+ config = loader.load_config(str(config_path))
102
+
103
+ # Override from CLI args
104
+ if args.output:
105
+ config.report.path = args.output
106
+ if args.verbose:
107
+ config.verbose = getattr(config, "verbose", True)
108
+
109
+ print(f"Loaded config: {config.experiment}")
110
+
111
+ if args.dry_run:
112
+ print("Config validation successful (dry-run mode)")
113
+ print(f" Experiment type: {config.experiment}")
114
+ print(f" Profile: {config.profile}")
115
+ print(f" Split: {config.split}")
116
+ print(f" Output: {config.report.path}")
117
+ return 0
118
+
119
+ except Exception as e:
120
+ print(f"Error loading config: {e}")
121
+ sys.exit(1)
122
+
123
+ # Initialize DataManager and AdapterRegistry
124
+ try:
125
+ print("Initializing DataManager...")
126
+ data_manager = DataManager.get_instance()
127
+
128
+ print("Initializing AdapterRegistry...")
129
+ adapter_registry = get_adapter_registry()
130
+
131
+ print(f" Available strategies: {adapter_registry.list_strategies()}")
132
+
133
+ except Exception as e:
134
+ print(f"Error initializing managers: {e}")
135
+ sys.exit(1)
136
+
137
+ # Create experiment
138
+ try:
139
+ experiment = create_experiment(config, data_manager, adapter_registry)
140
+ print(f"Created experiment: {experiment.__class__.__name__}")
141
+ except Exception as e:
142
+ print(f"Error creating experiment: {e}")
143
+ sys.exit(1)
144
+
145
+ # Run experiment
146
+ try:
147
+ print("\n" + "=" * 60)
148
+ print("Starting experiment...")
149
+ print("=" * 60 + "\n")
150
+
151
+ experiment.prepare()
152
+ result = experiment.run()
153
+ experiment.finalize()
154
+
155
+ print("\n" + "=" * 60)
156
+ print("Experiment completed successfully!")
157
+ print("=" * 60)
158
+ print(f"Results saved to: {config.report.path}")
159
+ print(f"Total samples: {result.metadata.get('total_samples', 0)}")
160
+ print(f"Failed samples: {result.metadata.get('failed_samples', 0)}")
161
+
162
+ return 0
163
+
164
+ except KeyboardInterrupt:
165
+ print("\n\nExperiment interrupted by user")
166
+ return 130
167
+
168
+ except Exception as e:
169
+ print(f"\nError running experiment: {e}")
170
+ import traceback
171
+
172
+ traceback.print_exc()
173
+ sys.exit(1)
174
+
175
+
176
+ if __name__ == "__main__":
177
+ sys.exit(main())