isage-benchmark-agent 0.1.0.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
  2. isage_benchmark_agent-0.1.0.1.dist-info/RECORD +59 -0
  3. isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +6 -0
  4. isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
  5. isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
  6. isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
  7. sage/__init__.py +0 -0
  8. sage/__init__.pyc +0 -0
  9. sage/benchmark/__init__.py +0 -0
  10. sage/benchmark/__init__.pyc +0 -0
  11. sage/benchmark/benchmark_agent/__init__.py +108 -0
  12. sage/benchmark/benchmark_agent/__init__.pyc +0 -0
  13. sage/benchmark/benchmark_agent/__main__.pyc +0 -0
  14. sage/benchmark/benchmark_agent/acebench_loader.pyc +0 -0
  15. sage/benchmark/benchmark_agent/adapter_registry.pyc +0 -0
  16. sage/benchmark/benchmark_agent/config/config_loader.pyc +0 -0
  17. sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  18. sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  19. sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  20. sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  21. sage/benchmark/benchmark_agent/data_paths.pyc +0 -0
  22. sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  23. sage/benchmark/benchmark_agent/evaluation/__init__.pyc +0 -0
  24. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  25. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.pyc +0 -0
  26. sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.pyc +0 -0
  27. sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.pyc +0 -0
  28. sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.pyc +0 -0
  29. sage/benchmark/benchmark_agent/evaluation/evaluator.pyc +0 -0
  30. sage/benchmark/benchmark_agent/evaluation/metrics.pyc +0 -0
  31. sage/benchmark/benchmark_agent/evaluation/report_builder.pyc +0 -0
  32. sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.pyc +0 -0
  33. sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  34. sage/benchmark/benchmark_agent/experiments/__init__.pyc +0 -0
  35. sage/benchmark/benchmark_agent/experiments/base_experiment.pyc +0 -0
  36. sage/benchmark/benchmark_agent/experiments/method_comparison.pyc +0 -0
  37. sage/benchmark/benchmark_agent/experiments/planning_exp.pyc +0 -0
  38. sage/benchmark/benchmark_agent/experiments/timing_detection_exp.pyc +0 -0
  39. sage/benchmark/benchmark_agent/experiments/tool_selection_exp.pyc +0 -0
  40. sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  41. sage/benchmark/benchmark_agent/scripts/__init__.pyc +0 -0
  42. sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  43. sage/benchmark/benchmark_agent/scripts/experiments/__init__.pyc +0 -0
  44. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.pyc +0 -0
  45. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.pyc +0 -0
  46. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.pyc +0 -0
  47. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.pyc +0 -0
  48. sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.pyc +0 -0
  49. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.pyc +0 -0
  50. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.pyc +0 -0
  51. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.pyc +0 -0
  52. sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.pyc +0 -0
  53. sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.pyc +0 -0
  54. sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.pyc +0 -0
  55. sage/benchmark/benchmark_agent/scripts/experiments/llm_service.pyc +0 -0
  56. sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.pyc +0 -0
  57. sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.pyc +0 -0
  58. sage/benchmark/benchmark_agent/scripts/experiments/table_generator.pyc +0 -0
  59. sage/benchmark/benchmark_agent/tools_loader.pyc +0 -0
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.4
2
+ Name: isage-benchmark-agent
3
+ Version: 0.1.0.1
4
+ Summary: SAGE Benchmark Agent - Agent capability evaluation framework
5
+ Author-email: IntelliStream Team <shuhao_zhang@hust.edu.cn>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/intellistream/sage-agent-benchmark
8
+ Project-URL: Documentation, https://github.com/intellistream/sage-agent-benchmark#readme
9
+ Project-URL: Repository, https://github.com/intellistream/sage-agent-benchmark
10
+ Project-URL: Issues, https://github.com/intellistream/sage-agent-benchmark/issues
11
+ Keywords: sage,benchmark,agent,tool-selection,planning,evaluation,intellistream
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: ==3.11.*
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: isage-common
25
+ Requires-Dist: isage-libs
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: pandas>=2.0.0
28
+ Requires-Dist: numpy<2.3.0,>=1.26.0
29
+ Requires-Dist: typer<1.0.0,>=0.15.0
30
+ Requires-Dist: rich<14.0.0,>=13.0.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
33
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
35
+ Requires-Dist: black>=23.0.0; extra == "dev"
36
+ Requires-Dist: ruff==0.14.6; extra == "dev"
37
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
38
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
39
+ Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
40
+ Provides-Extra: all
41
+ Requires-Dist: isage-benchmark-agent[dev]; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ # SAGE Benchmark Agent
45
+
46
+ Configuration-driven experiment framework for evaluating agent capabilities.
47
+
48
+ ## Features
49
+
50
+ - **Tool Selection Evaluation**: Tool retrieval and ranking benchmarks
51
+ - **Planning Evaluation**: Multi-step planning with tool composition
52
+ - **Timing Detection**: Timing judgment for tool invocation decisions
53
+
54
+ ## Quick Start
55
+
56
+ ```bash
57
+ # Install
58
+ pip install isage-benchmark-agent
59
+
60
+ # Run tool selection experiment
61
+ sage-agent-bench tool-selection --config config/tool_selection_exp.yaml
62
+
63
+ # Run planning experiment
64
+ sage-agent-bench planning --config config/planning_exp.yaml
65
+ ```
66
+
67
+ ## Documentation
68
+
69
+ See [benchmark_agent/README.md](src/sage/benchmark/benchmark_agent/README.md) for detailed documentation.
70
+
71
+ ## Development
72
+
73
+ ```bash
74
+ # Clone
75
+ git clone https://github.com/intellistream/sage-agent-benchmark.git
76
+ cd sage-agent-benchmark
77
+
78
+ # Setup virtual environment
79
+ python -m venv .venv
80
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
81
+
82
+ # Install in development mode
83
+ pip install -e ".[dev]"
84
+
85
+ # Run tests
86
+ pytest
87
+ ```
88
+
89
+ ## License
90
+
91
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,59 @@
1
+ isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE,sha256=vBNVIGkYYZY0B8f0Ui1ITYwRu7WNtSwyxvIAVGYS6jU,1075
2
+ sage/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ sage/__init__.pyc,sha256=8sSJ7mfq8oAAPGQurhAtP-2HOTLofIZVpwYzrJJD1YM,125
4
+ sage/benchmark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ sage/benchmark/__init__.pyc,sha256=3hwYkavnuAMyMha9hrX3i5bNBHfVAYz9I3TEeE1Ru0k,135
6
+ sage/benchmark/benchmark_agent/__init__.py,sha256=Bnlx9jxIxIq0xo1Q-3Hrr2VRJZq0g6nHhXcmffotBYc,2807
7
+ sage/benchmark/benchmark_agent/__init__.pyc,sha256=5wbDpUu6ZFmlHb8R_U1Hy9RzHkijQC7K4y9rfzjsl8Y,2820
8
+ sage/benchmark/benchmark_agent/__main__.pyc,sha256=wpjYGWkeFldKbzeiJGUkgk3JU0QYDLE7T7nIog0z5AU,8148
9
+ sage/benchmark/benchmark_agent/acebench_loader.pyc,sha256=Qyg66TJIa-CqZYusCguWKCvl2yVcwR0GpZ2BBNl8kNM,15940
10
+ sage/benchmark/benchmark_agent/adapter_registry.pyc,sha256=BvfWmqPx7cQyrn_Ql7oUZNDAPfwouoqUZOFTFCKygug,140109
11
+ sage/benchmark/benchmark_agent/data_paths.pyc,sha256=bcjsP7pvFRM0Wip9uCpVI3E2jzibqUD6FRDgrrCZSUQ,13674
12
+ sage/benchmark/benchmark_agent/tools_loader.pyc,sha256=5e2RdIfloId-kA5ahUPojOfNUu0blrkP5yC8tfnND-0,9898
13
+ sage/benchmark/benchmark_agent/config/config_loader.pyc,sha256=dQuSmaXDSbfkE17S_Cjy84_L4aV31OGy0Pvms4xURMg,7986
14
+ sage/benchmark/benchmark_agent/config/default_config.yaml,sha256=iHNVAkou8UxcIOMBEuJheNeVI0fAlfe-BV9inzxw9_s,532
15
+ sage/benchmark/benchmark_agent/config/planning_exp.yaml,sha256=nyimmpQ725skpvEF1eNJpYudkCxYuh2hz6u2e1velNE,607
16
+ sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml,sha256=058TwEeeP8RkzjV7hUbYWFBctrzpcbAXKAlZzw6fJdI,604
17
+ sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml,sha256=dfar6m9XJ1AHsoqVqEN3iN5x8Ka2M3UpvzhpXlCkc6o,544
18
+ sage/benchmark/benchmark_agent/evaluation/__init__.py,sha256=Ypoer04sin7kTryIJWOqIImifonmXvhHWhOAwrhHK10,6615
19
+ sage/benchmark/benchmark_agent/evaluation/__init__.pyc,sha256=LGkSU9NogatJm6IPN9KiXib5GNwU-QthoiI8bapO7nk,9188
20
+ sage/benchmark/benchmark_agent/evaluation/evaluator.pyc,sha256=m2DvOAQNdghDbx0pa9YdzaV4JYX8nzXJorZRqyG_3Bs,11834
21
+ sage/benchmark/benchmark_agent/evaluation/metrics.pyc,sha256=OBE68eXcN9flhN85cjO_XhcJN0Ra8HJxjmaKdxTYHv4,26906
22
+ sage/benchmark/benchmark_agent/evaluation/report_builder.pyc,sha256=kz0QCGqe-YbtTFZAWC8q93WogXCsYONFa4frmQUtOS8,12968
23
+ sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.pyc,sha256=7cbQ-ESoBow0Y3WaUk5eDZW3-q-fHn3lH9Y1gzps0X8,30176
24
+ sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py,sha256=e_kzn4nWKe6l4L0OtSd4-V7Tt064bzwvlz3BsfQXBZI,282
25
+ sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.pyc,sha256=V06E_4Y5JBZXxwl5hZX4flDqQJrJeCpCiU71PRux89U,502
26
+ sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.pyc,sha256=P8kgxnATy9X4goe8HgfXehFH5CVTnLPx3BItPEf8fCM,5747
27
+ sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.pyc,sha256=j58rnbJPOzrjYMUrTsqznUBeGDfeASHJuOkE9bzz3zA,5336
28
+ sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.pyc,sha256=YFhM8485b10cNQF9H7U-sKngrivS4_i8WpwQp5tIjpQ,5213
29
+ sage/benchmark/benchmark_agent/experiments/__init__.py,sha256=w3aKDv8nfgXceZX81KFtOwxuRXhbETLSxUeQDD7HONY,1574
30
+ sage/benchmark/benchmark_agent/experiments/__init__.pyc,sha256=RjkbMzkZ9hRmnYOgz08sJHQV1HKeDeyDVIyqJLo-33A,1649
31
+ sage/benchmark/benchmark_agent/experiments/base_experiment.pyc,sha256=tQdZ_cLBgBX8NW7Fbb59czSA46xYp7Tfp_Lt-PcjhVM,14444
32
+ sage/benchmark/benchmark_agent/experiments/method_comparison.pyc,sha256=tjV8lCsbCM9X-nS3HqZ1o3Gr9olshnDoLWiCVDhEtDo,35466
33
+ sage/benchmark/benchmark_agent/experiments/planning_exp.pyc,sha256=9_P2NcZiQDvRmMDJf19lgVSOq779L_r9Mn2PVmiM3SE,12677
34
+ sage/benchmark/benchmark_agent/experiments/timing_detection_exp.pyc,sha256=aUcKpUrchwe4Lq-pQfURKmVk4cBLLuxQ-0ZuZ1uKX9U,8773
35
+ sage/benchmark/benchmark_agent/experiments/tool_selection_exp.pyc,sha256=d2_jIGC8cNVN-_7W7a3bzJDtSsATMNYcBjfUlsuLWgA,12461
36
+ sage/benchmark/benchmark_agent/scripts/__init__.py,sha256=C8YqglL5eDKIyB8fKg7mC5NZJZ_Fn7LlmEFl2e8RVRw,668
37
+ sage/benchmark/benchmark_agent/scripts/__init__.pyc,sha256=hns6MbbPoHu8GNDw8s2XY0DGPf-JMqI1XI73po2s1ng,845
38
+ sage/benchmark/benchmark_agent/scripts/experiments/__init__.py,sha256=d3VxS2Qfuz7WujRnREz25IUUOVOOatokoD5rhAExy14,1125
39
+ sage/benchmark/benchmark_agent/scripts/experiments/__init__.pyc,sha256=MQOkJMGUuYrYITQ225WvtrKDiNFCvAuDE-Pk_v6K4rc,1322
40
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.pyc,sha256=J-pb1e3K8yDfIEnydkZ3MVZE-nQjCRAVrmGi0KET7JI,14701
41
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.pyc,sha256=MPoQzFjEAKaPvpq6eeVzkvfxkc21ZpnrV9LqbcR7Zh4,18167
42
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.pyc,sha256=U4eC1dOMGx34k_c4AYJ0TBh_dY6ifL-cHqZkIdmxyro,16056
43
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.pyc,sha256=V6FL4YHPDFddTBAvszsghCY7MpgnK6E8Tlcu5pU7eGU,23045
44
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.pyc,sha256=rNQuqNYSbc-MpPU7_l5GXBGJ-VaeecQm8avdYJsiT0k,16312
45
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.pyc,sha256=q7WSoKmHhTrt9uSsNDq6S0iLflcBEktZ0zWlHuOgkI4,14665
46
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.pyc,sha256=7SxuvuZI0W-iL48yF6jfdEmAZeG5V6gYnBy0_GQ22mA,16696
47
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.pyc,sha256=UPzcsz8HF8rQgENNDgKEbDGvFARO0u55atKQnvtuQ9k,10934
48
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.pyc,sha256=2AqFj1NpfCvOyryt9PlMBzWNsm3ASyJchixHVmzwOM4,26165
49
+ sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.pyc,sha256=JhcOpoFow2eFjJgX9VlxLXGKkSltJFYueUsyO3j9APk,18357
50
+ sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.pyc,sha256=Iodn1OUeH8K2RGVu5IVWjCpeC2XurVXC0hz5H46YgRM,29757
51
+ sage/benchmark/benchmark_agent/scripts/experiments/llm_service.pyc,sha256=GsEOHw1Vz5JLohLAfHWcA4LIfEP3z2EubN5KwbstLlg,13530
52
+ sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.pyc,sha256=qJHju7y2tWFX66c9wFRs-LbqkvEatGVr7CW4wcV1LF8,25575
53
+ sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.pyc,sha256=W9Q6L4spqf-KS2bMM-wnAu5dDKme0FS0rXKhpBqoosE,16987
54
+ sage/benchmark/benchmark_agent/scripts/experiments/table_generator.pyc,sha256=z11-4HOgnsn-JtP-nodk-zzWIE30YvCBF3DTzCYqGGk,17114
55
+ isage_benchmark_agent-0.1.0.1.dist-info/METADATA,sha256=2tuTsOY7txE2XqvJcAF0nDeB6gV_jRJDtia5aHKndjc,3018
56
+ isage_benchmark_agent-0.1.0.1.dist-info/WHEEL,sha256=Mk1ST5gDzEO5il5kYREiBnzzM469m5sI8ESPl7TRhJY,110
57
+ isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt,sha256=g34HO224bwnCvIklVp2JQw4wTNDgNW7u61F-cL02pSA,82
58
+ isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt,sha256=hibFyzQHiLOMK68qL1OWsNKaXOmSXqZjeLTBem6Yy7I,5
59
+ isage_benchmark_agent-0.1.0.1.dist-info/RECORD,,
@@ -0,0 +1,6 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
6
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ sage-agent-bench = sage.benchmark.benchmark_agent.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 IntelliStream Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sage/__init__.py ADDED
File without changes
sage/__init__.pyc ADDED
Binary file
File without changes
Binary file
@@ -0,0 +1,108 @@
1
+ """
2
+ Agent Capability Benchmark Module
3
+
4
+ This module provides infrastructure for evaluating agent capabilities including:
5
+ - Tool selection
6
+ - Task planning
7
+ - Timing detection
8
+
9
+ Architecture:
10
+ config/ Configuration files and loaders
11
+ experiments/ Experiment runners and base classes
12
+ adapter_registry.py Strategy adapter registry
13
+
14
+ Usage:
15
+ # Via CLI
16
+ python -m sage.benchmark.benchmark_agent --config config/tool_selection_exp.yaml
17
+
18
+ # Programmatic
19
+ from sage.benchmark.benchmark_agent import ToolSelectionExperiment
20
+ from sage.benchmark.benchmark_agent.config import ConfigLoader
21
+ from sage.benchmark.benchmark_agent.adapter_registry import get_adapter_registry
22
+ from sage.data import DataManager
23
+
24
+ loader = ConfigLoader()
25
+ config = loader.load_config("config/tool_selection_exp.yaml")
26
+
27
+ dm = DataManager.get_instance()
28
+ registry = get_adapter_registry()
29
+
30
+ exp = ToolSelectionExperiment(config, data_manager=dm, adapter_registry=registry)
31
+ exp.prepare()
32
+ result = exp.run()
33
+ exp.finalize()
34
+ """
35
+
36
+ from sage.benchmark.benchmark_agent.adapter_registry import (
37
+ AdapterRegistry,
38
+ PlannerAdapter,
39
+ SelectorAdapter,
40
+ TimingAdapter,
41
+ get_adapter_registry,
42
+ register_strategy,
43
+ )
44
+
45
+ # Data paths management
46
+ from sage.benchmark.benchmark_agent.data_paths import (
47
+ DataPathsConfig,
48
+ RuntimePaths,
49
+ SourcePaths,
50
+ ensure_runtime_dirs,
51
+ get_data_paths_config,
52
+ get_runtime_paths,
53
+ get_source_paths,
54
+ )
55
+ from sage.benchmark.benchmark_agent.experiments import ( # Base classes; Configs; Experiments
56
+ BaseExperiment,
57
+ ExperimentConfig,
58
+ ExperimentResult,
59
+ PlanningConfig,
60
+ PlanningExperiment,
61
+ TimingDetectionConfig,
62
+ TimingDetectionExperiment,
63
+ ToolSelectionConfig,
64
+ ToolSelectionExperiment,
65
+ )
66
+ from sage.benchmark.benchmark_agent.experiments.method_comparison import (
67
+ ExperimentResult as ComparisonResult,
68
+ )
69
+ from sage.benchmark.benchmark_agent.experiments.method_comparison import (
70
+ MethodComparisonExperiment,
71
+ MethodConfig,
72
+ MethodRegistry,
73
+ run_full_comparison,
74
+ run_quick_comparison,
75
+ )
76
+
77
+ __version__ = "0.1.0"
78
+
79
+ __all__ = [
80
+ "__version__",
81
+ # Experiments
82
+ "ToolSelectionExperiment",
83
+ "PlanningExperiment",
84
+ "TimingDetectionExperiment",
85
+ # Base
86
+ "BaseExperiment",
87
+ "ExperimentConfig",
88
+ "ExperimentResult",
89
+ # Configs
90
+ "ToolSelectionConfig",
91
+ "PlanningConfig",
92
+ "TimingDetectionConfig",
93
+ # Adapter Registry
94
+ "AdapterRegistry",
95
+ "SelectorAdapter",
96
+ "PlannerAdapter",
97
+ "TimingAdapter",
98
+ "get_adapter_registry",
99
+ "register_strategy",
100
+ # Data Paths
101
+ "get_source_paths",
102
+ "get_runtime_paths",
103
+ "get_data_paths_config",
104
+ "ensure_runtime_dirs",
105
+ "SourcePaths",
106
+ "RuntimePaths",
107
+ "DataPathsConfig",
108
+ ]
@@ -0,0 +1,24 @@
1
+ # Default Configuration for Agent Benchmark Experiments
2
+
3
+ # Data settings
4
+ profile: "quick_eval" # agent_eval usage profile
5
+ split: "dev" # Data split: train/dev/test
6
+ max_samples: # Limit samples (null = all)
7
+
8
+ # Randomness
9
+ seed: 42
10
+
11
+ # Output and reporting
12
+ report:
13
+ format: ["json", "markdown"]
14
+ include_breakdowns: true
15
+ path: "${PROJECT_ROOT}/outputs/agent_benchmark"
16
+ markdown_template:
17
+
18
+ # Metrics (common defaults, overridden per experiment)
19
+ metrics:
20
+ - "accuracy"
21
+
22
+ # Logging
23
+ verbose: true
24
+ log_level: "INFO"
@@ -0,0 +1,34 @@
1
+ # Planning Experiment Configuration
2
+
3
+ experiment: planning
4
+
5
+ # Data configuration
6
+ profile: "full_eval"
7
+ split: "dev"
8
+ max_samples:
9
+
10
+ # Strategy configuration
11
+ planner: "baseline.template"
12
+ min_steps: 5
13
+ max_steps: 10
14
+ planner_params:
15
+ allow_tool_reuse: true
16
+ enforce_sequence: true
17
+
18
+ # Metrics to evaluate
19
+ metrics:
20
+ - "plan_success_rate"
21
+ - "step_accuracy"
22
+ - "tool_sequence_match"
23
+ - "average_plan_length"
24
+
25
+ # Report configuration
26
+ report:
27
+ format: ["json", "markdown"]
28
+ include_breakdowns: true
29
+ path: "${PROJECT_ROOT}/outputs/agent_benchmark/planning"
30
+ markdown_template:
31
+
32
+ # Reproducibility
33
+ seed: 42
34
+ verbose: true
@@ -0,0 +1,34 @@
1
+ # Timing Detection Experiment Configuration
2
+
3
+ experiment: timing_detection
4
+
5
+ # Data configuration
6
+ profile: "full_eval"
7
+ split: "dev"
8
+ max_samples:
9
+
10
+ # Strategy configuration
11
+ detector: "baseline.threshold"
12
+ threshold: 0.5
13
+ detector_params:
14
+ use_context: true
15
+ confidence_threshold: 0.7
16
+
17
+ # Metrics to evaluate
18
+ metrics:
19
+ - "f1_score"
20
+ - "precision"
21
+ - "recall"
22
+ - "accuracy"
23
+ - "confusion_matrix"
24
+
25
+ # Report configuration
26
+ report:
27
+ format: ["json", "markdown"]
28
+ include_breakdowns: true
29
+ path: "${PROJECT_ROOT}/outputs/agent_benchmark/timing_detection"
30
+ markdown_template:
31
+
32
+ # Reproducibility
33
+ seed: 42
34
+ verbose: true
@@ -0,0 +1,32 @@
1
+ # Tool Selection Experiment Configuration
2
+
3
+ experiment: tool_selection
4
+
5
+ # Data configuration
6
+ profile: "quick_eval"
7
+ split: "dev"
8
+ max_samples:
9
+
10
+ # Strategy configuration
11
+ selector: "baseline.keyword"
12
+ top_k: 5
13
+ selector_params:
14
+ min_score: 0.1
15
+
16
+ # Metrics to evaluate
17
+ metrics:
18
+ - "top_k_accuracy"
19
+ - "recall@5"
20
+ - "precision@5"
21
+ - "mrr"
22
+
23
+ # Report configuration
24
+ report:
25
+ format: ["json", "markdown"]
26
+ include_breakdowns: true
27
+ path: "${PROJECT_ROOT}/outputs/agent_benchmark/tool_selection"
28
+ markdown_template:
29
+
30
+ # Reproducibility
31
+ seed: 42
32
+ verbose: true
@@ -0,0 +1,217 @@
1
+ """
2
+ Evaluation module for Agent Capability Benchmark.
3
+
4
+ This module provides metrics, analyzers, and report builders for evaluating
5
+ agent performance across three capabilities: tool selection, task planning,
6
+ and timing judgment.
7
+ """
8
+
9
+ from pathlib import Path
10
+ from typing import Any, Dict, List, Optional, Protocol, Sequence
11
+
12
+ from pydantic import BaseModel, ConfigDict, Field
13
+
14
+ __all__ = [
15
+ "MetricOutput",
16
+ "EvaluationReport",
17
+ "Metric",
18
+ "Analyzer",
19
+ "ReportBuilder",
20
+ "compute_metrics",
21
+ "MetricRegistry",
22
+ ]
23
+
24
+
25
+ class MetricOutput(BaseModel):
26
+ """Output from a metric computation."""
27
+
28
+ value: float
29
+ details: dict[str, Any] = Field(default_factory=dict)
30
+
31
+
32
+ class EvaluationReport(BaseModel):
33
+ """Complete evaluation report with metrics, breakdowns, and artifacts."""
34
+
35
+ task: str
36
+ experiment_id: str
37
+ metrics: dict[str, float]
38
+ breakdowns: dict[str, Any] = Field(default_factory=dict)
39
+ artifacts: dict[str, Path] = Field(default_factory=dict)
40
+ timestamp: str
41
+
42
+ model_config = ConfigDict(arbitrary_types_allowed=True)
43
+
44
+
45
+ class Metric(Protocol):
46
+ """Protocol for metric implementations."""
47
+
48
+ name: str
49
+
50
+ def compute(self, predictions: Sequence[Any], references: Sequence[Any]) -> MetricOutput:
51
+ """
52
+ Compute metric from predictions and references.
53
+
54
+ Args:
55
+ predictions: Model predictions
56
+ references: Ground truth references
57
+
58
+ Returns:
59
+ MetricOutput with value and optional details
60
+ """
61
+ ...
62
+
63
+
64
+ class Analyzer(Protocol):
65
+ """Protocol for analyzer implementations."""
66
+
67
+ name: str
68
+
69
+ def analyze(
70
+ self, predictions: Sequence[Any], references: Sequence[Any], metadata: dict[str, Any]
71
+ ) -> dict[str, Any]:
72
+ """
73
+ Analyze predictions and produce breakdowns.
74
+
75
+ Args:
76
+ predictions: Model predictions
77
+ references: Ground truth references
78
+ metadata: Additional context from experiment
79
+
80
+ Returns:
81
+ Dictionary with analysis results
82
+ """
83
+ ...
84
+
85
+
86
+ class ReportBuilder(Protocol):
87
+ """Protocol for report builder implementations."""
88
+
89
+ def build(self, report: EvaluationReport, output_path: Path) -> Path:
90
+ """
91
+ Build and save report to file.
92
+
93
+ Args:
94
+ report: EvaluationReport to format
95
+ output_path: Path to save report
96
+
97
+ Returns:
98
+ Path to saved report file
99
+ """
100
+ ...
101
+
102
+
103
+ # Import metric registry after defining base classes
104
+ from sage.benchmark.benchmark_agent.evaluation.metrics import MetricRegistry
105
+
106
+
107
+ def compute_metrics(
108
+ task: str,
109
+ predictions: list[dict[str, Any]],
110
+ references: list[dict[str, Any]],
111
+ metrics: list[str],
112
+ k: int = 5,
113
+ ) -> dict[str, float]:
114
+ """
115
+ Compute evaluation metrics for experiment results.
116
+
117
+ Args:
118
+ task: Task type ('tool_selection', 'planning', 'timing_detection')
119
+ predictions: List of prediction dictionaries
120
+ references: List of reference dictionaries
121
+ metrics: List of metric names to compute
122
+ k: Top-k parameter for ranking metrics
123
+
124
+ Returns:
125
+ Dictionary mapping metric names to values
126
+ """
127
+ results = {}
128
+
129
+ if task == "tool_selection":
130
+ # Extract tool lists from predictions and references
131
+ pred_tools = []
132
+ ref_tools = []
133
+
134
+ for pred, ref in zip(predictions, references):
135
+ # Get predicted tool IDs
136
+ if "predicted_tools" in pred:
137
+ tools = pred["predicted_tools"]
138
+ if tools and isinstance(tools[0], dict):
139
+ pred_tools.append([t["tool_id"] for t in tools])
140
+ else:
141
+ pred_tools.append(tools if tools else [])
142
+ else:
143
+ pred_tools.append([])
144
+
145
+ # Get reference tool IDs
146
+ if "ground_truth_tools" in ref:
147
+ ref_tools.append(ref["ground_truth_tools"])
148
+ elif "top_k" in ref:
149
+ ref_tools.append(ref["top_k"])
150
+ else:
151
+ ref_tools.append([])
152
+
153
+ # Compute each metric
154
+ for metric_name in metrics:
155
+ try:
156
+ if metric_name in ("top_k_accuracy", "recall_at_k", "precision_at_k"):
157
+ metric = MetricRegistry.get(metric_name, k=k)
158
+ elif metric_name == "mrr":
159
+ metric = MetricRegistry.get("mrr")
160
+ else:
161
+ continue
162
+
163
+ output = metric.compute(pred_tools, ref_tools)
164
+ results[metric_name] = output.value
165
+ except Exception as e:
166
+ results[metric_name] = 0.0
167
+ results[f"{metric_name}_error"] = str(e)
168
+
169
+ elif task == "timing_detection":
170
+ # Extract boolean decisions
171
+ pred_decisions = []
172
+ ref_decisions = []
173
+
174
+ for pred, ref in zip(predictions, references):
175
+ pred_decisions.append(pred.get("should_call_tool", False))
176
+ ref_decisions.append(ref.get("should_call_tool", False))
177
+
178
+ # Metric name mapping for timing detection
179
+ timing_metric_map = {
180
+ "accuracy": "timing_accuracy",
181
+ "precision": "timing_precision",
182
+ "recall": "timing_recall",
183
+ "f1": "timing_f1",
184
+ }
185
+
186
+ for metric_name in metrics:
187
+ try:
188
+ # Map simple names to full metric names
189
+ registry_name = timing_metric_map.get(metric_name, metric_name)
190
+ metric = MetricRegistry.get(registry_name)
191
+ output = metric.compute(pred_decisions, ref_decisions)
192
+ results[metric_name] = output.value
193
+ # Include details if available
194
+ if hasattr(output, "details") and output.details:
195
+ results[f"{metric_name}_details"] = output.details
196
+ except Exception as e:
197
+ results[metric_name] = 0.0
198
+ results[f"{metric_name}_error"] = str(e)
199
+
200
+ elif task == "planning":
201
+ # Extract tool sequences
202
+ pred_sequences = []
203
+ ref_sequences = []
204
+
205
+ for pred, ref in zip(predictions, references):
206
+ pred_sequences.append(pred.get("tool_sequence", []))
207
+ ref_sequences.append(ref.get("tool_sequence", []))
208
+
209
+ for metric_name in metrics:
210
+ try:
211
+ metric = MetricRegistry.get(metric_name)
212
+ output = metric.compute(pred_sequences, ref_sequences)
213
+ results[metric_name] = output.value
214
+ except Exception:
215
+ results[metric_name] = 0.0
216
+
217
+ return results
@@ -0,0 +1,11 @@
1
+ """Analyzers package initialization."""
2
+
3
+ from .planning_analyzer import PlanningAnalyzer
4
+ from .timing_analyzer import TimingAnalyzer
5
+ from .tool_selection_analyzer import ToolSelectionAnalyzer
6
+
7
+ __all__ = [
8
+ "ToolSelectionAnalyzer",
9
+ "PlanningAnalyzer",
10
+ "TimingAnalyzer",
11
+ ]
@@ -0,0 +1,63 @@
1
+ """
2
+ Experiment implementations for agent benchmark evaluation.
3
+
4
+ Available experiments:
5
+ - ToolSelectionExperiment: Tool retrieval and ranking
6
+ - PlanningExperiment: Multi-step planning with tool composition
7
+ - TimingDetectionExperiment: Timing judgment for tool invocation
8
+ """
9
+
10
+ from sage.benchmark.benchmark_agent.experiments.base_experiment import (
11
+ CONFIG_TYPES,
12
+ BaseExperiment,
13
+ ExperimentConfig,
14
+ ExperimentResult,
15
+ PlanningConfig,
16
+ PlanningPrediction,
17
+ PlanStep,
18
+ ReportConfig,
19
+ TimingDecision,
20
+ TimingDetectionConfig,
21
+ ToolPrediction,
22
+ ToolSelectionConfig,
23
+ create_config,
24
+ )
25
+ from sage.benchmark.benchmark_agent.experiments.planning_exp import (
26
+ PlanningExperiment,
27
+ PlanningTask,
28
+ )
29
+ from sage.benchmark.benchmark_agent.experiments.timing_detection_exp import (
30
+ TimingDetectionExperiment,
31
+ TimingMessage,
32
+ )
33
+ from sage.benchmark.benchmark_agent.experiments.tool_selection_exp import (
34
+ ToolSelectionExperiment,
35
+ ToolSelectionQuery,
36
+ )
37
+
38
+ __all__ = [
39
+ # Base classes
40
+ "BaseExperiment",
41
+ "ExperimentConfig",
42
+ "ExperimentResult",
43
+ # Config models
44
+ "ToolSelectionConfig",
45
+ "PlanningConfig",
46
+ "TimingDetectionConfig",
47
+ "ReportConfig",
48
+ # Result/task models
49
+ "ToolPrediction",
50
+ "PlanStep",
51
+ "PlanningPrediction",
52
+ "TimingDecision",
53
+ "ToolSelectionQuery",
54
+ "PlanningTask",
55
+ "TimingMessage",
56
+ # Utilities
57
+ "CONFIG_TYPES",
58
+ "create_config",
59
+ # Experiment implementations
60
+ "ToolSelectionExperiment",
61
+ "PlanningExperiment",
62
+ "TimingDetectionExperiment",
63
+ ]
@@ -0,0 +1,26 @@
1
+ """
2
+ SAGE-Bench Scripts
3
+
4
+ 统一的 Benchmark 实验脚本入口。
5
+
6
+ 所有实验功能位于 experiments/ 子包:
7
+ - run_paper1_experiments.py: 论文 1 实验统一入口
8
+ - exp_main_*.py: Section 5.2 主实验
9
+ - exp_analysis_*.py: Section 5.3 分析实验
10
+ - exp_cross_dataset.py: Section 5.4 跨数据集泛化
11
+ - exp_training_comparison.py: Section 5.5 训练方法对比
12
+
13
+ Usage:
14
+ # CLI 入口
15
+ sage-bench run --quick
16
+ sage-bench eval --dataset all
17
+ sage-bench train --dry-run
18
+ sage-bench llm status
19
+
20
+ # 直接运行
21
+ python -m sage.benchmark.benchmark_agent.scripts.experiments.run_paper1_experiments --quick
22
+ """
23
+
24
+ __all__ = [
25
+ "experiments",
26
+ ]
@@ -0,0 +1,40 @@
1
+ """
2
+ SAGE-Bench Paper 1 Experiments Package
3
+
4
+ 按论文 Experiment Section 组织的实验脚本集合:
5
+
6
+ - Section 5.2 (Main Results):
7
+ - exp_main_timing.py # RQ1: Timing Detection
8
+ - exp_main_planning.py # RQ2: Task Planning
9
+ - exp_main_selection.py # RQ3: Tool Selection
10
+
11
+ - Section 5.3 (Analysis):
12
+ - exp_analysis_error.py # 5.3.1 Error Analysis
13
+ - exp_analysis_scaling.py # 5.3.2 Scaling Analysis
14
+ - exp_analysis_robustness.py # 5.3.3 Robustness Analysis
15
+ - exp_analysis_ablation.py # 5.3.4 Ablation Studies
16
+
17
+ - Section 5.4 (Generalization):
18
+ - exp_cross_dataset.py # Cross-dataset evaluation
19
+
20
+ Usage:
21
+ sage-bench paper1 run # 运行所有实验
22
+ sage-bench paper1 run --section 5.2 # 仅主实验
23
+ sage-bench paper1 timing # 单个实验
24
+ """
25
+
26
+ from .exp_utils import (
27
+ get_embedding_client,
28
+ get_llm_client,
29
+ load_benchmark_data,
30
+ save_results,
31
+ setup_experiment_env,
32
+ )
33
+
34
+ __all__ = [
35
+ "setup_experiment_env",
36
+ "load_benchmark_data",
37
+ "save_results",
38
+ "get_llm_client",
39
+ "get_embedding_client",
40
+ ]