isage-tooluse-benchmark 0.1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. isage_tooluse_benchmark-0.1.0.1/LICENSE +21 -0
  2. isage_tooluse_benchmark-0.1.0.1/PKG-INFO +91 -0
  3. isage_tooluse_benchmark-0.1.0.1/README.md +48 -0
  4. isage_tooluse_benchmark-0.1.0.1/pyproject.toml +139 -0
  5. isage_tooluse_benchmark-0.1.0.1/setup.cfg +4 -0
  6. isage_tooluse_benchmark-0.1.0.1/src/isage_tooluse_benchmark.egg-info/PKG-INFO +91 -0
  7. isage_tooluse_benchmark-0.1.0.1/src/isage_tooluse_benchmark.egg-info/SOURCES.txt +59 -0
  8. isage_tooluse_benchmark-0.1.0.1/src/isage_tooluse_benchmark.egg-info/dependency_links.txt +1 -0
  9. isage_tooluse_benchmark-0.1.0.1/src/isage_tooluse_benchmark.egg-info/entry_points.txt +2 -0
  10. isage_tooluse_benchmark-0.1.0.1/src/isage_tooluse_benchmark.egg-info/requires.txt +20 -0
  11. isage_tooluse_benchmark-0.1.0.1/src/isage_tooluse_benchmark.egg-info/top_level.txt +1 -0
  12. isage_tooluse_benchmark-0.1.0.1/src/sage/__init__.py +0 -0
  13. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/__init__.py +0 -0
  14. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/__init__.py +108 -0
  15. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/__main__.py +177 -0
  16. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
  17. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
  18. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
  19. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  20. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  21. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  22. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  23. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/data_paths.py +332 -0
  24. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  25. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  26. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
  27. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
  28. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
  29. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
  30. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
  31. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
  32. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
  33. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  34. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
  35. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
  36. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
  37. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
  38. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
  39. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  40. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  41. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
  42. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
  43. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
  44. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
  45. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
  46. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
  47. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
  48. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
  49. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
  50. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
  51. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
  52. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
  53. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
  54. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
  55. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
  56. isage_tooluse_benchmark-0.1.0.1/src/sage/benchmark/benchmark_agent/tools_loader.py +212 -0
  57. isage_tooluse_benchmark-0.1.0.1/tests/test_evaluation.py +449 -0
  58. isage_tooluse_benchmark-0.1.0.1/tests/test_experiments.py +475 -0
  59. isage_tooluse_benchmark-0.1.0.1/tests/test_react_planner.py +318 -0
  60. isage_tooluse_benchmark-0.1.0.1/tests/test_rule_based_decider.py +418 -0
  61. isage_tooluse_benchmark-0.1.0.1/tests/test_toolalpaca_loader.py +309 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 IntelliStream Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.4
2
+ Name: isage-tooluse-benchmark
3
+ Version: 0.1.0.1
4
+ Summary: SAGE Tool Use Benchmark - Tool selection and use evaluation framework
5
+ Author-email: IntelliStream Team <shuhao_zhang@hust.edu.cn>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/intellistream/sage-agent-benchmark
8
+ Project-URL: Documentation, https://github.com/intellistream/sage-agent-benchmark#readme
9
+ Project-URL: Repository, https://github.com/intellistream/sage-agent-benchmark
10
+ Project-URL: Issues, https://github.com/intellistream/sage-agent-benchmark/issues
11
+ Keywords: sage,benchmark,tool-selection,tool-use,planning,timing-detection,evaluation,intellistream
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: isage-common
25
+ Requires-Dist: isage-libs
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: pandas>=2.0.0
28
+ Requires-Dist: numpy<2.3.0,>=1.26.0
29
+ Requires-Dist: typer<1.0.0,>=0.15.0
30
+ Requires-Dist: rich<14.0.0,>=13.0.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
33
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
35
+ Requires-Dist: black>=23.0.0; extra == "dev"
36
+ Requires-Dist: ruff==0.14.6; extra == "dev"
37
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
38
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
39
+ Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
40
+ Provides-Extra: all
41
+ Requires-Dist: isage-benchmark-agent[dev]; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ # SAGE Tool Use Benchmark
45
+
46
+ Configuration-driven experiment framework for evaluating tool selection and use capabilities.
47
+
48
+ ## Features
49
+
50
+ - **Tool Selection Evaluation**: Tool retrieval and ranking benchmarks
51
+ - **Planning Evaluation**: Multi-step planning with tool composition
52
+ - **Timing Detection**: Timing judgment for tool invocation decisions
53
+
54
+ ## Quick Start
55
+
56
+ ```bash
57
+ # Install
58
+ pip install isage-tooluse-benchmark
59
+
60
+ # Run tool selection experiment
61
+ sage-tooluse-bench tool-selection --config config/tool_selection_exp.yaml
62
+
63
+ # Run planning experiment
64
+ sage-tooluse-bench planning --config config/planning_exp.yaml
65
+ ```
66
+
67
+ ## Documentation
68
+
69
+ See [benchmark_agent/README.md](src/sage/benchmark/benchmark_agent/README.md) for detailed documentation.
70
+
71
+ ## Development
72
+
73
+ ```bash
74
+ # Clone
75
+ git clone https://github.com/intellistream/sage-tooluse-benchmark.git
76
+ cd sage-tooluse-benchmark
77
+
78
+ # Setup virtual environment
79
+ python -m venv .venv
80
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
81
+
82
+ # Install in development mode
83
+ pip install -e ".[dev]"
84
+
85
+ # Run tests
86
+ pytest
87
+ ```
88
+
89
+ ## License
90
+
91
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,48 @@
1
+ # SAGE Tool Use Benchmark
2
+
3
+ Configuration-driven experiment framework for evaluating tool selection and use capabilities.
4
+
5
+ ## Features
6
+
7
+ - **Tool Selection Evaluation**: Tool retrieval and ranking benchmarks
8
+ - **Planning Evaluation**: Multi-step planning with tool composition
9
+ - **Timing Detection**: Timing judgment for tool invocation decisions
10
+
11
+ ## Quick Start
12
+
13
+ ```bash
14
+ # Install
15
+ pip install isage-tooluse-benchmark
16
+
17
+ # Run tool selection experiment
18
+ sage-tooluse-bench tool-selection --config config/tool_selection_exp.yaml
19
+
20
+ # Run planning experiment
21
+ sage-tooluse-bench planning --config config/planning_exp.yaml
22
+ ```
23
+
24
+ ## Documentation
25
+
26
+ See [benchmark_agent/README.md](src/sage/benchmark/benchmark_agent/README.md) for detailed documentation.
27
+
28
+ ## Development
29
+
30
+ ```bash
31
+ # Clone
32
+ git clone https://github.com/intellistream/sage-tooluse-benchmark.git
33
+ cd sage-tooluse-benchmark
34
+
35
+ # Setup virtual environment
36
+ python -m venv .venv
37
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
38
+
39
+ # Install in development mode
40
+ pip install -e ".[dev]"
41
+
42
+ # Run tests
43
+ pytest
44
+ ```
45
+
46
+ ## License
47
+
48
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,139 @@
1
+ [build-system]
2
+ requires = ["setuptools>=64", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "isage-tooluse-benchmark"
7
+ version = "0.1.0.1"
8
+ description = "SAGE Tool Use Benchmark - Tool selection and use evaluation framework"
9
+ readme = "README.md"
10
+ requires-python = ">=3.11"
11
+ authors = [
12
+ { name = "IntelliStream Team", email = "shuhao_zhang@hust.edu.cn" },
13
+ ]
14
+ keywords = [
15
+ "sage",
16
+ "benchmark",
17
+ "tool-selection",
18
+ "tool-use",
19
+ "planning",
20
+ "timing-detection",
21
+ "evaluation",
22
+ "intellistream",
23
+ ]
24
+ classifiers = [
25
+ "Development Status :: 4 - Beta",
26
+ "Intended Audience :: Developers",
27
+ "Intended Audience :: Science/Research",
28
+ "Programming Language :: Python :: 3",
29
+ "Programming Language :: Python :: 3.11",
30
+ "Programming Language :: Python :: 3.12",
31
+ "Programming Language :: Python :: 3 :: Only",
32
+ "Topic :: Software Development :: Libraries :: Python Modules",
33
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
34
+ ]
35
+
36
+ dependencies = [
37
+ # SAGE dependencies from PyPI
38
+ "isage-common",
39
+ "isage-libs", # For agent interfaces if needed
40
+
41
+ # Core dependencies
42
+ "pyyaml>=6.0",
43
+ "pandas>=2.0.0",
44
+ "numpy>=1.26.0,<2.3.0",
45
+
46
+ # CLI dependencies
47
+ "typer>=0.15.0,<1.0.0",
48
+ "rich>=13.0.0,<14.0.0",
49
+ ]
50
+
51
+ license = "MIT"
52
+
53
+ [project.optional-dependencies]
54
+ dev = [
55
+ "pytest>=7.0.0",
56
+ "pytest-asyncio>=0.21.0",
57
+ "pytest-cov>=4.0.0",
58
+ "black>=23.0.0",
59
+ "ruff==0.14.6",
60
+ "pre-commit>=3.0.0",
61
+ "mypy>=1.0.0",
62
+ "types-PyYAML>=6.0.0",
63
+ ]
64
+
65
+ all = [
66
+ "isage-benchmark-agent[dev]",
67
+ ]
68
+
69
+ [project.urls]
70
+ Homepage = "https://github.com/intellistream/sage-agent-benchmark"
71
+ Documentation = "https://github.com/intellistream/sage-agent-benchmark#readme"
72
+ Repository = "https://github.com/intellistream/sage-agent-benchmark"
73
+ Issues = "https://github.com/intellistream/sage-agent-benchmark/issues"
74
+
75
+ [project.scripts]
76
+ sage-agent-bench = "sage.benchmark.benchmark_agent.__main__:main"
77
+
78
+ [tool.setuptools]
79
+ package-dir = {"" = "src"}
80
+
81
+ [tool.setuptools.packages.find]
82
+ where = ["src"]
83
+ include = ["sage*"]
84
+ namespaces = true
85
+
86
+ [tool.setuptools.package-data]
87
+ "sage.benchmark.benchmark_agent" = [
88
+ "config/*.yaml",
89
+ "data/**/*",
90
+ ]
91
+
92
+ [tool.ruff]
93
+ line-length = 100
94
+ target-version = "py311"
95
+
96
+ [tool.ruff.lint]
97
+ select = ["E", "F", "I", "W"]
98
+ ignore = [
99
+ "E501", # Line too long
100
+ "F401", # Unused imports
101
+ "F841", # Unused local variable
102
+ ]
103
+
104
+ [tool.mypy]
105
+ ignore_missing_imports = true
106
+ python_version = "3.11"
107
+ warn_return_any = true
108
+ warn_unused_configs = true
109
+
110
+ [tool.pytest.ini_options]
111
+ testpaths = ["tests"]
112
+ python_files = ["test_*.py"]
113
+ addopts = [
114
+ "--strict-markers",
115
+ "--verbose",
116
+ "-ra",
117
+ ]
118
+ markers = [
119
+ "slow: marks tests as slow",
120
+ "integration: marks tests as integration tests",
121
+ "unit: marks tests as unit tests",
122
+ ]
123
+
124
+ [tool.coverage.run]
125
+ source = ["src/sage"]
126
+ omit = ["*/tests/*", "*/test_*.py"]
127
+
128
+ [tool.coverage.report]
129
+ exclude_lines = [
130
+ "pragma: no cover",
131
+ "def __repr__",
132
+ "raise AssertionError",
133
+ "raise NotImplementedError",
134
+ "if __name__ == .__main__.:",
135
+ ]
136
+
137
+ [tool.black]
138
+ line-length = 100
139
+ target-version = ['py311']
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.4
2
+ Name: isage-tooluse-benchmark
3
+ Version: 0.1.0.1
4
+ Summary: SAGE Tool Use Benchmark - Tool selection and use evaluation framework
5
+ Author-email: IntelliStream Team <shuhao_zhang@hust.edu.cn>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/intellistream/sage-agent-benchmark
8
+ Project-URL: Documentation, https://github.com/intellistream/sage-agent-benchmark#readme
9
+ Project-URL: Repository, https://github.com/intellistream/sage-agent-benchmark
10
+ Project-URL: Issues, https://github.com/intellistream/sage-agent-benchmark/issues
11
+ Keywords: sage,benchmark,tool-selection,tool-use,planning,timing-detection,evaluation,intellistream
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.11
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: isage-common
25
+ Requires-Dist: isage-libs
26
+ Requires-Dist: pyyaml>=6.0
27
+ Requires-Dist: pandas>=2.0.0
28
+ Requires-Dist: numpy<2.3.0,>=1.26.0
29
+ Requires-Dist: typer<1.0.0,>=0.15.0
30
+ Requires-Dist: rich<14.0.0,>=13.0.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
33
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
34
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
35
+ Requires-Dist: black>=23.0.0; extra == "dev"
36
+ Requires-Dist: ruff==0.14.6; extra == "dev"
37
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
38
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
39
+ Requires-Dist: types-PyYAML>=6.0.0; extra == "dev"
40
+ Provides-Extra: all
41
+ Requires-Dist: isage-benchmark-agent[dev]; extra == "all"
42
+ Dynamic: license-file
43
+
44
+ # SAGE Tool Use Benchmark
45
+
46
+ Configuration-driven experiment framework for evaluating tool selection and use capabilities.
47
+
48
+ ## Features
49
+
50
+ - **Tool Selection Evaluation**: Tool retrieval and ranking benchmarks
51
+ - **Planning Evaluation**: Multi-step planning with tool composition
52
+ - **Timing Detection**: Timing judgment for tool invocation decisions
53
+
54
+ ## Quick Start
55
+
56
+ ```bash
57
+ # Install
58
+ pip install isage-tooluse-benchmark
59
+
60
+ # Run tool selection experiment
61
+ sage-tooluse-bench tool-selection --config config/tool_selection_exp.yaml
62
+
63
+ # Run planning experiment
64
+ sage-tooluse-bench planning --config config/planning_exp.yaml
65
+ ```
66
+
67
+ ## Documentation
68
+
69
+ See [benchmark_agent/README.md](src/sage/benchmark/benchmark_agent/README.md) for detailed documentation.
70
+
71
+ ## Development
72
+
73
+ ```bash
74
+ # Clone
75
+ git clone https://github.com/intellistream/sage-tooluse-benchmark.git
76
+ cd sage-tooluse-benchmark
77
+
78
+ # Setup virtual environment
79
+ python -m venv .venv
80
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
81
+
82
+ # Install in development mode
83
+ pip install -e ".[dev]"
84
+
85
+ # Run tests
86
+ pytest
87
+ ```
88
+
89
+ ## License
90
+
91
+ MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,59 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/isage_tooluse_benchmark.egg-info/PKG-INFO
5
+ src/isage_tooluse_benchmark.egg-info/SOURCES.txt
6
+ src/isage_tooluse_benchmark.egg-info/dependency_links.txt
7
+ src/isage_tooluse_benchmark.egg-info/entry_points.txt
8
+ src/isage_tooluse_benchmark.egg-info/requires.txt
9
+ src/isage_tooluse_benchmark.egg-info/top_level.txt
10
+ src/sage/__init__.py
11
+ src/sage/benchmark/__init__.py
12
+ src/sage/benchmark/benchmark_agent/__init__.py
13
+ src/sage/benchmark/benchmark_agent/__main__.py
14
+ src/sage/benchmark/benchmark_agent/acebench_loader.py
15
+ src/sage/benchmark/benchmark_agent/adapter_registry.py
16
+ src/sage/benchmark/benchmark_agent/data_paths.py
17
+ src/sage/benchmark/benchmark_agent/tools_loader.py
18
+ src/sage/benchmark/benchmark_agent/config/config_loader.py
19
+ src/sage/benchmark/benchmark_agent/config/default_config.yaml
20
+ src/sage/benchmark/benchmark_agent/config/planning_exp.yaml
21
+ src/sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml
22
+ src/sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml
23
+ src/sage/benchmark/benchmark_agent/evaluation/__init__.py
24
+ src/sage/benchmark/benchmark_agent/evaluation/evaluator.py
25
+ src/sage/benchmark/benchmark_agent/evaluation/metrics.py
26
+ src/sage/benchmark/benchmark_agent/evaluation/report_builder.py
27
+ src/sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py
28
+ src/sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py
29
+ src/sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py
30
+ src/sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py
31
+ src/sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py
32
+ src/sage/benchmark/benchmark_agent/experiments/__init__.py
33
+ src/sage/benchmark/benchmark_agent/experiments/base_experiment.py
34
+ src/sage/benchmark/benchmark_agent/experiments/method_comparison.py
35
+ src/sage/benchmark/benchmark_agent/experiments/planning_exp.py
36
+ src/sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py
37
+ src/sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py
38
+ src/sage/benchmark/benchmark_agent/scripts/__init__.py
39
+ src/sage/benchmark/benchmark_agent/scripts/experiments/__init__.py
40
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py
41
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py
42
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py
43
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py
44
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py
45
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py
46
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py
47
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py
48
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py
49
+ src/sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py
50
+ src/sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py
51
+ src/sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py
52
+ src/sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py
53
+ src/sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py
54
+ src/sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py
55
+ tests/test_evaluation.py
56
+ tests/test_experiments.py
57
+ tests/test_react_planner.py
58
+ tests/test_rule_based_decider.py
59
+ tests/test_toolalpaca_loader.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ sage-agent-bench = sage.benchmark.benchmark_agent.__main__:main
@@ -0,0 +1,20 @@
1
+ isage-common
2
+ isage-libs
3
+ pyyaml>=6.0
4
+ pandas>=2.0.0
5
+ numpy<2.3.0,>=1.26.0
6
+ typer<1.0.0,>=0.15.0
7
+ rich<14.0.0,>=13.0.0
8
+
9
+ [all]
10
+ isage-benchmark-agent[dev]
11
+
12
+ [dev]
13
+ pytest>=7.0.0
14
+ pytest-asyncio>=0.21.0
15
+ pytest-cov>=4.0.0
16
+ black>=23.0.0
17
+ ruff==0.14.6
18
+ pre-commit>=3.0.0
19
+ mypy>=1.0.0
20
+ types-PyYAML>=6.0.0
File without changes
@@ -0,0 +1,108 @@
1
+ """
2
+ Agent Capability Benchmark Module
3
+
4
+ This module provides infrastructure for evaluating agent capabilities including:
5
+ - Tool selection
6
+ - Task planning
7
+ - Timing detection
8
+
9
+ Architecture:
10
+ config/ Configuration files and loaders
11
+ experiments/ Experiment runners and base classes
12
+ adapter_registry.py Strategy adapter registry
13
+
14
+ Usage:
15
+ # Via CLI
16
+ python -m sage.benchmark.benchmark_agent --config config/tool_selection_exp.yaml
17
+
18
+ # Programmatic
19
+ from sage.benchmark.benchmark_agent import ToolSelectionExperiment
20
+ from sage.benchmark.benchmark_agent.config import ConfigLoader
21
+ from sage.benchmark.benchmark_agent.adapter_registry import get_adapter_registry
22
+ from sage.data import DataManager
23
+
24
+ loader = ConfigLoader()
25
+ config = loader.load_config("config/tool_selection_exp.yaml")
26
+
27
+ dm = DataManager.get_instance()
28
+ registry = get_adapter_registry()
29
+
30
+ exp = ToolSelectionExperiment(config, data_manager=dm, adapter_registry=registry)
31
+ exp.prepare()
32
+ result = exp.run()
33
+ exp.finalize()
34
+ """
35
+
36
+ from sage.benchmark.benchmark_agent.adapter_registry import (
37
+ AdapterRegistry,
38
+ PlannerAdapter,
39
+ SelectorAdapter,
40
+ TimingAdapter,
41
+ get_adapter_registry,
42
+ register_strategy,
43
+ )
44
+
45
+ # Data paths management
46
+ from sage.benchmark.benchmark_agent.data_paths import (
47
+ DataPathsConfig,
48
+ RuntimePaths,
49
+ SourcePaths,
50
+ ensure_runtime_dirs,
51
+ get_data_paths_config,
52
+ get_runtime_paths,
53
+ get_source_paths,
54
+ )
55
+ from sage.benchmark.benchmark_agent.experiments import ( # Base classes; Configs; Experiments
56
+ BaseExperiment,
57
+ ExperimentConfig,
58
+ ExperimentResult,
59
+ PlanningConfig,
60
+ PlanningExperiment,
61
+ TimingDetectionConfig,
62
+ TimingDetectionExperiment,
63
+ ToolSelectionConfig,
64
+ ToolSelectionExperiment,
65
+ )
66
+ from sage.benchmark.benchmark_agent.experiments.method_comparison import (
67
+ ExperimentResult as ComparisonResult,
68
+ )
69
+ from sage.benchmark.benchmark_agent.experiments.method_comparison import (
70
+ MethodComparisonExperiment,
71
+ MethodConfig,
72
+ MethodRegistry,
73
+ run_full_comparison,
74
+ run_quick_comparison,
75
+ )
76
+
77
+ __version__ = "0.1.0"
78
+
79
+ __all__ = [
80
+ "__version__",
81
+ # Experiments
82
+ "ToolSelectionExperiment",
83
+ "PlanningExperiment",
84
+ "TimingDetectionExperiment",
85
+ # Base
86
+ "BaseExperiment",
87
+ "ExperimentConfig",
88
+ "ExperimentResult",
89
+ # Configs
90
+ "ToolSelectionConfig",
91
+ "PlanningConfig",
92
+ "TimingDetectionConfig",
93
+ # Adapter Registry
94
+ "AdapterRegistry",
95
+ "SelectorAdapter",
96
+ "PlannerAdapter",
97
+ "TimingAdapter",
98
+ "get_adapter_registry",
99
+ "register_strategy",
100
+ # Data Paths
101
+ "get_source_paths",
102
+ "get_runtime_paths",
103
+ "get_data_paths_config",
104
+ "ensure_runtime_dirs",
105
+ "SourcePaths",
106
+ "RuntimePaths",
107
+ "DataPathsConfig",
108
+ ]