isage-benchmark-agent 0.1.0.1__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
  2. isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
  3. isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
  4. isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
  5. isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
  6. isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
  7. sage/__init__.py +0 -0
  8. sage/benchmark/__init__.py +0 -0
  9. sage/benchmark/benchmark_agent/__init__.py +108 -0
  10. sage/benchmark/benchmark_agent/__main__.py +177 -0
  11. sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
  12. sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
  13. sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
  14. sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
  15. sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
  16. sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
  17. sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
  18. sage/benchmark/benchmark_agent/data_paths.py +332 -0
  19. sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
  20. sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
  21. sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
  22. sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
  23. sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
  24. sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
  25. sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
  26. sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
  27. sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
  28. sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
  29. sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
  30. sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
  31. sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
  32. sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
  33. sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
  34. sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
  35. sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
  36. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
  37. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
  38. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
  39. sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
  40. sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
  41. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
  42. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
  43. sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
  44. sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
  45. sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
  46. sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
  47. sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
  48. sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
  49. sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
  50. sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
  51. sage/benchmark/benchmark_agent/tools_loader.py +212 -0
@@ -0,0 +1,422 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ SAGE Agent Bench CLI - Agent 能力评测命令行入口
4
+
5
+ 这是 SAGE Agent Benchmark 的官方入口,用于评测 Agent 工具调用能力。
6
+
7
+ Usage:
8
+ sage-agent-bench <command> [options]
9
+
10
+ Commands:
11
+ run 运行完整 Benchmark 实验
12
+ eval 工具选择评测 (跨数据集)
13
+ train 训练方法对比
14
+ llm LLM 服务管理
15
+ list 列出可用资源
16
+
17
+ Examples:
18
+ # 运行完整 benchmark
19
+ sage-agent-bench run --quick
20
+ sage-agent-bench run --section 5.2
21
+ sage-agent-bench run --exp timing
22
+
23
+ # 工具选择评测
24
+ sage-agent-bench eval --dataset sage --samples 100
25
+ sage-agent-bench eval --dataset acebench
26
+
27
+ # 训练方法对比
28
+ sage-agent-bench train --methods A_baseline,D_combined
29
+ sage-agent-bench train --dry-run
30
+
31
+ # LLM 服务管理
32
+ sage-agent-bench llm status
33
+ sage-agent-bench llm start
34
+ sage-agent-bench llm stop
35
+
36
+ # 列出资源
37
+ sage-agent-bench list datasets
38
+ sage-agent-bench list methods
39
+ sage-agent-bench list experiments
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import argparse
45
+ import sys
46
+ from pathlib import Path
47
+
48
+ # 获取脚本目录
49
+ SCRIPT_DIR = Path(__file__).resolve().parent
50
+
51
+ # 添加路径
52
+ sys.path.insert(0, str(SCRIPT_DIR.parent.parent.parent.parent))
53
+ sys.path.insert(0, str(SCRIPT_DIR / "experiments"))
54
+
55
+
56
+ # =============================================================================
57
+ # 子命令处理
58
+ # =============================================================================
59
+
60
+
61
+ def cmd_run(args):
62
+ """运行 Benchmark 实验"""
63
+ from experiments.run_paper1_experiments import main as run_main
64
+
65
+ # 构建等效的 argparse args
66
+ sys.argv = ["run_paper1_experiments.py"]
67
+
68
+ if args.section:
69
+ sys.argv.extend(["--section", args.section])
70
+ if args.exp:
71
+ sys.argv.extend(["--exp", args.exp])
72
+ if args.quick:
73
+ sys.argv.append("--quick")
74
+ if args.skip_llm:
75
+ sys.argv.append("--skip-llm")
76
+ if args.generate_paper:
77
+ sys.argv.append("--generate-paper")
78
+ if args.output:
79
+ sys.argv.extend(["--output-dir", args.output])
80
+
81
+ run_main()
82
+ return 0
83
+
84
+
85
+ def cmd_eval(args):
86
+ """工具选择评测"""
87
+ from experiments.exp_cross_dataset import run_cross_dataset_evaluation
88
+ from experiments.exp_main_selection import run_selection_experiment
89
+ from experiments.exp_utils import setup_experiment_env
90
+
91
+ setup_experiment_env()
92
+
93
+ if args.dataset == "all":
94
+ # 跨数据集评测
95
+ result = run_cross_dataset_evaluation(
96
+ datasets=["sage", "acebench"],
97
+ max_samples=args.samples,
98
+ verbose=True,
99
+ )
100
+ else:
101
+ # 单数据集评测
102
+ result = run_selection_experiment(
103
+ max_samples=args.samples,
104
+ top_k=args.top_k,
105
+ skip_llm=False,
106
+ verbose=True,
107
+ )
108
+
109
+ return 0 if result else 1
110
+
111
+
112
+ def cmd_train(args):
113
+ """训练方法对比"""
114
+ from experiments.exp_training_comparison import run_training_comparison
115
+
116
+ methods = args.methods.split(",") if args.methods else ["A_baseline", "D_combined"]
117
+
118
+ run_training_comparison(
119
+ methods=methods,
120
+ base_model=args.model,
121
+ quick=args.quick,
122
+ dry_run=args.dry_run,
123
+ verbose=True,
124
+ )
125
+ return 0
126
+
127
+
128
+ def cmd_llm(args):
129
+ """LLM 服务管理"""
130
+ from experiments.llm_service import (
131
+ print_llm_status,
132
+ start_llm_service,
133
+ stop_llm_service,
134
+ )
135
+
136
+ if args.llm_action == "status":
137
+ print_llm_status()
138
+ return 0
139
+
140
+ elif args.llm_action == "start":
141
+ success = start_llm_service(
142
+ model=args.model,
143
+ port=args.port,
144
+ gpu_memory=args.gpu_memory,
145
+ )
146
+ return 0 if success else 1
147
+
148
+ elif args.llm_action == "stop":
149
+ success = stop_llm_service()
150
+ return 0 if success else 1
151
+
152
+ return 0
153
+
154
+
155
+ def cmd_list(args):
156
+ """列出可用资源"""
157
+ if args.resource == "datasets":
158
+ print("\n" + "=" * 70)
159
+ print("Available Datasets for Tool Selection Evaluation")
160
+ print("=" * 70)
161
+ print()
162
+ print(f"{'Dataset':<15} {'Description':<50} {'Status'}")
163
+ print("-" * 70)
164
+
165
+ datasets = [
166
+ ("sage", "SAGE Agent Bench (1200 synthetic tools)", "Built-in"),
167
+ ("acebench", "ToolACE from HuggingFace", "HuggingFace"),
168
+ ("apibank", "API-Bank (Microsoft/Alibaba)", "External"),
169
+ ("toolalpaca", "ToolAlpaca (Microsoft)", "External"),
170
+ ("bfcl", "BFCL (Berkeley Function Calling)", "External"),
171
+ ("toolbench", "ToolBench (Tsinghua/OpenBMB)", "External"),
172
+ ("all", "Evaluate on ALL datasets", "-"),
173
+ ]
174
+ for name, desc, status in datasets:
175
+ print(f"{name:<15} {desc:<50} {status}")
176
+ print()
177
+ return 0
178
+
179
+ elif args.resource == "methods":
180
+ print("\n" + "=" * 70)
181
+ print("Available Methods")
182
+ print("=" * 70)
183
+
184
+ print("\n📋 Timing Detection Methods (RQ1):")
185
+ print("-" * 50)
186
+ timing_methods = [
187
+ ("rule_based", "Keyword + Regex rules", "Classic"),
188
+ ("embedding", "Semantic similarity", "Common"),
189
+ ("llm_based", "Direct LLM inference", "LLM"),
190
+ ("hybrid", "Rule filter + LLM judge", "Combined"),
191
+ ]
192
+ for name, desc, source in timing_methods:
193
+ print(f" {name:<15} {desc:<35} {source}")
194
+
195
+ print("\n📋 Task Planning Methods (RQ2):")
196
+ print("-" * 50)
197
+ planning_methods = [
198
+ ("simple", "Greedy matching", "Classic"),
199
+ ("hierarchical", "HuggingGPT-style decomposition", "ICML'23"),
200
+ ("llm_based", "CoT prompting", "Common"),
201
+ ("react", "ReAct interleaved execution", "ICLR'23"),
202
+ ("tot", "Tree-of-Thoughts search", "NeurIPS'23"),
203
+ ]
204
+ for name, desc, source in planning_methods:
205
+ print(f" {name:<15} {desc:<35} {source}")
206
+
207
+ print("\n📋 Tool Selection Methods (RQ3):")
208
+ print("-" * 50)
209
+ methods = [
210
+ ("keyword", "BM25 keyword matching", "Classic"),
211
+ ("embedding", "Semantic embedding similarity", "Common"),
212
+ ("hybrid", "Keyword + Embedding fusion", "Common"),
213
+ ("gorilla", "Retrieval + LLM reranking", "Berkeley"),
214
+ ("dfsdt", "Tree search (ToolLLM)", "Tsinghua"),
215
+ ]
216
+ for name, desc, source in methods:
217
+ print(f" {name:<15} {desc:<35} {source}")
218
+
219
+ print("\n📋 Training Methods (Section 5.5):")
220
+ print("-" * 50)
221
+ print(" Paper 1 (Benchmark) - Published SOTA:")
222
+ paper1_methods = [
223
+ ("A_baseline", "Standard SFT (full parameters)"),
224
+ ("A_lora", "LoRA (Hu et al., 2021)"),
225
+ ("A_qlora", "QLoRA (Dettmers et al., 2023)"),
226
+ ("A_dora", "DoRA (Liu et al., 2024)"),
227
+ ("A_fireact", "FireAct trajectory tuning"),
228
+ ("A_agenttuning", "AgentTuning multi-task"),
229
+ ("A_toolllm", "ToolLLM tool-augmented"),
230
+ ]
231
+ for name, desc in paper1_methods:
232
+ print(f" {name:<20} {desc}")
233
+
234
+ print("\n Paper 2 (SIAS) - from sage.libs.sias:")
235
+ sias_methods = [
236
+ ("B1_coreset_loss", "[SIAS] Select high-loss samples"),
237
+ ("B2_coreset_diversity", "[SIAS] Select diverse samples"),
238
+ ("B3_coreset_hybrid", "[SIAS] 60% loss + 40% diversity"),
239
+ ("C_continual", "[SIAS] Online learning with replay"),
240
+ ("D_combined", "[SIAS] Coreset + Continual Learning"),
241
+ ]
242
+ for name, desc in sias_methods:
243
+ print(f" {name:<20} {desc}")
244
+ print()
245
+ return 0
246
+
247
+ elif args.resource == "experiments":
248
+ print("\n" + "=" * 70)
249
+ print("Available Experiments (Paper 1: Benchmark)")
250
+ print("=" * 70)
251
+
252
+ print("\n📊 Section 5.2: Main Results")
253
+ print("-" * 50)
254
+ experiments = [
255
+ ("timing", "RQ1: Timing Detection", "~10 min"),
256
+ ("planning", "RQ2: Task Planning", "~15 min"),
257
+ ("selection", "RQ3: Tool Selection", "~20 min"),
258
+ ]
259
+ for exp_id, name, time_est in experiments:
260
+ print(f" {exp_id:<20} {name:<30} {time_est}")
261
+
262
+ print("\n🔬 Section 5.3: Analysis")
263
+ print("-" * 50)
264
+ experiments = [
265
+ ("error", "Error Type Breakdown", "~5 min"),
266
+ ("scaling", "Scaling Analysis", "~15 min"),
267
+ ("robustness", "Robustness Analysis", "~10 min"),
268
+ ("ablation", "Ablation Studies", "~10 min"),
269
+ ]
270
+ for exp_id, name, time_est in experiments:
271
+ print(f" {exp_id:<20} {name:<30} {time_est}")
272
+
273
+ print("\n🌐 Section 5.4: Generalization")
274
+ print("-" * 50)
275
+ print(f" {'cross-dataset':<20} {'Cross-Dataset Comparison':<30} ~30 min")
276
+
277
+ print("\n🎓 Section 5.5: Training Comparison")
278
+ print("-" * 50)
279
+ print(f" {'training':<20} {'Training Method Comparison':<30} ~2 hours")
280
+ print()
281
+ return 0
282
+
283
+ return 1
284
+
285
+
286
+ # =============================================================================
287
+ # 主入口
288
+ # =============================================================================
289
+
290
+
291
+ def print_banner():
292
+ print(
293
+ """
294
+ ╔═══════════════════════════════════════════════════════════════════════════╗
295
+ ║ SAGE Agent Bench CLI v3.0 ║
296
+ ║ ║
297
+ ║ Unified benchmark for evaluating Agent tool-calling capabilities ║
298
+ ║ Paper 1: Agent Capability Evaluation Framework ║
299
+ ╚═══════════════════════════════════════════════════════════════════════════╝
300
+ """
301
+ )
302
+
303
+
304
+ def main():
305
+ parser = argparse.ArgumentParser(
306
+ prog="sage-agent-bench",
307
+ description="SAGE Agent Bench CLI - Agent 能力评测命令行入口",
308
+ formatter_class=argparse.RawDescriptionHelpFormatter,
309
+ epilog="""
310
+ Examples:
311
+ sage-agent-bench run --quick # 快速运行完整 Benchmark
312
+ sage-agent-bench eval --dataset all # 跨数据集评测
313
+ sage-agent-bench train --dry-run # 模拟训练对比
314
+ sage-agent-bench llm start # 启动 LLM 服务
315
+ sage-agent-bench list experiments # 列出可用实验
316
+ """,
317
+ )
318
+
319
+ subparsers = parser.add_subparsers(dest="command", help="子命令")
320
+
321
+ # =========================================================================
322
+ # run 子命令
323
+ # =========================================================================
324
+ run_parser = subparsers.add_parser("run", help="运行 Benchmark 实验")
325
+ run_parser.add_argument(
326
+ "--section",
327
+ choices=["5.2", "5.3", "5.4", "5.5", "all"],
328
+ default="all",
329
+ help="运行指定章节",
330
+ )
331
+ run_parser.add_argument(
332
+ "--exp",
333
+ choices=[
334
+ "timing",
335
+ "planning",
336
+ "selection",
337
+ "error",
338
+ "scaling",
339
+ "robustness",
340
+ "ablation",
341
+ "cross-dataset",
342
+ "training",
343
+ "all",
344
+ ],
345
+ help="运行指定实验",
346
+ )
347
+ run_parser.add_argument("--quick", "-q", action="store_true", help="快速模式")
348
+ run_parser.add_argument("--skip-llm", action="store_true", help="跳过 LLM 方法")
349
+ run_parser.add_argument("--generate-paper", action="store_true", help="生成论文材料")
350
+ run_parser.add_argument("--output", "-o", type=str, help="输出目录")
351
+ run_parser.set_defaults(func=cmd_run)
352
+
353
+ # =========================================================================
354
+ # eval 子命令
355
+ # =========================================================================
356
+ eval_parser = subparsers.add_parser("eval", help="工具选择评测")
357
+ eval_parser.add_argument(
358
+ "--dataset",
359
+ "-d",
360
+ default="sage",
361
+ choices=["sage", "acebench", "apibank", "toolalpaca", "bfcl", "all"],
362
+ help="评测数据集",
363
+ )
364
+ eval_parser.add_argument("--samples", "-n", type=int, default=100, help="最大样本数")
365
+ eval_parser.add_argument("--top-k", "-k", type=int, default=5, help="Top-K 评测")
366
+ eval_parser.set_defaults(func=cmd_eval)
367
+
368
+ # =========================================================================
369
+ # train 子命令
370
+ # =========================================================================
371
+ train_parser = subparsers.add_parser("train", help="训练方法对比")
372
+ train_parser.add_argument(
373
+ "--methods",
374
+ "-m",
375
+ default="A_baseline,D_combined",
376
+ help="训练方法,逗号分隔",
377
+ )
378
+ train_parser.add_argument("--model", default="Qwen/Qwen2.5-1.5B-Instruct", help="基础模型")
379
+ train_parser.add_argument("--quick", "-q", action="store_true", help="快速模式")
380
+ train_parser.add_argument("--dry-run", action="store_true", help="模拟运行")
381
+ train_parser.set_defaults(func=cmd_train)
382
+
383
+ # =========================================================================
384
+ # llm 子命令
385
+ # =========================================================================
386
+ llm_parser = subparsers.add_parser("llm", help="LLM 服务管理")
387
+ llm_parser.add_argument(
388
+ "llm_action",
389
+ choices=["start", "stop", "status"],
390
+ help="操作: start/stop/status",
391
+ )
392
+ llm_parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct", help="LLM 模型")
393
+ llm_parser.add_argument("--port", type=int, default=8901, help="端口")
394
+ llm_parser.add_argument("--gpu-memory", type=float, default=0.5, help="GPU 显存比例")
395
+ llm_parser.set_defaults(func=cmd_llm)
396
+
397
+ # =========================================================================
398
+ # list 子命令
399
+ # =========================================================================
400
+ list_parser = subparsers.add_parser("list", help="列出可用资源")
401
+ list_parser.add_argument(
402
+ "resource",
403
+ choices=["datasets", "methods", "experiments"],
404
+ help="资源类型",
405
+ )
406
+ list_parser.set_defaults(func=cmd_list)
407
+
408
+ # =========================================================================
409
+ # 解析并执行
410
+ # =========================================================================
411
+ args = parser.parse_args()
412
+
413
+ if args.command is None:
414
+ print_banner()
415
+ parser.print_help()
416
+ return 0
417
+
418
+ return args.func(args)
419
+
420
+
421
+ if __name__ == "__main__":
422
+ sys.exit(main())