isage-benchmark-agent 0.1.0.1__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_benchmark_agent-0.1.0.1.dist-info/METADATA +91 -0
- isage_benchmark_agent-0.1.0.1.dist-info/RECORD +51 -0
- isage_benchmark_agent-0.1.0.1.dist-info/WHEEL +5 -0
- isage_benchmark_agent-0.1.0.1.dist-info/entry_points.txt +2 -0
- isage_benchmark_agent-0.1.0.1.dist-info/licenses/LICENSE +21 -0
- isage_benchmark_agent-0.1.0.1.dist-info/top_level.txt +1 -0
- sage/__init__.py +0 -0
- sage/benchmark/__init__.py +0 -0
- sage/benchmark/benchmark_agent/__init__.py +108 -0
- sage/benchmark/benchmark_agent/__main__.py +177 -0
- sage/benchmark/benchmark_agent/acebench_loader.py +369 -0
- sage/benchmark/benchmark_agent/adapter_registry.py +3036 -0
- sage/benchmark/benchmark_agent/config/config_loader.py +176 -0
- sage/benchmark/benchmark_agent/config/default_config.yaml +24 -0
- sage/benchmark/benchmark_agent/config/planning_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/timing_detection_exp.yaml +34 -0
- sage/benchmark/benchmark_agent/config/tool_selection_exp.yaml +32 -0
- sage/benchmark/benchmark_agent/data_paths.py +332 -0
- sage/benchmark/benchmark_agent/evaluation/__init__.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/__init__.py +11 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/planning_analyzer.py +111 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/timing_analyzer.py +135 -0
- sage/benchmark/benchmark_agent/evaluation/analyzers/tool_selection_analyzer.py +124 -0
- sage/benchmark/benchmark_agent/evaluation/evaluator.py +228 -0
- sage/benchmark/benchmark_agent/evaluation/metrics.py +650 -0
- sage/benchmark/benchmark_agent/evaluation/report_builder.py +217 -0
- sage/benchmark/benchmark_agent/evaluation/unified_tool_selection.py +602 -0
- sage/benchmark/benchmark_agent/experiments/__init__.py +63 -0
- sage/benchmark/benchmark_agent/experiments/base_experiment.py +263 -0
- sage/benchmark/benchmark_agent/experiments/method_comparison.py +742 -0
- sage/benchmark/benchmark_agent/experiments/planning_exp.py +262 -0
- sage/benchmark/benchmark_agent/experiments/timing_detection_exp.py +198 -0
- sage/benchmark/benchmark_agent/experiments/tool_selection_exp.py +250 -0
- sage/benchmark/benchmark_agent/scripts/__init__.py +26 -0
- sage/benchmark/benchmark_agent/scripts/experiments/__init__.py +40 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_ablation.py +425 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_error.py +400 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_robustness.py +439 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_analysis_scaling.py +565 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_cross_dataset.py +406 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_planning.py +315 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_selection.py +344 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_main_timing.py +270 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_training_comparison.py +620 -0
- sage/benchmark/benchmark_agent/scripts/experiments/exp_utils.py +427 -0
- sage/benchmark/benchmark_agent/scripts/experiments/figure_generator.py +677 -0
- sage/benchmark/benchmark_agent/scripts/experiments/llm_service.py +332 -0
- sage/benchmark/benchmark_agent/scripts/experiments/run_paper1_experiments.py +627 -0
- sage/benchmark/benchmark_agent/scripts/experiments/sage_bench_cli.py +422 -0
- sage/benchmark/benchmark_agent/scripts/experiments/table_generator.py +430 -0
- sage/benchmark/benchmark_agent/tools_loader.py +212 -0
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM Service Manager - LLM 服务管理模块
|
|
3
|
+
|
|
4
|
+
提供统一的 LLM 服务管理功能:
|
|
5
|
+
- 启动/停止 vLLM 服务
|
|
6
|
+
- 检查服务状态
|
|
7
|
+
- 多端口管理
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import signal
|
|
14
|
+
import subprocess
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
# 端口配置
|
|
21
|
+
try:
|
|
22
|
+
from sage.common.config.ports import SagePorts
|
|
23
|
+
|
|
24
|
+
DEFAULT_LLM_PORT = SagePorts.BENCHMARK_LLM
|
|
25
|
+
except ImportError:
|
|
26
|
+
DEFAULT_LLM_PORT = 8901
|
|
27
|
+
|
|
28
|
+
DEFAULT_LLM_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
|
|
29
|
+
LLM_PID_FILE = Path.home() / ".sage" / "benchmark_llm.pid"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def check_llm_service(port: int = DEFAULT_LLM_PORT) -> dict[str, Any]:
|
|
33
|
+
"""
|
|
34
|
+
检查 LLM 服务状态。
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
port: 服务端口
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
状态字典 {"running": bool, "port": int, "model": str, "error": str}
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
import httpx
|
|
44
|
+
except ImportError:
|
|
45
|
+
return {"running": False, "port": port, "model": None, "error": "httpx not installed"}
|
|
46
|
+
|
|
47
|
+
result: dict[str, Any] = {"running": False, "port": port, "model": None, "error": None}
|
|
48
|
+
|
|
49
|
+
try:
|
|
50
|
+
response = httpx.get(f"http://localhost:{port}/v1/models", timeout=5.0)
|
|
51
|
+
if response.status_code == 200:
|
|
52
|
+
data = response.json()
|
|
53
|
+
models = data.get("data", [])
|
|
54
|
+
if models:
|
|
55
|
+
result["running"] = True
|
|
56
|
+
result["model"] = models[0].get("id", "unknown")
|
|
57
|
+
else:
|
|
58
|
+
result["error"] = f"HTTP {response.status_code}"
|
|
59
|
+
except httpx.ConnectError:
|
|
60
|
+
result["error"] = "Connection refused"
|
|
61
|
+
except httpx.TimeoutException:
|
|
62
|
+
result["error"] = "Timeout"
|
|
63
|
+
except Exception as e:
|
|
64
|
+
result["error"] = str(e)
|
|
65
|
+
|
|
66
|
+
return result
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def check_all_llm_services() -> dict[int, dict]:
|
|
70
|
+
"""
|
|
71
|
+
检查所有可能的 LLM 服务端口。
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
{port: status_dict, ...}
|
|
75
|
+
"""
|
|
76
|
+
try:
|
|
77
|
+
from sage.common.config.ports import SagePorts
|
|
78
|
+
|
|
79
|
+
ports = [SagePorts.BENCHMARK_LLM] + SagePorts.get_llm_ports()
|
|
80
|
+
except ImportError:
|
|
81
|
+
ports = [DEFAULT_LLM_PORT, 8001, 8000]
|
|
82
|
+
|
|
83
|
+
# 去重
|
|
84
|
+
seen = set()
|
|
85
|
+
unique_ports = []
|
|
86
|
+
for port in ports:
|
|
87
|
+
if port not in seen:
|
|
88
|
+
seen.add(port)
|
|
89
|
+
unique_ports.append(port)
|
|
90
|
+
|
|
91
|
+
return {port: check_llm_service(port) for port in unique_ports}
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def start_llm_service(
|
|
95
|
+
model: str = DEFAULT_LLM_MODEL,
|
|
96
|
+
port: int = DEFAULT_LLM_PORT,
|
|
97
|
+
gpu_memory: float = 0.5,
|
|
98
|
+
timeout: int = 120,
|
|
99
|
+
) -> bool:
|
|
100
|
+
"""
|
|
101
|
+
启动 vLLM 服务。
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
model: 模型 ID
|
|
105
|
+
port: 服务端口
|
|
106
|
+
gpu_memory: GPU 显存使用比例
|
|
107
|
+
timeout: 等待启动超时秒数
|
|
108
|
+
|
|
109
|
+
Returns:
|
|
110
|
+
是否成功启动
|
|
111
|
+
"""
|
|
112
|
+
# 检查是否已运行
|
|
113
|
+
status = check_llm_service(port)
|
|
114
|
+
if status["running"]:
|
|
115
|
+
print(f"✅ LLM 服务已在运行 (port={port}, model={status['model']})")
|
|
116
|
+
return True
|
|
117
|
+
|
|
118
|
+
print("🚀 启动 LLM 服务...")
|
|
119
|
+
print(f" 模型: {model}")
|
|
120
|
+
print(f" 端口: {port}")
|
|
121
|
+
print(f" GPU 显存: {gpu_memory * 100:.0f}%")
|
|
122
|
+
|
|
123
|
+
# 确保 PID 文件目录存在
|
|
124
|
+
LLM_PID_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
125
|
+
|
|
126
|
+
# 构建命令
|
|
127
|
+
cmd = [
|
|
128
|
+
sys.executable,
|
|
129
|
+
"-m",
|
|
130
|
+
"vllm.entrypoints.openai.api_server",
|
|
131
|
+
"--model",
|
|
132
|
+
model,
|
|
133
|
+
"--port",
|
|
134
|
+
str(port),
|
|
135
|
+
"--gpu-memory-utilization",
|
|
136
|
+
str(gpu_memory),
|
|
137
|
+
"--trust-remote-code",
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
# 启动后台进程
|
|
142
|
+
process = subprocess.Popen(
|
|
143
|
+
cmd,
|
|
144
|
+
stdout=subprocess.PIPE,
|
|
145
|
+
stderr=subprocess.PIPE,
|
|
146
|
+
start_new_session=True,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# 保存 PID
|
|
150
|
+
with open(LLM_PID_FILE, "w") as f:
|
|
151
|
+
f.write(str(process.pid))
|
|
152
|
+
|
|
153
|
+
print(f" PID: {process.pid}")
|
|
154
|
+
print(" 等待服务启动...")
|
|
155
|
+
|
|
156
|
+
# 等待服务就绪
|
|
157
|
+
for i in range(timeout):
|
|
158
|
+
time.sleep(1)
|
|
159
|
+
if check_llm_service(port)["running"]:
|
|
160
|
+
print(f"\n✅ LLM 服务已启动 (耗时 {i + 1}s)")
|
|
161
|
+
return True
|
|
162
|
+
if i % 10 == 9:
|
|
163
|
+
print(f" 已等待 {i + 1}s...")
|
|
164
|
+
|
|
165
|
+
print("\n❌ 服务启动超时")
|
|
166
|
+
return False
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
print(f"❌ 启动失败: {e}")
|
|
170
|
+
return False
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def stop_llm_service() -> bool:
|
|
174
|
+
"""
|
|
175
|
+
停止 LLM 服务。
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
是否成功停止
|
|
179
|
+
"""
|
|
180
|
+
if not LLM_PID_FILE.exists():
|
|
181
|
+
print("ℹ️ 没有找到运行中的 LLM 服务")
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
with open(LLM_PID_FILE) as f:
|
|
186
|
+
pid = int(f.read().strip())
|
|
187
|
+
|
|
188
|
+
print(f"🛑 停止 LLM 服务 (PID={pid})...")
|
|
189
|
+
os.kill(pid, signal.SIGTERM)
|
|
190
|
+
|
|
191
|
+
# 等待进程结束
|
|
192
|
+
for _ in range(10):
|
|
193
|
+
try:
|
|
194
|
+
os.kill(pid, 0) # 检查进程是否存在
|
|
195
|
+
time.sleep(0.5)
|
|
196
|
+
except OSError:
|
|
197
|
+
break
|
|
198
|
+
|
|
199
|
+
LLM_PID_FILE.unlink(missing_ok=True)
|
|
200
|
+
print("✅ LLM 服务已停止")
|
|
201
|
+
return True
|
|
202
|
+
|
|
203
|
+
except ProcessLookupError:
|
|
204
|
+
# 进程已不存在
|
|
205
|
+
LLM_PID_FILE.unlink(missing_ok=True)
|
|
206
|
+
print("✅ LLM 服务已停止")
|
|
207
|
+
return True
|
|
208
|
+
except Exception as e:
|
|
209
|
+
print(f"❌ 停止失败: {e}")
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def print_llm_status():
|
|
214
|
+
"""打印 LLM 服务状态。"""
|
|
215
|
+
print("\n📡 LLM 服务状态")
|
|
216
|
+
print("=" * 50)
|
|
217
|
+
|
|
218
|
+
statuses = check_all_llm_services()
|
|
219
|
+
|
|
220
|
+
for port, status in statuses.items():
|
|
221
|
+
if status["running"]:
|
|
222
|
+
print(f" ✅ Port {port}: 运行中")
|
|
223
|
+
print(f" 模型: {status['model']}")
|
|
224
|
+
else:
|
|
225
|
+
print(f" ❌ Port {port}: {status['error'] or '未运行'}")
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def ensure_llm_available(
|
|
229
|
+
port: int = DEFAULT_LLM_PORT,
|
|
230
|
+
model: str = DEFAULT_LLM_MODEL,
|
|
231
|
+
auto_start: bool = True,
|
|
232
|
+
allow_cloud: bool = True,
|
|
233
|
+
) -> bool:
|
|
234
|
+
"""
|
|
235
|
+
确保 LLM 服务可用。
|
|
236
|
+
|
|
237
|
+
如果服务未运行且 auto_start=True,会尝试启动服务。
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
port: 服务端口
|
|
241
|
+
model: 模型 ID
|
|
242
|
+
auto_start: 是否自动启动
|
|
243
|
+
allow_cloud: 是否允许使用云端 API (默认 True)
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
服务是否可用
|
|
247
|
+
"""
|
|
248
|
+
# 首先检查指定端口
|
|
249
|
+
print(f" 🔍 Checking LLM service on port {port}...")
|
|
250
|
+
status = check_llm_service(port)
|
|
251
|
+
if status["running"]:
|
|
252
|
+
print(f" ✅ Found running service on port {port}")
|
|
253
|
+
# 设置环境变量供后续使用
|
|
254
|
+
os.environ["SAGE_LLM_PORT"] = str(port)
|
|
255
|
+
os.environ["SAGE_CHAT_BASE_URL"] = f"http://localhost:{port}/v1"
|
|
256
|
+
return True
|
|
257
|
+
|
|
258
|
+
# 检查其他端口
|
|
259
|
+
print(" 🔍 Checking other common ports...")
|
|
260
|
+
all_statuses = check_all_llm_services()
|
|
261
|
+
for p, s in all_statuses.items():
|
|
262
|
+
if s["running"]:
|
|
263
|
+
print(f" ℹ️ Found running service on port {p}")
|
|
264
|
+
# 设置环境变量供后续使用
|
|
265
|
+
os.environ["SAGE_LLM_PORT"] = str(p)
|
|
266
|
+
os.environ["SAGE_CHAT_BASE_URL"] = f"http://localhost:{p}/v1"
|
|
267
|
+
return True
|
|
268
|
+
|
|
269
|
+
# 检查云端 API 配置
|
|
270
|
+
if allow_cloud and (os.environ.get("SAGE_CHAT_API_KEY") or os.environ.get("OPENAI_API_KEY")):
|
|
271
|
+
print(" ℹ️ 检测到云端 API 配置")
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
# 尝试自动启动
|
|
275
|
+
if auto_start:
|
|
276
|
+
print(" ⚠️ 未检测到可用的 LLM 服务,尝试自动启动...")
|
|
277
|
+
return start_llm_service(model=model, port=port)
|
|
278
|
+
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# =============================================================================
|
|
283
|
+
# CLI 入口
|
|
284
|
+
# =============================================================================
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def main():
|
|
288
|
+
import argparse
|
|
289
|
+
|
|
290
|
+
parser = argparse.ArgumentParser(
|
|
291
|
+
description="LLM Service Manager",
|
|
292
|
+
)
|
|
293
|
+
|
|
294
|
+
subparsers = parser.add_subparsers(dest="action", help="操作")
|
|
295
|
+
|
|
296
|
+
# start
|
|
297
|
+
start_parser = subparsers.add_parser("start", help="启动 LLM 服务")
|
|
298
|
+
start_parser.add_argument("--model", default=DEFAULT_LLM_MODEL, help="模型 ID")
|
|
299
|
+
start_parser.add_argument("--port", type=int, default=DEFAULT_LLM_PORT, help="端口")
|
|
300
|
+
start_parser.add_argument("--gpu-memory", type=float, default=0.5, help="GPU 显存比例")
|
|
301
|
+
|
|
302
|
+
# stop
|
|
303
|
+
subparsers.add_parser("stop", help="停止 LLM 服务")
|
|
304
|
+
|
|
305
|
+
# status
|
|
306
|
+
subparsers.add_parser("status", help="查看服务状态")
|
|
307
|
+
|
|
308
|
+
args = parser.parse_args()
|
|
309
|
+
|
|
310
|
+
if args.action == "start":
|
|
311
|
+
success = start_llm_service(
|
|
312
|
+
model=args.model,
|
|
313
|
+
port=args.port,
|
|
314
|
+
gpu_memory=args.gpu_memory,
|
|
315
|
+
)
|
|
316
|
+
return 0 if success else 1
|
|
317
|
+
|
|
318
|
+
elif args.action == "stop":
|
|
319
|
+
success = stop_llm_service()
|
|
320
|
+
return 0 if success else 1
|
|
321
|
+
|
|
322
|
+
elif args.action == "status":
|
|
323
|
+
print_llm_status()
|
|
324
|
+
return 0
|
|
325
|
+
|
|
326
|
+
else:
|
|
327
|
+
parser.print_help()
|
|
328
|
+
return 0
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
if __name__ == "__main__":
|
|
332
|
+
sys.exit(main())
|