py-adtools 0.1.6__py3-none-any.whl → 0.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of py-adtools might be problematic. Click here for more details.
- adtools/evaluator.py +32 -8
- adtools/evaluator_pool.py +9 -7
- adtools/lm/__init__.py +6 -0
- adtools/lm/lm_base.py +39 -0
- adtools/lm/openai_api.py +72 -0
- adtools/lm/vllm_server.py +331 -0
- {py_adtools-0.1.6.dist-info → py_adtools-0.1.8.dist-info}/METADATA +1 -1
- py_adtools-0.1.8.dist-info/RECORD +13 -0
- py_adtools-0.1.6.dist-info/RECORD +0 -9
- {py_adtools-0.1.6.dist-info → py_adtools-0.1.8.dist-info}/WHEEL +0 -0
- {py_adtools-0.1.6.dist-info → py_adtools-0.1.8.dist-info}/licenses/LICENSE +0 -0
- {py_adtools-0.1.6.dist-info → py_adtools-0.1.8.dist-info}/top_level.txt +0 -0
adtools/evaluator.py
CHANGED
|
@@ -8,9 +8,10 @@ Commercial use of this software or its derivatives requires prior written permis
|
|
|
8
8
|
import multiprocessing
|
|
9
9
|
import os
|
|
10
10
|
import sys
|
|
11
|
+
import time
|
|
11
12
|
from abc import ABC, abstractmethod
|
|
12
13
|
from queue import Empty
|
|
13
|
-
from typing import Any, Literal, Dict, Callable, List
|
|
14
|
+
from typing import Any, Literal, Dict, Callable, List, Tuple
|
|
14
15
|
import psutil
|
|
15
16
|
import traceback
|
|
16
17
|
|
|
@@ -22,6 +23,7 @@ class PyEvaluator(ABC):
|
|
|
22
23
|
def __init__(
|
|
23
24
|
self,
|
|
24
25
|
exec_code: bool = True,
|
|
26
|
+
find_and_kill_children_evaluation_process: bool = False,
|
|
25
27
|
debug_mode: bool = False,
|
|
26
28
|
*,
|
|
27
29
|
join_timeout_seconds: int = 10
|
|
@@ -33,11 +35,15 @@ class PyEvaluator(ABC):
|
|
|
33
35
|
which will be passed to 'self.evaluate_program()'. Set this parameter to 'False' if you are going to
|
|
34
36
|
evaluate a Python scripy. Note that if the parameter is set to 'False', the arguments 'callable_...'
|
|
35
37
|
in 'self.evaluate_program()' will no longer be affective.
|
|
38
|
+
find_and_kill_children_evaluation_process: If using 'self.secure_evaluate', kill children processes
|
|
39
|
+
when they are terminated. Note that it is suggested to set to 'False' if the evaluation process
|
|
40
|
+
does not start new processes.
|
|
36
41
|
debug_mode: Debug mode.
|
|
37
42
|
join_timeout_seconds: Timeout in seconds to wait for the process to finish. Kill the process if timeout.
|
|
38
43
|
"""
|
|
39
44
|
self.debug_mode = debug_mode
|
|
40
45
|
self.exec_code = exec_code
|
|
46
|
+
self.find_and_kill_children_evaluation_process = find_and_kill_children_evaluation_process
|
|
41
47
|
self.join_timeout_seconds = join_timeout_seconds
|
|
42
48
|
|
|
43
49
|
@abstractmethod
|
|
@@ -66,11 +72,14 @@ class PyEvaluator(ABC):
|
|
|
66
72
|
)
|
|
67
73
|
|
|
68
74
|
def _kill_process_and_its_children(self, process: multiprocessing.Process):
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
75
|
+
if self.find_and_kill_children_evaluation_process:
|
|
76
|
+
# Find all children processes
|
|
77
|
+
try:
|
|
78
|
+
parent = psutil.Process(process.pid)
|
|
79
|
+
children_processes = parent.children(recursive=True)
|
|
80
|
+
except psutil.NoSuchProcess:
|
|
81
|
+
children_processes = []
|
|
82
|
+
else:
|
|
74
83
|
children_processes = []
|
|
75
84
|
# Terminate parent process
|
|
76
85
|
process.terminate()
|
|
@@ -151,8 +160,9 @@ class PyEvaluator(ABC):
|
|
|
151
160
|
timeout_seconds: int | float = None,
|
|
152
161
|
redirect_to_devnull: bool = False,
|
|
153
162
|
multiprocessing_start_method: Literal['default', 'auto', 'fork', 'spawn'] = 'auto',
|
|
163
|
+
get_evaluate_time=False,
|
|
154
164
|
**kwargs
|
|
155
|
-
):
|
|
165
|
+
) -> Any | Tuple[Any, float]:
|
|
156
166
|
"""Evaluate program in a new process. This enables timeout restriction and output redirection.
|
|
157
167
|
Args:
|
|
158
168
|
program: the program to be evaluated.
|
|
@@ -161,7 +171,11 @@ class PyEvaluator(ABC):
|
|
|
161
171
|
multiprocessing_start_method: start a process using 'fork' or 'spawn'. If set to 'auto',
|
|
162
172
|
the process will be started using 'fork' with Linux/macOS and 'spawn' with Windows.
|
|
163
173
|
If set to 'default', there will be no changes to system default.
|
|
174
|
+
get_evaluate_time: get evaluation time for this program.
|
|
164
175
|
**kwargs: additional keyword arguments to pass to 'evaluate_program'.
|
|
176
|
+
Returns:
|
|
177
|
+
Returns the evaluation results. If the 'get_evaluate_time' is True,
|
|
178
|
+
the return value will be (Results, Time).
|
|
165
179
|
"""
|
|
166
180
|
if multiprocessing_start_method == 'auto':
|
|
167
181
|
# Force macOS and Linux use 'fork' to generate new process
|
|
@@ -180,21 +194,28 @@ class PyEvaluator(ABC):
|
|
|
180
194
|
args=(str(program), result_queue, redirect_to_devnull),
|
|
181
195
|
kwargs=kwargs,
|
|
182
196
|
)
|
|
197
|
+
evaluate_start_time = time.time()
|
|
183
198
|
process.start()
|
|
184
199
|
|
|
185
200
|
if timeout_seconds is not None:
|
|
186
201
|
try:
|
|
187
202
|
# Get the result in timeout seconds
|
|
188
203
|
result = result_queue.get(timeout=timeout_seconds)
|
|
204
|
+
# Calculate the evaluate time
|
|
205
|
+
eval_time = time.time() - evaluate_start_time
|
|
189
206
|
# After getting the result, terminate/kill the process
|
|
190
207
|
self._kill_process_and_its_children(process)
|
|
191
208
|
except Empty: # The queue is empty indicates a timeout
|
|
209
|
+
# Calculate the evaluate time
|
|
210
|
+
eval_time = time.time() - evaluate_start_time
|
|
192
211
|
if self.debug_mode:
|
|
193
212
|
print(f'DEBUG: the evaluation time exceeds {timeout_seconds}s.')
|
|
194
213
|
# Terminate/kill all processes if timeout happens
|
|
195
214
|
self._kill_process_and_its_children(process)
|
|
196
215
|
result = None
|
|
197
216
|
except Exception as e:
|
|
217
|
+
# Calculate the evaluate time
|
|
218
|
+
eval_time = time.time() - evaluate_start_time
|
|
198
219
|
if self.debug_mode:
|
|
199
220
|
print(f'DEBUG: evaluation failed with exception:\n{traceback.format_exc()}')
|
|
200
221
|
# Terminate/kill all processes if meet exceptions
|
|
@@ -203,9 +224,12 @@ class PyEvaluator(ABC):
|
|
|
203
224
|
else:
|
|
204
225
|
# If there is no timeout limit, wait execution to finish
|
|
205
226
|
result = result_queue.get()
|
|
227
|
+
# Calculate the evaluate time
|
|
228
|
+
eval_time = time.time() - evaluate_start_time
|
|
206
229
|
# Terminate/kill all processes after evaluation
|
|
207
230
|
self._kill_process_and_its_children(process)
|
|
208
|
-
|
|
231
|
+
|
|
232
|
+
return (result, eval_time) if get_evaluate_time else result
|
|
209
233
|
except Exception as e:
|
|
210
234
|
if self.debug_mode:
|
|
211
235
|
print(traceback.format_exc())
|
adtools/evaluator_pool.py
CHANGED
|
@@ -62,21 +62,23 @@ class EvaluatorExecutorPool:
|
|
|
62
62
|
program: the program to be evaluated.
|
|
63
63
|
timeout_seconds: return 'None' if the execution time exceeds 'timeout_seconds'.
|
|
64
64
|
redirect_to_devnull: redirect any output to '/dev/null'.
|
|
65
|
-
multiprocessing_start_method: start a process using 'fork' or 'spawn'.
|
|
65
|
+
multiprocessing_start_method: start a process using 'fork' or 'spawn'. If set to 'auto',
|
|
66
|
+
the process will be started using 'fork' with Linux/macOS and 'spawn' with Windows.
|
|
67
|
+
If set to 'default', there will be no changes to system default.
|
|
68
|
+
return_time: get evaluation time for this program.
|
|
66
69
|
**kwargs: additional keyword arguments to pass to 'evaluate_program'.
|
|
70
|
+
Returns:
|
|
71
|
+
Returns the evaluation results. If the 'get_evaluate_time' is True,
|
|
72
|
+
the return value will be (Results, Time).
|
|
67
73
|
"""
|
|
68
|
-
start_time = time.time()
|
|
69
74
|
future = self.pool.submit(
|
|
70
75
|
self.evaluator.secure_evaluate,
|
|
71
76
|
program,
|
|
72
77
|
timeout_seconds,
|
|
73
78
|
redirect_to_devnull,
|
|
74
79
|
multiprocessing_start_method,
|
|
80
|
+
return_time,
|
|
75
81
|
**kwargs
|
|
76
82
|
)
|
|
77
83
|
res = future.result()
|
|
78
|
-
|
|
79
|
-
if return_time:
|
|
80
|
-
return res, duration
|
|
81
|
-
else:
|
|
82
|
-
return res
|
|
84
|
+
return res
|
adtools/lm/__init__.py
ADDED
adtools/lm/lm_base.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 Rui Zhang <rzhang.cs@gmail.com>
|
|
3
|
+
|
|
4
|
+
NOTICE: This code is under MIT license. This code is intended for academic/research purposes only.
|
|
5
|
+
Commercial use of this software or its derivatives requires prior written permission.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import abstractmethod
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
import openai.types.chat
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class LanguageModel:
|
|
15
|
+
"""Base class for language model interface."""
|
|
16
|
+
|
|
17
|
+
@abstractmethod
|
|
18
|
+
def chat_completion(
|
|
19
|
+
self,
|
|
20
|
+
message: str | List[openai.types.chat.ChatCompletionMessageParam],
|
|
21
|
+
max_tokens: int,
|
|
22
|
+
timeout_seconds: float,
|
|
23
|
+
*args,
|
|
24
|
+
**kwargs
|
|
25
|
+
):
|
|
26
|
+
"""Send a chat completion query with OpenAI format to the vLLM server. Return the response content.
|
|
27
|
+
Args:
|
|
28
|
+
message: The message in str or openai format.
|
|
29
|
+
max_tokens: The maximum number of tokens to generate.
|
|
30
|
+
timeout_seconds: The timeout seconds.
|
|
31
|
+
"""
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
def close(self):
|
|
35
|
+
"""Release resources (if necessary)."""
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def __del__(self):
|
|
39
|
+
self.close()
|
adtools/lm/openai_api.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 Rui Zhang <rzhang.cs@gmail.com>
|
|
3
|
+
|
|
4
|
+
NOTICE: This code is under MIT license. This code is intended for academic/research purposes only.
|
|
5
|
+
Commercial use of this software or its derivatives requires prior written permission.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
from typing import List
|
|
10
|
+
|
|
11
|
+
import openai.types.chat
|
|
12
|
+
|
|
13
|
+
from .lm_base import LanguageModel
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class OpenAIAPI(LanguageModel):
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
model: str,
|
|
20
|
+
base_url: str = None,
|
|
21
|
+
api_key: str = None,
|
|
22
|
+
**openai_init_kwargs
|
|
23
|
+
):
|
|
24
|
+
super().__init__()
|
|
25
|
+
# If base_url is set to None, find 'OPENAI_BASE_URL' in environment variables
|
|
26
|
+
if base_url is None:
|
|
27
|
+
if 'OPENAI_BASE_URL' not in os.environ:
|
|
28
|
+
raise RuntimeError('If "base_url" is None, the environment variable OPENAI_BASE_URL must be set.')
|
|
29
|
+
else:
|
|
30
|
+
base_url = os.environ['OPENAI_BASE_URL']
|
|
31
|
+
|
|
32
|
+
# If api_key is set to None, find 'OPENAI_API_KEY' in environment variables
|
|
33
|
+
if api_key is None:
|
|
34
|
+
if 'OPENAI_API_KEY' not in os.environ:
|
|
35
|
+
raise RuntimeError('If "api_key" is None, OPENAI_API_KEY must be set.')
|
|
36
|
+
else:
|
|
37
|
+
api_key = os.environ['OPENAI_API_KEY']
|
|
38
|
+
|
|
39
|
+
self._model = model
|
|
40
|
+
self._client = openai.OpenAI(
|
|
41
|
+
api_key=api_key,
|
|
42
|
+
base_url=base_url,
|
|
43
|
+
**openai_init_kwargs
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
def chat_completion(
|
|
47
|
+
self,
|
|
48
|
+
message: str | List[openai.types.chat.ChatCompletionMessageParam],
|
|
49
|
+
max_tokens: int,
|
|
50
|
+
timeout_seconds: float,
|
|
51
|
+
*args,
|
|
52
|
+
**kwargs
|
|
53
|
+
):
|
|
54
|
+
"""Send a chat completion query with OpenAI format to the vLLM server. Return the response content.
|
|
55
|
+
Args:
|
|
56
|
+
message: The message in str or openai format.
|
|
57
|
+
max_tokens: The maximum number of tokens to generate.
|
|
58
|
+
timeout_seconds: The timeout seconds.
|
|
59
|
+
"""
|
|
60
|
+
if isinstance(message, str):
|
|
61
|
+
message = [{'role': 'user', 'content': message.strip()}]
|
|
62
|
+
|
|
63
|
+
response = self._client.chat.completions.create(
|
|
64
|
+
model=self._model,
|
|
65
|
+
messages=message,
|
|
66
|
+
stream=False,
|
|
67
|
+
max_tokens=max_tokens,
|
|
68
|
+
timeout=timeout_seconds,
|
|
69
|
+
*args,
|
|
70
|
+
**kwargs,
|
|
71
|
+
)
|
|
72
|
+
return response.choices[0].message.content
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Copyright (c) 2025 Rui Zhang <rzhang.cs@gmail.com>
|
|
3
|
+
|
|
4
|
+
NOTICE: This code is under MIT license. This code is intended for academic/research purposes only.
|
|
5
|
+
Commercial use of this software or its derivatives requires prior written permission.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import vllm
|
|
10
|
+
except ImportError:
|
|
11
|
+
raise ImportError('Python package "vllm" is not installed.')
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
import requests
|
|
15
|
+
except ImportError:
|
|
16
|
+
raise ImportError('Python package "requests" is not installed.')
|
|
17
|
+
|
|
18
|
+
from typing import Optional, List, Literal, Dict, Any
|
|
19
|
+
import os
|
|
20
|
+
import subprocess
|
|
21
|
+
import sys
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
import psutil
|
|
24
|
+
import time
|
|
25
|
+
|
|
26
|
+
import openai.types.chat
|
|
27
|
+
|
|
28
|
+
from .lm_base import LanguageModel
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _print_cmd_list(cmd_list, gpus, host, port):
|
|
32
|
+
print('\n' + '=' * 80)
|
|
33
|
+
print(f'[vLLM] Launching vLLM on GPU:{gpus}; URL: https://{host}:{port}')
|
|
34
|
+
print('=' * 80)
|
|
35
|
+
cmd = cmd_list[0] + ' \\\n'
|
|
36
|
+
for c in cmd_list[1:]:
|
|
37
|
+
cmd += ' ' + c + ' \\\n'
|
|
38
|
+
print(cmd.strip())
|
|
39
|
+
print('=' * 80 + '\n', flush=True)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class VLLMServer(LanguageModel):
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
model_path: str,
|
|
46
|
+
port: int,
|
|
47
|
+
gpus: int | list[int],
|
|
48
|
+
tokenizer_path: Optional[str] = None,
|
|
49
|
+
max_model_len: int = 16384,
|
|
50
|
+
max_lora_rank: Optional[int] = None,
|
|
51
|
+
host: str = '0.0.0.0',
|
|
52
|
+
mem_util: float = 0.85,
|
|
53
|
+
deploy_timeout_seconds: int = 600,
|
|
54
|
+
enforce_eager: bool = False,
|
|
55
|
+
vllm_log_level: Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] = 'INFO',
|
|
56
|
+
silent_mode: bool = False,
|
|
57
|
+
env_variable_dict: Optional[Dict[str, str]] = None,
|
|
58
|
+
vllm_serve_args: Optional[List[str]] = None,
|
|
59
|
+
vllm_serve_kwargs: Optional[Dict[str, str]] = None,
|
|
60
|
+
chat_template_kwargs: Optional[Dict[str, Any]] = None
|
|
61
|
+
):
|
|
62
|
+
"""Deploy an LLM on specified GPUs.
|
|
63
|
+
Args:
|
|
64
|
+
model_path: Path to the model to deploy.
|
|
65
|
+
tokenizer_path: Path to the tokenizer to use.
|
|
66
|
+
port: List of ports to deploy.
|
|
67
|
+
gpus: List of GPUs to deploy.
|
|
68
|
+
max_lora_rank: Max rank of LoRA adapter. Defaults to `None` which disables LoRA adapter.
|
|
69
|
+
host: Host address for vLLM server.
|
|
70
|
+
mem_util: Memory utility for each vLLM deployment.
|
|
71
|
+
deploy_timeout_seconds: Timeout to deploy (in seconds).
|
|
72
|
+
enforce_eager: Enforce eager mode.
|
|
73
|
+
vllm_log_level: Log level of vLLM server.
|
|
74
|
+
silent_mode: Silent mode.
|
|
75
|
+
env_variable_dict: Environment variables to use for vLLM server, e.g., {'KEY': 'VALUE'}.
|
|
76
|
+
vllm_serve_args: Arguments to pass to vLLM server, e.g., ['--enable-reasoning'].
|
|
77
|
+
vllm_serve_kwargs: Keyword arguments to pass to vLLM server, e.g., {'--reasoning-parser': 'deepseek-r1'}.
|
|
78
|
+
|
|
79
|
+
Example:
|
|
80
|
+
# deploy a model on GPU 0 and 1
|
|
81
|
+
llm = VLLMServer(
|
|
82
|
+
model_path='path/to/model',
|
|
83
|
+
tokenizer_path='path/to/tokenizer',
|
|
84
|
+
gpus=[0, 1], # set gpus=0 or gpus=[0] if you only use one GPU
|
|
85
|
+
port=12001,
|
|
86
|
+
mem_util=0.8
|
|
87
|
+
)
|
|
88
|
+
# draw sample using base model
|
|
89
|
+
llm.draw_sample('hello')
|
|
90
|
+
|
|
91
|
+
# load adapter and draw sample
|
|
92
|
+
llm.load_lora_adapter('adapter_1', '/path/to/adapter')
|
|
93
|
+
llm.draw_sample('hello', lora_name='adapter_1')
|
|
94
|
+
|
|
95
|
+
# unload adapter
|
|
96
|
+
llm.unload_lora_adapter('adapter_1')
|
|
97
|
+
|
|
98
|
+
# release resources
|
|
99
|
+
llm.close()
|
|
100
|
+
"""
|
|
101
|
+
self._model_path = model_path
|
|
102
|
+
self._port = port
|
|
103
|
+
self._gpus = gpus
|
|
104
|
+
self._tokenizer_path = tokenizer_path if tokenizer_path is not None else model_path
|
|
105
|
+
self._max_model_len = max_model_len
|
|
106
|
+
self._max_lora_rank = max_lora_rank
|
|
107
|
+
self._host = host
|
|
108
|
+
self._mem_util = mem_util
|
|
109
|
+
self._deploy_timeout_seconds = deploy_timeout_seconds
|
|
110
|
+
self._enforce_eager = enforce_eager
|
|
111
|
+
self._vllm_log_level = vllm_log_level
|
|
112
|
+
self._silent_mode = silent_mode
|
|
113
|
+
self._env_variable_dict = env_variable_dict
|
|
114
|
+
self._vllm_serve_args = vllm_serve_args
|
|
115
|
+
self._vllm_serve_kwargs = vllm_serve_kwargs
|
|
116
|
+
self._chat_template_kwargs = chat_template_kwargs
|
|
117
|
+
|
|
118
|
+
# Deploy vLLMs
|
|
119
|
+
self._process = self._launch_vllm()
|
|
120
|
+
self._wait_for_vllm()
|
|
121
|
+
|
|
122
|
+
def _launch_vllm(self):
|
|
123
|
+
"""Launch a vLLM server and return the subprocess.
|
|
124
|
+
"""
|
|
125
|
+
if isinstance(self._gpus, int):
|
|
126
|
+
gpus = str(self._gpus)
|
|
127
|
+
else:
|
|
128
|
+
gpus = ','.join([str(g) for g in self._gpus])
|
|
129
|
+
|
|
130
|
+
executable_path = sys.executable
|
|
131
|
+
cmd = [
|
|
132
|
+
executable_path, '-m',
|
|
133
|
+
'vllm.entrypoints.openai.api_server',
|
|
134
|
+
'--model', self._model_path,
|
|
135
|
+
'--tokenizer', self._tokenizer_path,
|
|
136
|
+
'--max_model_len', str(self._max_model_len),
|
|
137
|
+
'--host', self._host,
|
|
138
|
+
'--port', str(self._port),
|
|
139
|
+
'--gpu-memory-utilization', str(self._mem_util),
|
|
140
|
+
'--tensor-parallel-size', str(len(self._gpus)) if isinstance(self._gpus, list) else '1',
|
|
141
|
+
'--trust-remote-code',
|
|
142
|
+
'--chat-template-content-format', 'string',
|
|
143
|
+
]
|
|
144
|
+
|
|
145
|
+
if self._enforce_eager:
|
|
146
|
+
cmd.append('--enforce_eager')
|
|
147
|
+
|
|
148
|
+
# Other args for vllm serve
|
|
149
|
+
if self._vllm_serve_args is not None:
|
|
150
|
+
for arg in self._vllm_serve_args:
|
|
151
|
+
cmd.append(arg)
|
|
152
|
+
|
|
153
|
+
# Other kwargs for vllm serve
|
|
154
|
+
if self._vllm_serve_kwargs is not None:
|
|
155
|
+
for kwarg, value in self._vllm_serve_kwargs.items():
|
|
156
|
+
cmd.extend([kwarg, value])
|
|
157
|
+
|
|
158
|
+
# Environmental variables
|
|
159
|
+
env = os.environ.copy()
|
|
160
|
+
env['CUDA_VISIBLE_DEVICES'] = gpus
|
|
161
|
+
env['VLLM_LOGGING_LEVEL'] = self._vllm_log_level
|
|
162
|
+
|
|
163
|
+
# FIXME: These code are required for my machine :(
|
|
164
|
+
# FIXME: This may due to the bad NCCL configuration :(
|
|
165
|
+
if isinstance(self._gpus, list) and len(self._gpus) > 1:
|
|
166
|
+
# set NCCL environment variable
|
|
167
|
+
env['NCCL_P2P_DISABLE'] = '1'
|
|
168
|
+
# disable custom all reduce
|
|
169
|
+
cmd.append('--disable-custom-all-reduce')
|
|
170
|
+
|
|
171
|
+
# Enable LoRA dynamic loading
|
|
172
|
+
if self._max_lora_rank is not None:
|
|
173
|
+
cmd.extend([
|
|
174
|
+
'--enable-lora',
|
|
175
|
+
'--max-lora-rank', str(self._max_lora_rank),
|
|
176
|
+
])
|
|
177
|
+
env['VLLM_ALLOW_RUNTIME_LORA_UPDATING'] = 'True'
|
|
178
|
+
|
|
179
|
+
# Other env variables
|
|
180
|
+
if self._env_variable_dict is not None:
|
|
181
|
+
for k, v in self._env_variable_dict.items():
|
|
182
|
+
env[k] = v
|
|
183
|
+
|
|
184
|
+
_print_cmd_list(cmd, gpus=self._gpus, host=self._host, port=self._port)
|
|
185
|
+
|
|
186
|
+
# Launch vllm using subprocess
|
|
187
|
+
stdout = Path(os.devnull).open('w') if self._silent_mode else None
|
|
188
|
+
proc = subprocess.Popen(cmd, env=env, stdout=stdout, stderr=subprocess.STDOUT)
|
|
189
|
+
return proc
|
|
190
|
+
|
|
191
|
+
def _kill_vllm_process(self):
|
|
192
|
+
try:
|
|
193
|
+
# Get child processes before terminating parent
|
|
194
|
+
try:
|
|
195
|
+
parent = psutil.Process(self._process.pid)
|
|
196
|
+
children = parent.children(recursive=True)
|
|
197
|
+
except psutil.NoSuchProcess:
|
|
198
|
+
children = []
|
|
199
|
+
|
|
200
|
+
# Terminate parent process
|
|
201
|
+
self._process.terminate()
|
|
202
|
+
self._process.wait(timeout=5)
|
|
203
|
+
print(f'[vLLM] terminated process: {self._process.pid}')
|
|
204
|
+
|
|
205
|
+
# Kill any remaining children
|
|
206
|
+
for child in children:
|
|
207
|
+
try:
|
|
208
|
+
child.terminate()
|
|
209
|
+
child.wait(timeout=2)
|
|
210
|
+
except (psutil.NoSuchProcess, psutil.TimeoutExpired):
|
|
211
|
+
try:
|
|
212
|
+
child.kill()
|
|
213
|
+
except psutil.NoSuchProcess:
|
|
214
|
+
pass
|
|
215
|
+
except subprocess.TimeoutExpired:
|
|
216
|
+
self._process.kill()
|
|
217
|
+
print(f'[vLLM] killed process: {self._process.pid}')
|
|
218
|
+
|
|
219
|
+
def _wait_for_vllm(self):
|
|
220
|
+
"""Check each vLLM server's state and check /health. Kill all vLLM server processes if timeout.
|
|
221
|
+
"""
|
|
222
|
+
for _ in range(self._deploy_timeout_seconds):
|
|
223
|
+
# check process status
|
|
224
|
+
if self._process.poll() is not None:
|
|
225
|
+
sys.exit(f'[vLLM] crashed (exit {self._process.returncode})')
|
|
226
|
+
|
|
227
|
+
# check server status
|
|
228
|
+
health = f'http://{self._host}:{self._port}/health'
|
|
229
|
+
try:
|
|
230
|
+
if requests.get(health, timeout=1).status_code == 200:
|
|
231
|
+
return
|
|
232
|
+
except Exception:
|
|
233
|
+
pass
|
|
234
|
+
time.sleep(1)
|
|
235
|
+
|
|
236
|
+
# Servers fail to initialize
|
|
237
|
+
print('[vLLM] failed to start within timeout')
|
|
238
|
+
self._kill_vllm_process()
|
|
239
|
+
sys.exit('[vLLM] failed to start within timeout')
|
|
240
|
+
|
|
241
|
+
def unload_lora_adapter(self, lora_name: str):
|
|
242
|
+
"""Unload lora adapter given the lora name.
|
|
243
|
+
Args:
|
|
244
|
+
lora_name: Lora adapter name.
|
|
245
|
+
"""
|
|
246
|
+
lora_api_url = f'http://{self._host}:{self._port}/v1/unload_lora_adapter'
|
|
247
|
+
headers = {'Content-Type': 'application/json'}
|
|
248
|
+
try:
|
|
249
|
+
payload = {'lora_name': lora_name}
|
|
250
|
+
requests.post(lora_api_url, json=payload, headers=headers, timeout=10)
|
|
251
|
+
except requests.exceptions.RequestException:
|
|
252
|
+
pass
|
|
253
|
+
|
|
254
|
+
def load_lora_adapter(self, lora_name: str, new_adapter_path: str, num_trails: int = 5):
|
|
255
|
+
"""Dynamically load a LoRA adapter.
|
|
256
|
+
Args:
|
|
257
|
+
lora_name: LoRA adapter name.
|
|
258
|
+
new_adapter_path: Path to the new LoRA adapter weights.
|
|
259
|
+
"""
|
|
260
|
+
# First unload lora adapter
|
|
261
|
+
self.unload_lora_adapter(lora_name)
|
|
262
|
+
|
|
263
|
+
if self._max_lora_rank is None:
|
|
264
|
+
raise ValueError('LoRA is not enabled for this VLLMServer instance, since "max_lora_rank" is not set.')
|
|
265
|
+
|
|
266
|
+
# Prepare the payload for LoRA update
|
|
267
|
+
payload = {'lora_name': lora_name, 'lora_path': new_adapter_path}
|
|
268
|
+
headers = {'Content-Type': 'application/json'}
|
|
269
|
+
lora_api_url = f'http://{self._host}:{self._port}/v1/load_lora_adapter'
|
|
270
|
+
|
|
271
|
+
# Repeatedly trying to load lora adapters
|
|
272
|
+
for i in range(num_trails):
|
|
273
|
+
try:
|
|
274
|
+
response = requests.post(lora_api_url, json=payload, headers=headers, timeout=60)
|
|
275
|
+
if response.status_code == 200:
|
|
276
|
+
print(f'[vLLM] Successfully load LoRA adapter: {lora_name} from {new_adapter_path}')
|
|
277
|
+
else:
|
|
278
|
+
print(f'[vLLM] Failed to load LoRA adapter. '
|
|
279
|
+
f'Status code: {response.status_code}, Response: {response.text}')
|
|
280
|
+
return True
|
|
281
|
+
except requests.exceptions.RequestException:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
print(f'[vLLM] Error loading LoRA adapter.')
|
|
285
|
+
return False
|
|
286
|
+
|
|
287
|
+
def close(self):
|
|
288
|
+
"""Shut down vLLM server and kill all vLLM processes."""
|
|
289
|
+
self._kill_vllm_process()
|
|
290
|
+
|
|
291
|
+
def chat_completion(
|
|
292
|
+
self,
|
|
293
|
+
message: str | List[openai.types.chat.ChatCompletionMessageParam],
|
|
294
|
+
max_tokens: Optional[int] = None,
|
|
295
|
+
timeout_seconds: Optional[int] = None,
|
|
296
|
+
lora_name: Optional[str] = None,
|
|
297
|
+
temperature: float = 0.9,
|
|
298
|
+
top_p: float = 0.9,
|
|
299
|
+
chat_template_kwargs: Optional[Dict[str, Any]] = None
|
|
300
|
+
) -> str:
|
|
301
|
+
"""Send a chat completion query with OpenAI format to the vLLM server. Return the response content.
|
|
302
|
+
Args:
|
|
303
|
+
message: The message in str or openai format.
|
|
304
|
+
max_tokens: The maximum number of tokens to generate.
|
|
305
|
+
timeout_seconds: The timeout seconds.
|
|
306
|
+
lora_name: Lora adapter name. Defaults to None which uses base model.
|
|
307
|
+
temperature: The temperature parameter.
|
|
308
|
+
top_p: The top p parameter.
|
|
309
|
+
chat_template_kwargs: The chat template kwargs, e.g., {'enable_thinking': False}.
|
|
310
|
+
"""
|
|
311
|
+
data = {
|
|
312
|
+
'messages': [
|
|
313
|
+
{'role': 'user', 'content': message.strip()} if isinstance(message, str) else message
|
|
314
|
+
],
|
|
315
|
+
'temperature': temperature,
|
|
316
|
+
'top_p': top_p,
|
|
317
|
+
'max_tokens': max_tokens,
|
|
318
|
+
}
|
|
319
|
+
# Use the specified lora adapter
|
|
320
|
+
if lora_name is not None:
|
|
321
|
+
data['model'] = lora_name
|
|
322
|
+
# Chat template keyword args
|
|
323
|
+
if self._chat_template_kwargs is not None:
|
|
324
|
+
data['chat_template_kwargs'] = self._chat_template_kwargs
|
|
325
|
+
elif chat_template_kwargs is not None:
|
|
326
|
+
data['chat_template_kwargs'] = chat_template_kwargs
|
|
327
|
+
# Request
|
|
328
|
+
url = f'http://{self._host}:{self._port}/v1/chat/completions'
|
|
329
|
+
headers = {'Content-Type': 'application/json'}
|
|
330
|
+
response = requests.post(url, headers=headers, json=data, timeout=timeout_seconds)
|
|
331
|
+
return response.json()['choices'][0]['message']['content']
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
adtools/__init__.py,sha256=kbxntZFeCcURiIypNOdMWyeKPdlzRsWOB-K7z6HNCsc,150
|
|
2
|
+
adtools/evaluator.py,sha256=A33E0HmLsIMtMbdjRmfPzck9gDodJc96gvXR0cL_SR0,10577
|
|
3
|
+
adtools/evaluator_pool.py,sha256=zfQ7DgAjpByqPM5SE1tRQT_HGXU5uwNntvguzrXaPEk,3258
|
|
4
|
+
adtools/py_code.py,sha256=FZfkp-IZ4zpOjrWe6svKNJsQhVANaTTkE0l0mc4aMW8,14277
|
|
5
|
+
adtools/lm/__init__.py,sha256=PZf5Lraly9rAWz-cxOSLCvZ9OZ4EI8aQEluetvNX8LM,146
|
|
6
|
+
adtools/lm/lm_base.py,sha256=KtO7KTrrMW7oWN-BhncoIOsbOVQsSc-0gNCYtvR6Sog,1105
|
|
7
|
+
adtools/lm/openai_api.py,sha256=LcfLkNOBrJTdsp0zcUjaCelIcQK5XknpHWrlB0S67_k,2390
|
|
8
|
+
adtools/lm/vllm_server.py,sha256=BPZoTS77wNJDcJ_0FO2QFyZTf6WR0isYKMuTctqKEU8,12942
|
|
9
|
+
py_adtools-0.1.8.dist-info/licenses/LICENSE,sha256=E5GGyecx3y5h2gcEGQloF-rDY9wbaef5IHjRsvtFbt8,1065
|
|
10
|
+
py_adtools-0.1.8.dist-info/METADATA,sha256=22hW8kcx1OxxlcedTDYIX1EzfB79pRYEtU3cXptveE8,6386
|
|
11
|
+
py_adtools-0.1.8.dist-info/WHEEL,sha256=lTU6B6eIfYoiQJTZNc-fyaR6BpL6ehTzU3xGYxn2n8k,91
|
|
12
|
+
py_adtools-0.1.8.dist-info/top_level.txt,sha256=X2kKzmJFDAKR2FWCij5pfMG9pVVjVUomyl4e-1VLXIk,8
|
|
13
|
+
py_adtools-0.1.8.dist-info/RECORD,,
|
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
adtools/__init__.py,sha256=kbxntZFeCcURiIypNOdMWyeKPdlzRsWOB-K7z6HNCsc,150
|
|
2
|
-
adtools/evaluator.py,sha256=weA6zR1WyUE3f5pt7wQYF1ukmkA-e2kDLaogbDmG_Ig,9154
|
|
3
|
-
adtools/evaluator_pool.py,sha256=v_NZibN4VI3STVUZt6ARdyoB4Z061xAefZlH8lkWsjE,2972
|
|
4
|
-
adtools/py_code.py,sha256=FZfkp-IZ4zpOjrWe6svKNJsQhVANaTTkE0l0mc4aMW8,14277
|
|
5
|
-
py_adtools-0.1.6.dist-info/licenses/LICENSE,sha256=E5GGyecx3y5h2gcEGQloF-rDY9wbaef5IHjRsvtFbt8,1065
|
|
6
|
-
py_adtools-0.1.6.dist-info/METADATA,sha256=4Se0O0Fvi1xAhbYJPP8oBP6Wq9y6LJFH-JJeV8H-AVw,6386
|
|
7
|
-
py_adtools-0.1.6.dist-info/WHEEL,sha256=lTU6B6eIfYoiQJTZNc-fyaR6BpL6ehTzU3xGYxn2n8k,91
|
|
8
|
-
py_adtools-0.1.6.dist-info/top_level.txt,sha256=X2kKzmJFDAKR2FWCij5pfMG9pVVjVUomyl4e-1VLXIk,8
|
|
9
|
-
py_adtools-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|