speedy-utils 1.0.5__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,19 +9,17 @@ Serve a base model:
9
9
  svllm serve --model MODEL_NAME --gpus GPU_GROUPS
10
10
 
11
11
  Add a LoRA to a served model:
12
- svllm add-lora --lora LORA_NAME LORA_PATH --host_port host:port (if add then the port must be specify)
12
+ svllm add-lora --lora LORA_NAME LORA_PATH --host_port host:port
13
+ (if add then the port must be specify)
13
14
  """
14
15
 
15
- from glob import glob
16
16
  import os
17
17
  import subprocess
18
- import time
19
- from typing import List, Literal, Optional
20
- from fastcore.script import call_parse
21
- from loguru import logger
18
+ from typing import List, Optional
22
19
  import argparse
23
20
  import requests
24
21
  import openai
22
+ from loguru import logger
25
23
 
26
24
  from speedy_utils.common.utils_io import load_by_ext
27
25
 
@@ -32,63 +30,22 @@ HF_HOME: str = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingfac
32
30
  logger.info(f"LORA_DIR: {LORA_DIR}")
33
31
 
34
32
 
35
- def model_list(host_port, api_key="abc"):
33
+ def model_list(host_port: str, api_key: str = "abc") -> None:
34
+ """List models from the vLLM server."""
36
35
  client = openai.OpenAI(base_url=f"http://{host_port}/v1", api_key=api_key)
37
36
  models = client.models.list()
38
37
  for model in models:
39
38
  print(f"Model ID: {model.id}")
40
39
 
41
40
 
42
- def kill_existing_vllm(vllm_binary: Optional[str] = None) -> None:
43
- """Kill selected vLLM processes using fzf."""
44
- if not vllm_binary:
45
- vllm_binary = get_vllm()
46
-
47
- # List running vLLM processes
48
- result = subprocess.run(
49
- f"ps aux | grep {vllm_binary} | grep -v grep",
50
- shell=True,
51
- capture_output=True,
52
- text=True,
53
- )
54
- processes = result.stdout.strip().split("\n")
55
-
56
- if not processes or processes == [""]:
57
- print("No running vLLM processes found.")
58
- return
59
-
60
- # Use fzf to select processes to kill
61
- fzf = subprocess.Popen(
62
- ["fzf", "--multi"],
63
- stdin=subprocess.PIPE,
64
- stdout=subprocess.PIPE,
65
- text=True,
66
- )
67
- selected, _ = fzf.communicate("\n".join(processes))
68
-
69
- if not selected:
70
- print("No processes selected.")
71
- return
72
-
73
- # Extract PIDs and kill selected processes
74
- pids = [line.split()[1] for line in selected.strip().split("\n")]
75
- for pid in pids:
76
- subprocess.run(
77
- f"kill -9 {pid}",
78
- shell=True,
79
- stdout=subprocess.DEVNULL,
80
- stderr=subprocess.DEVNULL,
81
- )
82
- print(f"Killed processes: {', '.join(pids)}")
83
-
84
-
85
41
  def add_lora(
86
42
  lora_name_or_path: str,
87
43
  host_port: str,
88
44
  url: str = "http://HOST:PORT/v1/load_lora_adapter",
89
45
  served_model_name: Optional[str] = None,
90
- lora_module: Optional[str] = None, # Added parameter
46
+ lora_module: Optional[str] = None,
91
47
  ) -> dict:
48
+ """Add a LoRA adapter to a running vLLM server."""
92
49
  url = url.replace("HOST:PORT", host_port)
93
50
  headers = {"Content-Type": "application/json"}
94
51
 
@@ -96,15 +53,12 @@ def add_lora(
96
53
  "lora_name": served_model_name,
97
54
  "lora_path": os.path.abspath(lora_name_or_path),
98
55
  }
99
- if lora_module: # Include lora_module if provided
56
+ if lora_module:
100
57
  data["lora_module"] = lora_module
101
58
  logger.info(f"{data=}, {headers}, {url=}")
102
- # logger.warning(f"Failed to unload LoRA adapter: {str(e)}")
103
59
  try:
104
- response = requests.post(url, headers=headers, json=data)
60
+ response = requests.post(url, headers=headers, json=data, timeout=10)
105
61
  response.raise_for_status()
106
-
107
- # Handle potential non-JSON responses
108
62
  try:
109
63
  return response.json()
110
64
  except ValueError:
@@ -116,113 +70,100 @@ def add_lora(
116
70
  else "Request completed with empty response"
117
71
  ),
118
72
  }
119
-
120
73
  except requests.exceptions.RequestException as e:
121
74
  logger.error(f"Request failed: {str(e)}")
122
75
  return {"error": f"Request failed: {str(e)}"}
123
76
 
124
77
 
125
- def unload_lora(lora_name, host_port):
78
+ def unload_lora(lora_name: str, host_port: str) -> Optional[dict]:
79
+ """Unload a LoRA adapter from a running vLLM server."""
126
80
  try:
127
81
  url = f"http://{host_port}/v1/unload_lora_adapter"
128
82
  logger.info(f"{url=}")
129
83
  headers = {"Content-Type": "application/json"}
130
84
  data = {"lora_name": lora_name}
131
85
  logger.info(f"Unloading LoRA adapter: {data=}")
132
- response = requests.post(url, headers=headers, json=data)
86
+ response = requests.post(url, headers=headers, json=data, timeout=10)
133
87
  response.raise_for_status()
134
88
  logger.success(f"Unloaded LoRA adapter: {lora_name}")
135
89
  except requests.exceptions.RequestException as e:
136
90
  return {"error": f"Request failed: {str(e)}"}
137
91
 
138
92
 
139
- def serve(
140
- model: str,
141
- gpu_groups: str,
142
- served_model_name: Optional[str] = None,
143
- port_start: int = 8155,
144
- gpu_memory_utilization: float = 0.93,
145
- dtype: str = "bfloat16",
146
- max_model_len: int = 8192,
147
- enable_lora: bool = False,
148
- is_bnb: bool = False,
149
- eager: bool = False,
150
- lora_modules: Optional[List[str]] = None, # Updated type
151
- ) -> None:
152
- """Main function to start or kill vLLM containers."""
153
-
93
+ def serve(args) -> None:
154
94
  """Start vLLM containers with dynamic args."""
155
95
  print("Starting vLLM containers...,")
156
- gpu_groups_arr: List[str] = gpu_groups.split(",")
157
- VLLM_BINARY: str = get_vllm()
158
- if enable_lora:
159
- VLLM_BINARY = "VLLM_ALLOW_RUNTIME_LORA_UPDATING=True " + VLLM_BINARY
160
-
161
- # Auto-detect quantization based on model name if not explicitly set
162
- if not is_bnb and model and ("bnb" in model.lower() or "4bit" in model.lower()):
163
- is_bnb = True
164
- print(f"Auto-detected quantization for model: {model}")
165
-
166
- # Set environment variables for LoRA if needed
167
- if enable_lora:
96
+ gpu_groups_arr: List[str] = args.gpu_groups.split(",")
97
+ vllm_binary: str = get_vllm()
98
+ if args.enable_lora:
99
+ vllm_binary = "VLLM_ALLOW_RUNTIME_LORA_UPDATING=True " + vllm_binary
100
+
101
+ if (
102
+ not args.bnb
103
+ and args.model
104
+ and ("bnb" in args.model.lower() or "4bit" in args.model.lower())
105
+ ):
106
+ args.bnb = True
107
+ print(f"Auto-detected quantization for model: {args.model}")
108
+
109
+ if args.enable_lora:
168
110
  os.environ["VLLM_ALLOW_RUNTIME_LORA_UPDATING"] = "True"
169
111
  print("Enabled runtime LoRA updating")
170
112
 
171
113
  for i, gpu_group in enumerate(gpu_groups_arr):
172
- port = port_start + i
114
+ port = int(args.host_port.split(":")[-1]) + i
173
115
  gpu_group = ",".join([str(x) for x in gpu_group])
174
116
  tensor_parallel = len(gpu_group.split(","))
175
117
 
176
118
  cmd = [
177
119
  f"CUDA_VISIBLE_DEVICES={gpu_group}",
178
- VLLM_BINARY,
120
+ vllm_binary,
179
121
  "serve",
180
- model,
122
+ args.model,
181
123
  "--port",
182
124
  str(port),
183
125
  "--tensor-parallel",
184
126
  str(tensor_parallel),
185
127
  "--gpu-memory-utilization",
186
- str(gpu_memory_utilization),
128
+ str(args.gpu_memory_utilization),
187
129
  "--dtype",
188
- dtype,
130
+ args.dtype,
189
131
  "--max-model-len",
190
- str(max_model_len),
132
+ str(args.max_model_len),
191
133
  "--enable-prefix-caching",
192
134
  "--disable-log-requests",
193
135
  "--uvicorn-log-level critical",
194
136
  ]
195
137
  if HF_HOME:
196
- # insert
197
138
  cmd.insert(0, f"HF_HOME={HF_HOME}")
198
- if eager:
139
+ if args.eager:
199
140
  cmd.append("--enforce-eager")
200
141
 
201
- if served_model_name:
202
- cmd.extend(["--served-model-name", served_model_name])
142
+ if args.served_model_name:
143
+ cmd.extend(["--served-model-name", args.served_model_name])
203
144
 
204
- if is_bnb:
145
+ if args.bnb:
205
146
  cmd.extend(
206
147
  ["--quantization", "bitsandbytes", "--load-format", "bitsandbytes"]
207
148
  )
208
149
 
209
- if enable_lora:
150
+ if args.enable_lora:
210
151
  cmd.extend(["--fully-sharded-loras", "--enable-lora"])
211
152
 
212
- if lora_modules:
213
- # for lora_module in lora_modules:
214
- # len must be even and we will join tuple with `=`
215
- assert len(lora_modules) % 2 == 0, "lora_modules must be even"
216
- # lora_modulle = [f'{name}={module}' for name, module in zip(lora_module[::2], lora_module[1::2])]
217
- # import ipdb;ipdb.set_trace()
153
+ if args.lora_modules:
154
+ assert len(args.lora_modules) % 2 == 0, "lora_modules must be even"
218
155
  s = ""
219
- for i in range(0, len(lora_modules), 2):
220
- name = lora_modules[i]
221
- module = lora_modules[i + 1]
156
+ for i in range(0, len(args.lora_modules), 2):
157
+ name = args.lora_modules[i]
158
+ module = args.lora_modules[i + 1]
222
159
  s += f"{name}={module} "
223
-
224
160
  cmd.extend(["--lora-modules", s])
225
- # add kwargs
161
+
162
+ if hasattr(args, "enable_reasoning") and args.enable_reasoning:
163
+ cmd.extend(["--enable-reasoning", "--reasoning-parser", "deepseek_r1"])
164
+ # Add VLLM_USE_V1=0 to the environment for reasoning mode
165
+ cmd.insert(0, "VLLM_USE_V1=0")
166
+
226
167
  final_cmd = " ".join(cmd)
227
168
  log_file = f"/tmp/vllm_{port}.txt"
228
169
  final_cmd_with_log = f'"{final_cmd} 2>&1 | tee {log_file}"'
@@ -235,14 +176,15 @@ def serve(
235
176
  os.system(run_in_tmux)
236
177
 
237
178
 
238
- def get_vllm():
239
- VLLM_BINARY = subprocess.check_output("which vllm", shell=True, text=True).strip()
240
- VLLM_BINARY = os.getenv("VLLM_BINARY", VLLM_BINARY)
241
- logger.info(f"vLLM binary: {VLLM_BINARY}")
179
+ def get_vllm() -> str:
180
+ """Get the vLLM binary path."""
181
+ vllm_binary = subprocess.check_output("which vllm", shell=True, text=True).strip()
182
+ vllm_binary = os.getenv("VLLM_BINARY", vllm_binary)
183
+ logger.info(f"vLLM binary: {vllm_binary}")
242
184
  assert os.path.exists(
243
- VLLM_BINARY
244
- ), f"vLLM binary not found at {VLLM_BINARY}, please set VLLM_BINARY env variable"
245
- return VLLM_BINARY
185
+ vllm_binary
186
+ ), f"vLLM binary not found at {vllm_binary}, please set VLLM_BINARY env variable"
187
+ return vllm_binary
246
188
 
247
189
 
248
190
  def get_args():
@@ -330,6 +272,9 @@ def get_args():
330
272
  type=str,
331
273
  help="List of LoRA modules in the format lora_name lora_module",
332
274
  )
275
+ parser.add_argument(
276
+ "--enable-reasoning", action="store_true", help="Enable reasoning"
277
+ )
333
278
  return parser.parse_args()
334
279
 
335
280
 
@@ -371,23 +316,8 @@ def main():
371
316
  logger.info(f"Model name from LoRA config: {model_name}")
372
317
  args.model = model_name
373
318
  # port_start from hostport
374
- port_start = int(args.host_port.split(":")[-1])
375
- serve(
376
- args.model,
377
- args.gpu_groups,
378
- args.served_model_name,
379
- port_start,
380
- args.gpu_memory_utilization,
381
- args.dtype,
382
- args.max_model_len,
383
- args.enable_lora,
384
- args.bnb,
385
- args.eager,
386
- args.lora_modules,
387
- )
319
+ serve(args)
388
320
 
389
- elif args.mode == "kill":
390
- kill_existing_vllm(args.vllm_binary)
391
321
  elif args.mode == "add_lora":
392
322
  if args.lora:
393
323
  lora_name, lora_path = args.lora
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: speedy-utils
3
- Version: 1.0.5
3
+ Version: 1.0.11
4
4
  Summary: Fast and easy-to-use package for data science
5
5
  Author: AnhVTH
6
6
  Author-email: anhvth.226@gmail.com
@@ -54,9 +54,6 @@ Description-Content-Type: text/markdown
54
54
  - [Data Manipulation](#data-manipulation)
55
55
  - [Utility Functions](#utility-functions)
56
56
  - [Testing](#testing)
57
- - [Deployment](#deployment)
58
- - [Contributing](#contributing)
59
- - [License](#license)
60
57
 
61
58
  ## Features
62
59
 
@@ -84,6 +81,18 @@ cd speedy-utils
84
81
  pip install .
85
82
  ```
86
83
 
84
+ ## Updating from previous versions
85
+
86
+ To update from previous versions or switch to v1.x, first uninstall any old
87
+ packages, then install the latest version:
88
+
89
+ ```bash
90
+ pip uninstall speedy_llm_utils speedy_utils
91
+ pip install -e ./ # for local development
92
+ # or
93
+ pip install speedy_utils -U # for PyPI upgrade
94
+ ```
95
+
87
96
  ## Usage
88
97
 
89
98
  Below are examples demonstrating how to utilize various features of **Speedy Utils**.
@@ -262,7 +271,6 @@ Ensure all dependencies are installed before running tests:
262
271
  pip install -r requirements.txt
263
272
  ```
264
273
 
265
-
266
274
  Run the script to parse and display the arguments:
267
275
 
268
276
  ```bash
@@ -1,11 +1,14 @@
1
- llm_utils/__init__.py,sha256=2g0XXQLj9WsGmWE4UQ9YaaYGfF5ZUyvc1hfR2OQIVSo,679
2
- llm_utils/chat_format.py,sha256=ZY2HYv3FPL2xiMxbbO-huIwT5LZrcJm_if_us-2eSZ4,15094
1
+ llm_utils/__init__.py,sha256=ibEVUPkL11M4htL-3uXkSyyUZiIO-TZD6IzWVmi8QYw,697
2
+ llm_utils/chat_format/__init__.py,sha256=8dBIUqFJvkgQYedxBtcyxt-4tt8JxAKVap2JlTXmgaM,737
3
+ llm_utils/chat_format/display.py,sha256=a3zWzo47SUf4i-uic-dwf-vxtu6gZWLbnJrszjjZjQ8,9801
4
+ llm_utils/chat_format/transform.py,sha256=328V18FOgRQzljAl9Mh8NF4Tl-N3cZZIPmAwHQspXCY,5461
5
+ llm_utils/chat_format/utils.py,sha256=xTxN4HrLHcRO2PfCTR43nH1M5zCa7v0kTTdzAcGkZg0,1229
3
6
  llm_utils/group_messages.py,sha256=wyiZzs7O8yK2lyIakV2x-1CrrWVT12sjnP1vVnmPet4,3606
4
- llm_utils/lm/__init__.py,sha256=a4N_hh0JuB2FEEzxNiE8GwTIbd8enm6mj_lw4TBxreI,76
5
- llm_utils/lm/base_lm.py,sha256=ZbuLagAPZsA4Oa2eIkoDnsldqMf4pKtz442LtWgMrMk,10704
7
+ llm_utils/lm/__init__.py,sha256=vXFILZLBmmpg39cy5XniQPSMzoFQCE3wdfz39EtqDKU,71
8
+ llm_utils/lm/lm.py,sha256=4bEo4nnyCi_ybTOYfzrJz9AwpxJNkzRFAUPq7KpBklw,16695
6
9
  llm_utils/lm/utils.py,sha256=-fDNueiXKQI6RDoNHJYNyORomf2XlCf2doJZ3GEV2Io,4762
7
- llm_utils/scripts/vllm_load_balancer.py,sha256=uSjGd_jOmI9W9eVOhiOXbeUnZkQq9xG4bCVzhmpupcA,16096
8
- llm_utils/scripts/vllm_serve.py,sha256=uFS5kNXZ7kZ9rQms63LnliGEVV3rATT6dEppGTgoR0s,13910
10
+ llm_utils/scripts/vllm_load_balancer.py,sha256=MgMnnoKWJQc-l2fspUSkyA9wxL1RkXd7wdBLJNQBlr4,17384
11
+ llm_utils/scripts/vllm_serve.py,sha256=LlrkwfWLxdMDhfOJ-eL1VJnA4AY1Beh_cI8U6l9Xl-A,11975
9
12
  speedy_utils/__init__.py,sha256=I2bSfDIE9yRF77tnHW0vqfExDA2m1gUx4AH8C9XmGtg,1707
10
13
  speedy_utils/all.py,sha256=A9jiKGjo950eg1pscS9x38OWAjKGyusoAN5mrfweY4E,3090
11
14
  speedy_utils/common/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -21,7 +24,7 @@ speedy_utils/multi_worker/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJW
21
24
  speedy_utils/multi_worker/process.py,sha256=XwQlffxzRFnCVeKjDNBZDwFfUQHiJiuFA12MRGJVru8,6708
22
25
  speedy_utils/multi_worker/thread.py,sha256=9pXjvgjD0s0Hp0cZ6I3M0ndp1OlYZ1yvqbs_bcun_Kw,12775
23
26
  speedy_utils/scripts/mpython.py,sha256=ZzkBWI5Xw3vPoMx8xQt2x4mOFRjtwWqfvAJ5_ngyWgw,3816
24
- speedy_utils-1.0.5.dist-info/METADATA,sha256=QAwtVoX05-q-aPwtJN7fg6AvFRF9MF47ycEwsq-7z-0,7165
25
- speedy_utils-1.0.5.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
26
- speedy_utils-1.0.5.dist-info/entry_points.txt,sha256=fsv8_lMg62BeswoUHrqfj2u6q2l4YcDCw7AgQFg6GRw,61
27
- speedy_utils-1.0.5.dist-info/RECORD,,
27
+ speedy_utils-1.0.11.dist-info/METADATA,sha256=F48tr0hmL3k-r9O2tPbUdfbBU5JHnwxVGB547eQXElU,7392
28
+ speedy_utils-1.0.11.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
29
+ speedy_utils-1.0.11.dist-info/entry_points.txt,sha256=rP43satgw1uHcKUAlmVxS-MTAQImL-03-WwLIB5a300,165
30
+ speedy_utils-1.0.11.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ mpython=speedy_utils.scripts.mpython:main
3
+ svllm=llm_utils.scripts.vllm_serve:main
4
+ svllm-lb=llm_utils.scripts.vllm_load_balancer:run_load_balancer
5
+