vec-inf 0.3.1__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/README.md CHANGED
@@ -3,6 +3,6 @@
3
3
  * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
4
4
  * `list`: List all available model names, `--json-mode` supported.
5
5
  * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
6
- * `shutdown`: Shutdown a model by providing its Slurm job ID.
6
+ * `shutdown`: Shutdown a model by providing its Slurm job ID.
7
7
 
8
- Use `--help` to see all available options
8
+ Use `--help` to see all available options
vec_inf/cli/_cli.py CHANGED
@@ -1,12 +1,12 @@
1
1
  import os
2
+ from typing import Optional
2
3
 
3
4
  import click
4
- from rich.console import Console
5
5
  from rich.columns import Columns
6
+ from rich.console import Console
6
7
  from rich.panel import Panel
7
8
 
8
- from ._utils import *
9
-
9
+ import vec_inf.cli._utils as utils
10
10
 
11
11
  CONSOLE = Console()
12
12
 
@@ -18,122 +18,107 @@ def cli():
18
18
 
19
19
 
20
20
  @cli.command("launch")
21
- @click.argument(
22
- "model-name",
23
- type=str,
24
- nargs=1
25
- )
26
- @click.option(
27
- "--model-family",
28
- type=str,
29
- help='The model family'
30
- )
31
- @click.option(
32
- "--model-variant",
33
- type=str,
34
- help='The model variant'
35
- )
21
+ @click.argument("model-name", type=str, nargs=1)
22
+ @click.option("--model-family", type=str, help="The model family")
23
+ @click.option("--model-variant", type=str, help="The model variant")
36
24
  @click.option(
37
25
  "--max-model-len",
38
26
  type=int,
39
- help='Model context length. If unspecified, will be automatically derived from the model config.'
40
- )
41
- @click.option(
42
- "--partition",
43
- type=str,
44
- help='Type of compute partition, default to a40'
27
+ help="Model context length. If unspecified, will be automatically derived from the model config.",
45
28
  )
29
+ @click.option("--partition", type=str, help="Type of compute partition, default to a40")
46
30
  @click.option(
47
31
  "--num-nodes",
48
32
  type=int,
49
- help='Number of nodes to use, default to suggested resource allocation for model'
33
+ help="Number of nodes to use, default to suggested resource allocation for model",
50
34
  )
51
35
  @click.option(
52
36
  "--num-gpus",
53
37
  type=int,
54
- help='Number of GPUs/node to use, default to suggested resource allocation for model'
38
+ help="Number of GPUs/node to use, default to suggested resource allocation for model",
55
39
  )
56
40
  @click.option(
57
41
  "--qos",
58
42
  type=str,
59
- help='Quality of service, default depends on suggested resource allocation required for the model'
43
+ help="Quality of service, default depends on suggested resource allocation required for the model",
60
44
  )
61
45
  @click.option(
62
46
  "--time",
63
47
  type=str,
64
- help='Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS'
48
+ help="Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS",
65
49
  )
66
50
  @click.option(
67
- "--data-type",
68
- type=str,
69
- help='Model data type, default to auto'
70
- )
71
- @click.option(
72
- "--venv",
73
- type=str,
74
- help='Path to virtual environment'
51
+ "--vocab-size",
52
+ type=int,
53
+ help="Vocabulary size, this option is intended for custom models",
75
54
  )
55
+ @click.option("--data-type", type=str, help="Model data type, default to auto")
56
+ @click.option("--venv", type=str, help="Path to virtual environment")
76
57
  @click.option(
77
58
  "--log-dir",
78
59
  type=str,
79
- help='Path to slurm log directory, default to .vec-inf-logs in home directory'
60
+ help="Path to slurm log directory, default to .vec-inf-logs in home directory",
80
61
  )
81
62
  @click.option(
82
63
  "--json-mode",
83
64
  is_flag=True,
84
- help='Output in JSON string',
65
+ help="Output in JSON string",
85
66
  )
86
67
  def launch(
87
68
  model_name: str,
88
- model_family: str=None,
89
- model_variant: str=None,
90
- max_model_len: int=None,
91
- partition: str=None,
92
- num_nodes: int=None,
93
- num_gpus: int=None,
94
- qos: str=None,
95
- time: str=None,
96
- data_type: str=None,
97
- venv: str=None,
98
- log_dir: str=None,
99
- json_mode: bool=False
69
+ model_family: Optional[str] = None,
70
+ model_variant: Optional[str] = None,
71
+ max_model_len: Optional[int] = None,
72
+ partition: Optional[str] = None,
73
+ num_nodes: Optional[int] = None,
74
+ num_gpus: Optional[int] = None,
75
+ qos: Optional[str] = None,
76
+ time: Optional[str] = None,
77
+ vocab_size: Optional[int] = None,
78
+ data_type: Optional[str] = None,
79
+ venv: Optional[str] = None,
80
+ log_dir: Optional[str] = None,
81
+ json_mode: bool = False,
100
82
  ) -> None:
101
83
  """
102
84
  Launch a model on the cluster
103
85
  """
104
86
  launch_script_path = os.path.join(
105
- os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
106
- "launch_server.sh"
87
+ os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
107
88
  )
108
- launch_cmd = f"bash {launch_script_path}"
109
-
110
- models_df = load_models_df()
89
+ launch_cmd = f"bash {launch_script_path}"
111
90
 
112
- if model_name not in models_df['model_name'].values:
113
- raise ValueError(f"Model name {model_name} not found in available models")
91
+ models_df = utils.load_models_df()
114
92
 
115
- default_args = load_default_args(models_df, model_name)
93
+ if model_name in models_df["model_name"].values:
94
+ default_args = utils.load_default_args(models_df, model_name)
95
+ for arg in default_args:
96
+ if arg in locals() and locals()[arg] is not None:
97
+ default_args[arg] = locals()[arg]
98
+ renamed_arg = arg.replace("_", "-")
99
+ launch_cmd += f" --{renamed_arg} {default_args[arg]}"
100
+ else:
101
+ model_args = models_df.columns.tolist()
102
+ excluded_keys = ["model_name", "pipeline_parallelism"]
103
+ for arg in model_args:
104
+ if arg not in excluded_keys and locals()[arg] is not None:
105
+ renamed_arg = arg.replace("_", "-")
106
+ launch_cmd += f" --{renamed_arg} {locals()[arg]}"
116
107
 
117
- for arg in default_args:
118
- if arg in locals() and locals()[arg] is not None:
119
- default_args[arg] = locals()[arg]
120
- renamed_arg = arg.replace("_", "-")
121
- launch_cmd += f" --{renamed_arg} {default_args[arg]}"
122
-
123
- output = run_bash_command(launch_cmd)
108
+ output = utils.run_bash_command(launch_cmd)
124
109
 
125
110
  slurm_job_id = output.split(" ")[-1].strip().strip("\n")
126
111
  output_lines = output.split("\n")[:-2]
127
112
 
128
- table = create_table(key_title="Job Config", value_title="Value")
113
+ table = utils.create_table(key_title="Job Config", value_title="Value")
129
114
  table.add_row("Slurm Job ID", slurm_job_id, style="blue")
130
115
  output_dict = {"slurm_job_id": slurm_job_id}
131
-
116
+
132
117
  for line in output_lines:
133
118
  key, value = line.split(": ")
134
119
  table.add_row(key, value)
135
120
  output_dict[key.lower().replace(" ", "_")] = value
136
-
121
+
137
122
  if json_mode:
138
123
  click.echo(output_dict)
139
124
  else:
@@ -141,27 +126,25 @@ def launch(
141
126
 
142
127
 
143
128
  @cli.command("status")
144
- @click.argument(
145
- "slurm_job_id",
146
- type=int,
147
- nargs=1
148
- )
129
+ @click.argument("slurm_job_id", type=int, nargs=1)
149
130
  @click.option(
150
131
  "--log-dir",
151
132
  type=str,
152
- help='Path to slurm log directory. This is required if --log-dir was set in model launch'
133
+ help="Path to slurm log directory. This is required if --log-dir was set in model launch",
153
134
  )
154
135
  @click.option(
155
136
  "--json-mode",
156
137
  is_flag=True,
157
- help='Output in JSON string',
138
+ help="Output in JSON string",
158
139
  )
159
- def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
140
+ def status(
141
+ slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
142
+ ) -> None:
160
143
  """
161
144
  Get the status of a running model on the cluster
162
145
  """
163
146
  status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
164
- output = run_bash_command(status_cmd)
147
+ output = utils.run_bash_command(status_cmd)
165
148
 
166
149
  slurm_job_name = "UNAVAILABLE"
167
150
  status = "SHUTDOWN"
@@ -181,36 +164,39 @@ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
181
164
  # If Slurm job is currently RUNNING
182
165
  elif slurm_job_state == "RUNNING":
183
166
  # Check whether the server is ready, if yes, run model health check to further determine status
184
- server_status = is_server_running(slurm_job_name, slurm_job_id, log_dir)
167
+ server_status = utils.is_server_running(slurm_job_name, slurm_job_id, log_dir)
185
168
  # If server status is a tuple, then server status is "FAILED"
186
- if type(server_status) is tuple:
169
+ if isinstance(server_status, tuple):
187
170
  status = server_status[0]
188
171
  slurm_job_failed_reason = server_status[1]
189
172
  elif server_status == "RUNNING":
190
- status = model_health_check(slurm_job_name, slurm_job_id, log_dir)
191
- if status == "READY":
173
+ model_status = utils.model_health_check(
174
+ slurm_job_name, slurm_job_id, log_dir
175
+ )
176
+ if model_status == "READY":
192
177
  # Only set base_url if model is ready to serve requests
193
- base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
178
+ base_url = utils.get_base_url(slurm_job_name, slurm_job_id, log_dir)
179
+ status = "READY"
194
180
  else:
195
181
  # If model is not ready, then status must be "FAILED"
196
- status = status[0]
197
- slurm_job_failed_reason = status[1]
182
+ status = model_status[0]
183
+ slurm_job_failed_reason = str(model_status[1])
198
184
  else:
199
185
  status = server_status
200
186
 
201
187
  if json_mode:
202
188
  status_dict = {
203
- "model_name": slurm_job_name,
204
- "model_status": status,
205
- "base_url": base_url
189
+ "model_name": slurm_job_name,
190
+ "model_status": status,
191
+ "base_url": base_url,
206
192
  }
207
193
  if "slurm_job_pending_reason" in locals():
208
194
  status_dict["pending_reason"] = slurm_job_pending_reason
209
195
  if "slurm_job_failed_reason" in locals():
210
196
  status_dict["failed_reason"] = slurm_job_failed_reason
211
- click.echo(f'{status_dict}')
197
+ click.echo(f"{status_dict}")
212
198
  else:
213
- table = create_table(key_title="Job Status", value_title="Value")
199
+ table = utils.create_table(key_title="Job Status", value_title="Value")
214
200
  table.add_row("Model Name", slurm_job_name)
215
201
  table.add_row("Model Status", status, style="blue")
216
202
  if "slurm_job_pending_reason" in locals():
@@ -219,60 +205,54 @@ def status(slurm_job_id: int, log_dir: str=None, json_mode: bool=False) -> None:
219
205
  table.add_row("Reason", slurm_job_failed_reason)
220
206
  table.add_row("Base URL", base_url)
221
207
  CONSOLE.print(table)
222
-
208
+
223
209
 
224
210
  @cli.command("shutdown")
225
- @click.argument(
226
- "slurm_job_id",
227
- type=int,
228
- nargs=1
229
- )
211
+ @click.argument("slurm_job_id", type=int, nargs=1)
230
212
  def shutdown(slurm_job_id: int) -> None:
231
213
  """
232
214
  Shutdown a running model on the cluster
233
215
  """
234
216
  shutdown_cmd = f"scancel {slurm_job_id}"
235
- run_bash_command(shutdown_cmd)
217
+ utils.run_bash_command(shutdown_cmd)
236
218
  click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
237
219
 
238
220
 
239
221
  @cli.command("list")
240
- @click.argument(
241
- "model-name",
242
- required=False)
222
+ @click.argument("model-name", required=False)
243
223
  @click.option(
244
224
  "--json-mode",
245
225
  is_flag=True,
246
- help='Output in JSON string',
226
+ help="Output in JSON string",
247
227
  )
248
- def list(model_name: str=None, json_mode: bool=False) -> None:
228
+ def list(model_name: Optional[str] = None, json_mode: bool = False) -> None:
249
229
  """
250
230
  List all available models, or get default setup of a specific model
251
231
  """
252
- models_df = load_models_df()
232
+ models_df = utils.load_models_df()
253
233
 
254
234
  if model_name:
255
- if model_name not in models_df['model_name'].values:
235
+ if model_name not in models_df["model_name"].values:
256
236
  raise ValueError(f"Model name {model_name} not found in available models")
257
-
258
- excluded_keys = {'venv', 'log_dir', 'pipeline_parallelism'}
259
- model_row = models_df.loc[models_df['model_name'] == model_name]
237
+
238
+ excluded_keys = {"venv", "log_dir", "pipeline_parallelism"}
239
+ model_row = models_df.loc[models_df["model_name"] == model_name]
260
240
 
261
241
  if json_mode:
262
242
  # click.echo(model_row.to_json(orient='records'))
263
- filtered_model_row = model_row.drop(columns=excluded_keys, errors='ignore')
264
- click.echo(filtered_model_row.to_json(orient='records'))
243
+ filtered_model_row = model_row.drop(columns=excluded_keys, errors="ignore")
244
+ click.echo(filtered_model_row.to_json(orient="records"))
265
245
  return
266
- table = create_table(key_title="Model Config", value_title="Value")
246
+ table = utils.create_table(key_title="Model Config", value_title="Value")
267
247
  for _, row in model_row.iterrows():
268
248
  for key, value in row.items():
269
249
  if key not in excluded_keys:
270
250
  table.add_row(key, str(value))
271
251
  CONSOLE.print(table)
272
252
  return
273
-
253
+
274
254
  if json_mode:
275
- click.echo(models_df['model_name'].to_json(orient='records'))
255
+ click.echo(models_df["model_name"].to_json(orient="records"))
276
256
  return
277
257
  panels = []
278
258
  for _, row in models_df.iterrows():
@@ -281,5 +261,5 @@ def list(model_name: str=None, json_mode: bool=False) -> None:
281
261
  CONSOLE.print(Columns(panels, equal=True))
282
262
 
283
263
 
284
- if __name__ == '__main__':
285
- cli()
264
+ if __name__ == "__main__":
265
+ cli()
vec_inf/cli/_utils.py CHANGED
@@ -1,11 +1,10 @@
1
- import subprocess
2
1
  import os
3
- from typing import Union
2
+ import subprocess
3
+ from typing import Optional, Union
4
4
 
5
+ import pandas as pd
5
6
  import requests
6
7
  from rich.table import Table
7
- import pandas as pd
8
-
9
8
 
10
9
  MODEL_READY_SIGNATURE = "INFO: Uvicorn running on http://0.0.0.0:"
11
10
  SERVER_ADDRESS_SIGNATURE = "Server address: "
@@ -15,45 +14,50 @@ def run_bash_command(command: str) -> str:
15
14
  """
16
15
  Run a bash command and return the output
17
16
  """
18
- process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
17
+ process = subprocess.Popen(
18
+ command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
19
+ )
19
20
  stdout, _ = process.communicate()
20
21
  return stdout
21
22
 
22
23
 
23
24
  def read_slurm_log(
24
- slurm_job_name: str,
25
- slurm_job_id: int,
26
- slurm_log_type: str,
27
- log_dir: str
28
- ) -> Union[list, str]:
25
+ slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
26
+ ) -> Union[list[str], str]:
29
27
  """
30
28
  Get the directory of a model
31
29
  """
32
30
  if not log_dir:
33
31
  models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
34
-
32
+
35
33
  for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
36
34
  if dir in slurm_job_name:
37
35
  log_dir = os.path.join(models_dir, dir)
38
36
  break
39
-
37
+
40
38
  try:
41
- file_path = os.path.join(log_dir, f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}")
42
- with open(file_path, 'r') as file:
39
+ file_path = os.path.join(
40
+ log_dir, # type: ignore
41
+ f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}",
42
+ )
43
+ with open(file_path, "r") as file:
43
44
  lines = file.readlines()
44
45
  except FileNotFoundError:
45
46
  print(f"Could not find file: {file_path}")
46
47
  return "LOG_FILE_NOT_FOUND"
47
48
  return lines
48
49
 
49
- def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
50
+
51
+ def is_server_running(
52
+ slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
53
+ ) -> Union[str, tuple[str, str]]:
50
54
  """
51
55
  Check if a model is ready to serve requests
52
56
  """
53
57
  log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
54
- if type(log_content) is str:
58
+ if isinstance(log_content, str):
55
59
  return log_content
56
-
60
+
57
61
  for line in log_content:
58
62
  if "error" in line.lower():
59
63
  return ("FAILED", line.strip("\n"))
@@ -62,21 +66,23 @@ def is_server_running(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> U
62
66
  return "LAUNCHING"
63
67
 
64
68
 
65
- def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> str:
69
+ def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
66
70
  """
67
71
  Get the base URL of a model
68
72
  """
69
73
  log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
70
- if type(log_content) is str:
74
+ if isinstance(log_content, str):
71
75
  return log_content
72
-
76
+
73
77
  for line in log_content:
74
78
  if SERVER_ADDRESS_SIGNATURE in line:
75
79
  return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
76
80
  return "URL_NOT_FOUND"
77
81
 
78
82
 
79
- def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) -> Union[str, tuple]:
83
+ def model_health_check(
84
+ slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
85
+ ) -> Union[str, tuple[str, Union[str, int]]]:
80
86
  """
81
87
  Check the health of a running model on the cluster
82
88
  """
@@ -94,9 +100,11 @@ def model_health_check(slurm_job_name: str, slurm_job_id: int, log_dir: str) ->
94
100
  return ("FAILED", response.status_code)
95
101
  except requests.exceptions.RequestException as e:
96
102
  return ("FAILED", str(e))
97
-
98
103
 
99
- def create_table(key_title: str = "", value_title: str = "", show_header: bool = True) -> Table:
104
+
105
+ def create_table(
106
+ key_title: str = "", value_title: str = "", show_header: bool = True
107
+ ) -> Table:
100
108
  """
101
109
  Create a table for displaying model status
102
110
  """
@@ -113,7 +121,7 @@ def load_models_df() -> pd.DataFrame:
113
121
  models_df = pd.read_csv(
114
122
  os.path.join(
115
123
  os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
116
- "models/models.csv"
124
+ "models/models.csv",
117
125
  )
118
126
  )
119
127
  return models_df
@@ -126,4 +134,4 @@ def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
126
134
  row_data = models_df.loc[models_df["model_name"] == model_name]
127
135
  default_args = row_data.iloc[0].to_dict()
128
136
  default_args.pop("model_name")
129
- return default_args
137
+ return default_args
vec_inf/launch_server.sh CHANGED
@@ -76,7 +76,7 @@ mkdir -p $LOG_DIR
76
76
  export SRC_DIR="$(dirname "$0")"
77
77
  export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
78
78
  export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
79
-
79
+
80
80
  # Variables specific to your working environment, below are examples for the Vector cluster
81
81
  export VLLM_MODEL_WEIGHTS="/model-weights/$JOB_NAME"
82
82
  export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
@@ -119,4 +119,4 @@ sbatch --job-name $JOB_NAME \
119
119
  --time $WALLTIME \
120
120
  --output $LOG_DIR/$JOB_NAME.%j.out \
121
121
  --error $LOG_DIR/$JOB_NAME.%j.err \
122
- $SRC_DIR/${is_special}vllm.slurm
122
+ $SRC_DIR/${is_special}vllm.slurm
vec_inf/models/README.md CHANGED
@@ -1,13 +1,13 @@
1
1
  # Available Models
2
2
  More profiling metrics coming soon!
3
3
 
4
- ## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
4
+ ## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
5
5
 
6
6
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
7
7
  |:----------:|:----------:|:----------:|:----------:|
8
8
  |[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
9
9
 
10
- ## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
10
+ ## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
11
11
 
12
12
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
13
13
  |:----------:|:----------:|:----------:|:----------:|
@@ -20,13 +20,13 @@ More profiling metrics coming soon!
20
20
  | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
21
21
  | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
22
22
 
23
- ## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
23
+ ## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
24
24
 
25
25
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
26
26
  |:----------:|:----------:|:----------:|:----------:|
27
27
  |[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
28
28
 
29
- ## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
29
+ ## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
30
30
 
31
31
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
32
32
  |:----------:|:----------:|:----------:|:----------:|
@@ -35,32 +35,32 @@ More profiling metrics coming soon!
35
35
  | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
36
36
  | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
37
37
 
38
- ## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
38
+ ## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
39
39
 
40
40
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
41
41
  |:----------:|:----------:|:----------:|:----------:|
42
42
  |[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
43
43
  |[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
44
44
 
45
- ## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
45
+ ## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
46
46
 
47
47
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
48
48
  |:----------:|:----------:|:----------:|:----------:|
49
49
  |[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
50
50
  |[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
51
51
 
52
- ## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
52
+ ## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
53
53
 
54
54
  | Variant | Suggested resource allocation |
55
55
  |:----------:|:----------:|
56
- | [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
57
- | [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
58
- | [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
56
+ | [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
57
+ | [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
58
+ | [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
59
59
  | [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
60
60
  | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
61
61
  | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
62
62
 
63
- ## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
63
+ ## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
64
64
 
65
65
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
66
66
  |:----------:|:----------:|:----------:|:----------:|
@@ -69,7 +69,7 @@ More profiling metrics coming soon!
69
69
  | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
70
70
  | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
71
71
 
72
- ## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
72
+ ## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
73
73
 
74
74
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
75
75
  |:----------:|:----------:|:----------:|:----------:|
@@ -79,7 +79,7 @@ More profiling metrics coming soon!
79
79
  | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
80
80
  | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
81
81
 
82
- ## [Mistral AI: Mistral](https://huggingface.co/mistralai)
82
+ ## [Mistral AI: Mistral](https://huggingface.co/mistralai)
83
83
 
84
84
  | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
85
85
  |:----------:|:----------:|:----------:|:----------:|
@@ -90,7 +90,7 @@ More profiling metrics coming soon!
90
90
  |[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
91
91
  |[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 4x a40 | - tokens/s | - tokens/s|
92
92
 
93
- ## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
93
+ ## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
94
94
 
95
95
  | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
96
96
  |:----------:|:----------:|:----------:|:----------:|
@@ -98,9 +98,9 @@ More profiling metrics coming soon!
98
98
  |[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
99
99
  |[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
100
100
 
101
- ## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
101
+ ## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
102
102
 
103
103
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
104
104
  |:----------:|:----------:|:----------:|:----------:|
105
105
  | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
106
- | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
106
+ | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
vec_inf/models/models.csv CHANGED
@@ -42,4 +42,5 @@ Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,
42
42
  Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
43
43
  Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
44
44
  Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
45
- Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
45
+ Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
46
+ Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
@@ -64,7 +64,7 @@ for ((i = 1; i <= worker_num; i++)); do
64
64
  ray start --address "$ip_head" \
65
65
  --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
66
66
  fi
67
-
67
+
68
68
  sleep 5
69
69
  done
70
70
 
@@ -83,7 +83,7 @@ else
83
83
  fi
84
84
 
85
85
  # Activate vllm venv
86
- if [ "$VENV_BASE" = "singularity" ]; then
86
+ if [ "$VENV_BASE" = "singularity" ]; then
87
87
  singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
88
88
  python3.10 -m vllm.entrypoints.openai.api_server \
89
89
  --model ${VLLM_MODEL_WEIGHTS} \
@@ -93,7 +93,6 @@ if [ "$VENV_BASE" = "singularity" ]; then
93
93
  --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
94
94
  --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
95
95
  --dtype ${VLLM_DATA_TYPE} \
96
- --load-format safetensors \
97
96
  --trust-remote-code \
98
97
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
99
98
  --max-model-len ${VLLM_MAX_MODEL_LEN}
@@ -107,8 +106,7 @@ else
107
106
  --pipeline-parallel-size ${PIPELINE_PARALLEL_SIZE} \
108
107
  --tensor-parallel-size ${TENSOR_PARALLEL_SIZE} \
109
108
  --dtype ${VLLM_DATA_TYPE} \
110
- --load-format safetensors \
111
109
  --trust-remote-code \
112
110
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
113
111
  --max-model-len ${VLLM_MAX_MODEL_LEN}
114
- fi
112
+ fi
vec_inf/vllm.slurm CHANGED
@@ -41,7 +41,7 @@ else
41
41
  --port ${vllm_port_number} \
42
42
  --tensor-parallel-size ${NUM_GPUS} \
43
43
  --dtype ${VLLM_DATA_TYPE} \
44
- --max-logprobs ${VLLM_MAX_LOGPROBS} \
44
+ --max-logprobs ${VLLM_MAX_LOGPROBS} \
45
45
  --trust-remote-code \
46
46
  --max-model-len ${VLLM_MAX_MODEL_LEN}
47
- fi
47
+ fi
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vec-inf
3
- Version: 0.3.1
3
+ Version: 0.3.3
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  License: MIT
6
6
  Author: Marshall Wang
@@ -14,6 +14,7 @@ Classifier: Programming Language :: Python :: 3.12
14
14
  Provides-Extra: dev
15
15
  Requires-Dist: click (>=8.1.0,<9.0.0)
16
16
  Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
17
+ Requires-Dist: pandas (>=2.2.2,<3.0.0)
17
18
  Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
18
19
  Requires-Dist: requests (>=2.31.0,<3.0.0)
19
20
  Requires-Dist: rich (>=13.7.0,<14.0.0)
@@ -22,7 +23,7 @@ Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
22
23
  Description-Content-Type: text/markdown
23
24
 
24
25
  # Vector Inference: Easy inference on Slurm clusters
25
- This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
26
+ This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
26
27
 
27
28
  ## Installation
28
29
  If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
@@ -40,7 +41,7 @@ You should see an output like the following:
40
41
 
41
42
  <img width="400" alt="launch_img" src="https://github.com/user-attachments/assets/557eb421-47db-4810-bccd-c49c526b1b43">
42
43
 
43
- The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
44
+ The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
44
45
 
45
46
  You can check the inference server status by providing the Slurm job ID to the `status` command:
46
47
  ```bash
@@ -55,7 +56,7 @@ There are 5 possible states:
55
56
 
56
57
  * **PENDING**: Job submitted to Slurm, but not executed yet. Job pending reason will be shown.
57
58
  * **LAUNCHING**: Job is running but the server is not ready yet.
58
- * **READY**: Inference server running and ready to take requests.
59
+ * **READY**: Inference server running and ready to take requests.
59
60
  * **FAILED**: Inference server in an unhealthy state. Job failed reason will be shown.
60
61
  * **SHUTDOWN**: Inference server is shutdown/cancelled.
61
62
 
@@ -84,7 +85,7 @@ vec-inf list Meta-Llama-3.1-70B-Instruct
84
85
 
85
86
  ## Send inference requests
86
87
  Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
87
- > {"id":"cmpl-bdf43763adf242588af07af88b070b62","object":"text_completion","created":2983960,"model":"/model-weights/Llama-2-7b-hf","choices":[{"index":0,"text":"\nCanada is close to the actual continent of North America. Aside from the Arctic islands","logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
88
+ > {"id":"cmpl-c08d8946224747af9cce9f4d9f36ceb3","object":"text_completion","created":1725394970,"model":"Meta-Llama-3.1-8B-Instruct","choices":[{"index":0,"text":" is a question that many people may wonder. The answer is, of course, Ottawa. But if","logprobs":null,"finish_reason":"length","stop_reason":null}],"usage":{"prompt_tokens":8,"total_tokens":28,"completion_tokens":20}}
88
89
 
89
90
  **NOTE**: For multimodal models, currently only `ChatCompletion` is available, and only one image can be provided for each prompt.
90
91
 
@@ -0,0 +1,15 @@
1
+ vec_inf/README.md,sha256=ny3ffk6FeRwk_nERimK-JQwEuysvBe5eKpNyLk_A-8k,499
2
+ vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ vec_inf/cli/_cli.py,sha256=XwCBkwFrN06T_o1CkUKD2nWT6P4bwOfDpVPoM3AUyUA,8984
5
+ vec_inf/cli/_utils.py,sha256=n37X0AcgXNEi3wOEqQFA4_iHHeGclHew6NyQaML6q7s,4034
6
+ vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
+ vec_inf/launch_server.sh,sha256=-efoTEIDKlJD7YhbYMgq4fFRV7H_1okjT5uKhfQAGUg,3998
8
+ vec_inf/models/README.md,sha256=7Vz-AMValcic5Mpi9i5FshhRUV9K8nwSnItN4O1TSvI,8124
9
+ vec_inf/models/models.csv,sha256=dOthlc04TyTQTin_fyt-PFDqg-lARScI9i0-tUkIgQ8,4828
10
+ vec_inf/multinode_vllm.slurm,sha256=KbxsKD9kV8wsB_jCEqh63BHq8h2DLmYMV46z5h2wAe0,3867
11
+ vec_inf/vllm.slurm,sha256=wRBkDunb0Oc1d8ESl_Dn9wRs_kIKvN_J39pL8dWAbV0,1608
12
+ vec_inf-0.3.3.dist-info/METADATA,sha256=IefFGb9Gb7bOwI3RjNTbTlTCL6AImzx5XBSJjCp4y8c,5751
13
+ vec_inf-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
+ vec_inf-0.3.3.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
15
+ vec_inf-0.3.3.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- vec_inf/README.md,sha256=jtvslzw1MjTFFIXwzlrb0NstUyTEDL0S_k27K5bLl34,499
2
- vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- vec_inf/cli/_cli.py,sha256=8UHNFitbmq1OTNO1cLM_LVuHFndnvNyQSezGs1oT3tc,8346
5
- vec_inf/cli/_utils.py,sha256=2Grz-bX_mGjzxXUBdrX7MbNfXUM7JQ3399GKe-N74FE,3910
6
- vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
- vec_inf/launch_server.sh,sha256=BW5oK_10OjfHXhIsdf9vPsEBlCXh8j2lOV7qvSlPcZU,3998
8
- vec_inf/models/README.md,sha256=y_Cr1ZAkqIw1vIEOZMEp4FsyLGVijDoIoqwxn6aeQwo,8138
9
- vec_inf/models/models.csv,sha256=JFGMhT9o7Pf0tkY-w2GRQG5MxdYK2V5T8s6bk166MpM,4720
10
- vec_inf/multinode_vllm.slurm,sha256=pedYWIzPN-BKtL6ezoZSKJ3DO7RduDyAR4_cxZD4KyY,3938
11
- vec_inf/vllm.slurm,sha256=6Nx14qyAwHlbweCbFMUcMV2jaZSv41ghkyx2MiHJY8Y,1608
12
- vec_inf-0.3.1.dist-info/METADATA,sha256=xRhpXmFmMv5A77xdJaKBo_m7UXC13CkBmzegnQzQnPg,5701
13
- vec_inf-0.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
- vec_inf-0.3.1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
15
- vec_inf-0.3.1.dist-info/RECORD,,