vec-inf 0.3.3__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/README.md CHANGED
@@ -1,7 +1,8 @@
1
1
  # `vec-inf` Commands
2
2
 
3
3
  * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
4
- * `list`: List all available model names, `--json-mode` supported.
4
+ * `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
5
+ * `metrics`: Streams performance metrics to the console.
5
6
  * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
6
7
  * `shutdown`: Shutdown a model by providing its Slurm job ID.
7
8
 
vec_inf/cli/_cli.py CHANGED
@@ -1,9 +1,13 @@
1
1
  import os
2
- from typing import Optional
2
+ import time
3
+ from typing import Optional, cast
3
4
 
4
5
  import click
6
+
7
+ import polars as pl
5
8
  from rich.columns import Columns
6
9
  from rich.console import Console
10
+ from rich.live import Live
7
11
  from rich.panel import Panel
8
12
 
9
13
  import vec_inf.cli._utils as utils
@@ -24,9 +28,19 @@ def cli():
24
28
  @click.option(
25
29
  "--max-model-len",
26
30
  type=int,
27
- help="Model context length. If unspecified, will be automatically derived from the model config.",
31
+ help="Model context length. Default value set based on suggested resource allocation.",
32
+ )
33
+ @click.option(
34
+ "--max-num-seqs",
35
+ type=int,
36
+ help="Maximum number of sequences to process in a single request",
37
+ )
38
+ @click.option(
39
+ "--partition",
40
+ type=str,
41
+ default="a40",
42
+ help="Type of compute partition, default to a40",
28
43
  )
29
- @click.option("--partition", type=str, help="Type of compute partition, default to a40")
30
44
  @click.option(
31
45
  "--num-nodes",
32
46
  type=int,
@@ -40,24 +54,48 @@ def cli():
40
54
  @click.option(
41
55
  "--qos",
42
56
  type=str,
43
- help="Quality of service, default depends on suggested resource allocation required for the model",
57
+ help="Quality of service",
44
58
  )
45
59
  @click.option(
46
60
  "--time",
47
61
  type=str,
48
- help="Time limit for job, this should comply with QoS, default to max walltime of the chosen QoS",
62
+ help="Time limit for job, this should comply with QoS limits",
49
63
  )
50
64
  @click.option(
51
65
  "--vocab-size",
52
66
  type=int,
53
67
  help="Vocabulary size, this option is intended for custom models",
54
68
  )
55
- @click.option("--data-type", type=str, help="Model data type, default to auto")
56
- @click.option("--venv", type=str, help="Path to virtual environment")
69
+ @click.option(
70
+ "--data-type", type=str, default="auto", help="Model data type, default to auto"
71
+ )
72
+ @click.option(
73
+ "--venv",
74
+ type=str,
75
+ default="singularity",
76
+ help="Path to virtual environment, default to preconfigured singularity container",
77
+ )
57
78
  @click.option(
58
79
  "--log-dir",
59
80
  type=str,
60
- help="Path to slurm log directory, default to .vec-inf-logs in home directory",
81
+ default="default",
82
+ help="Path to slurm log directory, default to .vec-inf-logs in user home directory",
83
+ )
84
+ @click.option(
85
+ "--model-weights-parent-dir",
86
+ type=str,
87
+ default="/model-weights",
88
+ help="Path to parent directory containing model weights, default to '/model-weights' for supported models",
89
+ )
90
+ @click.option(
91
+ "--pipeline-parallelism",
92
+ type=str,
93
+ help="Enable pipeline parallelism, accepts 'True' or 'False', default to 'True' for supported models",
94
+ )
95
+ @click.option(
96
+ "--enforce-eager",
97
+ type=str,
98
+ help="Always use eager-mode PyTorch, accepts 'True' or 'False', default to 'False' for custom models if not set",
61
99
  )
62
100
  @click.option(
63
101
  "--json-mode",
@@ -69,6 +107,7 @@ def launch(
69
107
  model_family: Optional[str] = None,
70
108
  model_variant: Optional[str] = None,
71
109
  max_model_len: Optional[int] = None,
110
+ max_num_seqs: Optional[int] = None,
72
111
  partition: Optional[str] = None,
73
112
  num_nodes: Optional[int] = None,
74
113
  num_gpus: Optional[int] = None,
@@ -78,11 +117,20 @@ def launch(
78
117
  data_type: Optional[str] = None,
79
118
  venv: Optional[str] = None,
80
119
  log_dir: Optional[str] = None,
120
+ model_weights_parent_dir: Optional[str] = None,
121
+ pipeline_parallelism: Optional[str] = None,
122
+ enforce_eager: Optional[str] = None,
81
123
  json_mode: bool = False,
82
124
  ) -> None:
83
125
  """
84
126
  Launch a model on the cluster
85
127
  """
128
+
129
+ if isinstance(pipeline_parallelism, str):
130
+ pipeline_parallelism = (
131
+ "True" if pipeline_parallelism.lower() == "true" else "False"
132
+ )
133
+
86
134
  launch_script_path = os.path.join(
87
135
  os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "launch_server.sh"
88
136
  )
@@ -90,7 +138,7 @@ def launch(
90
138
 
91
139
  models_df = utils.load_models_df()
92
140
 
93
- if model_name in models_df["model_name"].values:
141
+ if model_name in models_df["model_name"].to_list():
94
142
  default_args = utils.load_default_args(models_df, model_name)
95
143
  for arg in default_args:
96
144
  if arg in locals() and locals()[arg] is not None:
@@ -98,10 +146,11 @@ def launch(
98
146
  renamed_arg = arg.replace("_", "-")
99
147
  launch_cmd += f" --{renamed_arg} {default_args[arg]}"
100
148
  else:
101
- model_args = models_df.columns.tolist()
102
- excluded_keys = ["model_name", "pipeline_parallelism"]
149
+ model_args = models_df.columns
150
+ model_args.remove("model_name")
151
+ model_args.remove("model_type")
103
152
  for arg in model_args:
104
- if arg not in excluded_keys and locals()[arg] is not None:
153
+ if locals()[arg] is not None:
105
154
  renamed_arg = arg.replace("_", "-")
106
155
  launch_cmd += f" --{renamed_arg} {locals()[arg]}"
107
156
 
@@ -225,40 +274,111 @@ def shutdown(slurm_job_id: int) -> None:
225
274
  is_flag=True,
226
275
  help="Output in JSON string",
227
276
  )
228
- def list(model_name: Optional[str] = None, json_mode: bool = False) -> None:
277
+ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
229
278
  """
230
279
  List all available models, or get default setup of a specific model
231
280
  """
232
- models_df = utils.load_models_df()
233
281
 
234
- if model_name:
235
- if model_name not in models_df["model_name"].values:
282
+ def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool):
283
+ if model_name not in models_df["model_name"].to_list():
236
284
  raise ValueError(f"Model name {model_name} not found in available models")
237
285
 
238
- excluded_keys = {"venv", "log_dir", "pipeline_parallelism"}
239
- model_row = models_df.loc[models_df["model_name"] == model_name]
286
+ excluded_keys = {"venv", "log_dir"}
287
+ model_row = models_df.filter(models_df["model_name"] == model_name)
240
288
 
241
289
  if json_mode:
242
- # click.echo(model_row.to_json(orient='records'))
243
- filtered_model_row = model_row.drop(columns=excluded_keys, errors="ignore")
244
- click.echo(filtered_model_row.to_json(orient="records"))
290
+ filtered_model_row = model_row.drop(excluded_keys, strict=False)
291
+ click.echo(filtered_model_row.to_dicts()[0])
245
292
  return
246
293
  table = utils.create_table(key_title="Model Config", value_title="Value")
247
- for _, row in model_row.iterrows():
294
+ for row in model_row.to_dicts():
248
295
  for key, value in row.items():
249
296
  if key not in excluded_keys:
250
297
  table.add_row(key, str(value))
251
298
  CONSOLE.print(table)
252
- return
253
299
 
254
- if json_mode:
255
- click.echo(models_df["model_name"].to_json(orient="records"))
256
- return
257
- panels = []
258
- for _, row in models_df.iterrows():
259
- styled_text = f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
260
- panels.append(Panel(styled_text, expand=True))
261
- CONSOLE.print(Columns(panels, equal=True))
300
+ def list_all(models_df: pl.DataFrame, json_mode: bool):
301
+ if json_mode:
302
+ click.echo(models_df["model_name"].to_list())
303
+ return
304
+ panels = []
305
+ model_type_colors = {
306
+ "LLM": "cyan",
307
+ "VLM": "bright_blue",
308
+ "Text Embedding": "purple",
309
+ "Reward Modeling": "bright_magenta",
310
+ }
311
+
312
+ models_df = models_df.with_columns(
313
+ pl.when(pl.col("model_type") == "LLM")
314
+ .then(0)
315
+ .when(pl.col("model_type") == "VLM")
316
+ .then(1)
317
+ .when(pl.col("model_type") == "Text Embedding")
318
+ .then(2)
319
+ .when(pl.col("model_type") == "Reward Modeling")
320
+ .then(3)
321
+ .otherwise(-1)
322
+ .alias("model_type_order")
323
+ )
324
+
325
+ models_df = models_df.sort("model_type_order")
326
+ models_df = models_df.drop("model_type_order")
327
+
328
+ for row in models_df.to_dicts():
329
+ panel_color = model_type_colors.get(row["model_type"], "white")
330
+ styled_text = (
331
+ f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
332
+ )
333
+ panels.append(Panel(styled_text, expand=True, border_style=panel_color))
334
+ CONSOLE.print(Columns(panels, equal=True))
335
+
336
+ models_df = utils.load_models_df()
337
+
338
+ if model_name:
339
+ list_model(model_name, models_df, json_mode)
340
+ else:
341
+ list_all(models_df, json_mode)
342
+
343
+
344
+ @cli.command("metrics")
345
+ @click.argument("slurm_job_id", type=int, nargs=1)
346
+ @click.option(
347
+ "--log-dir",
348
+ type=str,
349
+ help="Path to slurm log directory. This is required if --log-dir was set in model launch",
350
+ )
351
+ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
352
+ """
353
+ Stream performance metrics to the console
354
+ """
355
+ status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
356
+ output = utils.run_bash_command(status_cmd)
357
+ slurm_job_name = output.split(" ")[1].split("=")[1]
358
+
359
+ with Live(refresh_per_second=1, console=CONSOLE) as live:
360
+ while True:
361
+ out_logs = utils.read_slurm_log(
362
+ slurm_job_name, slurm_job_id, "out", log_dir
363
+ )
364
+ # if out_logs is a string, then it is an error message
365
+ if isinstance(out_logs, str):
366
+ live.update(out_logs)
367
+ break
368
+ out_logs = cast(list, out_logs)
369
+ latest_metrics = utils.get_latest_metric(out_logs)
370
+ # if latest_metrics is a string, then it is an error message
371
+ if isinstance(latest_metrics, str):
372
+ live.update(latest_metrics)
373
+ break
374
+ latest_metrics = cast(dict, latest_metrics)
375
+ table = utils.create_table(key_title="Metric", value_title="Value")
376
+ for key, value in latest_metrics.items():
377
+ table.add_row(key, value)
378
+
379
+ live.update(table)
380
+
381
+ time.sleep(2)
262
382
 
263
383
 
264
384
  if __name__ == "__main__":
vec_inf/cli/_utils.py CHANGED
@@ -1,12 +1,12 @@
1
1
  import os
2
2
  import subprocess
3
- from typing import Optional, Union
3
+ from typing import Optional, Union, cast
4
4
 
5
- import pandas as pd
5
+ import polars as pl
6
6
  import requests
7
7
  from rich.table import Table
8
8
 
9
- MODEL_READY_SIGNATURE = "INFO: Uvicorn running on http://0.0.0.0:"
9
+ MODEL_READY_SIGNATURE = "INFO: Application startup complete."
10
10
  SERVER_ADDRESS_SIGNATURE = "Server address: "
11
11
 
12
12
 
@@ -25,7 +25,7 @@ def read_slurm_log(
25
25
  slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
26
26
  ) -> Union[list[str], str]:
27
27
  """
28
- Get the directory of a model
28
+ Read the slurm log file
29
29
  """
30
30
  if not log_dir:
31
31
  models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
@@ -35,9 +35,11 @@ def read_slurm_log(
35
35
  log_dir = os.path.join(models_dir, dir)
36
36
  break
37
37
 
38
+ log_dir = cast(str, log_dir)
39
+
38
40
  try:
39
41
  file_path = os.path.join(
40
- log_dir, # type: ignore
42
+ log_dir,
41
43
  f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}",
42
44
  )
43
45
  with open(file_path, "r") as file:
@@ -58,12 +60,15 @@ def is_server_running(
58
60
  if isinstance(log_content, str):
59
61
  return log_content
60
62
 
63
+ status: Union[str, tuple[str, str]] = "LAUNCHING"
64
+
61
65
  for line in log_content:
62
66
  if "error" in line.lower():
63
- return ("FAILED", line.strip("\n"))
67
+ status = ("FAILED", line.strip("\n"))
64
68
  if MODEL_READY_SIGNATURE in line:
65
- return "RUNNING"
66
- return "LAUNCHING"
69
+ status = "RUNNING"
70
+
71
+ return status
67
72
 
68
73
 
69
74
  def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
@@ -114,11 +119,11 @@ def create_table(
114
119
  return table
115
120
 
116
121
 
117
- def load_models_df() -> pd.DataFrame:
122
+ def load_models_df() -> pl.DataFrame:
118
123
  """
119
124
  Load the models dataframe
120
125
  """
121
- models_df = pd.read_csv(
126
+ models_df = pl.read_csv(
122
127
  os.path.join(
123
128
  os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
124
129
  "models/models.csv",
@@ -127,11 +132,32 @@ def load_models_df() -> pd.DataFrame:
127
132
  return models_df
128
133
 
129
134
 
130
- def load_default_args(models_df: pd.DataFrame, model_name: str) -> dict:
135
+ def load_default_args(models_df: pl.DataFrame, model_name: str) -> dict:
131
136
  """
132
137
  Load the default arguments for a model
133
138
  """
134
- row_data = models_df.loc[models_df["model_name"] == model_name]
135
- default_args = row_data.iloc[0].to_dict()
136
- default_args.pop("model_name")
139
+ row_data = models_df.filter(models_df["model_name"] == model_name)
140
+ default_args = row_data.to_dicts()[0]
141
+ default_args.pop("model_name", None)
142
+ default_args.pop("model_type", None)
137
143
  return default_args
144
+
145
+
146
+ def get_latest_metric(log_lines: list[str]) -> dict | str:
147
+ """Read the latest metric entry from the log file."""
148
+ latest_metric = {}
149
+
150
+ try:
151
+ for line in reversed(log_lines):
152
+ if "Avg prompt throughput" in line:
153
+ # Parse the metric values from the line
154
+ metrics_str = line.split("] ")[1].strip().strip(".")
155
+ metrics_list = metrics_str.split(", ")
156
+ for metric in metrics_list:
157
+ key, value = metric.split(": ")
158
+ latest_metric[key] = value
159
+ break
160
+ except Exception as e:
161
+ return f"[red]Error reading log file: {e}[/red]"
162
+
163
+ return latest_metric
vec_inf/launch_server.sh CHANGED
@@ -12,21 +12,24 @@ while [[ "$#" -gt 0 ]]; do
12
12
  --num-nodes) num_nodes="$2"; shift ;;
13
13
  --num-gpus) num_gpus="$2"; shift ;;
14
14
  --max-model-len) max_model_len="$2"; shift ;;
15
+ --max-num-seqs) max_num_seqs="$2"; shift ;;
15
16
  --vocab-size) vocab_size="$2"; shift ;;
16
17
  --data-type) data_type="$2"; shift ;;
17
- --venv) virtual_env="$2"; shift ;;
18
+ --venv) venv="$2"; shift ;;
18
19
  --log-dir) log_dir="$2"; shift ;;
20
+ --model-weights-parent-dir) model_weights_parent_dir="$2"; shift ;;
19
21
  --pipeline-parallelism) pipeline_parallelism="$2"; shift ;;
22
+ --enforce-eager) enforce_eager="$2"; shift ;;
20
23
  *) echo "Unknown parameter passed: $1"; exit 1 ;;
21
24
  esac
22
25
  shift
23
26
  done
24
27
 
25
- required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size)
28
+ required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
26
29
 
27
30
  for var in "$required_vars[@]"; do
28
31
  if [ -z "$!var" ]; then
29
- echo "Error: Missing required --$var//_/- argument."
32
+ echo "Error: Missing required --$var argument."
30
33
  exit 1
31
34
  fi
32
35
  done
@@ -40,27 +43,27 @@ export NUM_NODES=$num_nodes
40
43
  export NUM_GPUS=$num_gpus
41
44
  export VLLM_MAX_MODEL_LEN=$max_model_len
42
45
  export VLLM_MAX_LOGPROBS=$vocab_size
43
- # For custom models, the following are set to default if not specified
44
- export VLLM_DATA_TYPE="auto"
45
- export VENV_BASE="singularity"
46
- export LOG_DIR="default"
47
- # Pipeline parallelism is disabled and can only be enabled if specified in models.csv as this is an experimental feature
48
- export PIPELINE_PARALLELISM="false"
49
-
50
- if [ -n "$data_type" ]; then
51
- export VLLM_DATA_TYPE=$data_type
52
- fi
53
-
54
- if [ -n "$virtual_env" ]; then
55
- export VENV_BASE=$virtual_env
56
- fi
57
-
58
- if [ -n "$log_dir" ]; then
59
- export LOG_DIR=$log_dir
46
+ export VLLM_DATA_TYPE=$data_type
47
+ export VENV_BASE=$venv
48
+ export LOG_DIR=$log_dir
49
+ export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
50
+
51
+ if [ -n "$max_num_seqs" ]; then
52
+ export VLLM_MAX_NUM_SEQS=$max_num_seqs
53
+ else
54
+ export VLLM_MAX_NUM_SEQS=256
60
55
  fi
61
56
 
62
57
  if [ -n "$pipeline_parallelism" ]; then
63
58
  export PIPELINE_PARALLELISM=$pipeline_parallelism
59
+ else
60
+ export PIPELINE_PARALLELISM="False"
61
+ fi
62
+
63
+ if [ -n "$enforce_eager" ]; then
64
+ export ENFORCE_EAGER=$enforce_eager
65
+ else
66
+ export ENFORCE_EAGER="False"
64
67
  fi
65
68
 
66
69
  # ================================= Set default environment variables ======================================
@@ -72,13 +75,12 @@ fi
72
75
  mkdir -p $LOG_DIR
73
76
 
74
77
  # Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
75
- # SLURM job and are written to the file specified at VLLM_BASE_URL_FILENAME
78
+ # SLURM job
76
79
  export SRC_DIR="$(dirname "$0")"
77
80
  export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
78
- export VLLM_BASE_URL_FILENAME="${MODEL_DIR}/.${JOB_NAME}_url"
79
81
 
80
82
  # Variables specific to your working environment, below are examples for the Vector cluster
81
- export VLLM_MODEL_WEIGHTS="/model-weights/$JOB_NAME"
83
+ export VLLM_MODEL_WEIGHTS="${MODEL_WEIGHTS_PARENT_DIR}/${JOB_NAME}"
82
84
  export LD_LIBRARY_PATH="/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
83
85
 
84
86
 
@@ -93,11 +95,6 @@ if [[ $fp16_partitions =~ $JOB_PARTITION ]]; then
93
95
  echo "Data type set to due to non-Ampere GPUs used: $VLLM_DATA_TYPE"
94
96
  fi
95
97
 
96
- # Create a file to store the API server URL if it doesn't exist
97
- if [ -f $VLLM_BASE_URL_FILENAME ]; then
98
- touch $VLLM_BASE_URL_FILENAME
99
- fi
100
-
101
98
  echo Job Name: $JOB_NAME
102
99
  echo Partition: $JOB_PARTITION
103
100
  echo Num Nodes: $NUM_NODES
@@ -105,6 +102,13 @@ echo GPUs per Node: $NUM_GPUS
105
102
  echo QOS: $QOS
106
103
  echo Walltime: $WALLTIME
107
104
  echo Data Type: $VLLM_DATA_TYPE
105
+ echo Max Model Length: $VLLM_MAX_MODEL_LEN
106
+ echo Max Num Seqs: $VLLM_MAX_NUM_SEQS
107
+ echo Vocabulary Size: $VLLM_MAX_LOGPROBS
108
+ echo Pipeline Parallelism: $PIPELINE_PARALLELISM
109
+ echo Enforce Eager: $ENFORCE_EAGER
110
+ echo Log Directory: $LOG_DIR
111
+ echo Model Weights Parent Directory: $MODEL_WEIGHTS_PARENT_DIR
108
112
 
109
113
  is_special=""
110
114
  if [ "$NUM_NODES" -gt 1 ]; then
vec_inf/models/README.md CHANGED
@@ -1,13 +1,17 @@
1
1
  # Available Models
2
2
  More profiling metrics coming soon!
3
3
 
4
- ## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
4
+ ## Text Generation Models
5
+
6
+ ### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
5
7
 
6
8
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
7
9
  |:----------:|:----------:|:----------:|:----------:|
8
- |[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
10
+ | [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
11
+ | [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
12
+ | [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
9
13
 
10
- ## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
14
+ ### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
11
15
 
12
16
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
13
17
  |:----------:|:----------:|:----------:|:----------:|
@@ -20,13 +24,13 @@ More profiling metrics coming soon!
20
24
  | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
21
25
  | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
22
26
 
23
- ## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
27
+ ### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
24
28
 
25
29
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
26
30
  |:----------:|:----------:|:----------:|:----------:|
27
- |[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
31
+ | [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
28
32
 
29
- ## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
33
+ ### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
30
34
 
31
35
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
32
36
  |:----------:|:----------:|:----------:|:----------:|
@@ -35,21 +39,7 @@ More profiling metrics coming soon!
35
39
  | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
36
40
  | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
37
41
 
38
- ## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
39
-
40
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
41
- |:----------:|:----------:|:----------:|:----------:|
42
- |[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
43
- |[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
44
-
45
- ## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
46
-
47
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
48
- |:----------:|:----------:|:----------:|:----------:|
49
- |[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
50
- |[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
51
-
52
- ## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
42
+ ### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
53
43
 
54
44
  | Variant | Suggested resource allocation |
55
45
  |:----------:|:----------:|
@@ -60,7 +50,7 @@ More profiling metrics coming soon!
60
50
  | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
61
51
  | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
62
52
 
63
- ## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
53
+ ### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
64
54
 
65
55
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
66
56
  |:----------:|:----------:|:----------:|:----------:|
@@ -69,7 +59,7 @@ More profiling metrics coming soon!
69
59
  | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
70
60
  | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
71
61
 
72
- ## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
62
+ ### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
73
63
 
74
64
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
75
65
  |:----------:|:----------:|:----------:|:----------:|
@@ -79,28 +69,135 @@ More profiling metrics coming soon!
79
69
  | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
80
70
  | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
81
71
 
82
- ## [Mistral AI: Mistral](https://huggingface.co/mistralai)
72
+ ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
73
+
74
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
75
+ |:----------:|:----------:|:----------:|:----------:|
76
+ | [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
77
+ | [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
78
+ | [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
79
+ | [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
80
+
81
+ ### [Mistral AI: Mistral](https://huggingface.co/mistralai)
83
82
 
84
83
  | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
85
84
  |:----------:|:----------:|:----------:|:----------:|
86
- |[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
87
- |[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
88
- |[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
89
- |[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
90
- |[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
91
- |[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 4x a40 | - tokens/s | - tokens/s|
85
+ | [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
86
+ | [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
87
+ | [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
88
+ | [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
89
+ | [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
90
+ | [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
91
+ | [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
92
92
 
93
- ## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
93
+ ### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
94
94
 
95
95
  | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
96
96
  |:----------:|:----------:|:----------:|:----------:|
97
- |[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
98
- |[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
99
- |[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
97
+ | [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
98
+ | [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
99
+ | [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
100
100
 
101
- ## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
101
+ ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
102
102
 
103
103
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
104
104
  |:----------:|:----------:|:----------:|:----------:|
105
105
  | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
106
+
107
+ ### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
108
+
109
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
110
+ |:----------:|:----------:|:----------:|:----------:|
111
+ | [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
112
+
113
+ ### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
114
+
115
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
116
+ |:----------:|:----------:|:----------:|:----------:|
117
+ | [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
118
+
119
+ ### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
120
+
121
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
122
+ |:----------:|:----------:|:----------:|:----------:|
123
+ | [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
124
+ | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
125
+ | [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
126
+ | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
127
+ | [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
128
+ | [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
129
+ | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
130
+
131
+ ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
132
+
133
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
134
+ |:----------:|:----------:|:----------:|:----------:|
135
+ | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
136
+ | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
137
+ | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
138
+
139
+ ### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
140
+
141
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
142
+ |:----------:|:----------:|:----------:|:----------:|
143
+ | [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
144
+
145
+ ### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
146
+
147
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
148
+ |:----------:|:----------:|:----------:|:----------:|
149
+ | [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
150
+
151
+ ## Vision Language Models
152
+
153
+ ### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
154
+
155
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
156
+ |:----------:|:----------:|:----------:|:----------:|
157
+ | [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
158
+ | [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
159
+
160
+ ### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
161
+
162
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
163
+ |:----------:|:----------:|:----------:|:----------:|
164
+ | [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
165
+ | [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
166
+
167
+ ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
168
+
169
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
170
+ |:----------:|:----------:|:----------:|:----------:|
106
171
  | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
172
+
173
+ ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
174
+
175
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
176
+ |:----------:|:----------:|:----------:|:----------:|
177
+ | [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
178
+ | [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
179
+ | [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
180
+ | [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
181
+
182
+ **NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
183
+
184
+ ### [Mistral: Pixtral](https://huggingface.co/mistralai)
185
+
186
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
187
+ |:----------:|:----------:|:----------:|:----------:|
188
+ | [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
189
+
190
+ ## Text Embedding Models
191
+
192
+ ### [Liang Wang: e5](https://huggingface.co/intfloat)
193
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
194
+ |:----------:|:----------:|:----------:|:----------:|
195
+ | [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
196
+
197
+ ## Reward Modeling Models
198
+
199
+ ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
200
+
201
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
202
+ |:----------:|:----------:|:----------:|:----------:|
203
+ | [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
vec_inf/models/models.csv CHANGED
@@ -1,46 +1,73 @@
1
- model_name,model_family,model_variant,partition,qos,time,num_gpus,num_nodes,vocab_size,max_model_len,data_type,venv,log_dir,pipeline_parallelism
2
- c4ai-command-r-plus,c4ai-command-r,plus,a40,m2,08:00:00,4,2,256000,8192,auto,singularity,default,false
3
- CodeLlama-7b-hf,CodeLlama,7b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
4
- CodeLlama-7b-Instruct-hf,CodeLlama,7b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
5
- CodeLlama-13b-hf,CodeLlama,13b-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
6
- CodeLlama-13b-Instruct-hf,CodeLlama,13b-Instruct-hf,a40,m2,08:00:00,1,1,32000,16384,auto,singularity,default,false
7
- CodeLlama-34b-hf,CodeLlama,34b-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
8
- CodeLlama-34b-Instruct-hf,CodeLlama,34b-Instruct-hf,a40,m2,08:00:00,2,1,32000,16384,auto,singularity,default,false
9
- CodeLlama-70b-hf,CodeLlama,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
10
- CodeLlama-70b-Instruct-hf,CodeLlama,70b-Instruct-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
11
- dbrx-instruct,dbrx,instruct,a40,m2,08:00:00,4,2,100352,32000,auto,singularity,default,false
12
- gemma-2-9b,gemma-2,9b,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
13
- gemma-2-9b-it,gemma-2,9b-it,a40,m2,08:00:00,1,1,256000,4096,auto,singularity,default,false
14
- gemma-2-27b,gemma-2,27b,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
15
- gemma-2-27b-it,gemma-2,27b-it,a40,m2,08:00:00,2,1,256000,4096,auto,singularity,default,false
16
- Llama-2-7b-hf,Llama-2,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
17
- Llama-2-7b-chat-hf,Llama-2,7b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
18
- Llama-2-13b-hf,Llama-2,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
19
- Llama-2-13b-chat-hf,Llama-2,13b-chat-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
20
- Llama-2-70b-hf,Llama-2,70b-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
21
- Llama-2-70b-chat-hf,Llama-2,70b-chat-hf,a40,m2,08:00:00,4,1,32000,4096,auto,singularity,default,false
22
- llava-1.5-7b-hf,llava-1.5,7b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
23
- llava-1.5-13b-hf,llava-1.5,13b-hf,a40,m2,08:00:00,1,1,32000,4096,auto,singularity,default,false
24
- llava-v1.6-mistral-7b-hf,llava-v1.6,mistral-7b-hf,a40,m2,08:00:00,1,1,32064,32768,auto,singularity,default,false
25
- llava-v1.6-34b-hf,llava-v1.6,34b-hf,a40,m2,08:00:00,2,1,64064,4096,auto,singularity,default,false
26
- Meta-Llama-3-8B,Meta-Llama-3,8B,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
27
- Meta-Llama-3-8B-Instruct,Meta-Llama-3,8B-Instruct,a40,m2,08:00:00,1,1,128256,8192,auto,singularity,default,false
28
- Meta-Llama-3-70B,Meta-Llama-3,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
29
- Meta-Llama-3-70B-Instruct,Meta-Llama-3,70B-Instruct,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
30
- Meta-Llama-3.1-8B,Meta-Llama-3.1,8B,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
31
- Meta-Llama-3.1-8B-Instruct,Meta-Llama-3.1,8B-Instruct,a40,m2,08:00:00,1,1,128256,131072,auto,singularity,default,false
32
- Meta-Llama-3.1-70B,Meta-Llama-3.1,70B,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
33
- Meta-Llama-3.1-70B-Instruct,Meta-Llama-3.1,70B-Instruct,a40,m2,08:00:00,4,1,128256,65536,auto,singularity,default,false
34
- Meta-Llama-3.1-405B-Instruct,Meta-Llama-3.1,405B-Instruct,a40,m4,02:00:00,4,8,128256,16384,auto,singularity,default,true
35
- Mistral-7B-v0.1,Mistral,7B-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
36
- Mistral-7B-Instruct-v0.1,Mistral,7B-Instruct-v0.1,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
37
- Mistral-7B-Instruct-v0.2,Mistral,7B-Instruct-v0.2,a40,m2,08:00:00,1,1,32000,32768,auto,singularity,default,false
38
- Mistral-7B-v0.3,Mistral,7B-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
39
- Mistral-7B-Instruct-v0.3,Mistral,7B-Instruct-v0.3,a40,m2,08:00:00,1,1,32768,32768,auto,singularity,default,false
40
- Mistral-Large-Instruct-2407,Mistral,Large-Instruct-2407,a40,m2,08:00:00,4,1,32768,131072,auto,singularity,default,false
41
- Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,a40,m2,08:00:00,4,1,32000,32768,auto,singularity,default,false
42
- Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
43
- Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,a40,m2,08:00:00,4,2,32768,65536,auto,singularity,default,false
44
- Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,a40,m2,08:00:00,2,1,32064,131072,auto,singularity,default,false
45
- Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,a40,m2,08:00:00,2,1,32064,65536,auto,singularity,default,false
46
- Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,a40,m2,08:00:00,4,1,128256,8192,auto,singularity,default,false
1
+ model_name,model_family,model_variant,model_type,num_gpus,num_nodes,vocab_size,max_model_len,max_num_seqs,pipeline_parallelism,enforce_eager,qos,time,partition,data_type,venv,log_dir,model_weights_parent_dir
2
+ c4ai-command-r-plus,c4ai-command-r,plus,LLM,4,2,256000,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
3
+ c4ai-command-r-plus-08-2024,c4ai-command-r,plus-08-2024,LLM,4,2,256000,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
4
+ c4ai-command-r-08-2024,c4ai-command-r,08-2024,LLM,2,1,256000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
5
+ CodeLlama-7b-hf,CodeLlama,7b-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
6
+ CodeLlama-7b-Instruct-hf,CodeLlama,7b-Instruct-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
7
+ CodeLlama-13b-hf,CodeLlama,13b-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
8
+ CodeLlama-13b-Instruct-hf,CodeLlama,13b-Instruct-hf,LLM,1,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
9
+ CodeLlama-34b-hf,CodeLlama,34b-hf,LLM,2,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
10
+ CodeLlama-34b-Instruct-hf,CodeLlama,34b-Instruct-hf,LLM,2,1,32000,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
11
+ CodeLlama-70b-hf,CodeLlama,70b-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
12
+ CodeLlama-70b-Instruct-hf,CodeLlama,70b-Instruct-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
13
+ dbrx-instruct,dbrx,instruct,LLM,4,2,100352,32000,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
14
+ gemma-2-9b,gemma-2,9b,LLM,1,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
15
+ gemma-2-9b-it,gemma-2,9b-it,LLM,1,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
16
+ gemma-2-27b,gemma-2,27b,LLM,2,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
17
+ gemma-2-27b-it,gemma-2,27b-it,LLM,2,1,256000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
18
+ Llama-2-7b-hf,Llama-2,7b-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
19
+ Llama-2-7b-chat-hf,Llama-2,7b-chat-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
20
+ Llama-2-13b-hf,Llama-2,13b-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
21
+ Llama-2-13b-chat-hf,Llama-2,13b-chat-hf,LLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
22
+ Llama-2-70b-hf,Llama-2,70b-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
23
+ Llama-2-70b-chat-hf,Llama-2,70b-chat-hf,LLM,4,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
24
+ llava-1.5-7b-hf,llava-1.5,7b-hf,VLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
25
+ llava-1.5-13b-hf,llava-1.5,13b-hf,VLM,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
26
+ llava-v1.6-mistral-7b-hf,llava-v1.6,mistral-7b-hf,VLM,1,1,32064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
27
+ llava-v1.6-34b-hf,llava-v1.6,34b-hf,VLM,2,1,64064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
28
+ Meta-Llama-3-8B,Meta-Llama-3,8B,LLM,1,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
29
+ Meta-Llama-3-8B-Instruct,Meta-Llama-3,8B-Instruct,LLM,1,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
30
+ Meta-Llama-3-70B,Meta-Llama-3,70B,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
31
+ Meta-Llama-3-70B-Instruct,Meta-Llama-3,70B-Instruct,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
32
+ Meta-Llama-3.1-8B,Meta-Llama-3.1,8B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
33
+ Meta-Llama-3.1-8B-Instruct,Meta-Llama-3.1,8B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
34
+ Meta-Llama-3.1-70B,Meta-Llama-3.1,70B,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
35
+ Meta-Llama-3.1-70B-Instruct,Meta-Llama-3.1,70B-Instruct,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
36
+ Meta-Llama-3.1-405B-Instruct,Meta-Llama-3.1,405B-Instruct,LLM,4,8,128256,16384,256,true,false,m4,02:00:00,a40,auto,singularity,default,/model-weights
37
+ Mistral-7B-v0.1,Mistral,7B-v0.1,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
38
+ Mistral-7B-Instruct-v0.1,Mistral,7B-Instruct-v0.1,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
39
+ Mistral-7B-Instruct-v0.2,Mistral,7B-Instruct-v0.2,LLM,1,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
40
+ Mistral-7B-v0.3,Mistral,7B-v0.3,LLM,1,1,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
41
+ Mistral-7B-Instruct-v0.3,Mistral,7B-Instruct-v0.3,LLM,1,1,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
42
+ Mistral-Large-Instruct-2407,Mistral,Large-Instruct-2407,LLM,4,2,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
43
+ Mistral-Large-Instruct-2411,Mistral,Large-Instruct-2411,LLM,4,2,32768,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
44
+ Mixtral-8x7B-Instruct-v0.1,Mixtral,8x7B-Instruct-v0.1,LLM,4,1,32000,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
45
+ Mixtral-8x22B-v0.1,Mixtral,8x22B-v0.1,LLM,4,2,32768,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
46
+ Mixtral-8x22B-Instruct-v0.1,Mixtral,8x22B-Instruct-v0.1,LLM,4,2,32768,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
47
+ Phi-3-medium-128k-instruct,Phi-3,medium-128k-instruct,LLM,2,1,32064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
48
+ Phi-3-vision-128k-instruct,Phi-3,vision-128k-instruct,VLM,2,1,32064,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
49
+ Llama3-OpenBioLLM-70B,Llama3-OpenBioLLM,70B,LLM,4,1,128256,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
50
+ Llama-3.1-Nemotron-70B-Instruct-HF,Llama-3.1-Nemotron,70B-Instruct-HF,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
51
+ Llama-3.2-1B,Llama-3.2,1B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
52
+ Llama-3.2-1B-Instruct,Llama-3.2,1B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
53
+ Llama-3.2-3B,Llama-3.2,3B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
54
+ Llama-3.2-3B-Instruct,Llama-3.2,3B-Instruct,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
55
+ Llama-3.2-11B-Vision,Llama-3.2,11B-Vision,VLM,2,1,128256,4096,64,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
56
+ Llama-3.2-11B-Vision-Instruct,Llama-3.2,11B-Vision-Instruct,VLM,2,1,128256,4096,64,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
57
+ Llama-3.2-90B-Vision,Llama-3.2,90B-Vision,VLM,4,2,128256,4096,32,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
58
+ Llama-3.2-90B-Vision-Instruct,Llama-3.2,90B-Vision-Instruct,VLM,4,2,128256,4096,32,false,true,m2,08:00:00,a40,auto,singularity,default,/model-weights
59
+ Qwen2.5-0.5B-Instruct,Qwen2.5,0.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
60
+ Qwen2.5-1.5B-Instruct,Qwen2.5,1.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
61
+ Qwen2.5-3B-Instruct,Qwen2.5,3B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
62
+ Qwen2.5-7B-Instruct,Qwen2.5,7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
63
+ Qwen2.5-14B-Instruct,Qwen2.5,14B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
64
+ Qwen2.5-32B-Instruct,Qwen2.5,32B-Instruct,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
65
+ Qwen2.5-72B-Instruct,Qwen2.5,72B-Instruct,LLM,4,1,152064,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
66
+ Qwen2.5-Math-1.5B-Instruct,Qwen2.5,Math-1.5B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
67
+ Qwen2.5-Math-7B-Instruct,Qwen2.5,Math-7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
68
+ Qwen2.5-Math-72B-Instruct,Qwen2.5,Math-72B-Instruct,LLM,4,1,152064,16384,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
69
+ Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
70
+ Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
71
+ QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
72
+ Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
73
+ e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
@@ -5,13 +5,14 @@
5
5
  #SBATCH --tasks-per-node=1
6
6
 
7
7
  # Load CUDA, change to the cuda version on your environment if different
8
+ source /opt/lmod/lmod/init/profile
8
9
  module load cuda-12.3
9
10
  nvidia-smi
10
11
 
11
12
  source ${SRC_DIR}/find_port.sh
12
13
 
13
14
  if [ "$VENV_BASE" = "singularity" ]; then
14
- export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
15
+ export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
15
16
  export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
16
17
  module load singularity-ce/3.8.2
17
18
  singularity exec $SINGULARITY_IMAGE ray stop
@@ -35,7 +36,7 @@ echo "IP Head: $ip_head"
35
36
  echo "Starting HEAD at $head_node"
36
37
  if [ "$VENV_BASE" = "singularity" ]; then
37
38
  srun --nodes=1 --ntasks=1 -w "$head_node" \
38
- singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
39
+ singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
39
40
  ray start --head --node-ip-address="$head_node_ip" --port=$head_node_port \
40
41
  --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
41
42
  else
@@ -56,7 +57,7 @@ for ((i = 1; i <= worker_num; i++)); do
56
57
  echo "Starting WORKER $i at $node_i"
57
58
  if [ "$VENV_BASE" = "singularity" ]; then
58
59
  srun --nodes=1 --ntasks=1 -w "$node_i" \
59
- singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
60
+ singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
60
61
  ray start --address "$ip_head" \
61
62
  --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${NUM_GPUS}" --block &
62
63
  else
@@ -72,9 +73,8 @@ done
72
73
  vllm_port_number=$(find_available_port $head_node_ip 8080 65535)
73
74
 
74
75
  echo "Server address: http://${head_node_ip}:${vllm_port_number}/v1"
75
- echo "http://${head_node_ip}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
76
76
 
77
- if [ "$PIPELINE_PARALLELISM" = "true" ]; then
77
+ if [ "$PIPELINE_PARALLELISM" = "True" ]; then
78
78
  export PIPELINE_PARALLEL_SIZE=$NUM_NODES
79
79
  export TENSOR_PARALLEL_SIZE=$NUM_GPUS
80
80
  else
@@ -82,9 +82,15 @@ else
82
82
  export TENSOR_PARALLEL_SIZE=$((NUM_NODES*NUM_GPUS))
83
83
  fi
84
84
 
85
+ if [ "$ENFORCE_EAGER" = "True" ]; then
86
+ export ENFORCE_EAGER="--enforce-eager"
87
+ else
88
+ export ENFORCE_EAGER=""
89
+ fi
90
+
85
91
  # Activate vllm venv
86
92
  if [ "$VENV_BASE" = "singularity" ]; then
87
- singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
93
+ singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
88
94
  python3.10 -m vllm.entrypoints.openai.api_server \
89
95
  --model ${VLLM_MODEL_WEIGHTS} \
90
96
  --served-model-name ${JOB_NAME} \
@@ -95,7 +101,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
95
101
  --dtype ${VLLM_DATA_TYPE} \
96
102
  --trust-remote-code \
97
103
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
98
- --max-model-len ${VLLM_MAX_MODEL_LEN}
104
+ --max-model-len ${VLLM_MAX_MODEL_LEN} \
105
+ --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
106
+ ${ENFORCE_EAGER}
99
107
  else
100
108
  source ${VENV_BASE}/bin/activate
101
109
  python3 -m vllm.entrypoints.openai.api_server \
@@ -108,5 +116,7 @@ else
108
116
  --dtype ${VLLM_DATA_TYPE} \
109
117
  --trust-remote-code \
110
118
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
111
- --max-model-len ${VLLM_MAX_MODEL_LEN}
119
+ --max-model-len ${VLLM_MAX_MODEL_LEN} \
120
+ --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
121
+ ${ENFORCE_EAGER}
112
122
  fi
vec_inf/vllm.slurm CHANGED
@@ -3,6 +3,7 @@
3
3
  #SBATCH --mem=64G
4
4
 
5
5
  # Load CUDA, change to the cuda version on your environment if different
6
+ source /opt/lmod/lmod/init/profile
6
7
  module load cuda-12.3
7
8
  nvidia-smi
8
9
 
@@ -13,15 +14,20 @@ hostname=${SLURMD_NODENAME}
13
14
  vllm_port_number=$(find_available_port $hostname 8080 65535)
14
15
 
15
16
  echo "Server address: http://${hostname}:${vllm_port_number}/v1"
16
- echo "http://${hostname}:${vllm_port_number}/v1" > ${VLLM_BASE_URL_FILENAME}
17
+
18
+ if [ "$ENFORCE_EAGER" = "True" ]; then
19
+ export ENFORCE_EAGER="--enforce-eager"
20
+ else
21
+ export ENFORCE_EAGER=""
22
+ fi
17
23
 
18
24
  # Activate vllm venv
19
25
  if [ "$VENV_BASE" = "singularity" ]; then
20
- export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.3.0.sif
26
+ export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
21
27
  export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
22
28
  module load singularity-ce/3.8.2
23
29
  singularity exec $SINGULARITY_IMAGE ray stop
24
- singularity exec --nv --bind /model-weights:/model-weights $SINGULARITY_IMAGE \
30
+ singularity exec --nv --bind ${MODEL_WEIGHTS_PARENT_DIR}:${MODEL_WEIGHTS_PARENT_DIR} $SINGULARITY_IMAGE \
25
31
  python3.10 -m vllm.entrypoints.openai.api_server \
26
32
  --model ${VLLM_MODEL_WEIGHTS} \
27
33
  --served-model-name ${JOB_NAME} \
@@ -31,7 +37,9 @@ if [ "$VENV_BASE" = "singularity" ]; then
31
37
  --dtype ${VLLM_DATA_TYPE} \
32
38
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
33
39
  --trust-remote-code \
34
- --max-model-len ${VLLM_MAX_MODEL_LEN}
40
+ --max-model-len ${VLLM_MAX_MODEL_LEN} \
41
+ --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
42
+ ${ENFORCE_EAGER}
35
43
  else
36
44
  source ${VENV_BASE}/bin/activate
37
45
  python3 -m vllm.entrypoints.openai.api_server \
@@ -43,5 +51,7 @@ else
43
51
  --dtype ${VLLM_DATA_TYPE} \
44
52
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
45
53
  --trust-remote-code \
46
- --max-model-len ${VLLM_MAX_MODEL_LEN}
54
+ --max-model-len ${VLLM_MAX_MODEL_LEN} \
55
+ --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
56
+ ${ENFORCE_EAGER}
47
57
  fi
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Vector Institute
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vec-inf
3
- Version: 0.3.3
3
+ Version: 0.4.0.post1
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  License: MIT
6
6
  Author: Marshall Wang
@@ -11,19 +11,21 @@ Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.10
12
12
  Classifier: Programming Language :: Python :: 3.11
13
13
  Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
14
15
  Provides-Extra: dev
15
16
  Requires-Dist: click (>=8.1.0,<9.0.0)
16
17
  Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
17
- Requires-Dist: pandas (>=2.2.2,<3.0.0)
18
+ Requires-Dist: numpy (>=1.24.0,<2.0.0)
19
+ Requires-Dist: polars (>=1.15.0,<2.0.0)
18
20
  Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
19
21
  Requires-Dist: requests (>=2.31.0,<3.0.0)
20
22
  Requires-Dist: rich (>=13.7.0,<14.0.0)
21
- Requires-Dist: vllm (>=0.5.0,<0.6.0) ; extra == "dev"
23
+ Requires-Dist: vllm (>=0.6.0,<0.7.0) ; extra == "dev"
22
24
  Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
23
25
  Description-Content-Type: text/markdown
24
26
 
25
27
  # Vector Inference: Easy inference on Slurm clusters
26
- This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec-inf/launch_server.sh), [`vllm.slurm`](vec-inf/vllm.slurm), [`multinode_vllm.slurm`](vec-inf/multinode_vllm.slurm) and [`models.csv`](vec-inf/models/models.csv) accordingly.
28
+ This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec_inf/launch_server.sh), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.csv`](vec_inf/models/models.csv) accordingly.
27
29
 
28
30
  ## Installation
29
31
  If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
@@ -33,16 +35,23 @@ pip install vec-inf
33
35
  Otherwise, we recommend using the provided [`Dockerfile`](Dockerfile) to set up your own environment with the package
34
36
 
35
37
  ## Launch an inference server
38
+ ### `launch` command
36
39
  We will use the Llama 3.1 model as example, to launch an OpenAI compatible inference server for Meta-Llama-3.1-8B-Instruct, run:
37
40
  ```bash
38
41
  vec-inf launch Meta-Llama-3.1-8B-Instruct
39
42
  ```
40
43
  You should see an output like the following:
41
44
 
42
- <img width="400" alt="launch_img" src="https://github.com/user-attachments/assets/557eb421-47db-4810-bccd-c49c526b1b43">
45
+ <img width="700" alt="launch_img" src="https://github.com/user-attachments/assets/ab658552-18b2-47e0-bf70-e539c3b898d5">
43
46
 
44
- The model would be launched using the [default parameters](vec-inf/models/models.csv), you can override these values by providing additional options, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), you'll need to specify all model launching related options to run a successful run.
47
+ The model would be launched using the [default parameters](vec_inf/models/models.csv), you can override these values by providing additional parameters, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
48
+ * Your model weights directory naming convention should follow `$MODEL_FAMILY-$MODEL_VARIANT`.
49
+ * Your model weights directory should contain HF format weights.
50
+ * The following launch parameters will conform to default value if not specified: `--max-num-seqs`, `--partition`, `--data-type`, `--venv`, `--log-dir`, `--model-weights-parent-dir`, `--pipeline-parallelism`, `--enforce-eager`. All other launch parameters need to be specified for custom models.
51
+ * Example for setting the model weights parent directory: `--model-weights-parent-dir /h/user_name/my_weights`.
52
+ * For other model launch parameters you can reference the default values for similar models using the [`list` command ](#list-command).
45
53
 
54
+ ### `status` command
46
55
  You can check the inference server status by providing the Slurm job ID to the `status` command:
47
56
  ```bash
48
57
  vec-inf status 13014393
@@ -62,6 +71,17 @@ There are 5 possible states:
62
71
 
63
72
  Note that the base URL is only available when model is in `READY` state, and if you've changed the Slurm log directory path, you also need to specify it when using the `status` command.
64
73
 
74
+ ### `metrics` command
75
+ Once your server is ready, you can check performance metrics by providing the Slurm job ID to the `metrics` command:
76
+ ```bash
77
+ vec-inf metrics 13014393
78
+ ```
79
+
80
+ And you will see the performance metrics streamed to your console, note that the metrics are updated with a 10-second interval.
81
+
82
+ <img width="400" alt="metrics_img" src="https://github.com/user-attachments/assets/e5ff2cd5-659b-4c88-8ebc-d8f3fdc023a4">
83
+
84
+ ### `shutdown` command
65
85
  Finally, when you're finished using a model, you can shut it down by providing the Slurm job ID:
66
86
  ```bash
67
87
  vec-inf shutdown 13014393
@@ -69,17 +89,19 @@ vec-inf shutdown 13014393
69
89
  > Shutting down model with Slurm Job ID: 13014393
70
90
  ```
71
91
 
92
+ ### `list` command
72
93
  You call view the full list of available models by running the `list` command:
73
94
  ```bash
74
95
  vec-inf list
75
96
  ```
76
- <img width="1200" alt="list_img" src="https://github.com/user-attachments/assets/a4f0d896-989d-43bf-82a2-6a6e5d0d288f">
97
+ <img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
98
+
77
99
 
78
100
  You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
79
101
  ```bash
80
102
  vec-inf list Meta-Llama-3.1-70B-Instruct
81
103
  ```
82
- <img width="400" alt="list_model_img" src="https://github.com/user-attachments/assets/5dec7a33-ba6b-490d-af47-4cf7341d0b42">
104
+ <img width="400" alt="list_model_img" src="https://github.com/user-attachments/assets/30e42ab7-dde2-4d20-85f0-187adffefc3d">
83
105
 
84
106
  `launch`, `list`, and `status` command supports `--json-mode`, where the command output would be structured as a JSON string.
85
107
 
@@ -0,0 +1,16 @@
1
+ vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
2
+ vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
5
+ vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
6
+ vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
+ vec_inf/launch_server.sh,sha256=gFovqXuYiQ8bEc6O31WTMDuBoNj7opB5iVfnCDhz2Nw,4165
8
+ vec_inf/models/README.md,sha256=YNEVTWliHehCpJTq2SXAidqgFl6CWL6GUOnAPksDYFE,14844
9
+ vec_inf/models/models.csv,sha256=f_cNeM7L0-4pgZqYfWilQd12-WVec2IVk6dRq5BE4mE,9875
10
+ vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
11
+ vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
12
+ vec_inf-0.4.0.post1.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
13
+ vec_inf-0.4.0.post1.dist-info/METADATA,sha256=Q6KhU-ggnR9FB5YUjWrPwy2MSd_c9GCFXAQqT9YXZOw,7032
14
+ vec_inf-0.4.0.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
15
+ vec_inf-0.4.0.post1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
16
+ vec_inf-0.4.0.post1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,15 +0,0 @@
1
- vec_inf/README.md,sha256=ny3ffk6FeRwk_nERimK-JQwEuysvBe5eKpNyLk_A-8k,499
2
- vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- vec_inf/cli/_cli.py,sha256=XwCBkwFrN06T_o1CkUKD2nWT6P4bwOfDpVPoM3AUyUA,8984
5
- vec_inf/cli/_utils.py,sha256=n37X0AcgXNEi3wOEqQFA4_iHHeGclHew6NyQaML6q7s,4034
6
- vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
- vec_inf/launch_server.sh,sha256=-efoTEIDKlJD7YhbYMgq4fFRV7H_1okjT5uKhfQAGUg,3998
8
- vec_inf/models/README.md,sha256=7Vz-AMValcic5Mpi9i5FshhRUV9K8nwSnItN4O1TSvI,8124
9
- vec_inf/models/models.csv,sha256=dOthlc04TyTQTin_fyt-PFDqg-lARScI9i0-tUkIgQ8,4828
10
- vec_inf/multinode_vllm.slurm,sha256=KbxsKD9kV8wsB_jCEqh63BHq8h2DLmYMV46z5h2wAe0,3867
11
- vec_inf/vllm.slurm,sha256=wRBkDunb0Oc1d8ESl_Dn9wRs_kIKvN_J39pL8dWAbV0,1608
12
- vec_inf-0.3.3.dist-info/METADATA,sha256=IefFGb9Gb7bOwI3RjNTbTlTCL6AImzx5XBSJjCp4y8c,5751
13
- vec_inf-0.3.3.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
14
- vec_inf-0.3.3.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
15
- vec_inf-0.3.3.dist-info/RECORD,,