vec-inf 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ """vec_inf package."""
vec_inf/cli/__init__.py CHANGED
@@ -0,0 +1 @@
1
+ """vec_inf cli package."""
vec_inf/cli/_cli.py CHANGED
@@ -1,9 +1,10 @@
1
+ """Command line interface for Vector Inference."""
2
+
1
3
  import os
2
4
  import time
3
- from typing import Optional, cast
5
+ from typing import Any, Dict, Optional
4
6
 
5
7
  import click
6
-
7
8
  import polars as pl
8
9
  from rich.columns import Columns
9
10
  from rich.console import Console
@@ -12,12 +13,13 @@ from rich.panel import Panel
12
13
 
13
14
  import vec_inf.cli._utils as utils
14
15
 
16
+
15
17
  CONSOLE = Console()
16
18
 
17
19
 
18
20
  @click.group()
19
- def cli():
20
- """Vector Inference CLI"""
21
+ def cli() -> None:
22
+ """Vector Inference CLI."""
21
23
  pass
22
24
 
23
25
 
@@ -122,10 +124,7 @@ def launch(
122
124
  enforce_eager: Optional[str] = None,
123
125
  json_mode: bool = False,
124
126
  ) -> None:
125
- """
126
- Launch a model on the cluster
127
- """
128
-
127
+ """Launch a model on the cluster."""
129
128
  if isinstance(pipeline_parallelism, str):
130
129
  pipeline_parallelism = (
131
130
  "True" if pipeline_parallelism.lower() == "true" else "False"
@@ -138,6 +137,13 @@ def launch(
138
137
 
139
138
  models_df = utils.load_models_df()
140
139
 
140
+ models_df = models_df.with_columns(
141
+ pl.col("model_type").replace("Reward Modeling", "Reward_Modeling")
142
+ )
143
+ models_df = models_df.with_columns(
144
+ pl.col("model_type").replace("Text Embedding", "Text_Embedding")
145
+ )
146
+
141
147
  if model_name in models_df["model_name"].to_list():
142
148
  default_args = utils.load_default_args(models_df, model_name)
143
149
  for arg in default_args:
@@ -148,7 +154,6 @@ def launch(
148
154
  else:
149
155
  model_args = models_df.columns
150
156
  model_args.remove("model_name")
151
- model_args.remove("model_type")
152
157
  for arg in model_args:
153
158
  if locals()[arg] is not None:
154
159
  renamed_arg = arg.replace("_", "-")
@@ -189,79 +194,130 @@ def launch(
189
194
  def status(
190
195
  slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
191
196
  ) -> None:
192
- """
193
- Get the status of a running model on the cluster
194
- """
197
+ """Get the status of a running model on the cluster."""
195
198
  status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
196
199
  output = utils.run_bash_command(status_cmd)
197
200
 
198
- slurm_job_name = "UNAVAILABLE"
199
- status = "SHUTDOWN"
200
- base_url = "UNAVAILABLE"
201
+ base_data = _get_base_status_data(output)
202
+ status_info = _process_job_state(output, base_data, slurm_job_id, log_dir)
203
+ _display_status(status_info, json_mode)
201
204
 
205
+
206
+ def _get_base_status_data(output: str) -> Dict[str, Any]:
207
+ """Extract basic job status information from scontrol output."""
202
208
  try:
203
- slurm_job_name = output.split(" ")[1].split("=")[1]
204
- slurm_job_state = output.split(" ")[9].split("=")[1]
209
+ job_name = output.split(" ")[1].split("=")[1]
210
+ job_state = output.split(" ")[9].split("=")[1]
205
211
  except IndexError:
206
- # Job ID not found
207
- slurm_job_state = "UNAVAILABLE"
208
-
209
- # If Slurm job is currently PENDING
210
- if slurm_job_state == "PENDING":
211
- slurm_job_pending_reason = output.split(" ")[10].split("=")[1]
212
- status = "PENDING"
213
- # If Slurm job is currently RUNNING
214
- elif slurm_job_state == "RUNNING":
215
- # Check whether the server is ready, if yes, run model health check to further determine status
216
- server_status = utils.is_server_running(slurm_job_name, slurm_job_id, log_dir)
217
- # If server status is a tuple, then server status is "FAILED"
218
- if isinstance(server_status, tuple):
219
- status = server_status[0]
220
- slurm_job_failed_reason = server_status[1]
221
- elif server_status == "RUNNING":
222
- model_status = utils.model_health_check(
223
- slurm_job_name, slurm_job_id, log_dir
224
- )
225
- if model_status == "READY":
226
- # Only set base_url if model is ready to serve requests
227
- base_url = utils.get_base_url(slurm_job_name, slurm_job_id, log_dir)
228
- status = "READY"
229
- else:
230
- # If model is not ready, then status must be "FAILED"
231
- status = model_status[0]
232
- slurm_job_failed_reason = str(model_status[1])
233
- else:
234
- status = server_status
212
+ job_name = "UNAVAILABLE"
213
+ job_state = "UNAVAILABLE"
214
+
215
+ return {
216
+ "model_name": job_name,
217
+ "status": "SHUTDOWN",
218
+ "base_url": "UNAVAILABLE",
219
+ "state": job_state,
220
+ "pending_reason": None,
221
+ "failed_reason": None,
222
+ }
223
+
224
+
225
+ def _process_job_state(
226
+ output: str, status_info: Dict[str, Any], slurm_job_id: int, log_dir: Optional[str]
227
+ ) -> Dict[str, Any]:
228
+ """Process different job states and update status information."""
229
+ if status_info["state"] == "PENDING":
230
+ _process_pending_state(output, status_info)
231
+ elif status_info["state"] == "RUNNING":
232
+ _handle_running_state(status_info, slurm_job_id, log_dir)
233
+ return status_info
234
+
235
+
236
+ def _process_pending_state(output: str, status_info: Dict[str, Any]) -> None:
237
+ """Handle PENDING job state."""
238
+ try:
239
+ status_info["pending_reason"] = output.split(" ")[10].split("=")[1]
240
+ status_info["status"] = "PENDING"
241
+ except IndexError:
242
+ status_info["pending_reason"] = "Unknown pending reason"
243
+
244
+
245
+ def _handle_running_state(
246
+ status_info: Dict[str, Any], slurm_job_id: int, log_dir: Optional[str]
247
+ ) -> None:
248
+ """Handle RUNNING job state and check server status."""
249
+ server_status = utils.is_server_running(
250
+ status_info["model_name"], slurm_job_id, log_dir
251
+ )
252
+
253
+ if isinstance(server_status, tuple):
254
+ status_info["status"], status_info["failed_reason"] = server_status
255
+ return
256
+
257
+ if server_status == "RUNNING":
258
+ _check_model_health(status_info, slurm_job_id, log_dir)
259
+ else:
260
+ status_info["status"] = server_status
261
+
235
262
 
263
+ def _check_model_health(
264
+ status_info: Dict[str, Any], slurm_job_id: int, log_dir: Optional[str]
265
+ ) -> None:
266
+ """Check model health and update status accordingly."""
267
+ model_status = utils.model_health_check(
268
+ status_info["model_name"], slurm_job_id, log_dir
269
+ )
270
+ status, failed_reason = model_status
271
+ if status == "READY":
272
+ status_info["base_url"] = utils.get_base_url(
273
+ status_info["model_name"], slurm_job_id, log_dir
274
+ )
275
+ status_info["status"] = status
276
+ else:
277
+ status_info["status"], status_info["failed_reason"] = status, failed_reason
278
+
279
+
280
+ def _display_status(status_info: Dict[str, Any], json_mode: bool) -> None:
281
+ """Display the status information in appropriate format."""
236
282
  if json_mode:
237
- status_dict = {
238
- "model_name": slurm_job_name,
239
- "model_status": status,
240
- "base_url": base_url,
241
- }
242
- if "slurm_job_pending_reason" in locals():
243
- status_dict["pending_reason"] = slurm_job_pending_reason
244
- if "slurm_job_failed_reason" in locals():
245
- status_dict["failed_reason"] = slurm_job_failed_reason
246
- click.echo(f"{status_dict}")
283
+ _output_json(status_info)
247
284
  else:
248
- table = utils.create_table(key_title="Job Status", value_title="Value")
249
- table.add_row("Model Name", slurm_job_name)
250
- table.add_row("Model Status", status, style="blue")
251
- if "slurm_job_pending_reason" in locals():
252
- table.add_row("Reason", slurm_job_pending_reason)
253
- if "slurm_job_failed_reason" in locals():
254
- table.add_row("Reason", slurm_job_failed_reason)
255
- table.add_row("Base URL", base_url)
256
- CONSOLE.print(table)
285
+ _output_table(status_info)
286
+
287
+
288
+ def _output_json(status_info: Dict[str, Any]) -> None:
289
+ """Format and output JSON data."""
290
+ json_data = {
291
+ "model_name": status_info["model_name"],
292
+ "model_status": status_info["status"],
293
+ "base_url": status_info["base_url"],
294
+ }
295
+ if status_info["pending_reason"]:
296
+ json_data["pending_reason"] = status_info["pending_reason"]
297
+ if status_info["failed_reason"]:
298
+ json_data["failed_reason"] = status_info["failed_reason"]
299
+ click.echo(json_data)
300
+
301
+
302
+ def _output_table(status_info: Dict[str, Any]) -> None:
303
+ """Create and display rich table."""
304
+ table = utils.create_table(key_title="Job Status", value_title="Value")
305
+ table.add_row("Model Name", status_info["model_name"])
306
+ table.add_row("Model Status", status_info["status"], style="blue")
307
+
308
+ if status_info["pending_reason"]:
309
+ table.add_row("Pending Reason", status_info["pending_reason"])
310
+ if status_info["failed_reason"]:
311
+ table.add_row("Failed Reason", status_info["failed_reason"])
312
+
313
+ table.add_row("Base URL", status_info["base_url"])
314
+ CONSOLE.print(table)
257
315
 
258
316
 
259
317
  @cli.command("shutdown")
260
318
  @click.argument("slurm_job_id", type=int, nargs=1)
261
319
  def shutdown(slurm_job_id: int) -> None:
262
- """
263
- Shutdown a running model on the cluster
264
- """
320
+ """Shutdown a running model on the cluster."""
265
321
  shutdown_cmd = f"scancel {slurm_job_id}"
266
322
  utils.run_bash_command(shutdown_cmd)
267
323
  click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
@@ -275,11 +331,9 @@ def shutdown(slurm_job_id: int) -> None:
275
331
  help="Output in JSON string",
276
332
  )
277
333
  def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
278
- """
279
- List all available models, or get default setup of a specific model
280
- """
334
+ """List all available models, or get default setup of a specific model."""
281
335
 
282
- def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool):
336
+ def list_model(model_name: str, models_df: pl.DataFrame, json_mode: bool) -> None:
283
337
  if model_name not in models_df["model_name"].to_list():
284
338
  raise ValueError(f"Model name {model_name} not found in available models")
285
339
 
@@ -297,7 +351,7 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
297
351
  table.add_row(key, str(value))
298
352
  CONSOLE.print(table)
299
353
 
300
- def list_all(models_df: pl.DataFrame, json_mode: bool):
354
+ def list_all(models_df: pl.DataFrame, json_mode: bool) -> None:
301
355
  if json_mode:
302
356
  click.echo(models_df["model_name"].to_list())
303
357
  return
@@ -327,9 +381,12 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
327
381
 
328
382
  for row in models_df.to_dicts():
329
383
  panel_color = model_type_colors.get(row["model_type"], "white")
330
- styled_text = (
331
- f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
332
- )
384
+ if row["model_variant"] == "None":
385
+ styled_text = f"[magenta]{row['model_family']}[/magenta]"
386
+ else:
387
+ styled_text = (
388
+ f"[magenta]{row['model_family']}[/magenta]-{row['model_variant']}"
389
+ )
333
390
  panels.append(Panel(styled_text, expand=True, border_style=panel_color))
334
391
  CONSOLE.print(Columns(panels, equal=True))
335
392
 
@@ -349,9 +406,7 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
349
406
  help="Path to slurm log directory. This is required if --log-dir was set in model launch",
350
407
  )
351
408
  def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
352
- """
353
- Stream performance metrics to the console
354
- """
409
+ """Stream performance metrics to the console."""
355
410
  status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
356
411
  output = utils.run_bash_command(status_cmd)
357
412
  slurm_job_name = output.split(" ")[1].split("=")[1]
@@ -365,13 +420,11 @@ def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
365
420
  if isinstance(out_logs, str):
366
421
  live.update(out_logs)
367
422
  break
368
- out_logs = cast(list, out_logs)
369
423
  latest_metrics = utils.get_latest_metric(out_logs)
370
424
  # if latest_metrics is a string, then it is an error message
371
425
  if isinstance(latest_metrics, str):
372
426
  live.update(latest_metrics)
373
427
  break
374
- latest_metrics = cast(dict, latest_metrics)
375
428
  table = utils.create_table(key_title="Metric", value_title="Value")
376
429
  for key, value in latest_metrics.items():
377
430
  table.add_row(key, value)
vec_inf/cli/_utils.py CHANGED
@@ -1,19 +1,20 @@
1
+ """Utility functions for the CLI."""
2
+
1
3
  import os
2
4
  import subprocess
3
- from typing import Optional, Union, cast
5
+ from typing import Dict, List, Optional, Tuple, Union, cast
4
6
 
5
7
  import polars as pl
6
8
  import requests
7
9
  from rich.table import Table
8
10
 
11
+
9
12
  MODEL_READY_SIGNATURE = "INFO: Application startup complete."
10
13
  SERVER_ADDRESS_SIGNATURE = "Server address: "
11
14
 
12
15
 
13
16
  def run_bash_command(command: str) -> str:
14
- """
15
- Run a bash command and return the output
16
- """
17
+ """Run a bash command and return the output."""
17
18
  process = subprocess.Popen(
18
19
  command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
19
20
  )
@@ -24,15 +25,13 @@ def run_bash_command(command: str) -> str:
24
25
  def read_slurm_log(
25
26
  slurm_job_name: str, slurm_job_id: int, slurm_log_type: str, log_dir: Optional[str]
26
27
  ) -> Union[list[str], str]:
27
- """
28
- Read the slurm log file
29
- """
28
+ """Read the slurm log file."""
30
29
  if not log_dir:
31
30
  models_dir = os.path.join(os.path.expanduser("~"), ".vec-inf-logs")
32
31
 
33
- for dir in sorted(os.listdir(models_dir), key=len, reverse=True):
34
- if dir in slurm_job_name:
35
- log_dir = os.path.join(models_dir, dir)
32
+ for directory in sorted(os.listdir(models_dir), key=len, reverse=True):
33
+ if directory in slurm_job_name:
34
+ log_dir = os.path.join(models_dir, directory)
36
35
  break
37
36
 
38
37
  log_dir = cast(str, log_dir)
@@ -53,9 +52,7 @@ def read_slurm_log(
53
52
  def is_server_running(
54
53
  slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
55
54
  ) -> Union[str, tuple[str, str]]:
56
- """
57
- Check if a model is ready to serve requests
58
- """
55
+ """Check if a model is ready to serve requests."""
59
56
  log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
60
57
  if isinstance(log_content, str):
61
58
  return log_content
@@ -72,9 +69,7 @@ def is_server_running(
72
69
 
73
70
 
74
71
  def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
75
- """
76
- Get the base URL of a model
77
- """
72
+ """Get the base URL of a model."""
78
73
  log_content = read_slurm_log(slurm_job_name, slurm_job_id, "out", log_dir)
79
74
  if isinstance(log_content, str):
80
75
  return log_content
@@ -87,10 +82,8 @@ def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str])
87
82
 
88
83
  def model_health_check(
89
84
  slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
90
- ) -> Union[str, tuple[str, Union[str, int]]]:
91
- """
92
- Check the health of a running model on the cluster
93
- """
85
+ ) -> Tuple[str, Union[str, int]]:
86
+ """Check the health of a running model on the cluster."""
94
87
  base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
95
88
  if not base_url.startswith("http"):
96
89
  return ("FAILED", base_url)
@@ -100,9 +93,8 @@ def model_health_check(
100
93
  response = requests.get(health_check_url)
101
94
  # Check if the request was successful
102
95
  if response.status_code == 200:
103
- return "READY"
104
- else:
105
- return ("FAILED", response.status_code)
96
+ return ("READY", response.status_code)
97
+ return ("FAILED", response.status_code)
106
98
  except requests.exceptions.RequestException as e:
107
99
  return ("FAILED", str(e))
108
100
 
@@ -110,9 +102,7 @@ def model_health_check(
110
102
  def create_table(
111
103
  key_title: str = "", value_title: str = "", show_header: bool = True
112
104
  ) -> Table:
113
- """
114
- Create a table for displaying model status
115
- """
105
+ """Create a table for displaying model status."""
116
106
  table = Table(show_header=show_header, header_style="bold magenta")
117
107
  table.add_column(key_title, style="dim")
118
108
  table.add_column(value_title)
@@ -120,30 +110,24 @@ def create_table(
120
110
 
121
111
 
122
112
  def load_models_df() -> pl.DataFrame:
123
- """
124
- Load the models dataframe
125
- """
126
- models_df = pl.read_csv(
113
+ """Load the models dataframe."""
114
+ return pl.read_csv(
127
115
  os.path.join(
128
116
  os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
129
117
  "models/models.csv",
130
118
  )
131
119
  )
132
- return models_df
133
120
 
134
121
 
135
- def load_default_args(models_df: pl.DataFrame, model_name: str) -> dict:
136
- """
137
- Load the default arguments for a model
138
- """
122
+ def load_default_args(models_df: pl.DataFrame, model_name: str) -> Dict[str, str]:
123
+ """Load the default arguments for a model."""
139
124
  row_data = models_df.filter(models_df["model_name"] == model_name)
140
125
  default_args = row_data.to_dicts()[0]
141
126
  default_args.pop("model_name", None)
142
- default_args.pop("model_type", None)
143
127
  return default_args
144
128
 
145
129
 
146
- def get_latest_metric(log_lines: list[str]) -> dict | str:
130
+ def get_latest_metric(log_lines: List[str]) -> Union[str, Dict[str, str]]:
147
131
  """Read the latest metric entry from the log file."""
148
132
  latest_metric = {}
149
133
 
vec_inf/launch_server.sh CHANGED
@@ -6,6 +6,7 @@ while [[ "$#" -gt 0 ]]; do
6
6
  case $1 in
7
7
  --model-family) model_family="$2"; shift ;;
8
8
  --model-variant) model_variant="$2"; shift ;;
9
+ --model-type) model_type="$2"; shift ;;
9
10
  --partition) partition="$2"; shift ;;
10
11
  --qos) qos="$2"; shift ;;
11
12
  --time) walltime="$2"; shift ;;
@@ -25,7 +26,7 @@ while [[ "$#" -gt 0 ]]; do
25
26
  shift
26
27
  done
27
28
 
28
- required_vars=(model_family model_variant partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
29
+ required_vars=(model_family model_variant model_type partition qos walltime num_nodes num_gpus max_model_len vocab_size data_type venv log_dir model_weights_parent_dir)
29
30
 
30
31
  for var in "$required_vars[@]"; do
31
32
  if [ -z "$!var" ]; then
@@ -36,6 +37,7 @@ done
36
37
 
37
38
  export MODEL_FAMILY=$model_family
38
39
  export MODEL_VARIANT=$model_variant
40
+ export MODEL_TYPE=$model_type
39
41
  export JOB_PARTITION=$partition
40
42
  export QOS=$qos
41
43
  export WALLTIME=$walltime
@@ -48,9 +50,20 @@ export VENV_BASE=$venv
48
50
  export LOG_DIR=$log_dir
49
51
  export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
50
52
 
53
+ if [[ "$model_type" == "LLM" || "$model_type" == "VLM" ]]; then
54
+ export VLLM_TASK="generate"
55
+ elif [ "$model_type" == "Reward_Modeling" ]; then
56
+ export VLLM_TASK="reward"
57
+ elif [ "$model_type" == "Text_Embedding" ]; then
58
+ export VLLM_TASK="embed"
59
+ else
60
+ echo "Error: Unknown model_type: $model_type"
61
+ exit 1
62
+ fi
63
+
51
64
  if [ -n "$max_num_seqs" ]; then
52
65
  export VLLM_MAX_NUM_SEQS=$max_num_seqs
53
- else
66
+ else
54
67
  export VLLM_MAX_NUM_SEQS=256
55
68
  fi
56
69
 
@@ -69,13 +82,17 @@ fi
69
82
  # ================================= Set default environment variables ======================================
70
83
  # Slurm job configuration
71
84
  export JOB_NAME="$MODEL_FAMILY-$MODEL_VARIANT"
85
+ if [ "$JOB_NAME" == "DeepSeek-R1-None" ]; then
86
+ export JOB_NAME=$MODEL_FAMILY
87
+ fi
88
+
72
89
  if [ "$LOG_DIR" = "default" ]; then
73
90
  export LOG_DIR="$HOME/.vec-inf-logs/$MODEL_FAMILY"
74
91
  fi
75
92
  mkdir -p $LOG_DIR
76
93
 
77
94
  # Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
78
- # SLURM job
95
+ # SLURM job
79
96
  export SRC_DIR="$(dirname "$0")"
80
97
  export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
81
98
 
@@ -101,6 +118,8 @@ echo Num Nodes: $NUM_NODES
101
118
  echo GPUs per Node: $NUM_GPUS
102
119
  echo QOS: $QOS
103
120
  echo Walltime: $WALLTIME
121
+ echo Model Type: $MODEL_TYPE
122
+ echo Task: $VLLM_TASK
104
123
  echo Data Type: $VLLM_DATA_TYPE
105
124
  echo Max Model Length: $VLLM_MAX_MODEL_LEN
106
125
  echo Max Num Seqs: $VLLM_MAX_NUM_SEQS
vec_inf/models/README.md CHANGED
@@ -1,13 +1,17 @@
1
1
  # Available Models
2
2
  More profiling metrics coming soon!
3
3
 
4
- ## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
4
+ ## Text Generation Models
5
+
6
+ ### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
5
7
 
6
8
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
7
9
  |:----------:|:----------:|:----------:|:----------:|
8
- |[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
10
+ | [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
11
+ | [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
12
+ | [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
9
13
 
10
- ## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
14
+ ### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
11
15
 
12
16
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
13
17
  |:----------:|:----------:|:----------:|:----------:|
@@ -20,13 +24,13 @@ More profiling metrics coming soon!
20
24
  | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
21
25
  | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
22
26
 
23
- ## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
27
+ ### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
24
28
 
25
29
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
26
30
  |:----------:|:----------:|:----------:|:----------:|
27
- |[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
31
+ | [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
28
32
 
29
- ## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
33
+ ### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
30
34
 
31
35
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
32
36
  |:----------:|:----------:|:----------:|:----------:|
@@ -35,21 +39,7 @@ More profiling metrics coming soon!
35
39
  | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
36
40
  | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
37
41
 
38
- ## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
39
-
40
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
41
- |:----------:|:----------:|:----------:|:----------:|
42
- |[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
43
- |[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
44
-
45
- ## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
46
-
47
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
48
- |:----------:|:----------:|:----------:|:----------:|
49
- |[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
50
- |[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
51
-
52
- ## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
42
+ ### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
53
43
 
54
44
  | Variant | Suggested resource allocation |
55
45
  |:----------:|:----------:|
@@ -60,7 +50,7 @@ More profiling metrics coming soon!
60
50
  | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
61
51
  | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
62
52
 
63
- ## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
53
+ ### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
64
54
 
65
55
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
66
56
  |:----------:|:----------:|:----------:|:----------:|
@@ -69,7 +59,7 @@ More profiling metrics coming soon!
69
59
  | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
70
60
  | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
71
61
 
72
- ## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
62
+ ### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
73
63
 
74
64
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
75
65
  |:----------:|:----------:|:----------:|:----------:|
@@ -79,28 +69,159 @@ More profiling metrics coming soon!
79
69
  | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
80
70
  | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
81
71
 
82
- ## [Mistral AI: Mistral](https://huggingface.co/mistralai)
72
+ ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
73
+
74
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
75
+ |:----------:|:----------:|:----------:|:----------:|
76
+ | [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
77
+ | [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
78
+ | [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
79
+ | [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
80
+
81
+ ### [Mistral AI: Mistral](https://huggingface.co/mistralai)
83
82
 
84
83
  | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
85
84
  |:----------:|:----------:|:----------:|:----------:|
86
- |[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
87
- |[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
88
- |[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
89
- |[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
90
- |[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
91
- |[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
85
+ | [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
86
+ | [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
87
+ | [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
88
+ | [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
89
+ | [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
90
+ | [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
91
+ | [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
92
92
 
93
- ## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
93
+ ### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
94
94
 
95
95
  | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
96
96
  |:----------:|:----------:|:----------:|:----------:|
97
- |[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
98
- |[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
99
- |[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
97
+ | [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
98
+ | [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
99
+ | [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
100
100
 
101
- ## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
101
+ ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
102
102
 
103
103
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
104
104
  |:----------:|:----------:|:----------:|:----------:|
105
105
  | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
106
+
107
+ ### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
108
+
109
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
110
+ |:----------:|:----------:|:----------:|:----------:|
111
+ | [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
112
+
113
+ ### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
114
+
115
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
116
+ |:----------:|:----------:|:----------:|:----------:|
117
+ | [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
118
+
119
+ ### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
120
+
121
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
122
+ |:----------:|:----------:|:----------:|:----------:|
123
+ | [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
124
+ | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
125
+ | [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
126
+ | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
127
+ | [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
128
+ | [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
129
+ | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
130
+
131
+ ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
132
+
133
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
134
+ |:----------:|:----------:|:----------:|:----------:|
135
+ | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
136
+ | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
137
+ | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
138
+
139
+ ### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
140
+
141
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
142
+ |:----------:|:----------:|:----------:|:----------:|
143
+ | [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
144
+
145
+ ### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
146
+
147
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
148
+ |:----------:|:----------:|:----------:|:----------:|
149
+ | [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
150
+
151
+ ### [DeepSeek-R1: Distilled Models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
152
+
153
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
154
+ |:----------:|:----------:|:----------:|:----------:|
155
+ | [`DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 1x a40 | - tokens/s | - tokens/s |
156
+ | [`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 4x a40 | - tokens/s | - tokens/s |
157
+ | [`DeepSeek-R1-Distill-Qwen-1.5B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | 1x a40 | - tokens/s | - tokens/s |
158
+ | [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1x a40 | - tokens/s | - tokens/s |
159
+ | [`DeepSeek-R1-Distill-Qwen-14B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | 2x a40 | - tokens/s | - tokens/s |
160
+ | [`DeepSeek-R1-Distill-Qwen-32B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | 4x a40 | - tokens/s | - tokens/s |
161
+
162
+
163
+ ## Vision Language Models
164
+
165
+ ### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
166
+
167
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
168
+ |:----------:|:----------:|:----------:|:----------:|
169
+ | [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
170
+ | [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
171
+
172
+ ### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
173
+
174
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
175
+ |:----------:|:----------:|:----------:|:----------:|
176
+ | [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
177
+ | [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
178
+
179
+ ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
180
+
181
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
182
+ |:----------:|:----------:|:----------:|:----------:|
106
183
  | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
184
+
185
+ ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
186
+
187
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
188
+ |:----------:|:----------:|:----------:|:----------:|
189
+ | [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
190
+ | [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
191
+ | [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
192
+ | [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
193
+
194
+ **NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
195
+
196
+ ### [Mistral: Pixtral](https://huggingface.co/mistralai)
197
+
198
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
199
+ |:----------:|:----------:|:----------:|:----------:|
200
+ | [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
201
+
202
+ ## Text Embedding Models
203
+
204
+ ### [Liang Wang: e5](https://huggingface.co/intfloat)
205
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
206
+ |:----------:|:----------:|:----------:|:----------:|
207
+ | [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
208
+
209
+ ### [BAAI: bge](https://huggingface.co/BAAI)
210
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
211
+ |:----------:|:----------:|:----------:|:----------:|
212
+ | [`bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1x A40 | - tokens/s | - tokens/s |
213
+
214
+ ### [Sentence Transformers: MiniLM](https://huggingface.co/sentence-transformers)
215
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
216
+ |:----------:|:----------:|:----------:|:----------:|
217
+ | [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1x A40 | - tokens/s | - tokens/s |
218
+
219
+
220
+
221
+ ## Reward Modeling Models
222
+
223
+ ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
224
+
225
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
226
+ |:----------:|:----------:|:----------:|:----------:|
227
+ | [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
vec_inf/models/models.csv CHANGED
@@ -70,5 +70,16 @@ Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,tru
70
70
  Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
71
71
  QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
72
72
  Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
73
- bge-multilingual-gemma2,bge,multilingual-gemma2,Text Embedding,1,1,256002,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
74
73
  e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
74
+ bge-base-en-v1.5,bge,base-en-v1.5,Text Embedding,1,1,30522,512,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
75
+ all-MiniLM-L6-v2,all-MiniLM,L6-v2,Text Embedding,1,1,30522,512,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
76
+ Llama-3.3-70B-Instruct,Llama-3.3,70B-Instruct,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
77
+ InternVL2_5-26B,InternVL2_5,26B,VLM,2,1,92553,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
78
+ InternVL2_5-38B,InternVL2_5,38B,VLM,4,1,92553,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
79
+ Aya-Expanse-32B,Aya-Expanse,32B,LLM,2,1,256000,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
80
+ DeepSeek-R1-Distill-Llama-70B,DeepSeek-R1,Distill-Llama-70B,LLM,4,1,128256,65536,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
81
+ DeepSeek-R1-Distill-Llama-8B,DeepSeek-R1,Distill-Llama-8B,LLM,1,1,128256,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
82
+ DeepSeek-R1-Distill-Qwen-32B,DeepSeek-R1,Distill-Qwen-32B,LLM,4,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
83
+ DeepSeek-R1-Distill-Qwen-14B,DeepSeek-R1,Distill-Qwen-14B,LLM,2,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
84
+ DeepSeek-R1-Distill-Qwen-7B,DeepSeek-R1,Distill-Qwen-7B,LLM,1,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
85
+ DeepSeek-R1-Distill-Qwen-1.5B,DeepSeek-R1,Distill-Qwen-1.5B,LLM,1,1,152064,131072,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
@@ -12,7 +12,7 @@ nvidia-smi
12
12
  source ${SRC_DIR}/find_port.sh
13
13
 
14
14
  if [ "$VENV_BASE" = "singularity" ]; then
15
- export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
15
+ export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_latest.sif
16
16
  export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
17
17
  module load singularity-ce/3.8.2
18
18
  singularity exec $SINGULARITY_IMAGE ray stop
@@ -103,6 +103,7 @@ if [ "$VENV_BASE" = "singularity" ]; then
103
103
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
104
104
  --max-model-len ${VLLM_MAX_MODEL_LEN} \
105
105
  --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
106
+ --task ${VLLM_TASK} \
106
107
  ${ENFORCE_EAGER}
107
108
  else
108
109
  source ${VENV_BASE}/bin/activate
@@ -118,5 +119,6 @@ else
118
119
  --max-logprobs ${VLLM_MAX_LOGPROBS} \
119
120
  --max-model-len ${VLLM_MAX_MODEL_LEN} \
120
121
  --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
122
+ --task ${VLLM_TASK} \
121
123
  ${ENFORCE_EAGER}
122
124
  fi
vec_inf/vllm.slurm CHANGED
@@ -23,7 +23,7 @@ fi
23
23
 
24
24
  # Activate vllm venv
25
25
  if [ "$VENV_BASE" = "singularity" ]; then
26
- export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_0.6.4.post1.sif
26
+ export SINGULARITY_IMAGE=/projects/aieng/public/vector-inference_latest.sif
27
27
  export VLLM_NCCL_SO_PATH=/vec-inf/nccl/libnccl.so.2.18.1
28
28
  module load singularity-ce/3.8.2
29
29
  singularity exec $SINGULARITY_IMAGE ray stop
@@ -39,6 +39,7 @@ if [ "$VENV_BASE" = "singularity" ]; then
39
39
  --trust-remote-code \
40
40
  --max-model-len ${VLLM_MAX_MODEL_LEN} \
41
41
  --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
42
+ --task ${VLLM_TASK} \
42
43
  ${ENFORCE_EAGER}
43
44
  else
44
45
  source ${VENV_BASE}/bin/activate
@@ -53,5 +54,6 @@ else
53
54
  --trust-remote-code \
54
55
  --max-model-len ${VLLM_MAX_MODEL_LEN} \
55
56
  --max-num-seqs ${VLLM_MAX_NUM_SEQS} \
57
+ --task ${VLLM_TASK} \
56
58
  ${ENFORCE_EAGER}
57
59
  fi
@@ -1,30 +1,32 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: vec-inf
3
- Version: 0.4.0
3
+ Version: 0.4.1
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
- License: MIT
6
- Author: Marshall Wang
7
- Author-email: marshall.wang@vectorinstitute.ai
8
- Requires-Python: >=3.10,<4.0
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Programming Language :: Python :: 3
11
- Classifier: Programming Language :: Python :: 3.10
12
- Classifier: Programming Language :: Python :: 3.11
13
- Classifier: Programming Language :: Python :: 3.12
14
- Classifier: Programming Language :: Python :: 3.13
5
+ Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: <3.11,>=3.10
9
+ Requires-Dist: click>=8.1.0
10
+ Requires-Dist: numpy>=1.24.0
11
+ Requires-Dist: polars>=1.15.0
12
+ Requires-Dist: requests>=2.31.0
13
+ Requires-Dist: rich>=13.7.0
15
14
  Provides-Extra: dev
16
- Requires-Dist: click (>=8.1.0,<9.0.0)
17
- Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
18
- Requires-Dist: numpy (>=1.24.0,<2.0.0)
19
- Requires-Dist: pandas (>=1.15.0,<2.0.0)
20
- Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
21
- Requires-Dist: requests (>=2.31.0,<3.0.0)
22
- Requires-Dist: rich (>=13.7.0,<14.0.0)
23
- Requires-Dist: vllm (>=0.6.0,<0.7.0) ; extra == "dev"
24
- Requires-Dist: vllm-nccl-cu12 (>=2.18,<2.19) ; extra == "dev"
15
+ Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
16
+ Requires-Dist: ray>=2.40.0; extra == 'dev'
17
+ Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
18
+ Requires-Dist: vllm>=0.7.2; extra == 'dev'
25
19
  Description-Content-Type: text/markdown
26
20
 
27
21
  # Vector Inference: Easy inference on Slurm clusters
22
+
23
+ ----------------------------------------------------
24
+
25
+ [![code checks](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/code_checks.yml)
26
+ [![docs](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_build.yml/badge.svg)](https://github.com/VectorInstitute/vector-inference/actions/workflows/docs_build.yml)
27
+ [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/graph/badge.svg?token=83MYFZ3UPA)](https://codecov.io/github/VectorInstitute/vector-inference)
28
+ ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
29
+
28
30
  This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec_inf/launch_server.sh), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.csv`](vec_inf/models/models.csv) accordingly.
29
31
 
30
32
  ## Installation
@@ -42,7 +44,7 @@ vec-inf launch Meta-Llama-3.1-8B-Instruct
42
44
  ```
43
45
  You should see an output like the following:
44
46
 
45
- <img width="700" alt="launch_img" src="https://github.com/user-attachments/assets/ab658552-18b2-47e0-bf70-e539c3b898d5">
47
+ <img width="600" alt="launch_img" src="https://github.com/user-attachments/assets/ab658552-18b2-47e0-bf70-e539c3b898d5">
46
48
 
47
49
  The model would be launched using the [default parameters](vec_inf/models/models.csv), you can override these values by providing additional parameters, use `--help` to see the full list. You can also launch your own customized model as long as the model architecture is [supported by vLLM](https://docs.vllm.ai/en/stable/models/supported_models.html), and make sure to follow the instructions below:
48
50
  * Your model weights directory naming convention should follow `$MODEL_FAMILY-$MODEL_VARIANT`.
@@ -94,7 +96,8 @@ You call view the full list of available models by running the `list` command:
94
96
  ```bash
95
97
  vec-inf list
96
98
  ```
97
- <img width="900" alt="list_img" src="https://github.com/user-attachments/assets/7cb2b2ac-d30c-48a8-b773-f648c27d9de2">
99
+ <img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
100
+
98
101
 
99
102
  You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
100
103
  ```bash
@@ -116,4 +119,3 @@ If you want to run inference from your local device, you can open a SSH tunnel t
116
119
  ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
117
120
  ```
118
121
  Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment
119
-
@@ -0,0 +1,16 @@
1
+ vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
2
+ vec_inf/__init__.py,sha256=bHwSIz9lebYuxIemni-lP0h3gwJHVbJnwExQKGJWw_Q,23
3
+ vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
4
+ vec_inf/launch_server.sh,sha256=WJ7HyOEbknxe7zjF388qgnTqoapl90cUrjsIJQChidc,4714
5
+ vec_inf/multinode_vllm.slurm,sha256=ymyteZWWspNDL0yBjhPNMZRd18Jepbw28HRw0EDuXYY,4201
6
+ vec_inf/vllm.slurm,sha256=64jg8t9FHp4IH5Jc_Vrk0XwSSIrpN4Xjwko6GO7cDXQ,1894
7
+ vec_inf/cli/__init__.py,sha256=5XIvGQCOnaGl73XMkwetjC-Ul3xuXGrWDXdYJ3aUzvU,27
8
+ vec_inf/cli/_cli.py,sha256=3LZ7RbJsQ3mxHWTtt-34uQNCZ7G9HaJifyfTQw33zuI,14330
9
+ vec_inf/cli/_utils.py,sha256=t_zFDEomSP9eDvad85GlJIFQ7Kl5ZXOxbgbkfMZ3DwA,4802
10
+ vec_inf/models/README.md,sha256=JpQCg5taBuQp4sLmasK7YPjFMZritOAKlfPpEJsOpeQ,16602
11
+ vec_inf/models/models.csv,sha256=xYrNykRu5HabsUjj4bdRI63YuGgCJSZ-ti_nIjuGPCY,11557
12
+ vec_inf-0.4.1.dist-info/METADATA,sha256=yFvkCgCVpYzuZZJmD22BlTYQeTMk8gD6gmYagyTUyog,7375
13
+ vec_inf-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
14
+ vec_inf-0.4.1.dist-info/entry_points.txt,sha256=uNRXjCuJSR2nveEqD3IeMznI9oVI9YLZh5a24cZg6B0,49
15
+ vec_inf-0.4.1.dist-info/licenses/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
16
+ vec_inf-0.4.1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.1
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ vec-inf = vec_inf.cli._cli:cli
@@ -1,16 +0,0 @@
1
- vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
2
- vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
5
- vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
6
- vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
- vec_inf/launch_server.sh,sha256=3-esdDzfuG0qSOPhrZHgx2nQ9GEiaI2tjTPw7VrdMuQ,4167
8
- vec_inf/models/README.md,sha256=n9I8HsIHCafz0G9k1OFwkraK9J-OY92v6M3z42a-Nho,8146
9
- vec_inf/models/models.csv,sha256=CK2NDHgdkwx5qpaduuYy9KhcHhS0z60quSeV_KtWx9c,10025
10
- vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
11
- vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
12
- vec_inf-0.4.0.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
13
- vec_inf-0.4.0.dist-info/METADATA,sha256=X-zLib_6dTZT9ZvrIBoQThImgpJSkgTFBL12oi-Dt1A,7025
14
- vec_inf-0.4.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
15
- vec_inf-0.4.0.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
16
- vec_inf-0.4.0.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- vec-inf=vec_inf.cli._cli:cli
3
-