vec-inf 0.7.1__py3-none-any.whl → 0.7.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/cli/_cli.py CHANGED
@@ -69,6 +69,16 @@ def cli() -> None:
69
69
  type=int,
70
70
  help="Number of GPUs/node to use, default to suggested resource allocation for model",
71
71
  )
72
+ @click.option(
73
+ "--cpus-per-task",
74
+ type=int,
75
+ help="Number of CPU cores per task",
76
+ )
77
+ @click.option(
78
+ "--mem-per-node",
79
+ type=str,
80
+ help="Memory allocation per node in GB format (e.g., '32G')",
81
+ )
72
82
  @click.option(
73
83
  "--account",
74
84
  "-A",
@@ -165,6 +175,10 @@ def launch(
165
175
  Number of nodes to use
166
176
  - gpus_per_node : int, optional
167
177
  Number of GPUs per node
178
+ - cpus_per_task : int, optional
179
+ Number of CPU cores per task
180
+ - mem_per_node : str, optional
181
+ Memory allocation per node in GB format (e.g., '32G')
168
182
  - account : str, optional
169
183
  Charge resources used by this job to specified account
170
184
  - work_dir : str, optional
@@ -447,7 +461,7 @@ def metrics(slurm_job_id: str) -> None:
447
461
  metrics_formatter.format_metrics()
448
462
 
449
463
  live.update(metrics_formatter.table)
450
- time.sleep(2)
464
+ time.sleep(1)
451
465
  except click.ClickException as e:
452
466
  raise e
453
467
  except Exception as e:
vec_inf/cli/_helper.py CHANGED
@@ -36,6 +36,43 @@ class LaunchResponseFormatter:
36
36
  self.model_name = model_name
37
37
  self.params = params
38
38
 
39
+ def _add_resource_allocation_details(self, table: Table) -> None:
40
+ """Add resource allocation details to the table."""
41
+ optional_fields = [
42
+ ("account", "Account"),
43
+ ("work_dir", "Working Directory"),
44
+ ("resource_type", "Resource Type"),
45
+ ("partition", "Partition"),
46
+ ("qos", "QoS"),
47
+ ]
48
+ for key, label in optional_fields:
49
+ if self.params.get(key):
50
+ table.add_row(label, self.params[key])
51
+
52
+ def _add_vllm_config(self, table: Table) -> None:
53
+ """Add vLLM configuration details to the table."""
54
+ if self.params.get("vllm_args"):
55
+ table.add_row("vLLM Arguments:", style="magenta")
56
+ for arg, value in self.params["vllm_args"].items():
57
+ table.add_row(f" {arg}:", str(value))
58
+
59
+ def _add_env_vars(self, table: Table) -> None:
60
+ """Add environment variable configuration details to the table."""
61
+ if self.params.get("env"):
62
+ table.add_row("Environment Variables", style="magenta")
63
+ for arg, value in self.params["env"].items():
64
+ table.add_row(f" {arg}:", str(value))
65
+
66
+ def _add_bind_paths(self, table: Table) -> None:
67
+ """Add bind path configuration details to the table."""
68
+ if self.params.get("bind"):
69
+ table.add_row("Bind Paths", style="magenta")
70
+ for path in self.params["bind"].split(","):
71
+ host = target = path
72
+ if ":" in path:
73
+ host, target = path.split(":")
74
+ table.add_row(f" {host}:", target)
75
+
39
76
  def format_table_output(self) -> Table:
40
77
  """Format output as rich Table.
41
78
 
@@ -59,16 +96,7 @@ class LaunchResponseFormatter:
59
96
  table.add_row("Vocabulary Size", self.params["vocab_size"])
60
97
 
61
98
  # Add resource allocation details
62
- if self.params.get("account"):
63
- table.add_row("Account", self.params["account"])
64
- if self.params.get("work_dir"):
65
- table.add_row("Working Directory", self.params["work_dir"])
66
- if self.params.get("resource_type"):
67
- table.add_row("Resource Type", self.params["resource_type"])
68
- if self.params.get("partition"):
69
- table.add_row("Partition", self.params["partition"])
70
- if self.params.get("qos"):
71
- table.add_row("QoS", self.params["qos"])
99
+ self._add_resource_allocation_details(table)
72
100
  table.add_row("Time Limit", self.params["time"])
73
101
  table.add_row("Num Nodes", self.params["num_nodes"])
74
102
  table.add_row("GPUs/Node", self.params["gpus_per_node"])
@@ -76,21 +104,18 @@ class LaunchResponseFormatter:
76
104
  table.add_row("Memory/Node", self.params["mem_per_node"])
77
105
 
78
106
  # Add job config details
107
+ if self.params.get("venv"):
108
+ table.add_row("Virtual Environment", self.params["venv"])
79
109
  table.add_row(
80
110
  "Model Weights Directory",
81
111
  str(Path(self.params["model_weights_parent_dir"], self.model_name)),
82
112
  )
83
113
  table.add_row("Log Directory", self.params["log_dir"])
84
114
 
85
- # Add vLLM configuration details
86
- table.add_row("vLLM Arguments:", style="magenta")
87
- for arg, value in self.params["vllm_args"].items():
88
- table.add_row(f" {arg}:", str(value))
89
-
90
- # Add Environment Variable Configuration Details
91
- table.add_row("Environment Variables", style="magenta")
92
- for arg, value in self.params["env"].items():
93
- table.add_row(f" {arg}:", str(value))
115
+ # Add configuration details
116
+ self._add_vllm_config(table)
117
+ self._add_env_vars(table)
118
+ self._add_bind_paths(table)
94
119
 
95
120
  return table
96
121
 
vec_inf/client/_helper.py CHANGED
@@ -31,6 +31,7 @@ from vec_inf.client._slurm_script_generator import (
31
31
  BatchSlurmScriptGenerator,
32
32
  SlurmScriptGenerator,
33
33
  )
34
+ from vec_inf.client._slurm_vars import CONTAINER_MODULE_NAME, IMAGE_PATH
34
35
  from vec_inf.client.config import ModelConfig
35
36
  from vec_inf.client.models import (
36
37
  BatchLaunchResponse,
@@ -195,23 +196,14 @@ class ModelLauncher:
195
196
  print(f"WARNING: Could not parse env var: {line}")
196
197
  return env_vars
197
198
 
198
- def _get_launch_params(self) -> dict[str, Any]:
199
- """Prepare launch parameters, set log dir, and validate required fields.
200
-
201
- Returns
202
- -------
203
- dict[str, Any]
204
- Dictionary of prepared launch parameters
199
+ def _apply_cli_overrides(self, params: dict[str, Any]) -> None:
200
+ """Apply CLI argument overrides to params.
205
201
 
206
- Raises
207
- ------
208
- MissingRequiredFieldsError
209
- If required fields are missing or tensor parallel size is not specified
210
- when using multiple GPUs
202
+ Parameters
203
+ ----------
204
+ params : dict[str, Any]
205
+ Dictionary of launch parameters to override
211
206
  """
212
- params = self.model_config.model_dump(exclude_none=True)
213
-
214
- # Override config defaults with CLI arguments
215
207
  if self.kwargs.get("vllm_args"):
216
208
  vllm_args = self._process_vllm_args(self.kwargs["vllm_args"])
217
209
  for key, value in vllm_args.items():
@@ -224,13 +216,29 @@ class ModelLauncher:
224
216
  params["env"][key] = str(value)
225
217
  del self.kwargs["env"]
226
218
 
219
+ if self.kwargs.get("bind") and params.get("bind"):
220
+ params["bind"] = f"{params['bind']},{self.kwargs['bind']}"
221
+ del self.kwargs["bind"]
222
+
227
223
  for key, value in self.kwargs.items():
228
224
  params[key] = value
229
225
 
230
- # Check for required fields without default vals, will raise an error if missing
231
- utils.check_required_fields(params)
226
+ def _validate_resource_allocation(self, params: dict[str, Any]) -> None:
227
+ """Validate resource allocation and parallelization settings.
232
228
 
233
- # Validate resource allocation and parallelization settings
229
+ Parameters
230
+ ----------
231
+ params : dict[str, Any]
232
+ Dictionary of launch parameters to validate
233
+
234
+ Raises
235
+ ------
236
+ MissingRequiredFieldsError
237
+ If tensor parallel size is not specified when using multiple GPUs
238
+ ValueError
239
+ If total # of GPUs requested is not a power of two
240
+ If mismatch between total # of GPUs requested and parallelization settings
241
+ """
234
242
  if (
235
243
  int(params["gpus_per_node"]) > 1
236
244
  and params["vllm_args"].get("--tensor-parallel-size") is None
@@ -251,19 +259,18 @@ class ModelLauncher:
251
259
  "Mismatch between total number of GPUs requested and parallelization settings"
252
260
  )
253
261
 
254
- # Convert gpus_per_node and resource_type to gres
255
- resource_type = params.get("resource_type")
256
- if resource_type:
257
- params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
258
- else:
259
- params["gres"] = f"gpu:{params['gpus_per_node']}"
262
+ def _setup_log_files(self, params: dict[str, Any]) -> None:
263
+ """Set up log directory and file paths.
260
264
 
261
- # Create log directory
265
+ Parameters
266
+ ----------
267
+ params : dict[str, Any]
268
+ Dictionary of launch parameters to set up log files
269
+ """
262
270
  params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
263
271
  params["log_dir"].mkdir(parents=True, exist_ok=True)
264
272
  params["src_dir"] = SRC_DIR
265
273
 
266
- # Construct slurm log file paths
267
274
  params["out_file"] = (
268
275
  f"{params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out"
269
276
  )
@@ -274,6 +281,35 @@ class ModelLauncher:
274
281
  f"{params['log_dir']}/{self.model_name}.$SLURM_JOB_ID/{self.model_name}.$SLURM_JOB_ID.json"
275
282
  )
276
283
 
284
+ def _get_launch_params(self) -> dict[str, Any]:
285
+ """Prepare launch parameters, set log dir, and validate required fields.
286
+
287
+ Returns
288
+ -------
289
+ dict[str, Any]
290
+ Dictionary of prepared launch parameters
291
+ """
292
+ params = self.model_config.model_dump(exclude_none=True)
293
+
294
+ # Override config defaults with CLI arguments
295
+ self._apply_cli_overrides(params)
296
+
297
+ # Check for required fields without default vals, will raise an error if missing
298
+ utils.check_required_fields(params)
299
+
300
+ # Validate resource allocation and parallelization settings
301
+ self._validate_resource_allocation(params)
302
+
303
+ # Convert gpus_per_node and resource_type to gres
304
+ resource_type = params.get("resource_type")
305
+ if resource_type:
306
+ params["gres"] = f"gpu:{resource_type}:{params['gpus_per_node']}"
307
+ else:
308
+ params["gres"] = f"gpu:{params['gpus_per_node']}"
309
+
310
+ # Setup log files
311
+ self._setup_log_files(params)
312
+
277
313
  # Convert path to string for JSON serialization
278
314
  for field in params:
279
315
  if field in ["vllm_args", "env"]:
@@ -332,6 +368,10 @@ class ModelLauncher:
332
368
  job_log_dir / f"{self.model_name}.{self.slurm_job_id}.sbatch"
333
369
  )
334
370
 
371
+ # Replace venv with image path if using container
372
+ if self.params["venv"] == CONTAINER_MODULE_NAME:
373
+ self.params["venv"] = IMAGE_PATH
374
+
335
375
  with job_json.open("w") as file:
336
376
  json.dump(self.params, file, indent=4)
337
377
 
@@ -14,6 +14,7 @@ from vec_inf.client._slurm_templates import (
14
14
  BATCH_SLURM_SCRIPT_TEMPLATE,
15
15
  SLURM_SCRIPT_TEMPLATE,
16
16
  )
17
+ from vec_inf.client._slurm_vars import CONTAINER_MODULE_NAME
17
18
 
18
19
 
19
20
  class SlurmScriptGenerator:
@@ -32,24 +33,35 @@ class SlurmScriptGenerator:
32
33
  def __init__(self, params: dict[str, Any]):
33
34
  self.params = params
34
35
  self.is_multinode = int(self.params["num_nodes"]) > 1
35
- self.use_container = (
36
- self.params["venv"] == "singularity" or self.params["venv"] == "apptainer"
37
- )
36
+ self.use_container = self.params["venv"] == CONTAINER_MODULE_NAME
38
37
  self.additional_binds = self.params.get("bind", "")
39
38
  if self.additional_binds:
40
39
  self.additional_binds = f" --bind {self.additional_binds}"
41
40
  self.model_weights_path = str(
42
41
  Path(self.params["model_weights_parent_dir"], self.params["model_name"])
43
42
  )
43
+ self.env_str = self._generate_env_str()
44
+
45
+ def _generate_env_str(self) -> str:
46
+ """Generate the environment variables string for the Slurm script.
47
+
48
+ Returns
49
+ -------
50
+ str
51
+ Formatted env vars string for container or shell export commands.
52
+ """
44
53
  env_dict: dict[str, str] = self.params.get("env", {})
45
- # Create string of environment variables
46
- self.env_str = ""
47
- for key, val in env_dict.items():
48
- if len(self.env_str) == 0:
49
- self.env_str = "--env "
50
- else:
51
- self.env_str += ","
52
- self.env_str += key + "=" + val
54
+
55
+ if not env_dict:
56
+ return ""
57
+
58
+ if self.use_container:
59
+ # Format for container: --env KEY1=VAL1,KEY2=VAL2
60
+ env_pairs = [f"{key}={val}" for key, val in env_dict.items()]
61
+ return f"--env {','.join(env_pairs)}"
62
+ # Format for shell: export KEY1=VAL1\nexport KEY2=VAL2
63
+ export_lines = [f"export {key}={val}" for key, val in env_dict.items()]
64
+ return "\n".join(export_lines)
53
65
 
54
66
  def _generate_script_content(self) -> str:
55
67
  """Generate the complete Slurm script content.
@@ -95,7 +107,12 @@ class SlurmScriptGenerator:
95
107
  server_script = ["\n"]
96
108
  if self.use_container:
97
109
  server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["container_setup"]))
98
- server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["env_vars"]))
110
+ server_script.append("\n".join(SLURM_SCRIPT_TEMPLATE["container_env_vars"]))
111
+ else:
112
+ server_script.append(
113
+ SLURM_SCRIPT_TEMPLATE["activate_venv"].format(venv=self.params["venv"])
114
+ )
115
+ server_script.append(self.env_str)
99
116
  server_script.append(
100
117
  SLURM_SCRIPT_TEMPLATE["imports"].format(src_dir=self.params["src_dir"])
101
118
  )
@@ -112,6 +129,11 @@ class SlurmScriptGenerator:
112
129
  env_str=self.env_str,
113
130
  ),
114
131
  )
132
+ else:
133
+ server_setup_str = server_setup_str.replace(
134
+ "CONTAINER_PLACEHOLDER",
135
+ "\\",
136
+ )
115
137
  else:
116
138
  server_setup_str = "\n".join(
117
139
  SLURM_SCRIPT_TEMPLATE["server_setup"]["single_node"]
@@ -145,10 +167,7 @@ class SlurmScriptGenerator:
145
167
  env_str=self.env_str,
146
168
  )
147
169
  )
148
- else:
149
- launcher_script.append(
150
- SLURM_SCRIPT_TEMPLATE["activate_venv"].format(venv=self.params["venv"])
151
- )
170
+
152
171
  launcher_script.append(
153
172
  "\n".join(SLURM_SCRIPT_TEMPLATE["launch_cmd"]).format(
154
173
  model_weights_path=self.model_weights_path,
@@ -194,9 +213,7 @@ class BatchSlurmScriptGenerator:
194
213
  def __init__(self, params: dict[str, Any]):
195
214
  self.params = params
196
215
  self.script_paths: list[Path] = []
197
- self.use_container = (
198
- self.params["venv"] == "singularity" or self.params["venv"] == "apptainer"
199
- )
216
+ self.use_container = self.params["venv"] == CONTAINER_MODULE_NAME
200
217
  for model_name in self.params["models"]:
201
218
  self.params["models"][model_name]["additional_binds"] = ""
202
219
  if self.params["models"][model_name].get("bind"):
@@ -74,7 +74,7 @@ class SlurmScriptTemplate(TypedDict):
74
74
  shebang: ShebangConfig
75
75
  container_setup: list[str]
76
76
  imports: str
77
- env_vars: list[str]
77
+ container_env_vars: list[str]
78
78
  container_command: str
79
79
  activate_venv: str
80
80
  server_setup: ServerSetupConfig
@@ -96,8 +96,8 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
96
96
  f"{CONTAINER_MODULE_NAME} exec {IMAGE_PATH} ray stop",
97
97
  ],
98
98
  "imports": "source {src_dir}/find_port.sh",
99
- "env_vars": [
100
- f"export {CONTAINER_MODULE_NAME}_BINDPATH=${CONTAINER_MODULE_NAME}_BINDPATH,$(echo /dev/infiniband* | sed -e 's/ /,/g')"
99
+ "container_env_vars": [
100
+ f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp"
101
101
  ],
102
102
  "container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --bind {{model_weights_path}}{{additional_binds}} --containall {IMAGE_PATH} \\",
103
103
  "activate_venv": "source {venv}/bin/activate",
@@ -112,6 +112,23 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
112
112
  "nodes_array=($nodes)",
113
113
  "head_node=${{nodes_array[0]}}",
114
114
  'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)',
115
+ "\n# Check for RDMA devices and set environment variable accordingly",
116
+ "if ! command -v ibv_devices >/dev/null 2>&1; then",
117
+ ' echo "ibv_devices not found; forcing TCP. (No RDMA userland on host?)"',
118
+ " export NCCL_IB_DISABLE=1",
119
+ ' export NCCL_ENV_ARG="--env NCCL_IB_DISABLE=1"',
120
+ "else",
121
+ " # Pick GID index based on link layer (IB vs RoCE)",
122
+ ' if ibv_devinfo 2>/dev/null | grep -q "link_layer:.*Ethernet"; then',
123
+ " # RoCEv2 typically needs a nonzero GID index; 3 is common, try 2 if your fabric uses it",
124
+ " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}",
125
+ ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}"',
126
+ " else",
127
+ " # Native InfiniBand => GID 0",
128
+ " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}",
129
+ ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}"',
130
+ " fi",
131
+ "fi",
115
132
  "\n# Start Ray head node",
116
133
  "head_node_port=$(find_available_port $head_node_ip 8080 65535)",
117
134
  "ray_head=$head_node_ip:$head_node_port",
vec_inf/client/_utils.py CHANGED
@@ -108,15 +108,64 @@ def is_server_running(
108
108
  if isinstance(log_content, str):
109
109
  return log_content
110
110
 
111
- status: Union[str, tuple[ModelStatus, str]] = ModelStatus.LAUNCHING
111
+ # Patterns that indicate fatal errors (not just warnings)
112
+ fatal_error_patterns = [
113
+ "traceback",
114
+ "exception",
115
+ "fatal error",
116
+ "critical error",
117
+ "failed to",
118
+ "could not",
119
+ "unable to",
120
+ "error:",
121
+ ]
122
+
123
+ # Patterns to ignore (non-fatal warnings/info messages)
124
+ ignore_patterns = [
125
+ "deprecated",
126
+ "futurewarning",
127
+ "userwarning",
128
+ "deprecationwarning",
129
+ "slurmstepd: error:", # SLURM cancellation messages (often after server started)
130
+ ]
131
+
132
+ ready_signature_found = False
133
+ fatal_error_line = None
112
134
 
113
135
  for line in log_content:
114
- if "error" in line.lower():
115
- status = (ModelStatus.FAILED, line.strip("\n"))
136
+ line_lower = line.lower()
137
+
138
+ # Check for ready signature first - if found, server is running
116
139
  if MODEL_READY_SIGNATURE in line:
117
- status = "RUNNING"
140
+ ready_signature_found = True
141
+ # Continue checking to see if there are errors after startup
142
+
143
+ # Check for fatal errors (only if we haven't seen ready signature yet)
144
+ if not ready_signature_found:
145
+ # Skip lines that match ignore patterns
146
+ if any(ignore_pattern in line_lower for ignore_pattern in ignore_patterns):
147
+ continue
118
148
 
119
- return status
149
+ # Check for fatal error patterns
150
+ for pattern in fatal_error_patterns:
151
+ if pattern in line_lower:
152
+ # Additional check: skip if it's part of a warning message
153
+ # (warnings often contain "error:" but aren't fatal)
154
+ if "warning" in line_lower and "error:" in line_lower:
155
+ continue
156
+ fatal_error_line = line.strip("\n")
157
+ break
158
+
159
+ # If we found a fatal error, mark as failed
160
+ if fatal_error_line:
161
+ return (ModelStatus.FAILED, fatal_error_line)
162
+
163
+ # If ready signature was found and no fatal errors, server is running
164
+ if ready_signature_found:
165
+ return "RUNNING"
166
+
167
+ # Otherwise, still launching
168
+ return ModelStatus.LAUNCHING
120
169
 
121
170
 
122
171
  def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str:
vec_inf/client/api.py CHANGED
@@ -81,7 +81,7 @@ class VecInfClient:
81
81
 
82
82
  def __init__(self) -> None:
83
83
  """Initialize the Vector Inference client."""
84
- pass
84
+ self._metrics_collectors: dict[str, PerformanceMetricsCollector] = {}
85
85
 
86
86
  def list_models(self) -> list[ModelInfo]:
87
87
  """List all available models.
@@ -218,7 +218,13 @@ class VecInfClient:
218
218
  - Performance metrics or error message
219
219
  - Timestamp of collection
220
220
  """
221
- performance_metrics_collector = PerformanceMetricsCollector(slurm_job_id)
221
+ # Use cached collector to preserve state between calls to compute throughput
222
+ if slurm_job_id not in self._metrics_collectors:
223
+ self._metrics_collectors[slurm_job_id] = PerformanceMetricsCollector(
224
+ slurm_job_id
225
+ )
226
+
227
+ performance_metrics_collector = self._metrics_collectors[slurm_job_id]
222
228
 
223
229
  metrics: Union[dict[str, float], str]
224
230
  if not performance_metrics_collector.metrics_url.startswith("http"):
vec_inf/client/models.py CHANGED
@@ -194,6 +194,10 @@ class LaunchOptions:
194
194
  Number of nodes to allocate
195
195
  gpus_per_node : int, optional
196
196
  Number of GPUs per node
197
+ cpus_per_task : int, optional
198
+ Number of CPUs per task
199
+ mem_per_node : str, optional
200
+ Memory per node
197
201
  account : str, optional
198
202
  Account name for job scheduling
199
203
  work_dir : str, optional
@@ -232,6 +236,8 @@ class LaunchOptions:
232
236
  resource_type: Optional[str] = None
233
237
  num_nodes: Optional[int] = None
234
238
  gpus_per_node: Optional[int] = None
239
+ cpus_per_task: Optional[int] = None
240
+ mem_per_node: Optional[str] = None
235
241
  account: Optional[str] = None
236
242
  work_dir: Optional[str] = None
237
243
  qos: Optional[str] = None
@@ -12,7 +12,6 @@ models:
12
12
  --pipeline-parallel-size: 2
13
13
  --tensor-parallel-size: 4
14
14
  --max-model-len: 65536
15
- --max-num-seqs: 256
16
15
  c4ai-command-r-08-2024:
17
16
  model_family: c4ai-command-r
18
17
  model_variant: 08-2024
@@ -25,7 +24,6 @@ models:
25
24
  vllm_args:
26
25
  --tensor-parallel-size: 2
27
26
  --max-model-len: 32768
28
- --max-num-seqs: 256
29
27
  CodeLlama-7b-hf:
30
28
  model_family: CodeLlama
31
29
  model_variant: 7b-hf
@@ -37,7 +35,6 @@ models:
37
35
  resource_type: l40s
38
36
  vllm_args:
39
37
  --max-model-len: 16384
40
- --max-num-seqs: 256
41
38
  CodeLlama-7b-Instruct-hf:
42
39
  model_family: CodeLlama
43
40
  model_variant: 7b-Instruct-hf
@@ -49,7 +46,6 @@ models:
49
46
  resource_type: l40s
50
47
  vllm_args:
51
48
  --max-model-len: 16384
52
- --max-num-seqs: 256
53
49
  CodeLlama-13b-hf:
54
50
  model_family: CodeLlama
55
51
  model_variant: 13b-hf
@@ -61,7 +57,6 @@ models:
61
57
  resource_type: l40s
62
58
  vllm_args:
63
59
  --max-model-len: 16384
64
- --max-num-seqs: 256
65
60
  CodeLlama-13b-Instruct-hf:
66
61
  model_family: CodeLlama
67
62
  model_variant: 13b-Instruct-hf
@@ -73,7 +68,6 @@ models:
73
68
  resource_type: l40s
74
69
  vllm_args:
75
70
  --max-model-len: 16384
76
- --max-num-seqs: 256
77
71
  CodeLlama-34b-hf:
78
72
  model_family: CodeLlama
79
73
  model_variant: 34b-hf
@@ -86,7 +80,6 @@ models:
86
80
  vllm_args:
87
81
  --tensor-parallel-size: 2
88
82
  --max-model-len: 16384
89
- --max-num-seqs: 256
90
83
  CodeLlama-34b-Instruct-hf:
91
84
  model_family: CodeLlama
92
85
  model_variant: 34b-Instruct-hf
@@ -99,7 +92,6 @@ models:
99
92
  vllm_args:
100
93
  --tensor-parallel-size: 2
101
94
  --max-model-len: 16384
102
- --max-num-seqs: 256
103
95
  CodeLlama-70b-hf:
104
96
  model_family: CodeLlama
105
97
  model_variant: 70b-hf
@@ -112,7 +104,6 @@ models:
112
104
  vllm_args:
113
105
  --tensor-parallel-size: 4
114
106
  --max-model-len: 4096
115
- --max-num-seqs: 256
116
107
  CodeLlama-70b-Instruct-hf:
117
108
  model_family: CodeLlama
118
109
  model_variant: 70b-Instruct-hf
@@ -125,7 +116,17 @@ models:
125
116
  vllm_args:
126
117
  --tensor-parallel-size: 4
127
118
  --max-model-len: 4096
128
- --max-num-seqs: 256
119
+ gemma-2-2b-it:
120
+ model_family: gemma-2
121
+ model_variant: 2b-it
122
+ model_type: LLM
123
+ gpus_per_node: 1
124
+ num_nodes: 1
125
+ vocab_size: 256000
126
+ time: 08:00:00
127
+ resource_type: l40s
128
+ vllm_args:
129
+ --max-model-len: 4096
129
130
  gemma-2-9b:
130
131
  model_family: gemma-2
131
132
  model_variant: 9b
@@ -137,7 +138,6 @@ models:
137
138
  resource_type: l40s
138
139
  vllm_args:
139
140
  --max-model-len: 4096
140
- --max-num-seqs: 256
141
141
  gemma-2-9b-it:
142
142
  model_family: gemma-2
143
143
  model_variant: 9b-it
@@ -149,7 +149,6 @@ models:
149
149
  resource_type: l40s
150
150
  vllm_args:
151
151
  --max-model-len: 4096
152
- --max-num-seqs: 256
153
152
  gemma-2-27b:
154
153
  model_family: gemma-2
155
154
  model_variant: 27b
@@ -162,7 +161,6 @@ models:
162
161
  vllm_args:
163
162
  --tensor-parallel-size: 2
164
163
  --max-model-len: 4096
165
- --max-num-seqs: 256
166
164
  gemma-2-27b-it:
167
165
  model_family: gemma-2
168
166
  model_variant: 27b-it
@@ -175,7 +173,6 @@ models:
175
173
  vllm_args:
176
174
  --tensor-parallel-size: 2
177
175
  --max-model-len: 4096
178
- --max-num-seqs: 256
179
176
  Llama-2-7b-hf:
180
177
  model_family: Llama-2
181
178
  model_variant: 7b-hf
@@ -187,7 +184,6 @@ models:
187
184
  resource_type: l40s
188
185
  vllm_args:
189
186
  --max-model-len: 4096
190
- --max-num-seqs: 256
191
187
  Llama-2-7b-chat-hf:
192
188
  model_family: Llama-2
193
189
  model_variant: 7b-chat-hf
@@ -199,7 +195,6 @@ models:
199
195
  resource_type: l40s
200
196
  vllm_args:
201
197
  --max-model-len: 4096
202
- --max-num-seqs: 256
203
198
  Llama-2-13b-hf:
204
199
  model_family: Llama-2
205
200
  model_variant: 13b-hf
@@ -211,7 +206,6 @@ models:
211
206
  resource_type: l40s
212
207
  vllm_args:
213
208
  --max-model-len: 4096
214
- --max-num-seqs: 256
215
209
  Llama-2-13b-chat-hf:
216
210
  model_family: Llama-2
217
211
  model_variant: 13b-chat-hf
@@ -223,7 +217,6 @@ models:
223
217
  resource_type: l40s
224
218
  vllm_args:
225
219
  --max-model-len: 4096
226
- --max-num-seqs: 256
227
220
  Llama-2-70b-hf:
228
221
  model_family: Llama-2
229
222
  model_variant: 70b-hf
@@ -236,7 +229,6 @@ models:
236
229
  vllm_args:
237
230
  --tensor-parallel-size: 4
238
231
  --max-model-len: 4096
239
- --max-num-seqs: 256
240
232
  Llama-2-70b-chat-hf:
241
233
  model_family: Llama-2
242
234
  model_variant: 70b-chat-hf
@@ -249,7 +241,6 @@ models:
249
241
  vllm_args:
250
242
  --tensor-parallel-size: 4
251
243
  --max-model-len: 4096
252
- --max-num-seqs: 256
253
244
  llava-1.5-7b-hf:
254
245
  model_family: llava-1.5
255
246
  model_variant: 7b-hf
@@ -261,7 +252,6 @@ models:
261
252
  resource_type: l40s
262
253
  vllm_args:
263
254
  --max-model-len: 4096
264
- --max-num-seqs: 256
265
255
  llava-1.5-13b-hf:
266
256
  model_family: llava-1.5
267
257
  model_variant: 13b-hf
@@ -273,7 +263,6 @@ models:
273
263
  resource_type: l40s
274
264
  vllm_args:
275
265
  --max-model-len: 4096
276
- --max-num-seqs: 256
277
266
  llava-v1.6-mistral-7b-hf:
278
267
  model_family: llava-v1.6
279
268
  model_variant: mistral-7b-hf
@@ -285,7 +274,6 @@ models:
285
274
  resource_type: l40s
286
275
  vllm_args:
287
276
  --max-model-len: 32768
288
- --max-num-seqs: 256
289
277
  llava-v1.6-34b-hf:
290
278
  model_family: llava-v1.6
291
279
  model_variant: 34b-hf
@@ -298,7 +286,6 @@ models:
298
286
  vllm_args:
299
287
  --tensor-parallel-size: 2
300
288
  --max-model-len: 4096
301
- --max-num-seqs: 256
302
289
  Meta-Llama-3-8B:
303
290
  model_family: Meta-Llama-3
304
291
  model_variant: 8B
@@ -310,7 +297,6 @@ models:
310
297
  resource_type: l40s
311
298
  vllm_args:
312
299
  --max-model-len: 8192
313
- --max-num-seqs: 256
314
300
  Meta-Llama-3-8B-Instruct:
315
301
  model_family: Meta-Llama-3
316
302
  model_variant: 8B-Instruct
@@ -322,7 +308,6 @@ models:
322
308
  resource_type: l40s
323
309
  vllm_args:
324
310
  --max-model-len: 8192
325
- --max-num-seqs: 256
326
311
  Meta-Llama-3-70B:
327
312
  model_family: Meta-Llama-3
328
313
  model_variant: 70B
@@ -335,7 +320,6 @@ models:
335
320
  vllm_args:
336
321
  --tensor-parallel-size: 4
337
322
  --max-model-len: 8192
338
- --max-num-seqs: 256
339
323
  Meta-Llama-3-70B-Instruct:
340
324
  model_family: Meta-Llama-3
341
325
  model_variant: 70B-Instruct
@@ -348,7 +332,6 @@ models:
348
332
  vllm_args:
349
333
  --tensor-parallel-size: 4
350
334
  --max-model-len: 8192
351
- --max-num-seqs: 256
352
335
  Meta-Llama-3.1-8B:
353
336
  model_family: Meta-Llama-3.1
354
337
  model_variant: 8B
@@ -360,7 +343,6 @@ models:
360
343
  resource_type: l40s
361
344
  vllm_args:
362
345
  --max-model-len: 131072
363
- --max-num-seqs: 256
364
346
  Meta-Llama-3.1-8B-Instruct:
365
347
  model_family: Meta-Llama-3.1
366
348
  model_variant: 8B-Instruct
@@ -372,7 +354,6 @@ models:
372
354
  resource_type: l40s
373
355
  vllm_args:
374
356
  --max-model-len: 131072
375
- --max-num-seqs: 256
376
357
  Meta-Llama-3.1-70B:
377
358
  model_family: Meta-Llama-3.1
378
359
  model_variant: 70B
@@ -385,7 +366,6 @@ models:
385
366
  vllm_args:
386
367
  --tensor-parallel-size: 4
387
368
  --max-model-len: 65536
388
- --max-num-seqs: 256
389
369
  Meta-Llama-3.1-70B-Instruct:
390
370
  model_family: Meta-Llama-3.1
391
371
  model_variant: 70B-Instruct
@@ -398,7 +378,6 @@ models:
398
378
  vllm_args:
399
379
  --tensor-parallel-size: 4
400
380
  --max-model-len: 65536
401
- --max-num-seqs: 256
402
381
  Meta-Llama-3.1-405B-Instruct:
403
382
  model_family: Meta-Llama-3.1
404
383
  model_variant: 405B-Instruct
@@ -406,14 +385,12 @@ models:
406
385
  gpus_per_node: 4
407
386
  num_nodes: 8
408
387
  vocab_size: 128256
409
- qos: m4
410
- time: 02:00:00
388
+ time: 08:00:00
411
389
  resource_type: l40s
412
390
  vllm_args:
413
391
  --pipeline-parallel-size: 8
414
392
  --tensor-parallel-size: 4
415
393
  --max-model-len: 16384
416
- --max-num-seqs: 256
417
394
  Mistral-7B-Instruct-v0.1:
418
395
  model_family: Mistral
419
396
  model_variant: 7B-Instruct-v0.1
@@ -425,7 +402,6 @@ models:
425
402
  resource_type: l40s
426
403
  vllm_args:
427
404
  --max-model-len: 32768
428
- --max-num-seqs: 256
429
405
  Mistral-7B-Instruct-v0.2:
430
406
  model_family: Mistral
431
407
  model_variant: 7B-Instruct-v0.2
@@ -437,7 +413,6 @@ models:
437
413
  resource_type: l40s
438
414
  vllm_args:
439
415
  --max-model-len: 32768
440
- --max-num-seqs: 256
441
416
  Mistral-7B-v0.3:
442
417
  model_family: Mistral
443
418
  model_variant: 7B-v0.3
@@ -449,7 +424,6 @@ models:
449
424
  resource_type: l40s
450
425
  vllm_args:
451
426
  --max-model-len: 32768
452
- --max-num-seqs: 256
453
427
  Mistral-7B-Instruct-v0.3:
454
428
  model_family: Mistral
455
429
  model_variant: 7B-Instruct-v0.3
@@ -461,7 +435,6 @@ models:
461
435
  resource_type: l40s
462
436
  vllm_args:
463
437
  --max-model-len: 32768
464
- --max-num-seqs: 256
465
438
  Mistral-Large-Instruct-2407:
466
439
  model_family: Mistral
467
440
  model_variant: Large-Instruct-2407
@@ -475,7 +448,6 @@ models:
475
448
  --pipeline-parallel-size: 2
476
449
  --tensor-parallel-size: 4
477
450
  --max-model-len: 32768
478
- --max-num-seqs: 256
479
451
  Mistral-Large-Instruct-2411:
480
452
  model_family: Mistral
481
453
  model_variant: Large-Instruct-2411
@@ -489,7 +461,6 @@ models:
489
461
  --pipeline-parallel-size: 2
490
462
  --tensor-parallel-size: 4
491
463
  --max-model-len: 32768
492
- --max-num-seqs: 256
493
464
  Mixtral-8x7B-Instruct-v0.1:
494
465
  model_family: Mixtral
495
466
  model_variant: 8x7B-Instruct-v0.1
@@ -502,7 +473,6 @@ models:
502
473
  vllm_args:
503
474
  --tensor-parallel-size: 4
504
475
  --max-model-len: 32768
505
- --max-num-seqs: 256
506
476
  Mixtral-8x22B-v0.1:
507
477
  model_family: Mixtral
508
478
  model_variant: 8x22B-v0.1
@@ -516,7 +486,6 @@ models:
516
486
  --pipeline-parallel-size: 2
517
487
  --tensor-parallel-size: 4
518
488
  --max-model-len: 65536
519
- --max-num-seqs: 256
520
489
  Mixtral-8x22B-Instruct-v0.1:
521
490
  model_family: Mixtral
522
491
  model_variant: 8x22B-Instruct-v0.1
@@ -530,7 +499,6 @@ models:
530
499
  --pipeline-parallel-size: 2
531
500
  --tensor-parallel-size: 4
532
501
  --max-model-len: 65536
533
- --max-num-seqs: 256
534
502
  Phi-3-medium-128k-instruct:
535
503
  model_family: Phi-3
536
504
  model_variant: medium-128k-instruct
@@ -543,7 +511,6 @@ models:
543
511
  vllm_args:
544
512
  --tensor-parallel-size: 2
545
513
  --max-model-len: 131072
546
- --max-num-seqs: 256
547
514
  Phi-3-vision-128k-instruct:
548
515
  model_family: Phi-3-vision
549
516
  model_variant: 128k-instruct
@@ -556,20 +523,6 @@ models:
556
523
  vllm_args:
557
524
  --tensor-parallel-size: 2
558
525
  --max-model-len: 65536
559
- --max-num-seqs: 256
560
- Llama3-OpenBioLLM-70B:
561
- model_family: Llama3-OpenBioLLM
562
- model_variant: 70B
563
- model_type: LLM
564
- gpus_per_node: 4
565
- num_nodes: 1
566
- vocab_size: 128256
567
- time: 08:00:00
568
- resource_type: l40s
569
- vllm_args:
570
- --tensor-parallel-size: 4
571
- --max-model-len: 8192
572
- --max-num-seqs: 256
573
526
  Llama-3.1-Nemotron-70B-Instruct-HF:
574
527
  model_family: Llama-3.1-Nemotron
575
528
  model_variant: 70B-Instruct-HF
@@ -582,7 +535,6 @@ models:
582
535
  vllm_args:
583
536
  --tensor-parallel-size: 4
584
537
  --max-model-len: 65536
585
- --max-num-seqs: 256
586
538
  Llama-3.2-1B:
587
539
  model_family: Llama-3.2
588
540
  model_variant: 1B
@@ -594,7 +546,6 @@ models:
594
546
  resource_type: l40s
595
547
  vllm_args:
596
548
  --max-model-len: 131072
597
- --max-num-seqs: 256
598
549
  Llama-3.2-1B-Instruct:
599
550
  model_family: Llama-3.2
600
551
  model_variant: 1B-Instruct
@@ -606,7 +557,6 @@ models:
606
557
  resource_type: l40s
607
558
  vllm_args:
608
559
  --max-model-len: 131072
609
- --max-num-seqs: 256
610
560
  Llama-3.2-3B:
611
561
  model_family: Llama-3.2
612
562
  model_variant: 3B
@@ -618,7 +568,6 @@ models:
618
568
  resource_type: l40s
619
569
  vllm_args:
620
570
  --max-model-len: 131072
621
- --max-num-seqs: 256
622
571
  Llama-3.2-3B-Instruct:
623
572
  model_family: Llama-3.2
624
573
  model_variant: 3B-Instruct
@@ -630,7 +579,6 @@ models:
630
579
  resource_type: l40s
631
580
  vllm_args:
632
581
  --max-model-len: 131072
633
- --max-num-seqs: 256
634
582
  Llama-3.2-11B-Vision:
635
583
  model_family: Llama-3.2
636
584
  model_variant: 11B-Vision
@@ -698,7 +646,6 @@ models:
698
646
  resource_type: l40s
699
647
  vllm_args:
700
648
  --max-model-len: 32768
701
- --max-num-seqs: 256
702
649
  Qwen2.5-1.5B-Instruct:
703
650
  model_family: Qwen2.5
704
651
  model_variant: 1.5B-Instruct
@@ -710,7 +657,6 @@ models:
710
657
  resource_type: l40s
711
658
  vllm_args:
712
659
  --max-model-len: 32768
713
- --max-num-seqs: 256
714
660
  Qwen2.5-3B-Instruct:
715
661
  model_family: Qwen2.5
716
662
  model_variant: 3B-Instruct
@@ -722,7 +668,6 @@ models:
722
668
  resource_type: l40s
723
669
  vllm_args:
724
670
  --max-model-len: 32768
725
- --max-num-seqs: 256
726
671
  Qwen2.5-7B-Instruct:
727
672
  model_family: Qwen2.5
728
673
  model_variant: 7B-Instruct
@@ -734,7 +679,6 @@ models:
734
679
  resource_type: l40s
735
680
  vllm_args:
736
681
  --max-model-len: 32768
737
- --max-num-seqs: 256
738
682
  Qwen2.5-14B-Instruct:
739
683
  model_family: Qwen2.5
740
684
  model_variant: 14B-Instruct
@@ -746,7 +690,6 @@ models:
746
690
  resource_type: l40s
747
691
  vllm_args:
748
692
  --max-model-len: 32768
749
- --max-num-seqs: 256
750
693
  Qwen2.5-32B-Instruct:
751
694
  model_family: Qwen2.5
752
695
  model_variant: 32B-Instruct
@@ -759,7 +702,6 @@ models:
759
702
  vllm_args:
760
703
  --tensor-parallel-size: 2
761
704
  --max-model-len: 32768
762
- --max-num-seqs: 256
763
705
  Qwen2.5-72B-Instruct:
764
706
  model_family: Qwen2.5
765
707
  model_variant: 72B-Instruct
@@ -772,7 +714,6 @@ models:
772
714
  vllm_args:
773
715
  --tensor-parallel-size: 4
774
716
  --max-model-len: 16384
775
- --max-num-seqs: 256
776
717
  Qwen2.5-Math-1.5B-Instruct:
777
718
  model_family: Qwen2.5
778
719
  model_variant: Math-1.5B-Instruct
@@ -784,7 +725,6 @@ models:
784
725
  resource_type: l40s
785
726
  vllm_args:
786
727
  --max-model-len: 4096
787
- --max-num-seqs: 256
788
728
  Qwen2.5-Math-7B-Instruct:
789
729
  model_family: Qwen2.5
790
730
  model_variant: Math-7B-Instruct
@@ -796,7 +736,6 @@ models:
796
736
  resource_type: l40s
797
737
  vllm_args:
798
738
  --max-model-len: 4096
799
- --max-num-seqs: 256
800
739
  Qwen2.5-Math-72B-Instruct:
801
740
  model_family: Qwen2.5
802
741
  model_variant: Math-72B-Instruct
@@ -809,7 +748,6 @@ models:
809
748
  vllm_args:
810
749
  --tensor-parallel-size: 4
811
750
  --max-model-len: 4096
812
- --max-num-seqs: 256
813
751
  Qwen2.5-Coder-7B-Instruct:
814
752
  model_family: Qwen2.5
815
753
  model_variant: Coder-7B-Instruct
@@ -821,7 +759,6 @@ models:
821
759
  resource_type: l40s
822
760
  vllm_args:
823
761
  --max-model-len: 32768
824
- --max-num-seqs: 256
825
762
  Qwen2.5-Math-RM-72B:
826
763
  model_family: Qwen2.5
827
764
  model_variant: Math-RM-72B
@@ -834,7 +771,6 @@ models:
834
771
  vllm_args:
835
772
  --tensor-parallel-size: 4
836
773
  --max-model-len: 4096
837
- --max-num-seqs: 256
838
774
  Qwen2.5-Math-PRM-7B:
839
775
  model_family: Qwen2.5
840
776
  model_variant: Math-PRM-7B
@@ -846,7 +782,6 @@ models:
846
782
  resource_type: l40s
847
783
  vllm_args:
848
784
  --max-model-len: 4096
849
- --max-num-seqs: 256
850
785
  QwQ-32B:
851
786
  model_family: QwQ
852
787
  model_variant: 32B
@@ -859,7 +794,6 @@ models:
859
794
  vllm_args:
860
795
  --tensor-parallel-size: 2
861
796
  --max-model-len: 32768
862
- --max-num-seqs: 256
863
797
  Pixtral-12B-2409:
864
798
  model_family: Pixtral
865
799
  model_variant: 12B-2409
@@ -871,7 +805,6 @@ models:
871
805
  resource_type: l40s
872
806
  vllm_args:
873
807
  --max-model-len: 8192
874
- --max-num-seqs: 256
875
808
  e5-mistral-7b-instruct:
876
809
  model_family: e5
877
810
  model_variant: mistral-7b-instruct
@@ -883,7 +816,6 @@ models:
883
816
  resource_type: l40s
884
817
  vllm_args:
885
818
  --max-model-len: 4096
886
- --max-num-seqs: 256
887
819
  bge-base-en-v1.5:
888
820
  model_family: bge
889
821
  model_variant: base-en-v1.5
@@ -895,7 +827,6 @@ models:
895
827
  resource_type: l40s
896
828
  vllm_args:
897
829
  --max-model-len: 512
898
- --max-num-seqs: 256
899
830
  all-MiniLM-L6-v2:
900
831
  model_family: all-MiniLM
901
832
  model_variant: L6-v2
@@ -907,7 +838,6 @@ models:
907
838
  resource_type: l40s
908
839
  vllm_args:
909
840
  --max-model-len: 512
910
- --max-num-seqs: 256
911
841
  Llama-3.3-70B-Instruct:
912
842
  model_family: Llama-3.3
913
843
  model_variant: 70B-Instruct
@@ -920,7 +850,6 @@ models:
920
850
  vllm_args:
921
851
  --tensor-parallel-size: 4
922
852
  --max-model-len: 65536
923
- --max-num-seqs: 256
924
853
  InternVL2_5-26B:
925
854
  model_family: InternVL2_5
926
855
  model_variant: 26B
@@ -933,7 +862,6 @@ models:
933
862
  vllm_args:
934
863
  --tensor-parallel-size: 2
935
864
  --max-model-len: 32768
936
- --max-num-seqs: 256
937
865
  InternVL2_5-38B:
938
866
  model_family: InternVL2_5
939
867
  model_variant: 38B
@@ -946,7 +874,6 @@ models:
946
874
  vllm_args:
947
875
  --tensor-parallel-size: 4
948
876
  --max-model-len: 32768
949
- --max-num-seqs: 256
950
877
  Aya-Expanse-32B:
951
878
  model_family: Aya-Expanse
952
879
  model_variant: 32B
@@ -959,7 +886,6 @@ models:
959
886
  vllm_args:
960
887
  --tensor-parallel-size: 2
961
888
  --max-model-len: 8192
962
- --max-num-seqs: 256
963
889
  DeepSeek-R1-Distill-Llama-70B:
964
890
  model_family: DeepSeek-R1
965
891
  model_variant: Distill-Llama-70B
@@ -972,7 +898,6 @@ models:
972
898
  vllm_args:
973
899
  --tensor-parallel-size: 4
974
900
  --max-model-len: 65536
975
- --max-num-seqs: 256
976
901
  DeepSeek-R1-Distill-Llama-8B:
977
902
  model_family: DeepSeek-R1
978
903
  model_variant: Distill-Llama-8B
@@ -984,7 +909,6 @@ models:
984
909
  resource_type: l40s
985
910
  vllm_args:
986
911
  --max-model-len: 131072
987
- --max-num-seqs: 256
988
912
  DeepSeek-R1-Distill-Qwen-32B:
989
913
  model_family: DeepSeek-R1
990
914
  model_variant: Distill-Qwen-32B
@@ -997,7 +921,6 @@ models:
997
921
  vllm_args:
998
922
  --tensor-parallel-size: 2
999
923
  --max-model-len: 65536
1000
- --max-num-seqs: 256
1001
924
  DeepSeek-R1-Distill-Qwen-14B:
1002
925
  model_family: DeepSeek-R1
1003
926
  model_variant: Distill-Qwen-14B
@@ -1009,7 +932,6 @@ models:
1009
932
  resource_type: l40s
1010
933
  vllm_args:
1011
934
  --max-model-len: 65536
1012
- --max-num-seqs: 256
1013
935
  DeepSeek-R1-Distill-Qwen-7B:
1014
936
  model_family: DeepSeek-R1
1015
937
  model_variant: Distill-Qwen-7B
@@ -1021,7 +943,6 @@ models:
1021
943
  resource_type: l40s
1022
944
  vllm_args:
1023
945
  --max-model-len: 131072
1024
- --max-num-seqs: 256
1025
946
  DeepSeek-R1-Distill-Qwen-1.5B:
1026
947
  model_family: DeepSeek-R1
1027
948
  model_variant: Distill-Qwen-1.5B
@@ -1033,7 +954,6 @@ models:
1033
954
  resource_type: l40s
1034
955
  vllm_args:
1035
956
  --max-model-len: 131072
1036
- --max-num-seqs: 256
1037
957
  Phi-3.5-vision-instruct:
1038
958
  model_family: Phi-3.5-vision
1039
959
  model_variant: instruct
@@ -1046,7 +966,6 @@ models:
1046
966
  vllm_args:
1047
967
  --tensor-parallel-size: 2
1048
968
  --max-model-len: 65536
1049
- --max-num-seqs: 256
1050
969
  InternVL2_5-8B:
1051
970
  model_family: InternVL2_5
1052
971
  model_variant: 8B
@@ -1058,7 +977,6 @@ models:
1058
977
  resource_type: l40s
1059
978
  vllm_args:
1060
979
  --max-model-len: 32768
1061
- --max-num-seqs: 256
1062
980
  glm-4v-9b:
1063
981
  model_family: glm-4v
1064
982
  model_variant: 9b
@@ -1070,7 +988,6 @@ models:
1070
988
  resource_type: l40s
1071
989
  vllm_args:
1072
990
  --max-model-len: 8192
1073
- --max-num-seqs: 256
1074
991
  Molmo-7B-D-0924:
1075
992
  model_family: Molmo
1076
993
  model_variant: 7B-D-0924
@@ -1082,7 +999,6 @@ models:
1082
999
  resource_type: l40s
1083
1000
  vllm_args:
1084
1001
  --max-model-len: 4096
1085
- --max-num-seqs: 256
1086
1002
  deepseek-vl2:
1087
1003
  model_family: deepseek-vl2
1088
1004
  model_type: VLM
@@ -1094,7 +1010,6 @@ models:
1094
1010
  vllm_args:
1095
1011
  --tensor-parallel-size: 2
1096
1012
  --max-model-len: 4096
1097
- --max-num-seqs: 256
1098
1013
  deepseek-vl2-small:
1099
1014
  model_family: deepseek-vl2
1100
1015
  model_variant: small
@@ -1106,7 +1021,17 @@ models:
1106
1021
  resource_type: l40s
1107
1022
  vllm_args:
1108
1023
  --max-model-len: 4096
1109
- --max-num-seqs: 256
1024
+ Qwen3-8B:
1025
+ model_family: Qwen3
1026
+ model_variant: 8B
1027
+ model_type: LLM
1028
+ gpus_per_node: 1
1029
+ num_nodes: 1
1030
+ vocab_size: 151936
1031
+ time: 08:00:00
1032
+ resource_type: l40s
1033
+ vllm_args:
1034
+ --max-model-len: 40960
1110
1035
  Qwen3-14B:
1111
1036
  model_family: Qwen3
1112
1037
  model_variant: 14B
@@ -1118,4 +1043,28 @@ models:
1118
1043
  resource_type: l40s
1119
1044
  vllm_args:
1120
1045
  --max-model-len: 40960
1121
- --max-num-seqs: 256
1046
+ Qwen3-32B:
1047
+ model_family: Qwen3
1048
+ model_variant: 32B
1049
+ model_type: LLM
1050
+ gpus_per_node: 2
1051
+ num_nodes: 1
1052
+ vocab_size: 151936
1053
+ time: 08:00:00
1054
+ resource_type: l40s
1055
+ vllm_args:
1056
+ --tensor-parallel-size: 2
1057
+ --max-model-len: 40960
1058
+ gpt-oss-120b:
1059
+ model_family: gpt-oss
1060
+ model_variant: 120b
1061
+ model_type: LLM
1062
+ gpus_per_node: 4
1063
+ num_nodes: 2
1064
+ vocab_size: 201088
1065
+ time: 08:00:00
1066
+ resource_type: l40s
1067
+ vllm_args:
1068
+ --tensor-parallel-size: 4
1069
+ --pipeline-parallel-size: 2
1070
+ --max-model-len: 40960
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: vec-inf
3
- Version: 0.7.1
3
+ Version: 0.7.2
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  Author-email: Marshall Wang <marshall.wang@vectorinstitute.ai>
6
6
  License-Expression: MIT
@@ -13,9 +13,10 @@ Requires-Dist: requests>=2.31.0
13
13
  Requires-Dist: rich>=13.7.0
14
14
  Provides-Extra: dev
15
15
  Requires-Dist: cupy-cuda12x==12.1.0; extra == 'dev'
16
- Requires-Dist: ray>=2.40.0; extra == 'dev'
16
+ Requires-Dist: flashinfer-python>=0.4.0; extra == 'dev'
17
+ Requires-Dist: ray[default]>=2.50.0; extra == 'dev'
18
+ Requires-Dist: sglang>=0.5.0; extra == 'dev'
17
19
  Requires-Dist: torch>=2.7.0; extra == 'dev'
18
- Requires-Dist: vllm-nccl-cu12<2.19,>=2.18; extra == 'dev'
19
20
  Requires-Dist: vllm>=0.10.0; extra == 'dev'
20
21
  Requires-Dist: xgrammar>=0.1.11; extra == 'dev'
21
22
  Description-Content-Type: text/markdown
@@ -2,26 +2,26 @@ vec_inf/README.md,sha256=WyvjbSs5Eh5fp8u66bgOaO3FQKP2U7m_HbLgqTHs_ng,1322
2
2
  vec_inf/__init__.py,sha256=bHwSIz9lebYuxIemni-lP0h3gwJHVbJnwExQKGJWw_Q,23
3
3
  vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
4
4
  vec_inf/cli/__init__.py,sha256=5XIvGQCOnaGl73XMkwetjC-Ul3xuXGrWDXdYJ3aUzvU,27
5
- vec_inf/cli/_cli.py,sha256=xrYce8iP2Wo5dNflvUO2gIfkyjA4V_V8mpiaxnMDwkk,15813
6
- vec_inf/cli/_helper.py,sha256=Jr9NnMhGflkx3YEfYCN1rMHQgUzMAAwlSx_BLH92tVM,16511
5
+ vec_inf/cli/_cli.py,sha256=0YfxtPT_Nq5gvIol9eWmw5yW9AT1ghf_E49R9pD7UG4,16213
6
+ vec_inf/cli/_helper.py,sha256=0_onclvxxpDTp33ODYc19RbZ2aIhXuMTC9v19q8ZhIo,17473
7
7
  vec_inf/cli/_utils.py,sha256=23vSbmvNOWY1-W1aOAwYqNDkDDmx-5UVlCiXAtxUZ8A,1057
8
8
  vec_inf/cli/_vars.py,sha256=V6DrJs_BuUa4yNcbBSSnMwpcyXwEBsizy3D0ubIg2fA,777
9
9
  vec_inf/client/__init__.py,sha256=OLlUJ4kL1R-Kh-nXNbvKlAZ3mtHcnozHprVufkVCNWk,739
10
10
  vec_inf/client/_client_vars.py,sha256=1D-bX9dS0-pFImLvgWt2hUnwJiz-VaxuLb2HIfPML8I,2408
11
11
  vec_inf/client/_exceptions.py,sha256=94Nx_5k1SriJNXzbdnwyXFZolyMutydU08Gsikawzzo,749
12
- vec_inf/client/_helper.py,sha256=P8A9JHRMzxJRl0dgTuv9xfOluEV3BthUM1KzQlWkR7E,35752
13
- vec_inf/client/_slurm_script_generator.py,sha256=d2NowdKMQR1lsVI_hw9ObKC3uSk8YJr75ZYRMkvp0RA,13354
14
- vec_inf/client/_slurm_templates.py,sha256=TAH-wQV4gP2CCwxP3BmShebohtSmlMstlJT9QK6n4Dc,8277
12
+ vec_inf/client/_helper.py,sha256=hb6m5TLwcGE0grCu5-UCUkWbByV-G5h8gA87Yzct6rk,37170
13
+ vec_inf/client/_slurm_script_generator.py,sha256=L6tqn71kNJ2I0xYipFh_ZxIAG8znpXhTpUxTU8LJIa4,13988
14
+ vec_inf/client/_slurm_templates.py,sha256=GxVNClkgggoJN2pT1AjK7CQCAErfKRMIs97Vlhxs9u8,9349
15
15
  vec_inf/client/_slurm_vars.py,sha256=sgP__XhpE1K7pvOzVFmotUXmINYPcOuFP-zGaePT5Iw,2910
16
- vec_inf/client/_utils.py,sha256=XamAz8-AJELgkXHrR082ptTsbHSiWI47SY6MlXA44rU,12593
17
- vec_inf/client/api.py,sha256=pkgNE37r7LzYBDjRGAKAh7rhOUMKHGwghJh6Hfb45TI,11681
16
+ vec_inf/client/_utils.py,sha256=_ZBmic0XvJ4vpdIuXDi6KO5iL2rbhIpFQT01EWGItN4,14296
17
+ vec_inf/client/api.py,sha256=lkVWCme-HmMJMqp8JbtjkBVL_MSPsCC_IBL9FBw3Um8,12011
18
18
  vec_inf/client/config.py,sha256=VU4h2iqL0rxYAqGw2HBF_l6QvvSDJy5M79IgX5G2PW4,5830
19
- vec_inf/client/models.py,sha256=qxLxsVoEhxNkuCmtABqs8In5erkwTZDK0wih7U2_U38,7296
19
+ vec_inf/client/models.py,sha256=jGNPOj1uPPBV7xdGy3HFv2ZwpJOGCsU8qm7pE2Rnnes,7498
20
20
  vec_inf/config/README.md,sha256=TvZOqZyTUaAFr71hC7GVgg6QUw80AXREyq8wS4D-F30,528
21
21
  vec_inf/config/environment.yaml,sha256=oEDp85hUERJO9NNn4wYhcgunnmkln50GNHDzG_3isMw,678
22
- vec_inf/config/models.yaml,sha256=vzAOqEu6M_lXput83MAhNzj-aNGSBzjbC6LydOmNqxk,26248
23
- vec_inf-0.7.1.dist-info/METADATA,sha256=CJEnzc3VLXxJ_00I1ubtwNNZQjvafddxlJyoi_bSwpo,10047
24
- vec_inf-0.7.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
- vec_inf-0.7.1.dist-info/entry_points.txt,sha256=uNRXjCuJSR2nveEqD3IeMznI9oVI9YLZh5a24cZg6B0,49
26
- vec_inf-0.7.1.dist-info/licenses/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
27
- vec_inf-0.7.1.dist-info/RECORD,,
22
+ vec_inf/config/models.yaml,sha256=PSDR29zI8xld32Vm6dhgCIRHPEkBhwQx7-d_uFlEAM8,24764
23
+ vec_inf-0.7.2.dist-info/METADATA,sha256=ljs9hao8q4igLERrjGL5u1vZ_n7DMrr8XnBHzybPE2Y,10099
24
+ vec_inf-0.7.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
+ vec_inf-0.7.2.dist-info/entry_points.txt,sha256=uNRXjCuJSR2nveEqD3IeMznI9oVI9YLZh5a24cZg6B0,49
26
+ vec_inf-0.7.2.dist-info/licenses/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
27
+ vec_inf-0.7.2.dist-info/RECORD,,