vec-inf 0.7.2__py3-none-any.whl → 0.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server.
4
4
  * `batch-launch`: Specify a list of models to launch multiple OpenAI compatible inference servers at the same time.
5
- * `status`: Check the model status by providing its Slurm job ID.
5
+ * `status`: Check the status of all `vec-inf` jobs, or a specific job by providing its job ID.
6
6
  * `metrics`: Streams performance metrics to the console.
7
7
  * `shutdown`: Shutdown a model by providing its Slurm job ID.
8
8
  * `list`: List all available model names, or view the default/cached configuration of a specific model.
@@ -14,6 +14,7 @@ Use `--help` to see all available options
14
14
 
15
15
  * `launch_model`: Launch an OpenAI compatible inference server.
16
16
  * `batch_launch_models`: Launch multiple OpenAI compatible inference servers.
17
+ * `fetch_running_jobs`: Get the running `vec-inf` job IDs.
17
18
  * `get_status`: Get the status of a running model.
18
19
  * `get_metrics`: Get the performance metrics of a running model.
19
20
  * `shutdown_model`: Shutdown a running model.
vec_inf/cli/_cli.py CHANGED
@@ -30,6 +30,7 @@ from vec_inf.cli._helper import (
30
30
  BatchLaunchResponseFormatter,
31
31
  LaunchResponseFormatter,
32
32
  ListCmdDisplay,
33
+ ListStatusDisplay,
33
34
  MetricsResponseFormatter,
34
35
  StatusResponseFormatter,
35
36
  )
@@ -131,10 +132,20 @@ def cli() -> None:
131
132
  type=str,
132
133
  help="Path to parent directory containing model weights",
133
134
  )
135
+ @click.option(
136
+ "--engine",
137
+ type=str,
138
+ help="Inference engine to use, supports 'vllm' and 'sglang'",
139
+ )
134
140
  @click.option(
135
141
  "--vllm-args",
136
142
  type=str,
137
- help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
143
+ help="vLLM engine arguments to be set, use the format as specified in vLLM serve documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
144
+ )
145
+ @click.option(
146
+ "--sglang-args",
147
+ type=str,
148
+ help="SGLang engine arguments to be set, use the format as specified in SGLang Server Arguments documentation and separate arguments with commas, e.g. --sglang-args '--context-length=8192,--mem-fraction-static=0.85'",
138
149
  )
139
150
  @click.option(
140
151
  "--json-mode",
@@ -149,7 +160,7 @@ def cli() -> None:
149
160
  @click.option(
150
161
  "--config",
151
162
  type=str,
152
- help="Path to a model config yaml file to use in place of the default",
163
+ help="Path to a model config yaml file to use in place of the default, you can also set VEC_INF_MODEL_CONFIG to the path to the model config file",
153
164
  )
154
165
  def launch(
155
166
  model_name: str,
@@ -200,7 +211,9 @@ def launch(
200
211
  - model_weights_parent_dir : str, optional
201
212
  Path to model weights directory
202
213
  - vllm_args : str, optional
203
- vLLM engine arguments
214
+ vllm engine arguments
215
+ - sglang_args : str, optional
216
+ sglang engine arguments
204
217
  - env : str, optional
205
218
  Environment variables
206
219
  - config : str, optional
@@ -228,6 +241,10 @@ def launch(
228
241
  if json_mode:
229
242
  click.echo(json.dumps(launch_response.config))
230
243
  else:
244
+ if launch_response.config.get("engine_inferred"):
245
+ CONSOLE.print(
246
+ "Warning: Inference engine inferred from engine-specific args"
247
+ )
231
248
  launch_formatter = LaunchResponseFormatter(
232
249
  model_name, launch_response.config
233
250
  )
@@ -313,14 +330,14 @@ def batch_launch(
313
330
  raise click.ClickException(f"Batch launch failed: {str(e)}") from e
314
331
 
315
332
 
316
- @cli.command("status", help="Check the status of a running model on the cluster.")
317
- @click.argument("slurm_job_id", type=str, nargs=1)
333
+ @cli.command("status", help="Check the status of running vec-inf jobs on the cluster.")
334
+ @click.argument("slurm_job_id", required=False)
318
335
  @click.option(
319
336
  "--json-mode",
320
337
  is_flag=True,
321
338
  help="Output in JSON string",
322
339
  )
323
- def status(slurm_job_id: str, json_mode: bool = False) -> None:
340
+ def status(slurm_job_id: Optional[str] = None, json_mode: bool = False) -> None:
324
341
  """Get the status of a running model on the cluster.
325
342
 
326
343
  Parameters
@@ -338,14 +355,28 @@ def status(slurm_job_id: str, json_mode: bool = False) -> None:
338
355
  try:
339
356
  # Start the client and get model inference server status
340
357
  client = VecInfClient()
341
- status_response = client.get_status(slurm_job_id)
358
+ if not slurm_job_id:
359
+ slurm_job_ids = client.fetch_running_jobs()
360
+ if not slurm_job_ids:
361
+ click.echo("No running jobs found.")
362
+ return
363
+ else:
364
+ slurm_job_ids = [slurm_job_id]
365
+ responses = []
366
+ for job_id in slurm_job_ids:
367
+ responses.append(client.get_status(job_id))
368
+
342
369
  # Display status information
343
- status_formatter = StatusResponseFormatter(status_response)
344
- if json_mode:
345
- status_formatter.output_json()
370
+ if slurm_job_id:
371
+ status_formatter = StatusResponseFormatter(responses[0])
372
+ if json_mode:
373
+ status_formatter.output_json()
374
+ else:
375
+ status_info_table = status_formatter.output_table()
376
+ CONSOLE.print(status_info_table)
346
377
  else:
347
- status_info_table = status_formatter.output_table()
348
- CONSOLE.print(status_info_table)
378
+ list_status_display = ListStatusDisplay(slurm_job_ids, responses, json_mode)
379
+ list_status_display.display_multiple_status_output(CONSOLE)
349
380
 
350
381
  except click.ClickException as e:
351
382
  raise e
vec_inf/cli/_helper.py CHANGED
@@ -15,7 +15,7 @@ from rich.panel import Panel
15
15
  from rich.table import Table
16
16
 
17
17
  from vec_inf.cli._utils import create_table
18
- from vec_inf.cli._vars import MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
18
+ from vec_inf.cli._vars import ENGINE_NAME_MAP, MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
19
19
  from vec_inf.client import ModelConfig, ModelInfo, StatusResponse
20
20
 
21
21
 
@@ -49,11 +49,12 @@ class LaunchResponseFormatter:
49
49
  if self.params.get(key):
50
50
  table.add_row(label, self.params[key])
51
51
 
52
- def _add_vllm_config(self, table: Table) -> None:
53
- """Add vLLM configuration details to the table."""
54
- if self.params.get("vllm_args"):
55
- table.add_row("vLLM Arguments:", style="magenta")
56
- for arg, value in self.params["vllm_args"].items():
52
+ def _add_engine_config(self, table: Table) -> None:
53
+ """Add inference engine configuration details to the table."""
54
+ if self.params.get("engine_args"):
55
+ engine_name = ENGINE_NAME_MAP[self.params["engine"]]
56
+ table.add_row(f"{engine_name} Arguments:", style="magenta")
57
+ for arg, value in self.params["engine_args"].items():
57
58
  table.add_row(f" {arg}:", str(value))
58
59
 
59
60
  def _add_env_vars(self, table: Table) -> None:
@@ -111,9 +112,10 @@ class LaunchResponseFormatter:
111
112
  str(Path(self.params["model_weights_parent_dir"], self.model_name)),
112
113
  )
113
114
  table.add_row("Log Directory", self.params["log_dir"])
115
+ table.add_row("Inference Engine", ENGINE_NAME_MAP[self.params["engine"]])
114
116
 
115
117
  # Add configuration details
116
- self._add_vllm_config(table)
118
+ self._add_engine_config(table)
117
119
  self._add_env_vars(table)
118
120
  self._add_bind_paths(table)
119
121
 
@@ -185,6 +187,10 @@ class BatchLaunchResponseFormatter:
185
187
  table.add_row(
186
188
  "Memory/Node", f" {self.params['models'][model_name]['mem_per_node']}"
187
189
  )
190
+ table.add_row(
191
+ "Inference Engine",
192
+ f" {ENGINE_NAME_MAP[self.params['models'][model_name]['engine']]}",
193
+ )
188
194
 
189
195
  return table
190
196
 
@@ -251,6 +257,62 @@ class StatusResponseFormatter:
251
257
  return table
252
258
 
253
259
 
260
+ class ListStatusDisplay:
261
+ """CLI Helper class for formatting a list of StatusResponse.
262
+
263
+ A formatter class that handles the presentation of multiple job statuses
264
+ in a table format.
265
+
266
+ Parameters
267
+ ----------
268
+ statuses : list[StatusResponse]
269
+ List of model status information
270
+ """
271
+
272
+ def __init__(
273
+ self,
274
+ job_ids: list[str],
275
+ statuses: list[StatusResponse],
276
+ json_mode: bool = False,
277
+ ):
278
+ self.job_ids = job_ids
279
+ self.statuses = statuses
280
+ self.json_mode = json_mode
281
+
282
+ self.table = Table(show_header=True, header_style="bold magenta")
283
+ self.table.add_column("Job ID")
284
+ self.table.add_column("Model Name")
285
+ self.table.add_column("Status", style="blue")
286
+ self.table.add_column("Base URL")
287
+
288
+ def display_multiple_status_output(self, console: Console) -> None:
289
+ """Format and display all model statuses.
290
+
291
+ Formats each model's status and adds it to the table.
292
+ """
293
+ if self.json_mode:
294
+ json_data = [
295
+ {
296
+ "job_id": status.model_name,
297
+ "model_name": status.model_name,
298
+ "model_status": status.server_status,
299
+ "base_url": status.base_url,
300
+ }
301
+ for status in self.statuses
302
+ ]
303
+ click.echo(json.dumps(json_data, indent=4))
304
+ return
305
+
306
+ for i, status in enumerate(self.statuses):
307
+ self.table.add_row(
308
+ self.job_ids[i],
309
+ status.model_name,
310
+ status.server_status,
311
+ status.base_url,
312
+ )
313
+ console.print(self.table)
314
+
315
+
254
316
  class MetricsResponseFormatter:
255
317
  """CLI Helper class for formatting MetricsResponse.
256
318
 
@@ -423,14 +485,19 @@ class ListCmdDisplay:
423
485
  )
424
486
  return json.dumps(config_dict, indent=4)
425
487
 
488
+ excluded_list = ["venv", "log_dir"]
489
+
426
490
  table = create_table(key_title="Model Config", value_title="Value")
427
491
  for field, value in config.model_dump().items():
428
- if field not in {"venv", "log_dir", "vllm_args"} and value:
492
+ if "args" in field:
493
+ if not value:
494
+ continue
495
+ engine_name = ENGINE_NAME_MAP[field.split("_")[0]]
496
+ table.add_row(f"{engine_name} Arguments:", style="magenta")
497
+ for engine_arg, engine_value in value.items():
498
+ table.add_row(f" {engine_arg}:", str(engine_value))
499
+ elif field not in excluded_list and value:
429
500
  table.add_row(field, str(value))
430
- if field == "vllm_args":
431
- table.add_row("vLLM Arguments:", style="magenta")
432
- for vllm_arg, vllm_value in value.items():
433
- table.add_row(f" {vllm_arg}:", str(vllm_value))
434
501
  return table
435
502
 
436
503
  def _format_all_models_output(
vec_inf/cli/_vars.py CHANGED
@@ -1,32 +1,47 @@
1
1
  """Constants for CLI rendering.
2
2
 
3
- This module defines constant mappings for model type priorities and colors
3
+ This module defines mappings for model type priorities, colors, and engine name mappings
4
4
  used in the CLI display formatting.
5
+ """
5
6
 
6
- Constants
7
- ---------
8
- MODEL_TYPE_PRIORITY : dict
9
- Mapping of model types to their display priority (lower numbers shown first)
7
+ from typing import get_args
10
8
 
11
- MODEL_TYPE_COLORS : dict
12
- Mapping of model types to their display colors in Rich
9
+ from vec_inf.client._slurm_vars import MODEL_TYPES
13
10
 
14
- Notes
15
- -----
16
- These constants are used primarily by the ListCmdDisplay class to ensure
17
- consistent sorting and color coding of different model types in the CLI output.
18
- """
19
11
 
20
- MODEL_TYPE_PRIORITY = {
21
- "LLM": 0,
22
- "VLM": 1,
23
- "Text_Embedding": 2,
24
- "Reward_Modeling": 3,
25
- }
12
+ # Extract model type values from the Literal type
13
+ _MODEL_TYPES = get_args(MODEL_TYPES)
14
+
15
+ # Rich color options (prioritizing current colors, with fallbacks for additional types)
16
+ _RICH_COLORS = [
17
+ "cyan",
18
+ "bright_blue",
19
+ "purple",
20
+ "bright_magenta",
21
+ "green",
22
+ "yellow",
23
+ "bright_green",
24
+ "bright_yellow",
25
+ "red",
26
+ "bright_red",
27
+ "blue",
28
+ "magenta",
29
+ "bright_cyan",
30
+ "white",
31
+ "bright_white",
32
+ ]
26
33
 
34
+ # Mapping of model types to their display priority (lower numbers shown first)
35
+ MODEL_TYPE_PRIORITY = {model_type: idx for idx, model_type in enumerate(_MODEL_TYPES)}
36
+
37
+ # Mapping of model types to their display colors in Rich
27
38
  MODEL_TYPE_COLORS = {
28
- "LLM": "cyan",
29
- "VLM": "bright_blue",
30
- "Text_Embedding": "purple",
31
- "Reward_Modeling": "bright_magenta",
39
+ model_type: _RICH_COLORS[idx % len(_RICH_COLORS)]
40
+ for idx, model_type in enumerate(_MODEL_TYPES)
41
+ }
42
+
43
+ # Inference engine choice and name mapping
44
+ ENGINE_NAME_MAP = {
45
+ "vllm": "vLLM",
46
+ "sglang": "SGLang",
32
47
  }
@@ -49,7 +49,7 @@ SLURM_JOB_CONFIG_ARGS = {
49
49
  "time": "time",
50
50
  "nodes": "num_nodes",
51
51
  "exclude": "exclude",
52
- "nodelist": "node_list",
52
+ "nodelist": "nodelist",
53
53
  "gres": "gres",
54
54
  "cpus-per-task": "cpus_per_task",
55
55
  "mem": "mem_per_node",
@@ -61,13 +61,43 @@ SLURM_JOB_CONFIG_ARGS = {
61
61
  VLLM_SHORT_TO_LONG_MAP = {
62
62
  "-tp": "--tensor-parallel-size",
63
63
  "-pp": "--pipeline-parallel-size",
64
+ "-n": "--nnodes",
65
+ "-r": "--node-rank",
66
+ "-dcp": "--decode-context-parallel-size",
67
+ "-pcp": "--prefill-context-parallel-size",
64
68
  "-dp": "--data-parallel-size",
69
+ "-dpn": "--data-parallel-rank",
70
+ "-dpr": "--data-parallel-start-rank",
65
71
  "-dpl": "--data-parallel-size-local",
66
72
  "-dpa": "--data-parallel-address",
67
73
  "-dpp": "--data-parallel-rpc-port",
74
+ "-dpb": "--data-parallel-backend",
75
+ "-dph": "--data-parallel-hybrid-lb",
76
+ "-dpe": "--data-parallel-external-lb",
68
77
  "-O": "--compilation-config",
69
78
  "-q": "--quantization",
70
79
  }
71
80
 
81
+ # SGLang engine args mapping between short and long names
82
+ SGLANG_SHORT_TO_LONG_MAP = {
83
+ "--tp": "--tensor-parallel-size",
84
+ "--tp-size": "--tensor-parallel-size",
85
+ "--pp": "--pipeline-parallel-size",
86
+ "--pp-size": "--pipeline-parallel-size",
87
+ "--dp": "--data-parallel-size",
88
+ "--dp-size": "--data-parallel-size",
89
+ "--ep": "--expert-parallel-size",
90
+ "--ep-size": "--expert-parallel-expert-size",
91
+ }
92
+
93
+ # Mapping of engine short names to their argument mappings
94
+ ENGINE_SHORT_TO_LONG_MAP = {
95
+ "vllm": VLLM_SHORT_TO_LONG_MAP,
96
+ "sglang": SGLANG_SHORT_TO_LONG_MAP,
97
+ }
98
+
72
99
  # Required matching arguments for batch mode
73
100
  BATCH_MODE_REQUIRED_MATCHING_ARGS = ["venv", "log_dir"]
101
+
102
+ # Supported engines
103
+ SUPPORTED_ENGINES = ["vllm", "sglang"]