vec-inf 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/README.md CHANGED
@@ -1,9 +1,9 @@
1
1
  # `vec-inf` Commands
2
2
 
3
- * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
4
- * `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
5
- * `metrics`: Streams performance metrics to the console.
3
+ * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported.
6
4
  * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
5
+ * `metrics`: Streams performance metrics to the console.
7
6
  * `shutdown`: Shutdown a model by providing its Slurm job ID.
7
+ * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
8
8
 
9
9
  Use `--help` to see all available options
vec_inf/cli/_cli.py CHANGED
@@ -1,5 +1,24 @@
1
- """Command line interface for Vector Inference."""
1
+ """Command line interface for Vector Inference.
2
2
 
3
+ This module provides the command-line interface for interacting with Vector
4
+ Inference services, including model launching, status checking, metrics
5
+ monitoring, and shutdown operations.
6
+
7
+ Commands
8
+ --------
9
+ launch
10
+ Launch a model on the cluster
11
+ status
12
+ Check the status of a running model
13
+ shutdown
14
+ Stop a running model
15
+ list
16
+ List available models or get specific model configuration
17
+ metrics
18
+ Stream real-time performance metrics
19
+ """
20
+
21
+ import json
3
22
  import time
4
23
  from typing import Optional, Union
5
24
 
@@ -7,8 +26,13 @@ import click
7
26
  from rich.console import Console
8
27
  from rich.live import Live
9
28
 
10
- import vec_inf.cli._utils as utils
11
- from vec_inf.cli._helper import LaunchHelper, ListHelper, MetricsHelper, StatusHelper
29
+ from vec_inf.cli._helper import (
30
+ LaunchResponseFormatter,
31
+ ListCmdDisplay,
32
+ MetricsResponseFormatter,
33
+ StatusResponseFormatter,
34
+ )
35
+ from vec_inf.client import LaunchOptions, VecInfClient
12
36
 
13
37
 
14
38
  CONSOLE = Console()
@@ -24,36 +48,6 @@ def cli() -> None:
24
48
  @click.argument("model-name", type=str, nargs=1)
25
49
  @click.option("--model-family", type=str, help="The model family")
26
50
  @click.option("--model-variant", type=str, help="The model variant")
27
- @click.option(
28
- "--max-model-len",
29
- type=int,
30
- help="Model context length. Default value set based on suggested resource allocation.",
31
- )
32
- @click.option(
33
- "--max-num-seqs",
34
- type=int,
35
- help="Maximum number of sequences to process in a single request",
36
- )
37
- @click.option(
38
- "--gpu-memory-utilization",
39
- type=float,
40
- help="GPU memory utilization, default to 0.9",
41
- )
42
- @click.option(
43
- "--enable-prefix-caching",
44
- is_flag=True,
45
- help="Enables automatic prefix caching",
46
- )
47
- @click.option(
48
- "--enable-chunked-prefill",
49
- is_flag=True,
50
- help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
51
- )
52
- @click.option(
53
- "--max-num-batched-tokens",
54
- type=int,
55
- help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
56
- )
57
51
  @click.option(
58
52
  "--partition",
59
53
  type=str,
@@ -69,22 +63,36 @@ def cli() -> None:
69
63
  type=int,
70
64
  help="Number of GPUs/node to use, default to suggested resource allocation for model",
71
65
  )
66
+ @click.option(
67
+ "--account",
68
+ type=str,
69
+ help="Charge resources used by this job to specified account.",
70
+ )
72
71
  @click.option(
73
72
  "--qos",
74
73
  type=str,
75
74
  help="Quality of service",
76
75
  )
77
76
  @click.option(
78
- "--time",
77
+ "--exclude",
79
78
  type=str,
80
- help="Time limit for job, this should comply with QoS limits",
79
+ help="Exclude certain nodes from the resources granted to the job",
81
80
  )
82
81
  @click.option(
83
- "--vocab-size",
84
- type=int,
85
- help="Vocabulary size, this option is intended for custom models",
82
+ "--node-list",
83
+ type=str,
84
+ help="Request a specific list of nodes for deployment",
85
+ )
86
+ @click.option(
87
+ "--bind",
88
+ type=str,
89
+ help="Additional binds for the singularity container as a comma separated list of bind paths",
90
+ )
91
+ @click.option(
92
+ "--time",
93
+ type=str,
94
+ help="Time limit for job, this should comply with QoS limits",
86
95
  )
87
- @click.option("--data-type", type=str, help="Model data type")
88
96
  @click.option(
89
97
  "--venv",
90
98
  type=str,
@@ -101,19 +109,9 @@ def cli() -> None:
101
109
  help="Path to parent directory containing model weights",
102
110
  )
103
111
  @click.option(
104
- "--pipeline-parallelism",
105
- is_flag=True,
106
- help="Enable pipeline parallelism, enabled by default for supported models",
107
- )
108
- @click.option(
109
- "--compilation-config",
110
- type=click.Choice(["0", "3"]),
111
- help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
112
- )
113
- @click.option(
114
- "--enforce-eager",
115
- is_flag=True,
116
- help="Always use eager-mode PyTorch",
112
+ "--vllm-args",
113
+ type=str,
114
+ help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
117
115
  )
118
116
  @click.option(
119
117
  "--json-mode",
@@ -122,18 +120,73 @@ def cli() -> None:
122
120
  )
123
121
  def launch(
124
122
  model_name: str,
125
- **cli_kwargs: Optional[Union[str, int, bool]],
123
+ **cli_kwargs: Optional[Union[str, int, float, bool]],
126
124
  ) -> None:
127
- """Launch a model on the cluster."""
125
+ """Launch a model on the cluster.
126
+
127
+ Parameters
128
+ ----------
129
+ model_name : str
130
+ Name of the model to launch
131
+ **cli_kwargs : dict
132
+ Additional launch options including:
133
+ - model_family : str, optional
134
+ Family/architecture of the model
135
+ - model_variant : str, optional
136
+ Specific variant of the model
137
+ - partition : str, optional
138
+ Type of compute partition
139
+ - num_nodes : int, optional
140
+ Number of nodes to use
141
+ - gpus_per_node : int, optional
142
+ Number of GPUs per node
143
+ - account : str, optional
144
+ Charge resources used by this job to specified account
145
+ - qos : str, optional
146
+ Quality of service tier
147
+ - exclude : str, optional
148
+ Exclude certain nodes from the resources granted to the job
149
+ - node_list : str, optional
150
+ Request a specific list of nodes for deployment
151
+ - bind : str, optional
152
+ Additional binds for the singularity container
153
+ - time : str, optional
154
+ Time limit for job
155
+ - venv : str, optional
156
+ Path to virtual environment
157
+ - log_dir : str, optional
158
+ Path to SLURM log directory
159
+ - model_weights_parent_dir : str, optional
160
+ Path to model weights directory
161
+ - vllm_args : str, optional
162
+ vLLM engine arguments
163
+ - json_mode : bool, optional
164
+ Output in JSON format
165
+
166
+ Raises
167
+ ------
168
+ click.ClickException
169
+ If launch fails for any reason
170
+ """
128
171
  try:
129
- launch_helper = LaunchHelper(model_name, cli_kwargs)
172
+ # Convert cli_kwargs to LaunchOptions
173
+ json_mode = cli_kwargs["json_mode"]
174
+ del cli_kwargs["json_mode"]
175
+
176
+ launch_options = LaunchOptions(**cli_kwargs) # type: ignore
177
+
178
+ # Start the client and launch model inference server
179
+ client = VecInfClient()
180
+ launch_response = client.launch_model(model_name, launch_options)
130
181
 
131
- launch_helper.set_env_vars()
132
- launch_command = launch_helper.build_launch_command()
133
- command_output, stderr = utils.run_bash_command(launch_command)
134
- if stderr:
135
- raise click.ClickException(f"Error: {stderr}")
136
- launch_helper.post_launch_processing(command_output, CONSOLE)
182
+ # Display launch information
183
+ launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
184
+
185
+ if json_mode:
186
+ click.echo(json.dumps(launch_response.config))
187
+ else:
188
+ launch_info_table = launch_formatter.format_table_output()
189
+ CONSOLE.print(launch_info_table)
137
190
 
138
191
  except click.ClickException as e:
139
192
  raise e
@@ -156,28 +209,61 @@ def launch(
156
209
  def status(
157
210
  slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
158
211
  ) -> None:
159
- """Get the status of a running model on the cluster."""
160
- status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
161
- output, stderr = utils.run_bash_command(status_cmd)
162
- if stderr:
163
- raise click.ClickException(f"Error: {stderr}")
212
+ """Get the status of a running model on the cluster.
164
213
 
165
- status_helper = StatusHelper(slurm_job_id, output, log_dir)
214
+ Parameters
215
+ ----------
216
+ slurm_job_id : int
217
+ ID of the SLURM job to check
218
+ log_dir : str, optional
219
+ Path to SLURM log directory
220
+ json_mode : bool, default=False
221
+ Whether to output in JSON format
166
222
 
167
- status_helper.process_job_state()
168
- if json_mode:
169
- status_helper.output_json()
170
- else:
171
- status_helper.output_table(CONSOLE)
223
+ Raises
224
+ ------
225
+ click.ClickException
226
+ If status check fails
227
+ """
228
+ try:
229
+ # Start the client and get model inference server status
230
+ client = VecInfClient()
231
+ status_response = client.get_status(slurm_job_id, log_dir)
232
+ # Display status information
233
+ status_formatter = StatusResponseFormatter(status_response)
234
+ if json_mode:
235
+ status_formatter.output_json()
236
+ else:
237
+ status_info_table = status_formatter.output_table()
238
+ CONSOLE.print(status_info_table)
239
+
240
+ except click.ClickException as e:
241
+ raise e
242
+ except Exception as e:
243
+ raise click.ClickException(f"Status check failed: {str(e)}") from e
172
244
 
173
245
 
174
246
  @cli.command("shutdown")
175
247
  @click.argument("slurm_job_id", type=int, nargs=1)
176
248
  def shutdown(slurm_job_id: int) -> None:
177
- """Shutdown a running model on the cluster."""
178
- shutdown_cmd = f"scancel {slurm_job_id}"
179
- utils.run_bash_command(shutdown_cmd)
180
- click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
249
+ """Shutdown a running model on the cluster.
250
+
251
+ Parameters
252
+ ----------
253
+ slurm_job_id : int
254
+ ID of the SLURM job to shut down
255
+
256
+ Raises
257
+ ------
258
+ click.ClickException
259
+ If shutdown operation fails
260
+ """
261
+ try:
262
+ client = VecInfClient()
263
+ client.shutdown_model(slurm_job_id)
264
+ click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
265
+ except Exception as e:
266
+ raise click.ClickException(f"Shutdown failed: {str(e)}") from e
181
267
 
182
268
 
183
269
  @cli.command("list")
@@ -188,9 +274,34 @@ def shutdown(slurm_job_id: int) -> None:
188
274
  help="Output in JSON string",
189
275
  )
190
276
  def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
191
- """List all available models, or get default setup of a specific model."""
192
- list_helper = ListHelper(model_name, json_mode)
193
- list_helper.process_list_command(CONSOLE)
277
+ """List all available models, or get default setup of a specific model.
278
+
279
+ Parameters
280
+ ----------
281
+ model_name : str, optional
282
+ Name of specific model to get information for
283
+ json_mode : bool, default=False
284
+ Whether to output in JSON format
285
+
286
+ Raises
287
+ ------
288
+ click.ClickException
289
+ If list operation fails
290
+ """
291
+ try:
292
+ # Start the client
293
+ client = VecInfClient()
294
+ list_display = ListCmdDisplay(CONSOLE, json_mode)
295
+ if model_name:
296
+ model_config = client.get_model_config(model_name)
297
+ list_display.display_single_model_output(model_config)
298
+ else:
299
+ model_infos = client.list_models()
300
+ list_display.display_all_models_output(model_infos)
301
+ except click.ClickException as e:
302
+ raise e
303
+ except Exception as e:
304
+ raise click.ClickException(f"List models failed: {str(e)}") from e
194
305
 
195
306
 
196
307
  @cli.command("metrics")
@@ -199,31 +310,55 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
199
310
  "--log-dir", type=str, help="Path to slurm log directory (if used during launch)"
200
311
  )
201
312
  def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
202
- """Stream real-time performance metrics from the model endpoint."""
203
- helper = MetricsHelper(slurm_job_id, log_dir)
204
-
205
- # Check if metrics URL is ready
206
- if not helper.metrics_url.startswith("http"):
207
- table = utils.create_table("Metric", "Value")
208
- helper.display_failed_metrics(
209
- table, f"Metrics endpoint unavailable - {helper.metrics_url}"
210
- )
211
- CONSOLE.print(table)
212
- return
213
-
214
- with Live(refresh_per_second=1, console=CONSOLE) as live:
215
- while True:
216
- metrics = helper.fetch_metrics()
217
- table = utils.create_table("Metric", "Value")
218
-
219
- if isinstance(metrics, str):
220
- # Show status information if metrics aren't available
221
- helper.display_failed_metrics(table, metrics)
222
- else:
223
- helper.display_metrics(table, metrics)
224
-
225
- live.update(table)
226
- time.sleep(2)
313
+ """Stream real-time performance metrics from the model endpoint.
314
+
315
+ Parameters
316
+ ----------
317
+ slurm_job_id : int
318
+ ID of the SLURM job to monitor
319
+ log_dir : str, optional
320
+ Path to SLURM log directory
321
+
322
+ Raises
323
+ ------
324
+ click.ClickException
325
+ If metrics collection fails
326
+
327
+ Notes
328
+ -----
329
+ This command continuously streams metrics with a 2-second refresh interval
330
+ until interrupted. If metrics are not available, it will display status
331
+ information instead.
332
+ """
333
+ try:
334
+ # Start the client and get inference server metrics
335
+ client = VecInfClient()
336
+ metrics_response = client.get_metrics(slurm_job_id, log_dir)
337
+ metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
338
+
339
+ # Check if metrics response is ready
340
+ if isinstance(metrics_response.metrics, str):
341
+ metrics_formatter.format_failed_metrics(metrics_response.metrics)
342
+ CONSOLE.print(metrics_formatter.table)
343
+ return
344
+
345
+ with Live(refresh_per_second=1, console=CONSOLE) as live:
346
+ while True:
347
+ metrics_response = client.get_metrics(slurm_job_id, log_dir)
348
+ metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
349
+
350
+ if isinstance(metrics_response.metrics, str):
351
+ # Show status information if metrics aren't available
352
+ metrics_formatter.format_failed_metrics(metrics_response.metrics)
353
+ else:
354
+ metrics_formatter.format_metrics()
355
+
356
+ live.update(metrics_formatter.table)
357
+ time.sleep(2)
358
+ except click.ClickException as e:
359
+ raise e
360
+ except Exception as e:
361
+ raise click.ClickException(f"Metrics check failed: {str(e)}") from e
227
362
 
228
363
 
229
364
  if __name__ == "__main__":