vec-inf 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/README.md CHANGED
@@ -1,9 +1,9 @@
1
1
  # `vec-inf` Commands
2
2
 
3
- * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported. Check [`here`](./models/README.md) for complete list of available options.
4
- * `list`: List all available model names, or append a supported model name to view the default configuration, `--json-mode` supported.
5
- * `metrics`: Streams performance metrics to the console.
3
+ * `launch`: Specify a model family and other optional parameters to launch an OpenAI compatible inference server, `--json-mode` supported.
6
4
  * `status`: Check the model status by providing its Slurm job ID, `--json-mode` supported.
5
+ * `metrics`: Streams performance metrics to the console.
7
6
  * `shutdown`: Shutdown a model by providing its Slurm job ID.
7
+ * `list`: List all available model names, or view the default/cached configuration of a specific model, `--json-mode` supported.
8
8
 
9
9
  Use `--help` to see all available options
vec_inf/cli/_cli.py CHANGED
@@ -1,4 +1,22 @@
1
- """Command line interface for Vector Inference."""
1
+ """Command line interface for Vector Inference.
2
+
3
+ This module provides the command-line interface for interacting with Vector
4
+ Inference services, including model launching, status checking, metrics
5
+ monitoring, and shutdown operations.
6
+
7
+ Commands
8
+ --------
9
+ launch
10
+ Launch a model on the cluster
11
+ status
12
+ Check the status of a running model
13
+ shutdown
14
+ Stop a running model
15
+ list
16
+ List available models or get specific model configuration
17
+ metrics
18
+ Stream real-time performance metrics
19
+ """
2
20
 
3
21
  import time
4
22
  from typing import Optional, Union
@@ -7,8 +25,13 @@ import click
7
25
  from rich.console import Console
8
26
  from rich.live import Live
9
27
 
10
- import vec_inf.cli._utils as utils
11
- from vec_inf.cli._helper import LaunchHelper, ListHelper, MetricsHelper, StatusHelper
28
+ from vec_inf.cli._helper import (
29
+ LaunchResponseFormatter,
30
+ ListCmdDisplay,
31
+ MetricsResponseFormatter,
32
+ StatusResponseFormatter,
33
+ )
34
+ from vec_inf.client import LaunchOptions, VecInfClient
12
35
 
13
36
 
14
37
  CONSOLE = Console()
@@ -24,36 +47,6 @@ def cli() -> None:
24
47
  @click.argument("model-name", type=str, nargs=1)
25
48
  @click.option("--model-family", type=str, help="The model family")
26
49
  @click.option("--model-variant", type=str, help="The model variant")
27
- @click.option(
28
- "--max-model-len",
29
- type=int,
30
- help="Model context length. Default value set based on suggested resource allocation.",
31
- )
32
- @click.option(
33
- "--max-num-seqs",
34
- type=int,
35
- help="Maximum number of sequences to process in a single request",
36
- )
37
- @click.option(
38
- "--gpu-memory-utilization",
39
- type=float,
40
- help="GPU memory utilization, default to 0.9",
41
- )
42
- @click.option(
43
- "--enable-prefix-caching",
44
- is_flag=True,
45
- help="Enables automatic prefix caching",
46
- )
47
- @click.option(
48
- "--enable-chunked-prefill",
49
- is_flag=True,
50
- help="Enable chunked prefill, enabled by default if max number of sequences > 32k",
51
- )
52
- @click.option(
53
- "--max-num-batched-tokens",
54
- type=int,
55
- help="Maximum number of batched tokens per iteration, defaults to 2048 if --enable-chunked-prefill is set, else None",
56
- )
57
50
  @click.option(
58
51
  "--partition",
59
52
  type=str,
@@ -69,6 +62,11 @@ def cli() -> None:
69
62
  type=int,
70
63
  help="Number of GPUs/node to use, default to suggested resource allocation for model",
71
64
  )
65
+ @click.option(
66
+ "--account",
67
+ type=str,
68
+ help="Charge resources used by this job to specified account.",
69
+ )
72
70
  @click.option(
73
71
  "--qos",
74
72
  type=str,
@@ -79,12 +77,6 @@ def cli() -> None:
79
77
  type=str,
80
78
  help="Time limit for job, this should comply with QoS limits",
81
79
  )
82
- @click.option(
83
- "--vocab-size",
84
- type=int,
85
- help="Vocabulary size, this option is intended for custom models",
86
- )
87
- @click.option("--data-type", type=str, help="Model data type")
88
80
  @click.option(
89
81
  "--venv",
90
82
  type=str,
@@ -101,19 +93,9 @@ def cli() -> None:
101
93
  help="Path to parent directory containing model weights",
102
94
  )
103
95
  @click.option(
104
- "--pipeline-parallelism",
105
- is_flag=True,
106
- help="Enable pipeline parallelism, enabled by default for supported models",
107
- )
108
- @click.option(
109
- "--compilation-config",
110
- type=click.Choice(["0", "3"]),
111
- help="torch.compile optimization level, accepts '0' or '3', default to '0', which means no optimization is applied",
112
- )
113
- @click.option(
114
- "--enforce-eager",
115
- is_flag=True,
116
- help="Always use eager-mode PyTorch",
96
+ "--vllm-args",
97
+ type=str,
98
+ help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
117
99
  )
118
100
  @click.option(
119
101
  "--json-mode",
@@ -122,18 +104,64 @@ def cli() -> None:
122
104
  )
123
105
  def launch(
124
106
  model_name: str,
125
- **cli_kwargs: Optional[Union[str, int, bool]],
107
+ **cli_kwargs: Optional[Union[str, int, float, bool]],
126
108
  ) -> None:
127
- """Launch a model on the cluster."""
109
+ """Launch a model on the cluster.
110
+
111
+ Parameters
112
+ ----------
113
+ model_name : str
114
+ Name of the model to launch
115
+ **cli_kwargs : dict
116
+ Additional launch options including:
117
+ - model_family : str, optional
118
+ Family/architecture of the model
119
+ - model_variant : str, optional
120
+ Specific variant of the model
121
+ - partition : str, optional
122
+ Type of compute partition
123
+ - num_nodes : int, optional
124
+ Number of nodes to use
125
+ - gpus_per_node : int, optional
126
+ Number of GPUs per node
127
+ - qos : str, optional
128
+ Quality of service tier
129
+ - time : str, optional
130
+ Time limit for job
131
+ - venv : str, optional
132
+ Path to virtual environment
133
+ - log_dir : str, optional
134
+ Path to SLURM log directory
135
+ - model_weights_parent_dir : str, optional
136
+ Path to model weights directory
137
+ - vllm_args : str, optional
138
+ vLLM engine arguments
139
+ - json_mode : bool, optional
140
+ Output in JSON format
141
+
142
+ Raises
143
+ ------
144
+ click.ClickException
145
+ If launch fails for any reason
146
+ """
128
147
  try:
129
- launch_helper = LaunchHelper(model_name, cli_kwargs)
148
+ # Convert cli_kwargs to LaunchOptions
149
+ json_mode = cli_kwargs["json_mode"]
150
+ del cli_kwargs["json_mode"]
151
+
152
+ launch_options = LaunchOptions(**cli_kwargs) # type: ignore
130
153
 
131
- launch_helper.set_env_vars()
132
- launch_command = launch_helper.build_launch_command()
133
- command_output, stderr = utils.run_bash_command(launch_command)
134
- if stderr:
135
- raise click.ClickException(f"Error: {stderr}")
136
- launch_helper.post_launch_processing(command_output, CONSOLE)
154
+ # Start the client and launch model inference server
155
+ client = VecInfClient()
156
+ launch_response = client.launch_model(model_name, launch_options)
157
+
158
+ # Display launch information
159
+ launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
160
+ if json_mode:
161
+ click.echo(launch_response.config)
162
+ else:
163
+ launch_info_table = launch_formatter.format_table_output()
164
+ CONSOLE.print(launch_info_table)
137
165
 
138
166
  except click.ClickException as e:
139
167
  raise e
@@ -156,28 +184,61 @@ def launch(
156
184
  def status(
157
185
  slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
158
186
  ) -> None:
159
- """Get the status of a running model on the cluster."""
160
- status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
161
- output, stderr = utils.run_bash_command(status_cmd)
162
- if stderr:
163
- raise click.ClickException(f"Error: {stderr}")
187
+ """Get the status of a running model on the cluster.
188
+
189
+ Parameters
190
+ ----------
191
+ slurm_job_id : int
192
+ ID of the SLURM job to check
193
+ log_dir : str, optional
194
+ Path to SLURM log directory
195
+ json_mode : bool, default=False
196
+ Whether to output in JSON format
164
197
 
165
- status_helper = StatusHelper(slurm_job_id, output, log_dir)
198
+ Raises
199
+ ------
200
+ click.ClickException
201
+ If status check fails
202
+ """
203
+ try:
204
+ # Start the client and get model inference server status
205
+ client = VecInfClient()
206
+ status_response = client.get_status(slurm_job_id, log_dir)
207
+ # Display status information
208
+ status_formatter = StatusResponseFormatter(status_response)
209
+ if json_mode:
210
+ status_formatter.output_json()
211
+ else:
212
+ status_info_table = status_formatter.output_table()
213
+ CONSOLE.print(status_info_table)
166
214
 
167
- status_helper.process_job_state()
168
- if json_mode:
169
- status_helper.output_json()
170
- else:
171
- status_helper.output_table(CONSOLE)
215
+ except click.ClickException as e:
216
+ raise e
217
+ except Exception as e:
218
+ raise click.ClickException(f"Status check failed: {str(e)}") from e
172
219
 
173
220
 
174
221
  @cli.command("shutdown")
175
222
  @click.argument("slurm_job_id", type=int, nargs=1)
176
223
  def shutdown(slurm_job_id: int) -> None:
177
- """Shutdown a running model on the cluster."""
178
- shutdown_cmd = f"scancel {slurm_job_id}"
179
- utils.run_bash_command(shutdown_cmd)
180
- click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
224
+ """Shutdown a running model on the cluster.
225
+
226
+ Parameters
227
+ ----------
228
+ slurm_job_id : int
229
+ ID of the SLURM job to shut down
230
+
231
+ Raises
232
+ ------
233
+ click.ClickException
234
+ If shutdown operation fails
235
+ """
236
+ try:
237
+ client = VecInfClient()
238
+ client.shutdown_model(slurm_job_id)
239
+ click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
240
+ except Exception as e:
241
+ raise click.ClickException(f"Shutdown failed: {str(e)}") from e
181
242
 
182
243
 
183
244
  @cli.command("list")
@@ -188,9 +249,34 @@ def shutdown(slurm_job_id: int) -> None:
188
249
  help="Output in JSON string",
189
250
  )
190
251
  def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
191
- """List all available models, or get default setup of a specific model."""
192
- list_helper = ListHelper(model_name, json_mode)
193
- list_helper.process_list_command(CONSOLE)
252
+ """List all available models, or get default setup of a specific model.
253
+
254
+ Parameters
255
+ ----------
256
+ model_name : str, optional
257
+ Name of specific model to get information for
258
+ json_mode : bool, default=False
259
+ Whether to output in JSON format
260
+
261
+ Raises
262
+ ------
263
+ click.ClickException
264
+ If list operation fails
265
+ """
266
+ try:
267
+ # Start the client
268
+ client = VecInfClient()
269
+ list_display = ListCmdDisplay(CONSOLE, json_mode)
270
+ if model_name:
271
+ model_config = client.get_model_config(model_name)
272
+ list_display.display_single_model_output(model_config)
273
+ else:
274
+ model_infos = client.list_models()
275
+ list_display.display_all_models_output(model_infos)
276
+ except click.ClickException as e:
277
+ raise e
278
+ except Exception as e:
279
+ raise click.ClickException(f"List models failed: {str(e)}") from e
194
280
 
195
281
 
196
282
  @cli.command("metrics")
@@ -199,31 +285,55 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
199
285
  "--log-dir", type=str, help="Path to slurm log directory (if used during launch)"
200
286
  )
201
287
  def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
202
- """Stream real-time performance metrics from the model endpoint."""
203
- helper = MetricsHelper(slurm_job_id, log_dir)
204
-
205
- # Check if metrics URL is ready
206
- if not helper.metrics_url.startswith("http"):
207
- table = utils.create_table("Metric", "Value")
208
- helper.display_failed_metrics(
209
- table, f"Metrics endpoint unavailable - {helper.metrics_url}"
210
- )
211
- CONSOLE.print(table)
212
- return
213
-
214
- with Live(refresh_per_second=1, console=CONSOLE) as live:
215
- while True:
216
- metrics = helper.fetch_metrics()
217
- table = utils.create_table("Metric", "Value")
218
-
219
- if isinstance(metrics, str):
220
- # Show status information if metrics aren't available
221
- helper.display_failed_metrics(table, metrics)
222
- else:
223
- helper.display_metrics(table, metrics)
224
-
225
- live.update(table)
226
- time.sleep(2)
288
+ """Stream real-time performance metrics from the model endpoint.
289
+
290
+ Parameters
291
+ ----------
292
+ slurm_job_id : int
293
+ ID of the SLURM job to monitor
294
+ log_dir : str, optional
295
+ Path to SLURM log directory
296
+
297
+ Raises
298
+ ------
299
+ click.ClickException
300
+ If metrics collection fails
301
+
302
+ Notes
303
+ -----
304
+ This command continuously streams metrics with a 2-second refresh interval
305
+ until interrupted. If metrics are not available, it will display status
306
+ information instead.
307
+ """
308
+ try:
309
+ # Start the client and get inference server metrics
310
+ client = VecInfClient()
311
+ metrics_response = client.get_metrics(slurm_job_id, log_dir)
312
+ metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
313
+
314
+ # Check if metrics response is ready
315
+ if isinstance(metrics_response.metrics, str):
316
+ metrics_formatter.format_failed_metrics(metrics_response.metrics)
317
+ CONSOLE.print(metrics_formatter.table)
318
+ return
319
+
320
+ with Live(refresh_per_second=1, console=CONSOLE) as live:
321
+ while True:
322
+ metrics_response = client.get_metrics(slurm_job_id, log_dir)
323
+ metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
324
+
325
+ if isinstance(metrics_response.metrics, str):
326
+ # Show status information if metrics aren't available
327
+ metrics_formatter.format_failed_metrics(metrics_response.metrics)
328
+ else:
329
+ metrics_formatter.format_metrics()
330
+
331
+ live.update(metrics_formatter.table)
332
+ time.sleep(2)
333
+ except click.ClickException as e:
334
+ raise e
335
+ except Exception as e:
336
+ raise click.ClickException(f"Metrics check failed: {str(e)}") from e
227
337
 
228
338
 
229
339
  if __name__ == "__main__":