vec-inf 0.4.1__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/client/api.py ADDED
@@ -0,0 +1,302 @@
1
+ """Vector Inference client for programmatic access.
2
+
3
+ This module provides the main client class for interacting with Vector Inference
4
+ services programmatically. It includes functionality for launching models, monitoring
5
+ their status, collecting metrics, and managing their lifecycle.
6
+
7
+ See Also
8
+ --------
9
+ vec_inf.client._helper : Helper classes for model inference server management
10
+ vec_inf.client.models : Data models for API responses
11
+ """
12
+
13
+ import time
14
+ import warnings
15
+ from typing import Any, Optional, Union
16
+
17
+ from vec_inf.client._exceptions import (
18
+ ServerError,
19
+ SlurmJobError,
20
+ )
21
+ from vec_inf.client._helper import (
22
+ ModelLauncher,
23
+ ModelRegistry,
24
+ ModelStatusMonitor,
25
+ PerformanceMetricsCollector,
26
+ )
27
+ from vec_inf.client._utils import run_bash_command
28
+ from vec_inf.client.config import ModelConfig
29
+ from vec_inf.client.models import (
30
+ LaunchOptions,
31
+ LaunchResponse,
32
+ MetricsResponse,
33
+ ModelInfo,
34
+ ModelStatus,
35
+ StatusResponse,
36
+ )
37
+
38
+
39
+ class VecInfClient:
40
+ """Client for interacting with Vector Inference programmatically.
41
+
42
+ This class provides methods for launching models, checking their status,
43
+ retrieving metrics, and shutting down models using the Vector Inference
44
+ infrastructure.
45
+
46
+ Methods
47
+ -------
48
+ list_models()
49
+ List all available models
50
+ get_model_config(model_name)
51
+ Get configuration for a specific model
52
+ launch_model(model_name, options)
53
+ Launch a model on the cluster
54
+ get_status(slurm_job_id, log_dir)
55
+ Get status of a running model
56
+ get_metrics(slurm_job_id, log_dir)
57
+ Get performance metrics of a running model
58
+ shutdown_model(slurm_job_id)
59
+ Shutdown a running model
60
+ wait_until_ready(slurm_job_id, timeout_seconds, poll_interval_seconds, log_dir)
61
+ Wait for a model to become ready
62
+
63
+ Examples
64
+ --------
65
+ >>> from vec_inf.api import VecInfClient
66
+ >>> client = VecInfClient()
67
+ >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
68
+ >>> job_id = response.slurm_job_id
69
+ >>> status = client.get_status(job_id)
70
+ >>> if status.status == ModelStatus.READY:
71
+ ... print(f"Model is ready at {status.base_url}")
72
+ >>> client.shutdown_model(job_id)
73
+ """
74
+
75
+ def __init__(self) -> None:
76
+ """Initialize the Vector Inference client."""
77
+ pass
78
+
79
+ def list_models(self) -> list[ModelInfo]:
80
+ """List all available models.
81
+
82
+ Returns
83
+ -------
84
+ list[ModelInfo]
85
+ List of ModelInfo objects containing information about available models,
86
+ including their configurations and specifications.
87
+ """
88
+ model_registry = ModelRegistry()
89
+ return model_registry.get_all_models()
90
+
91
+ def get_model_config(self, model_name: str) -> ModelConfig:
92
+ """Get the configuration for a specific model.
93
+
94
+ Parameters
95
+ ----------
96
+ model_name : str
97
+ Name of the model to get configuration for
98
+
99
+ Returns
100
+ -------
101
+ ModelConfig
102
+ Complete configuration for the specified model
103
+
104
+ Raises
105
+ ------
106
+ ModelNotFoundError
107
+ If the specified model is not found in the configuration
108
+ """
109
+ model_registry = ModelRegistry()
110
+ return model_registry.get_single_model_config(model_name)
111
+
112
+ def launch_model(
113
+ self, model_name: str, options: Optional[LaunchOptions] = None
114
+ ) -> LaunchResponse:
115
+ """Launch a model on the cluster.
116
+
117
+ Parameters
118
+ ----------
119
+ model_name : str
120
+ Name of the model to launch
121
+ options : LaunchOptions, optional
122
+ Launch options to override default configuration
123
+
124
+ Returns
125
+ -------
126
+ LaunchResponse
127
+ Response containing launch details including:
128
+ - SLURM job ID
129
+ - Model configuration
130
+ - Launch status
131
+
132
+ Raises
133
+ ------
134
+ ModelConfigurationError
135
+ If the model configuration is invalid
136
+ SlurmJobError
137
+ If there's an error launching the SLURM job
138
+ """
139
+ # Convert LaunchOptions to dictionary if provided
140
+ options_dict: dict[str, Any] = {}
141
+ if options:
142
+ options_dict = {k: v for k, v in vars(options).items() if v is not None}
143
+
144
+ # Create and use the API Launch Helper
145
+ model_launcher = ModelLauncher(model_name, options_dict)
146
+ return model_launcher.launch()
147
+
148
+ def get_status(
149
+ self, slurm_job_id: int, log_dir: Optional[str] = None
150
+ ) -> StatusResponse:
151
+ """Get the status of a running model.
152
+
153
+ Parameters
154
+ ----------
155
+ slurm_job_id : int
156
+ The SLURM job ID to check
157
+ log_dir : str, optional
158
+ Path to the SLURM log directory. If None, uses default location
159
+
160
+ Returns
161
+ -------
162
+ StatusResponse
163
+ Status information including:
164
+ - Model name
165
+ - Server status
166
+ - Job state
167
+ - Base URL (if ready)
168
+ - Error information (if failed)
169
+ """
170
+ model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
171
+ return model_status_monitor.process_model_status()
172
+
173
+ def get_metrics(
174
+ self, slurm_job_id: int, log_dir: Optional[str] = None
175
+ ) -> MetricsResponse:
176
+ """Get the performance metrics of a running model.
177
+
178
+ Parameters
179
+ ----------
180
+ slurm_job_id : int
181
+ The SLURM job ID to get metrics for
182
+ log_dir : str, optional
183
+ Path to the SLURM log directory. If None, uses default location
184
+
185
+ Returns
186
+ -------
187
+ MetricsResponse
188
+ Response containing:
189
+ - Model name
190
+ - Performance metrics or error message
191
+ - Timestamp of collection
192
+ """
193
+ performance_metrics_collector = PerformanceMetricsCollector(
194
+ slurm_job_id, log_dir
195
+ )
196
+
197
+ metrics: Union[dict[str, float], str]
198
+ if not performance_metrics_collector.metrics_url.startswith("http"):
199
+ metrics = performance_metrics_collector.metrics_url
200
+ else:
201
+ metrics = performance_metrics_collector.fetch_metrics()
202
+
203
+ return MetricsResponse(
204
+ model_name=performance_metrics_collector.status_info.model_name,
205
+ metrics=metrics,
206
+ timestamp=time.time(),
207
+ )
208
+
209
+ def shutdown_model(self, slurm_job_id: int) -> bool:
210
+ """Shutdown a running model.
211
+
212
+ Parameters
213
+ ----------
214
+ slurm_job_id : int
215
+ The SLURM job ID to shut down
216
+
217
+ Returns
218
+ -------
219
+ bool
220
+ True if the model was successfully shutdown
221
+
222
+ Raises
223
+ ------
224
+ SlurmJobError
225
+ If there was an error shutting down the model
226
+ """
227
+ shutdown_cmd = f"scancel {slurm_job_id}"
228
+ _, stderr = run_bash_command(shutdown_cmd)
229
+ if stderr:
230
+ raise SlurmJobError(f"Failed to shutdown model: {stderr}")
231
+ return True
232
+
233
+ def wait_until_ready(
234
+ self,
235
+ slurm_job_id: int,
236
+ timeout_seconds: int = 1800,
237
+ poll_interval_seconds: int = 10,
238
+ log_dir: Optional[str] = None,
239
+ ) -> StatusResponse:
240
+ """Wait until a model is ready or fails.
241
+
242
+ Parameters
243
+ ----------
244
+ slurm_job_id : int
245
+ The SLURM job ID to wait for
246
+ timeout_seconds : int, optional
247
+ Maximum time to wait in seconds, by default 1800 (30 mins)
248
+ poll_interval_seconds : int, optional
249
+ How often to check status in seconds, by default 10
250
+ log_dir : str, optional
251
+ Path to the SLURM log directory. If None, uses default location
252
+
253
+ Returns
254
+ -------
255
+ StatusResponse
256
+ Status information when the model becomes ready
257
+
258
+ Raises
259
+ ------
260
+ SlurmJobError
261
+ If the specified job is not found or there's an error with the job
262
+ ServerError
263
+ If the server fails to start within the timeout period
264
+ APIError
265
+ If there was an error checking the status
266
+
267
+ Notes
268
+ -----
269
+ The timeout is reset if the model is still in PENDING state after the
270
+ initial timeout period. This allows for longer queue times in the SLURM
271
+ scheduler.
272
+ """
273
+ start_time = time.time()
274
+
275
+ while True:
276
+ status_info = self.get_status(slurm_job_id, log_dir)
277
+
278
+ if status_info.server_status == ModelStatus.READY:
279
+ return status_info
280
+
281
+ if status_info.server_status == ModelStatus.FAILED:
282
+ error_message = status_info.failed_reason or "Unknown error"
283
+ raise ServerError(f"Model failed to start: {error_message}")
284
+
285
+ if status_info.server_status == ModelStatus.SHUTDOWN:
286
+ raise ServerError("Model was shutdown before it became ready")
287
+
288
+ # Check timeout
289
+ if time.time() - start_time > timeout_seconds:
290
+ if status_info.server_status == ModelStatus.PENDING:
291
+ warnings.warn(
292
+ f"Model is still pending after {timeout_seconds} seconds, resetting timer...",
293
+ UserWarning,
294
+ stacklevel=2,
295
+ )
296
+ start_time = time.time()
297
+ raise ServerError(
298
+ f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
299
+ )
300
+
301
+ # Wait before checking again
302
+ time.sleep(poll_interval_seconds)
@@ -0,0 +1,128 @@
1
+ """Model configuration.
2
+
3
+ This module provides a Pydantic model for validating and managing model deployment
4
+ configurations, including hardware requirements and model specifications.
5
+ """
6
+
7
+ from pathlib import Path
8
+ from typing import Any, Optional, Union, cast
9
+
10
+ from pydantic import BaseModel, ConfigDict, Field
11
+ from typing_extensions import Literal
12
+
13
+ from vec_inf.client.slurm_vars import (
14
+ DEFAULT_ARGS,
15
+ MAX_CPUS_PER_TASK,
16
+ MAX_GPUS_PER_NODE,
17
+ MAX_NUM_NODES,
18
+ PARTITION,
19
+ QOS,
20
+ )
21
+
22
+
23
+ class ModelConfig(BaseModel):
24
+ """Pydantic model for validating and managing model deployment configurations.
25
+
26
+ A configuration class that handles validation and management of model deployment
27
+ settings, including model specifications, hardware requirements, and runtime
28
+ parameters.
29
+
30
+ Parameters
31
+ ----------
32
+ model_name : str
33
+ Name of the model, must be alphanumeric with allowed characters: '-', '_', '.'
34
+ model_family : str
35
+ Family/architecture of the model
36
+ model_variant : str, optional
37
+ Specific variant or version of the model family
38
+ model_type : {'LLM', 'VLM', 'Text_Embedding', 'Reward_Modeling'}
39
+ Type of model architecture
40
+ gpus_per_node : int
41
+ Number of GPUs to use per node (1-MAX_GPUS_PER_NODE)
42
+ num_nodes : int
43
+ Number of nodes to use for deployment (1-MAX_NUM_NODES)
44
+ cpus_per_task : int, optional
45
+ Number of CPU cores per task (1-MAX_CPUS_PER_TASK)
46
+ mem_per_node : str, optional
47
+ Memory allocation per node in GB format (e.g., '32G')
48
+ vocab_size : int
49
+ Size of the model's vocabulary (1-1,000,000)
50
+ account : Optional[str], optional
51
+ Charge resources used by this job to specified account.
52
+ qos : Union[QOS, str], optional
53
+ Quality of Service tier for job scheduling
54
+ time : str, optional
55
+ Time limit for the job in HH:MM:SS format
56
+ partition : Union[PARTITION, str], optional
57
+ GPU partition type for job scheduling
58
+ venv : str, optional
59
+ Virtual environment or container system to use
60
+ log_dir : Path, optional
61
+ Directory path for storing logs
62
+ model_weights_parent_dir : Path, optional
63
+ Base directory containing model weights
64
+ vllm_args : dict[str, Any], optional
65
+ Additional arguments for vLLM engine configuration
66
+
67
+ Notes
68
+ -----
69
+ All fields are validated using Pydantic's validation system. The model is
70
+ configured to be immutable (frozen) and forbids extra fields.
71
+ """
72
+
73
+ model_name: str = Field(..., min_length=3, pattern=r"^[a-zA-Z0-9\-_\.]+$")
74
+ model_family: str = Field(..., min_length=2)
75
+ model_variant: Optional[str] = Field(
76
+ default=None, description="Specific variant/version of the model family"
77
+ )
78
+ model_type: Literal["LLM", "VLM", "Text_Embedding", "Reward_Modeling"] = Field(
79
+ ..., description="Type of model architecture"
80
+ )
81
+ gpus_per_node: int = Field(
82
+ ..., gt=0, le=MAX_GPUS_PER_NODE, description="GPUs per node"
83
+ )
84
+ num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
85
+ cpus_per_task: int = Field(
86
+ default=cast(int, DEFAULT_ARGS["cpus_per_task"]),
87
+ gt=0,
88
+ le=MAX_CPUS_PER_TASK,
89
+ description="CPUs per task",
90
+ )
91
+ mem_per_node: str = Field(
92
+ default=cast(str, DEFAULT_ARGS["mem_per_node"]),
93
+ pattern=r"^\d{1,4}G$",
94
+ description="Memory per node",
95
+ )
96
+ vocab_size: int = Field(..., gt=0, le=1_000_000)
97
+ account: Optional[str] = Field(
98
+ default=None, description="Account name for job scheduling"
99
+ )
100
+ qos: Union[QOS, str] = Field(
101
+ default=cast(str, DEFAULT_ARGS["qos"]), description="Quality of Service tier"
102
+ )
103
+ time: str = Field(
104
+ default=cast(str, DEFAULT_ARGS["time"]),
105
+ pattern=r"^\d{2}:\d{2}:\d{2}$",
106
+ description="HH:MM:SS time limit",
107
+ )
108
+ partition: Union[PARTITION, str] = Field(
109
+ default=cast(str, DEFAULT_ARGS["partition"]), description="GPU partition type"
110
+ )
111
+ venv: str = Field(
112
+ default="singularity", description="Virtual environment/container system"
113
+ )
114
+ log_dir: Path = Field(
115
+ default=Path(cast(str, DEFAULT_ARGS["log_dir"])),
116
+ description="Log directory path",
117
+ )
118
+ model_weights_parent_dir: Path = Field(
119
+ default=Path(cast(str, DEFAULT_ARGS["model_weights_parent_dir"])),
120
+ description="Base directory for model weights",
121
+ )
122
+ vllm_args: Optional[dict[str, Any]] = Field(
123
+ default={}, description="vLLM engine arguments"
124
+ )
125
+
126
+ model_config = ConfigDict(
127
+ extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
128
+ )
@@ -0,0 +1,225 @@
1
+ """Data models for Vector Inference API.
2
+
3
+ This module contains the data model classes used by the Vector Inference API
4
+ for both request parameters and response objects.
5
+
6
+ Classes
7
+ -------
8
+ ModelStatus : Enum
9
+ Status states of a model
10
+ ModelType : Enum
11
+ Types of supported models
12
+ LaunchResponse : dataclass
13
+ Response from model launch operation
14
+ StatusResponse : dataclass
15
+ Response from model status check
16
+ MetricsResponse : dataclass
17
+ Response from metrics collection
18
+ LaunchOptions : dataclass
19
+ Options for model launch
20
+ LaunchOptionsDict : TypedDict
21
+ Dictionary representation of launch options
22
+ ModelInfo : datacitten
23
+ Information about available models
24
+ """
25
+
26
+ from dataclasses import dataclass, field
27
+ from enum import Enum
28
+ from typing import Any, Optional, Union
29
+
30
+
31
+ class ModelStatus(str, Enum):
32
+ """Enum representing the possible status states of a model.
33
+
34
+ Attributes
35
+ ----------
36
+ PENDING : str
37
+ Model is waiting for Slurm to allocate resources
38
+ LAUNCHING : str
39
+ Model is in the process of starting
40
+ READY : str
41
+ Model is running and ready to serve requests
42
+ FAILED : str
43
+ Model failed to start or encountered an error
44
+ SHUTDOWN : str
45
+ Model was intentionally stopped
46
+ UNAVAILABLE : str
47
+ Model status cannot be determined
48
+ """
49
+
50
+ PENDING = "PENDING"
51
+ LAUNCHING = "LAUNCHING"
52
+ READY = "READY"
53
+ FAILED = "FAILED"
54
+ SHUTDOWN = "SHUTDOWN"
55
+ UNAVAILABLE = "UNAVAILABLE"
56
+
57
+
58
+ class ModelType(str, Enum):
59
+ """Enum representing the possible model types.
60
+
61
+ Attributes
62
+ ----------
63
+ LLM : str
64
+ Large Language Model
65
+ VLM : str
66
+ Vision Language Model
67
+ TEXT_EMBEDDING : str
68
+ Text Embedding Model
69
+ REWARD_MODELING : str
70
+ Reward Modeling Model
71
+ """
72
+
73
+ LLM = "LLM"
74
+ VLM = "VLM"
75
+ TEXT_EMBEDDING = "Text_Embedding"
76
+ REWARD_MODELING = "Reward_Modeling"
77
+
78
+
79
+ @dataclass
80
+ class LaunchResponse:
81
+ """Response from launching a model.
82
+
83
+ Parameters
84
+ ----------
85
+ slurm_job_id : int
86
+ ID of the launched SLURM job
87
+ model_name : str
88
+ Name of the launched model
89
+ config : dict[str, Any]
90
+ Configuration used for the launch
91
+ raw_output : str
92
+ Raw output from the launch command (hidden from repr)
93
+ """
94
+
95
+ slurm_job_id: int
96
+ model_name: str
97
+ config: dict[str, Any]
98
+ raw_output: str = field(repr=False)
99
+
100
+
101
+ @dataclass
102
+ class StatusResponse:
103
+ """Response from checking a model's status.
104
+
105
+ Parameters
106
+ ----------
107
+ model_name : str
108
+ Name of the model
109
+ server_status : ModelStatus
110
+ Current status of the server
111
+ job_state : Union[str, ModelStatus]
112
+ Current state of the SLURM job
113
+ raw_output : str
114
+ Raw output from status check (hidden from repr)
115
+ base_url : str, optional
116
+ Base URL of the model server if ready
117
+ pending_reason : str, optional
118
+ Reason for pending state if applicable
119
+ failed_reason : str, optional
120
+ Reason for failure if applicable
121
+ """
122
+
123
+ model_name: str
124
+ server_status: ModelStatus
125
+ job_state: Union[str, ModelStatus]
126
+ raw_output: str = field(repr=False)
127
+ base_url: Optional[str] = None
128
+ pending_reason: Optional[str] = None
129
+ failed_reason: Optional[str] = None
130
+
131
+
132
+ @dataclass
133
+ class MetricsResponse:
134
+ """Response from retrieving model metrics.
135
+
136
+ Parameters
137
+ ----------
138
+ model_name : str
139
+ Name of the model
140
+ metrics : Union[dict[str, float], str]
141
+ Either a dictionary of metrics or an error message
142
+ timestamp : float
143
+ Unix timestamp of when metrics were collected
144
+ """
145
+
146
+ model_name: str
147
+ metrics: Union[dict[str, float], str]
148
+ timestamp: float
149
+
150
+
151
+ @dataclass
152
+ class LaunchOptions:
153
+ """Options for launching a model.
154
+
155
+ Parameters
156
+ ----------
157
+ model_family : str, optional
158
+ Family/architecture of the model
159
+ model_variant : str, optional
160
+ Specific variant/version of the model
161
+ partition : str, optional
162
+ SLURM partition to use
163
+ num_nodes : int, optional
164
+ Number of nodes to allocate
165
+ gpus_per_node : int, optional
166
+ Number of GPUs per node
167
+ account : str, optional
168
+ Account name for job scheduling
169
+ qos : str, optional
170
+ Quality of Service level
171
+ time : str, optional
172
+ Time limit for the job
173
+ vocab_size : int, optional
174
+ Size of model vocabulary
175
+ data_type : str, optional
176
+ Data type for model weights
177
+ venv : str, optional
178
+ Virtual environment to use
179
+ log_dir : str, optional
180
+ Directory for logs
181
+ model_weights_parent_dir : str, optional
182
+ Parent directory containing model weights
183
+ vllm_args : str, optional
184
+ Additional arguments for vLLM
185
+ """
186
+
187
+ model_family: Optional[str] = None
188
+ model_variant: Optional[str] = None
189
+ partition: Optional[str] = None
190
+ num_nodes: Optional[int] = None
191
+ gpus_per_node: Optional[int] = None
192
+ account: Optional[str] = None
193
+ qos: Optional[str] = None
194
+ time: Optional[str] = None
195
+ vocab_size: Optional[int] = None
196
+ data_type: Optional[str] = None
197
+ venv: Optional[str] = None
198
+ log_dir: Optional[str] = None
199
+ model_weights_parent_dir: Optional[str] = None
200
+ vllm_args: Optional[str] = None
201
+
202
+
203
+ @dataclass
204
+ class ModelInfo:
205
+ """Information about an available model.
206
+
207
+ Parameters
208
+ ----------
209
+ name : str
210
+ Name of the model
211
+ family : str
212
+ Family/architecture of the model
213
+ variant : str, optional
214
+ Specific variant/version of the model
215
+ model_type : ModelType
216
+ Type of the model
217
+ config : dict[str, Any]
218
+ Additional configuration parameters
219
+ """
220
+
221
+ name: str
222
+ family: str
223
+ variant: Optional[str]
224
+ model_type: ModelType
225
+ config: dict[str, Any]
@@ -0,0 +1,49 @@
1
+ """Slurm cluster configuration variables."""
2
+
3
+ from pathlib import Path
4
+
5
+ from typing_extensions import Literal
6
+
7
+
8
+ CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models_latest.yaml")
9
+ LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
10
+ SINGULARITY_IMAGE = "/model-weights/vec-inf-shared/vector-inference_latest.sif"
11
+ SINGULARITY_LOAD_CMD = "module load singularity-ce/3.8.2"
12
+ VLLM_NCCL_SO_PATH = "/vec-inf/nccl/libnccl.so.2.18.1"
13
+ MAX_GPUS_PER_NODE = 8
14
+ MAX_NUM_NODES = 16
15
+ MAX_CPUS_PER_TASK = 128
16
+
17
+ QOS = Literal[
18
+ "normal",
19
+ "m",
20
+ "m2",
21
+ "m3",
22
+ "m4",
23
+ "m5",
24
+ "long",
25
+ "deadline",
26
+ "high",
27
+ "scavenger",
28
+ "llm",
29
+ "a100",
30
+ ]
31
+
32
+ PARTITION = Literal[
33
+ "a40",
34
+ "a100",
35
+ "t4v1",
36
+ "t4v2",
37
+ "rtx6000",
38
+ ]
39
+
40
+ DEFAULT_ARGS = {
41
+ "cpus_per_task": 16,
42
+ "mem_per_node": "64G",
43
+ "qos": "m2",
44
+ "time": "08:00:00",
45
+ "partition": "a40",
46
+ "data_type": "auto",
47
+ "log_dir": "~/.vec-inf-logs",
48
+ "model_weights_parent_dir": "/model-weights",
49
+ }