vec-inf 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/client/config.py CHANGED
@@ -5,18 +5,19 @@ configurations, including hardware requirements and model specifications.
5
5
  """
6
6
 
7
7
  from pathlib import Path
8
- from typing import Any, Optional, Union, cast
8
+ from typing import Any, Optional, Union
9
9
 
10
10
  from pydantic import BaseModel, ConfigDict, Field
11
11
  from typing_extensions import Literal
12
12
 
13
- from vec_inf.client.slurm_vars import (
13
+ from vec_inf.client._slurm_vars import (
14
14
  DEFAULT_ARGS,
15
15
  MAX_CPUS_PER_TASK,
16
16
  MAX_GPUS_PER_NODE,
17
17
  MAX_NUM_NODES,
18
18
  PARTITION,
19
19
  QOS,
20
+ RESOURCE_TYPE,
20
21
  )
21
22
 
22
23
 
@@ -47,14 +48,18 @@ class ModelConfig(BaseModel):
47
48
  Memory allocation per node in GB format (e.g., '32G')
48
49
  vocab_size : int
49
50
  Size of the model's vocabulary (1-1,000,000)
50
- account : Optional[str], optional
51
+ account : str, optional
51
52
  Charge resources used by this job to specified account.
53
+ work_dir : str, optional
54
+ Set working directory for the batch job
52
55
  qos : Union[QOS, str], optional
53
56
  Quality of Service tier for job scheduling
54
57
  time : str, optional
55
58
  Time limit for the job in HH:MM:SS format
56
59
  partition : Union[PARTITION, str], optional
57
- GPU partition type for job scheduling
60
+ Slurm partition for job scheduling
61
+ resource_type : Union[RESOURCE_TYPE, str], optional
62
+ Type of resource to request for the job
58
63
  venv : str, optional
59
64
  Virtual environment or container system to use
60
65
  log_dir : Path, optional
@@ -83,13 +88,13 @@ class ModelConfig(BaseModel):
83
88
  )
84
89
  num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
85
90
  cpus_per_task: int = Field(
86
- default=cast(int, DEFAULT_ARGS["cpus_per_task"]),
91
+ default=int(DEFAULT_ARGS["cpus_per_task"]),
87
92
  gt=0,
88
93
  le=MAX_CPUS_PER_TASK,
89
94
  description="CPUs per task",
90
95
  )
91
96
  mem_per_node: str = Field(
92
- default=cast(str, DEFAULT_ARGS["mem_per_node"]),
97
+ default=DEFAULT_ARGS["mem_per_node"],
93
98
  pattern=r"^\d{1,4}G$",
94
99
  description="Memory per node",
95
100
  )
@@ -97,32 +102,58 @@ class ModelConfig(BaseModel):
97
102
  account: Optional[str] = Field(
98
103
  default=None, description="Account name for job scheduling"
99
104
  )
100
- qos: Union[QOS, str] = Field(
101
- default=cast(str, DEFAULT_ARGS["qos"]), description="Quality of Service tier"
105
+ work_dir: Optional[str] = Field(
106
+ default=None, description="Working directory for the job"
107
+ )
108
+ qos: Optional[Union[QOS, str]] = Field(
109
+ default=DEFAULT_ARGS["qos"] if DEFAULT_ARGS["qos"] != "" else None,
110
+ description="Quality of Service tier",
102
111
  )
103
112
  time: str = Field(
104
- default=cast(str, DEFAULT_ARGS["time"]),
113
+ default=DEFAULT_ARGS["time"],
105
114
  pattern=r"^\d{2}:\d{2}:\d{2}$",
106
115
  description="HH:MM:SS time limit",
107
116
  )
108
- partition: Union[PARTITION, str] = Field(
109
- default=cast(str, DEFAULT_ARGS["partition"]), description="GPU partition type"
117
+ partition: Optional[Union[PARTITION, str]] = Field(
118
+ default=DEFAULT_ARGS["partition"] if DEFAULT_ARGS["partition"] != "" else None,
119
+ description="GPU partition type",
120
+ )
121
+ resource_type: Optional[Union[RESOURCE_TYPE, str]] = Field(
122
+ default=DEFAULT_ARGS["resource_type"]
123
+ if DEFAULT_ARGS["resource_type"] != ""
124
+ else None,
125
+ description="Resource type",
126
+ )
127
+ exclude: Optional[str] = Field(
128
+ default=DEFAULT_ARGS["exclude"],
129
+ description="Exclude certain nodes from the resources granted to the job",
130
+ )
131
+ nodelist: Optional[str] = Field(
132
+ default=DEFAULT_ARGS["nodelist"],
133
+ description="Request a specific list of nodes for deployment",
134
+ )
135
+ bind: Optional[str] = Field(
136
+ default=DEFAULT_ARGS["bind"],
137
+ description="Additional binds for the container",
110
138
  )
111
139
  venv: str = Field(
112
- default="singularity", description="Virtual environment/container system"
140
+ default=DEFAULT_ARGS["venv"],
141
+ description="Virtual environment/container system",
113
142
  )
114
143
  log_dir: Path = Field(
115
- default=Path(cast(str, DEFAULT_ARGS["log_dir"])),
144
+ default=Path(DEFAULT_ARGS["log_dir"]),
116
145
  description="Log directory path",
117
146
  )
118
147
  model_weights_parent_dir: Path = Field(
119
- default=Path(cast(str, DEFAULT_ARGS["model_weights_parent_dir"])),
148
+ default=Path(DEFAULT_ARGS["model_weights_parent_dir"]),
120
149
  description="Base directory for model weights",
121
150
  )
122
151
  vllm_args: Optional[dict[str, Any]] = Field(
123
152
  default={}, description="vLLM engine arguments"
124
153
  )
125
-
154
+ env: Optional[dict[str, Any]] = Field(
155
+ default={}, description="Environment variables to be set"
156
+ )
126
157
  model_config = ConfigDict(
127
158
  extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
128
159
  )
vec_inf/client/models.py CHANGED
@@ -82,7 +82,7 @@ class LaunchResponse:
82
82
 
83
83
  Parameters
84
84
  ----------
85
- slurm_job_id : int
85
+ slurm_job_id : str
86
86
  ID of the launched SLURM job
87
87
  model_name : str
88
88
  Name of the launched model
@@ -92,12 +92,37 @@ class LaunchResponse:
92
92
  Raw output from the launch command (hidden from repr)
93
93
  """
94
94
 
95
- slurm_job_id: int
95
+ slurm_job_id: str
96
96
  model_name: str
97
97
  config: dict[str, Any]
98
98
  raw_output: str = field(repr=False)
99
99
 
100
100
 
101
+ @dataclass
102
+ class BatchLaunchResponse:
103
+ """Response from launching multiple models in batch mode.
104
+
105
+ Parameters
106
+ ----------
107
+ slurm_job_id : str
108
+ ID of the launched SLURM job
109
+ slurm_job_name : str
110
+ Name of the launched SLURM job
111
+ model_names : list[str]
112
+ Names of the launched models
113
+ config : dict[str, Any]
114
+ Configuration used for the launch
115
+ raw_output : str
116
+ Raw output from the launch command (hidden from repr)
117
+ """
118
+
119
+ slurm_job_id: str
120
+ slurm_job_name: str
121
+ model_names: list[str]
122
+ config: dict[str, Any]
123
+ raw_output: str = field(repr=False)
124
+
125
+
101
126
  @dataclass
102
127
  class StatusResponse:
103
128
  """Response from checking a model's status.
@@ -106,6 +131,8 @@ class StatusResponse:
106
131
  ----------
107
132
  model_name : str
108
133
  Name of the model
134
+ log_dir : str
135
+ Path to the SLURM log directory
109
136
  server_status : ModelStatus
110
137
  Current status of the server
111
138
  job_state : Union[str, ModelStatus]
@@ -121,6 +148,7 @@ class StatusResponse:
121
148
  """
122
149
 
123
150
  model_name: str
151
+ log_dir: str
124
152
  server_status: ModelStatus
125
153
  job_state: Union[str, ModelStatus]
126
154
  raw_output: str = field(repr=False)
@@ -160,16 +188,26 @@ class LaunchOptions:
160
188
  Specific variant/version of the model
161
189
  partition : str, optional
162
190
  SLURM partition to use
191
+ resource_type : str, optional
192
+ Type of resource to request for the job
163
193
  num_nodes : int, optional
164
194
  Number of nodes to allocate
165
195
  gpus_per_node : int, optional
166
196
  Number of GPUs per node
167
197
  account : str, optional
168
198
  Account name for job scheduling
199
+ work_dir : str, optional
200
+ Set working directory for the batch job
169
201
  qos : str, optional
170
202
  Quality of Service level
171
203
  time : str, optional
172
204
  Time limit for the job
205
+ exclude : str, optional
206
+ Exclude certain nodes from the resources granted to the job
207
+ node_list : str, optional
208
+ Request a specific list of nodes for deployment
209
+ bind : str, optional
210
+ Additional binds for the container as a comma separated list of bind paths
173
211
  vocab_size : int, optional
174
212
  Size of model vocabulary
175
213
  data_type : str, optional
@@ -182,15 +220,24 @@ class LaunchOptions:
182
220
  Parent directory containing model weights
183
221
  vllm_args : str, optional
184
222
  Additional arguments for vLLM
223
+ env : str, optional
224
+ Environment variables to be set
225
+ config : str, optional
226
+ Path to custom model config yaml
185
227
  """
186
228
 
187
229
  model_family: Optional[str] = None
188
230
  model_variant: Optional[str] = None
189
231
  partition: Optional[str] = None
232
+ resource_type: Optional[str] = None
190
233
  num_nodes: Optional[int] = None
191
234
  gpus_per_node: Optional[int] = None
192
235
  account: Optional[str] = None
236
+ work_dir: Optional[str] = None
193
237
  qos: Optional[str] = None
238
+ exclude: Optional[str] = None
239
+ nodelist: Optional[str] = None
240
+ bind: Optional[str] = None
194
241
  time: Optional[str] = None
195
242
  vocab_size: Optional[int] = None
196
243
  data_type: Optional[str] = None
@@ -198,6 +245,8 @@ class LaunchOptions:
198
245
  log_dir: Optional[str] = None
199
246
  model_weights_parent_dir: Optional[str] = None
200
247
  vllm_args: Optional[str] = None
248
+ env: Optional[str] = None
249
+ config: Optional[str] = None
201
250
 
202
251
 
203
252
  @dataclass
vec_inf/config/README.md CHANGED
@@ -1,245 +1,6 @@
1
- # Available Models
2
- More profiling metrics coming soon!
1
+ # Configs
3
2
 
4
- ## Text Generation Models
3
+ * [`environment.yaml`](environment.yaml): Configuration for the Slurm cluster environment, including image paths, resource availabilities, default value, and etc.
4
+ * [`models.yaml`](models.yaml): Configuration for launching model inference servers, including Slurm parameters as well as `vllm serve` arguments.
5
5
 
6
- ### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
7
-
8
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
9
- |:----------:|:----------:|:----------:|:----------:|
10
- | [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
11
- | [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
12
- | [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
13
-
14
- ### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
15
-
16
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
17
- |:----------:|:----------:|:----------:|:----------:|
18
- | [`CodeLlama-7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
19
- | [`CodeLlama-7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
20
- | [`CodeLlama-13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
21
- | [`CodeLlama-13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
22
- | [`CodeLlama-34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
23
- | [`CodeLlama-34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
24
- | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
25
- | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
26
-
27
- ### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
28
-
29
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
30
- |:----------:|:----------:|:----------:|:----------:|
31
- | [`gemma-2-9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
32
- | [`gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
33
- | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
34
- | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
35
-
36
- ### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
37
-
38
- | Variant | Suggested resource allocation |
39
- |:----------:|:----------:|
40
- | [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
41
- | [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
42
- | [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
43
- | [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
44
- | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
45
- | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
46
-
47
- ### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
48
-
49
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
50
- |:----------:|:----------:|:----------:|:----------:|
51
- | [`Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
52
- | [`Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
53
- | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
54
- | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
55
-
56
- ### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
57
-
58
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
59
- |:----------:|:----------:|:----------:|:----------:|
60
- | [`Meta-Llama-3.1-8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
61
- | [`Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
62
- | [`Meta-Llama-3.1-70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
63
- | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
64
- | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
65
-
66
- ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
67
-
68
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
69
- |:----------:|:----------:|:----------:|:----------:|
70
- | [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
71
- | [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
72
- | [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
73
- | [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
74
-
75
- ### [Mistral AI: Mistral](https://huggingface.co/mistralai)
76
-
77
- | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
78
- |:----------:|:----------:|:----------:|:----------:|
79
- | [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
80
- | [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
81
- | [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
82
- | [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
83
- | [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
84
- | [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
85
- | [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
86
-
87
- ### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
88
-
89
- | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
90
- |:----------:|:----------:|:----------:|:----------:|
91
- | [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
92
- | [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
93
- | [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
94
-
95
- ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
96
-
97
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
98
- |:----------:|:----------:|:----------:|:----------:|
99
- | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
100
-
101
- ### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
102
-
103
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
104
- |:----------:|:----------:|:----------:|:----------:|
105
- | [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
106
-
107
- ### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
108
-
109
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
110
- |:----------:|:----------:|:----------:|:----------:|
111
- | [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
112
- | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
113
- | [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
114
- | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
115
- | [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
116
- | [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
117
- | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
118
-
119
- ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
120
-
121
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
122
- |:----------:|:----------:|:----------:|:----------:|
123
- | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
124
- | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
125
- | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
126
-
127
- ### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
128
-
129
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
130
- |:----------:|:----------:|:----------:|:----------:|
131
- | [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
132
-
133
- ### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
134
-
135
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
136
- |:----------:|:----------:|:----------:|:----------:|
137
- | [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
138
-
139
- ### [DeepSeek-R1: Distilled Models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
140
-
141
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
142
- |:----------:|:----------:|:----------:|:----------:|
143
- | [`DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 1x a40 | - tokens/s | - tokens/s |
144
- | [`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 4x a40 | - tokens/s | - tokens/s |
145
- | [`DeepSeek-R1-Distill-Qwen-1.5B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | 1x a40 | - tokens/s | - tokens/s |
146
- | [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1x a40 | - tokens/s | - tokens/s |
147
- | [`DeepSeek-R1-Distill-Qwen-14B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | 2x a40 | - tokens/s | - tokens/s |
148
- | [`DeepSeek-R1-Distill-Qwen-32B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | 4x a40 | - tokens/s | - tokens/s |
149
-
150
-
151
- ## Vision Language Models
152
-
153
- ### [allenai: Molmo](https://huggingface.co/collections/allenai/molmo-66f379e6fe3b8ef090a8ca19)
154
-
155
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
156
- |:----------:|:----------:|:----------:|:----------:|
157
- | [`Molmo-7B-D-0924`](https://huggingface.co/allenai/Molmo-7B-D-0924) | 1x a40 | - tokens/s | - tokens/s |
158
-
159
-
160
- ### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
161
-
162
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
163
- |:----------:|:----------:|:----------:|:----------:|
164
- | [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
165
- | [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
166
-
167
- ### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
168
-
169
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
170
- |:----------:|:----------:|:----------:|:----------:|
171
- | [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
172
- | [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
173
-
174
- ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
175
-
176
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
177
- |:----------:|:----------:|:----------:|:----------:|
178
- | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
179
- | [`Phi-3.5-vision-instruct`](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) | 2x a40 | - tokens/s | - tokens/s |
180
-
181
- ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
182
-
183
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
184
- |:----------:|:----------:|:----------:|:----------:|
185
- | [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
186
- | [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
187
- | [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
188
- | [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
189
-
190
- **NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
191
-
192
- ### [Mistral: Pixtral](https://huggingface.co/mistralai)
193
-
194
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
195
- |:----------:|:----------:|:----------:|:----------:|
196
- | [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
197
-
198
- ### [OpenGVLab: InternVL2.5](https://huggingface.co/collections/OpenGVLab/internvl25-673e1019b66e2218f68d7c1c)
199
-
200
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
201
- |:----------:|:----------:|:----------:|:----------:|
202
- | [`InternVL2_5-8B`](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | 1x a40 | - tokens/s | - tokens/s |
203
- | [`InternVL2_5-26B`](https://huggingface.co/OpenGVLab/InternVL2_5-26B) | 2x a40 | - tokens/s | - tokens/s |
204
- | [`InternVL2_5-38B`](https://huggingface.co/OpenGVLab/InternVL2_5-38B) | 4x a40 | - tokens/s | - tokens/s |
205
-
206
- ### [THUDM: GLM-4](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
207
-
208
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
209
- |:----------:|:----------:|:----------:|:----------:|
210
- | [`glm-4v-9b`](https://huggingface.co/THUDM/glm-4v-9b) | 1x a40 | - tokens/s | - tokens/s |
211
-
212
- ### [DeepSeek: DeepSeek-VL2](https://huggingface.co/collections/deepseek-ai/deepseek-vl2-675c22accc456d3beb4613ab)
213
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
214
- |:----------:|:----------:|:----------:|:----------:|
215
- | [`deepseek-vl2`](https://huggingface.co/deepseek-ai/deepseek-vl2) | 2x a40 | - tokens/s | - tokens/s |
216
- | [`deepseek-vl2-small`](https://huggingface.co/deepseek-ai/deepseek-vl2-small) | 1x a40 | - tokens/s | - tokens/s |
217
-
218
-
219
- ## Text Embedding Models
220
-
221
- ### [Liang Wang: e5](https://huggingface.co/intfloat)
222
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
223
- |:----------:|:----------:|:----------:|:----------:|
224
- | [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
225
-
226
- ### [BAAI: bge](https://huggingface.co/BAAI)
227
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
228
- |:----------:|:----------:|:----------:|:----------:|
229
- | [`bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1x A40 | - tokens/s | - tokens/s |
230
-
231
- ### [Sentence Transformers: MiniLM](https://huggingface.co/sentence-transformers)
232
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
233
- |:----------:|:----------:|:----------:|:----------:|
234
- | [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1x A40 | - tokens/s | - tokens/s |
235
-
236
-
237
-
238
- ## Reward Modeling Models
239
-
240
- ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
241
-
242
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
243
- |:----------:|:----------:|:----------:|:----------:|
244
- | [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
245
- | [`Qwen2.5-Math-PRM-7B`](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | 1x a40 | - tokens/s | - tokens/s |
6
+ **NOTE**: These configs acts as last resort fallbacks in the `vec-inf` package, they will be updated to match the latest cached config on the Vector Killarney cluster with each new package version release.
@@ -0,0 +1,31 @@
1
+ paths:
2
+ image_path: "/model-weights/vec-inf-shared/vector-inference_latest.sif"
3
+
4
+ containerization:
5
+ module_load_cmd: "module load apptainer"
6
+ module_name: "apptainer"
7
+
8
+ limits:
9
+ max_gpus_per_node: 8
10
+ max_num_nodes: 178
11
+ max_cpus_per_task: 64
12
+
13
+ allowed_values:
14
+ qos: []
15
+ partition: []
16
+ resource_type: ["l40s", "h100"]
17
+
18
+ default_args:
19
+ cpus_per_task: "16"
20
+ mem_per_node: "64G"
21
+ time: "08:00:00"
22
+ qos: ""
23
+ partition: ""
24
+ resource_type: ""
25
+ exclude: ""
26
+ nodelist: ""
27
+ bind: ""
28
+ venv: "apptainer"
29
+ data_type: "auto"
30
+ log_dir: "~/.vec-inf-logs"
31
+ model_weights_parent_dir: "/model-weights"