vec-inf 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +18 -4
- vec_inf/cli/_cli.py +212 -30
- vec_inf/cli/_helper.py +95 -14
- vec_inf/client/_client_vars.py +19 -152
- vec_inf/client/_helper.py +386 -53
- vec_inf/client/_slurm_script_generator.py +210 -43
- vec_inf/client/_slurm_templates.py +248 -0
- vec_inf/client/_slurm_vars.py +82 -0
- vec_inf/client/_utils.py +190 -71
- vec_inf/client/api.py +96 -25
- vec_inf/client/config.py +46 -15
- vec_inf/client/models.py +51 -2
- vec_inf/config/README.md +4 -243
- vec_inf/config/environment.yaml +31 -0
- vec_inf/config/models.yaml +102 -281
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/METADATA +25 -67
- vec_inf-0.7.0.dist-info/RECORD +27 -0
- vec_inf/client/slurm_vars.py +0 -49
- vec_inf-0.6.0.dist-info/RECORD +0 -25
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/WHEEL +0 -0
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.6.0.dist-info → vec_inf-0.7.0.dist-info}/licenses/LICENSE +0 -0
vec_inf/client/config.py
CHANGED
|
@@ -5,18 +5,19 @@ configurations, including hardware requirements and model specifications.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Optional, Union
|
|
8
|
+
from typing import Any, Optional, Union
|
|
9
9
|
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, Field
|
|
11
11
|
from typing_extensions import Literal
|
|
12
12
|
|
|
13
|
-
from vec_inf.client.
|
|
13
|
+
from vec_inf.client._slurm_vars import (
|
|
14
14
|
DEFAULT_ARGS,
|
|
15
15
|
MAX_CPUS_PER_TASK,
|
|
16
16
|
MAX_GPUS_PER_NODE,
|
|
17
17
|
MAX_NUM_NODES,
|
|
18
18
|
PARTITION,
|
|
19
19
|
QOS,
|
|
20
|
+
RESOURCE_TYPE,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
|
|
@@ -47,14 +48,18 @@ class ModelConfig(BaseModel):
|
|
|
47
48
|
Memory allocation per node in GB format (e.g., '32G')
|
|
48
49
|
vocab_size : int
|
|
49
50
|
Size of the model's vocabulary (1-1,000,000)
|
|
50
|
-
account :
|
|
51
|
+
account : str, optional
|
|
51
52
|
Charge resources used by this job to specified account.
|
|
53
|
+
work_dir : str, optional
|
|
54
|
+
Set working directory for the batch job
|
|
52
55
|
qos : Union[QOS, str], optional
|
|
53
56
|
Quality of Service tier for job scheduling
|
|
54
57
|
time : str, optional
|
|
55
58
|
Time limit for the job in HH:MM:SS format
|
|
56
59
|
partition : Union[PARTITION, str], optional
|
|
57
|
-
|
|
60
|
+
Slurm partition for job scheduling
|
|
61
|
+
resource_type : Union[RESOURCE_TYPE, str], optional
|
|
62
|
+
Type of resource to request for the job
|
|
58
63
|
venv : str, optional
|
|
59
64
|
Virtual environment or container system to use
|
|
60
65
|
log_dir : Path, optional
|
|
@@ -83,13 +88,13 @@ class ModelConfig(BaseModel):
|
|
|
83
88
|
)
|
|
84
89
|
num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
|
|
85
90
|
cpus_per_task: int = Field(
|
|
86
|
-
default=
|
|
91
|
+
default=int(DEFAULT_ARGS["cpus_per_task"]),
|
|
87
92
|
gt=0,
|
|
88
93
|
le=MAX_CPUS_PER_TASK,
|
|
89
94
|
description="CPUs per task",
|
|
90
95
|
)
|
|
91
96
|
mem_per_node: str = Field(
|
|
92
|
-
default=
|
|
97
|
+
default=DEFAULT_ARGS["mem_per_node"],
|
|
93
98
|
pattern=r"^\d{1,4}G$",
|
|
94
99
|
description="Memory per node",
|
|
95
100
|
)
|
|
@@ -97,32 +102,58 @@ class ModelConfig(BaseModel):
|
|
|
97
102
|
account: Optional[str] = Field(
|
|
98
103
|
default=None, description="Account name for job scheduling"
|
|
99
104
|
)
|
|
100
|
-
|
|
101
|
-
default=
|
|
105
|
+
work_dir: Optional[str] = Field(
|
|
106
|
+
default=None, description="Working directory for the job"
|
|
107
|
+
)
|
|
108
|
+
qos: Optional[Union[QOS, str]] = Field(
|
|
109
|
+
default=DEFAULT_ARGS["qos"] if DEFAULT_ARGS["qos"] != "" else None,
|
|
110
|
+
description="Quality of Service tier",
|
|
102
111
|
)
|
|
103
112
|
time: str = Field(
|
|
104
|
-
default=
|
|
113
|
+
default=DEFAULT_ARGS["time"],
|
|
105
114
|
pattern=r"^\d{2}:\d{2}:\d{2}$",
|
|
106
115
|
description="HH:MM:SS time limit",
|
|
107
116
|
)
|
|
108
|
-
partition: Union[PARTITION, str] = Field(
|
|
109
|
-
default=
|
|
117
|
+
partition: Optional[Union[PARTITION, str]] = Field(
|
|
118
|
+
default=DEFAULT_ARGS["partition"] if DEFAULT_ARGS["partition"] != "" else None,
|
|
119
|
+
description="GPU partition type",
|
|
120
|
+
)
|
|
121
|
+
resource_type: Optional[Union[RESOURCE_TYPE, str]] = Field(
|
|
122
|
+
default=DEFAULT_ARGS["resource_type"]
|
|
123
|
+
if DEFAULT_ARGS["resource_type"] != ""
|
|
124
|
+
else None,
|
|
125
|
+
description="Resource type",
|
|
126
|
+
)
|
|
127
|
+
exclude: Optional[str] = Field(
|
|
128
|
+
default=DEFAULT_ARGS["exclude"],
|
|
129
|
+
description="Exclude certain nodes from the resources granted to the job",
|
|
130
|
+
)
|
|
131
|
+
nodelist: Optional[str] = Field(
|
|
132
|
+
default=DEFAULT_ARGS["nodelist"],
|
|
133
|
+
description="Request a specific list of nodes for deployment",
|
|
134
|
+
)
|
|
135
|
+
bind: Optional[str] = Field(
|
|
136
|
+
default=DEFAULT_ARGS["bind"],
|
|
137
|
+
description="Additional binds for the container",
|
|
110
138
|
)
|
|
111
139
|
venv: str = Field(
|
|
112
|
-
default="
|
|
140
|
+
default=DEFAULT_ARGS["venv"],
|
|
141
|
+
description="Virtual environment/container system",
|
|
113
142
|
)
|
|
114
143
|
log_dir: Path = Field(
|
|
115
|
-
default=Path(
|
|
144
|
+
default=Path(DEFAULT_ARGS["log_dir"]),
|
|
116
145
|
description="Log directory path",
|
|
117
146
|
)
|
|
118
147
|
model_weights_parent_dir: Path = Field(
|
|
119
|
-
default=Path(
|
|
148
|
+
default=Path(DEFAULT_ARGS["model_weights_parent_dir"]),
|
|
120
149
|
description="Base directory for model weights",
|
|
121
150
|
)
|
|
122
151
|
vllm_args: Optional[dict[str, Any]] = Field(
|
|
123
152
|
default={}, description="vLLM engine arguments"
|
|
124
153
|
)
|
|
125
|
-
|
|
154
|
+
env: Optional[dict[str, Any]] = Field(
|
|
155
|
+
default={}, description="Environment variables to be set"
|
|
156
|
+
)
|
|
126
157
|
model_config = ConfigDict(
|
|
127
158
|
extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
|
|
128
159
|
)
|
vec_inf/client/models.py
CHANGED
|
@@ -82,7 +82,7 @@ class LaunchResponse:
|
|
|
82
82
|
|
|
83
83
|
Parameters
|
|
84
84
|
----------
|
|
85
|
-
slurm_job_id :
|
|
85
|
+
slurm_job_id : str
|
|
86
86
|
ID of the launched SLURM job
|
|
87
87
|
model_name : str
|
|
88
88
|
Name of the launched model
|
|
@@ -92,12 +92,37 @@ class LaunchResponse:
|
|
|
92
92
|
Raw output from the launch command (hidden from repr)
|
|
93
93
|
"""
|
|
94
94
|
|
|
95
|
-
slurm_job_id:
|
|
95
|
+
slurm_job_id: str
|
|
96
96
|
model_name: str
|
|
97
97
|
config: dict[str, Any]
|
|
98
98
|
raw_output: str = field(repr=False)
|
|
99
99
|
|
|
100
100
|
|
|
101
|
+
@dataclass
|
|
102
|
+
class BatchLaunchResponse:
|
|
103
|
+
"""Response from launching multiple models in batch mode.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
slurm_job_id : str
|
|
108
|
+
ID of the launched SLURM job
|
|
109
|
+
slurm_job_name : str
|
|
110
|
+
Name of the launched SLURM job
|
|
111
|
+
model_names : list[str]
|
|
112
|
+
Names of the launched models
|
|
113
|
+
config : dict[str, Any]
|
|
114
|
+
Configuration used for the launch
|
|
115
|
+
raw_output : str
|
|
116
|
+
Raw output from the launch command (hidden from repr)
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
slurm_job_id: str
|
|
120
|
+
slurm_job_name: str
|
|
121
|
+
model_names: list[str]
|
|
122
|
+
config: dict[str, Any]
|
|
123
|
+
raw_output: str = field(repr=False)
|
|
124
|
+
|
|
125
|
+
|
|
101
126
|
@dataclass
|
|
102
127
|
class StatusResponse:
|
|
103
128
|
"""Response from checking a model's status.
|
|
@@ -106,6 +131,8 @@ class StatusResponse:
|
|
|
106
131
|
----------
|
|
107
132
|
model_name : str
|
|
108
133
|
Name of the model
|
|
134
|
+
log_dir : str
|
|
135
|
+
Path to the SLURM log directory
|
|
109
136
|
server_status : ModelStatus
|
|
110
137
|
Current status of the server
|
|
111
138
|
job_state : Union[str, ModelStatus]
|
|
@@ -121,6 +148,7 @@ class StatusResponse:
|
|
|
121
148
|
"""
|
|
122
149
|
|
|
123
150
|
model_name: str
|
|
151
|
+
log_dir: str
|
|
124
152
|
server_status: ModelStatus
|
|
125
153
|
job_state: Union[str, ModelStatus]
|
|
126
154
|
raw_output: str = field(repr=False)
|
|
@@ -160,16 +188,26 @@ class LaunchOptions:
|
|
|
160
188
|
Specific variant/version of the model
|
|
161
189
|
partition : str, optional
|
|
162
190
|
SLURM partition to use
|
|
191
|
+
resource_type : str, optional
|
|
192
|
+
Type of resource to request for the job
|
|
163
193
|
num_nodes : int, optional
|
|
164
194
|
Number of nodes to allocate
|
|
165
195
|
gpus_per_node : int, optional
|
|
166
196
|
Number of GPUs per node
|
|
167
197
|
account : str, optional
|
|
168
198
|
Account name for job scheduling
|
|
199
|
+
work_dir : str, optional
|
|
200
|
+
Set working directory for the batch job
|
|
169
201
|
qos : str, optional
|
|
170
202
|
Quality of Service level
|
|
171
203
|
time : str, optional
|
|
172
204
|
Time limit for the job
|
|
205
|
+
exclude : str, optional
|
|
206
|
+
Exclude certain nodes from the resources granted to the job
|
|
207
|
+
node_list : str, optional
|
|
208
|
+
Request a specific list of nodes for deployment
|
|
209
|
+
bind : str, optional
|
|
210
|
+
Additional binds for the container as a comma separated list of bind paths
|
|
173
211
|
vocab_size : int, optional
|
|
174
212
|
Size of model vocabulary
|
|
175
213
|
data_type : str, optional
|
|
@@ -182,15 +220,24 @@ class LaunchOptions:
|
|
|
182
220
|
Parent directory containing model weights
|
|
183
221
|
vllm_args : str, optional
|
|
184
222
|
Additional arguments for vLLM
|
|
223
|
+
env : str, optional
|
|
224
|
+
Environment variables to be set
|
|
225
|
+
config : str, optional
|
|
226
|
+
Path to custom model config yaml
|
|
185
227
|
"""
|
|
186
228
|
|
|
187
229
|
model_family: Optional[str] = None
|
|
188
230
|
model_variant: Optional[str] = None
|
|
189
231
|
partition: Optional[str] = None
|
|
232
|
+
resource_type: Optional[str] = None
|
|
190
233
|
num_nodes: Optional[int] = None
|
|
191
234
|
gpus_per_node: Optional[int] = None
|
|
192
235
|
account: Optional[str] = None
|
|
236
|
+
work_dir: Optional[str] = None
|
|
193
237
|
qos: Optional[str] = None
|
|
238
|
+
exclude: Optional[str] = None
|
|
239
|
+
nodelist: Optional[str] = None
|
|
240
|
+
bind: Optional[str] = None
|
|
194
241
|
time: Optional[str] = None
|
|
195
242
|
vocab_size: Optional[int] = None
|
|
196
243
|
data_type: Optional[str] = None
|
|
@@ -198,6 +245,8 @@ class LaunchOptions:
|
|
|
198
245
|
log_dir: Optional[str] = None
|
|
199
246
|
model_weights_parent_dir: Optional[str] = None
|
|
200
247
|
vllm_args: Optional[str] = None
|
|
248
|
+
env: Optional[str] = None
|
|
249
|
+
config: Optional[str] = None
|
|
201
250
|
|
|
202
251
|
|
|
203
252
|
@dataclass
|
vec_inf/config/README.md
CHANGED
|
@@ -1,245 +1,6 @@
|
|
|
1
|
-
#
|
|
2
|
-
More profiling metrics coming soon!
|
|
1
|
+
# Configs
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
* [`environment.yaml`](environment.yaml): Configuration for the Slurm cluster environment, including image paths, resource availabilities, default value, and etc.
|
|
4
|
+
* [`models.yaml`](models.yaml): Configuration for launching model inference servers, including Slurm parameters as well as `vllm serve` arguments.
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
9
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
10
|
-
| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
11
|
-
| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
12
|
-
| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
13
|
-
|
|
14
|
-
### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
15
|
-
|
|
16
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
17
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
18
|
-
| [`CodeLlama-7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
19
|
-
| [`CodeLlama-7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
20
|
-
| [`CodeLlama-13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
21
|
-
| [`CodeLlama-13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
22
|
-
| [`CodeLlama-34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
23
|
-
| [`CodeLlama-34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
24
|
-
| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
25
|
-
| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
26
|
-
|
|
27
|
-
### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
28
|
-
|
|
29
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
30
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
31
|
-
| [`gemma-2-9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
|
|
32
|
-
| [`gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
|
|
33
|
-
| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
|
|
34
|
-
| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
|
|
35
|
-
|
|
36
|
-
### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
37
|
-
|
|
38
|
-
| Variant | Suggested resource allocation |
|
|
39
|
-
|:----------:|:----------:|
|
|
40
|
-
| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
|
|
41
|
-
| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
|
|
42
|
-
| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
|
|
43
|
-
| [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
|
|
44
|
-
| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
|
|
45
|
-
| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
|
|
46
|
-
|
|
47
|
-
### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
48
|
-
|
|
49
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
50
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
51
|
-
| [`Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
|
|
52
|
-
| [`Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
|
|
53
|
-
| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
|
|
54
|
-
| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
|
|
55
|
-
|
|
56
|
-
### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
57
|
-
|
|
58
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
59
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
60
|
-
| [`Meta-Llama-3.1-8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
61
|
-
| [`Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
62
|
-
| [`Meta-Llama-3.1-70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
63
|
-
| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
64
|
-
| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
65
|
-
|
|
66
|
-
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
67
|
-
|
|
68
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
69
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
70
|
-
| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
|
|
71
|
-
| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
72
|
-
| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
|
|
73
|
-
| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
74
|
-
|
|
75
|
-
### [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
76
|
-
|
|
77
|
-
| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
78
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
79
|
-
| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
80
|
-
| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
81
|
-
| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
|
|
82
|
-
| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
|
|
83
|
-
| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
|
|
84
|
-
| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
85
|
-
| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
86
|
-
|
|
87
|
-
### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
88
|
-
|
|
89
|
-
| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
90
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
91
|
-
| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
92
|
-
| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
93
|
-
| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
94
|
-
|
|
95
|
-
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
96
|
-
|
|
97
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
98
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
99
|
-
| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
100
|
-
|
|
101
|
-
### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
|
|
102
|
-
|
|
103
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
104
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
105
|
-
| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
|
|
106
|
-
|
|
107
|
-
### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
|
|
108
|
-
|
|
109
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
110
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
111
|
-
| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
112
|
-
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
113
|
-
| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
114
|
-
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
115
|
-
| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
116
|
-
| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
117
|
-
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
118
|
-
|
|
119
|
-
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
120
|
-
|
|
121
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
122
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
123
|
-
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
124
|
-
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
125
|
-
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
126
|
-
|
|
127
|
-
### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
|
|
128
|
-
|
|
129
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
130
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
131
|
-
| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
132
|
-
|
|
133
|
-
### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
|
|
134
|
-
|
|
135
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
136
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
137
|
-
| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
|
|
138
|
-
|
|
139
|
-
### [DeepSeek-R1: Distilled Models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
|
|
140
|
-
|
|
141
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
142
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
143
|
-
| [`DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
144
|
-
| [`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
145
|
-
| [`DeepSeek-R1-Distill-Qwen-1.5B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | 1x a40 | - tokens/s | - tokens/s |
|
|
146
|
-
| [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1x a40 | - tokens/s | - tokens/s |
|
|
147
|
-
| [`DeepSeek-R1-Distill-Qwen-14B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | 2x a40 | - tokens/s | - tokens/s |
|
|
148
|
-
| [`DeepSeek-R1-Distill-Qwen-32B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | 4x a40 | - tokens/s | - tokens/s |
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
## Vision Language Models
|
|
152
|
-
|
|
153
|
-
### [allenai: Molmo](https://huggingface.co/collections/allenai/molmo-66f379e6fe3b8ef090a8ca19)
|
|
154
|
-
|
|
155
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
156
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
157
|
-
| [`Molmo-7B-D-0924`](https://huggingface.co/allenai/Molmo-7B-D-0924) | 1x a40 | - tokens/s | - tokens/s |
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
161
|
-
|
|
162
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
163
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
164
|
-
| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
165
|
-
| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
166
|
-
|
|
167
|
-
### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
168
|
-
|
|
169
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
170
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
171
|
-
| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
172
|
-
| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
173
|
-
|
|
174
|
-
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
175
|
-
|
|
176
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
177
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
178
|
-
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
179
|
-
| [`Phi-3.5-vision-instruct`](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
180
|
-
|
|
181
|
-
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
182
|
-
|
|
183
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
184
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
185
|
-
| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
|
|
186
|
-
| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
187
|
-
| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
188
|
-
| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
189
|
-
|
|
190
|
-
**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
|
|
191
|
-
|
|
192
|
-
### [Mistral: Pixtral](https://huggingface.co/mistralai)
|
|
193
|
-
|
|
194
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
195
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
196
|
-
| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
|
|
197
|
-
|
|
198
|
-
### [OpenGVLab: InternVL2.5](https://huggingface.co/collections/OpenGVLab/internvl25-673e1019b66e2218f68d7c1c)
|
|
199
|
-
|
|
200
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
201
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
202
|
-
| [`InternVL2_5-8B`](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
203
|
-
| [`InternVL2_5-26B`](https://huggingface.co/OpenGVLab/InternVL2_5-26B) | 2x a40 | - tokens/s | - tokens/s |
|
|
204
|
-
| [`InternVL2_5-38B`](https://huggingface.co/OpenGVLab/InternVL2_5-38B) | 4x a40 | - tokens/s | - tokens/s |
|
|
205
|
-
|
|
206
|
-
### [THUDM: GLM-4](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
|
|
207
|
-
|
|
208
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
209
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
210
|
-
| [`glm-4v-9b`](https://huggingface.co/THUDM/glm-4v-9b) | 1x a40 | - tokens/s | - tokens/s |
|
|
211
|
-
|
|
212
|
-
### [DeepSeek: DeepSeek-VL2](https://huggingface.co/collections/deepseek-ai/deepseek-vl2-675c22accc456d3beb4613ab)
|
|
213
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
214
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
215
|
-
| [`deepseek-vl2`](https://huggingface.co/deepseek-ai/deepseek-vl2) | 2x a40 | - tokens/s | - tokens/s |
|
|
216
|
-
| [`deepseek-vl2-small`](https://huggingface.co/deepseek-ai/deepseek-vl2-small) | 1x a40 | - tokens/s | - tokens/s |
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
## Text Embedding Models
|
|
220
|
-
|
|
221
|
-
### [Liang Wang: e5](https://huggingface.co/intfloat)
|
|
222
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
223
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
224
|
-
| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
225
|
-
|
|
226
|
-
### [BAAI: bge](https://huggingface.co/BAAI)
|
|
227
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
228
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
229
|
-
| [`bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1x A40 | - tokens/s | - tokens/s |
|
|
230
|
-
|
|
231
|
-
### [Sentence Transformers: MiniLM](https://huggingface.co/sentence-transformers)
|
|
232
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
233
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
234
|
-
| [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1x A40 | - tokens/s | - tokens/s |
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
## Reward Modeling Models
|
|
239
|
-
|
|
240
|
-
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
241
|
-
|
|
242
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
243
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
244
|
-
| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
|
|
245
|
-
| [`Qwen2.5-Math-PRM-7B`](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | 1x a40 | - tokens/s | - tokens/s |
|
|
6
|
+
**NOTE**: These configs acts as last resort fallbacks in the `vec-inf` package, they will be updated to match the latest cached config on the Vector Killarney cluster with each new package version release.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
paths:
|
|
2
|
+
image_path: "/model-weights/vec-inf-shared/vector-inference_latest.sif"
|
|
3
|
+
|
|
4
|
+
containerization:
|
|
5
|
+
module_load_cmd: "module load apptainer"
|
|
6
|
+
module_name: "apptainer"
|
|
7
|
+
|
|
8
|
+
limits:
|
|
9
|
+
max_gpus_per_node: 8
|
|
10
|
+
max_num_nodes: 178
|
|
11
|
+
max_cpus_per_task: 64
|
|
12
|
+
|
|
13
|
+
allowed_values:
|
|
14
|
+
qos: []
|
|
15
|
+
partition: []
|
|
16
|
+
resource_type: ["l40s", "h100"]
|
|
17
|
+
|
|
18
|
+
default_args:
|
|
19
|
+
cpus_per_task: "16"
|
|
20
|
+
mem_per_node: "64G"
|
|
21
|
+
time: "08:00:00"
|
|
22
|
+
qos: ""
|
|
23
|
+
partition: ""
|
|
24
|
+
resource_type: ""
|
|
25
|
+
exclude: ""
|
|
26
|
+
nodelist: ""
|
|
27
|
+
bind: ""
|
|
28
|
+
venv: "apptainer"
|
|
29
|
+
data_type: "auto"
|
|
30
|
+
log_dir: "~/.vec-inf-logs"
|
|
31
|
+
model_weights_parent_dir: "/model-weights"
|