vec-inf 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +18 -4
- vec_inf/cli/_cli.py +191 -34
- vec_inf/cli/_helper.py +95 -14
- vec_inf/client/_client_vars.py +7 -165
- vec_inf/client/_helper.py +386 -40
- vec_inf/client/_slurm_script_generator.py +204 -36
- vec_inf/client/_slurm_templates.py +248 -0
- vec_inf/client/_slurm_vars.py +86 -0
- vec_inf/client/_utils.py +189 -70
- vec_inf/client/api.py +96 -25
- vec_inf/client/config.py +40 -19
- vec_inf/client/models.py +44 -4
- vec_inf/config/README.md +4 -243
- vec_inf/config/environment.yaml +35 -0
- vec_inf/config/models.yaml +102 -274
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/METADATA +43 -73
- vec_inf-0.7.1.dist-info/RECORD +27 -0
- vec_inf/client/slurm_vars.py +0 -49
- vec_inf-0.6.1.dist-info/RECORD +0 -25
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/WHEEL +0 -0
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/licenses/LICENSE +0 -0
vec_inf/client/config.py
CHANGED
|
@@ -5,18 +5,19 @@ configurations, including hardware requirements and model specifications.
|
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
7
|
from pathlib import Path
|
|
8
|
-
from typing import Any, Optional, Union
|
|
8
|
+
from typing import Any, Optional, Union
|
|
9
9
|
|
|
10
10
|
from pydantic import BaseModel, ConfigDict, Field
|
|
11
11
|
from typing_extensions import Literal
|
|
12
12
|
|
|
13
|
-
from vec_inf.client.
|
|
13
|
+
from vec_inf.client._slurm_vars import (
|
|
14
14
|
DEFAULT_ARGS,
|
|
15
15
|
MAX_CPUS_PER_TASK,
|
|
16
16
|
MAX_GPUS_PER_NODE,
|
|
17
17
|
MAX_NUM_NODES,
|
|
18
18
|
PARTITION,
|
|
19
19
|
QOS,
|
|
20
|
+
RESOURCE_TYPE,
|
|
20
21
|
)
|
|
21
22
|
|
|
22
23
|
|
|
@@ -47,14 +48,18 @@ class ModelConfig(BaseModel):
|
|
|
47
48
|
Memory allocation per node in GB format (e.g., '32G')
|
|
48
49
|
vocab_size : int
|
|
49
50
|
Size of the model's vocabulary (1-1,000,000)
|
|
50
|
-
account :
|
|
51
|
+
account : str, optional
|
|
51
52
|
Charge resources used by this job to specified account.
|
|
53
|
+
work_dir : str, optional
|
|
54
|
+
Set working directory for the batch job
|
|
52
55
|
qos : Union[QOS, str], optional
|
|
53
56
|
Quality of Service tier for job scheduling
|
|
54
57
|
time : str, optional
|
|
55
58
|
Time limit for the job in HH:MM:SS format
|
|
56
59
|
partition : Union[PARTITION, str], optional
|
|
57
|
-
|
|
60
|
+
Slurm partition for job scheduling
|
|
61
|
+
resource_type : Union[RESOURCE_TYPE, str], optional
|
|
62
|
+
Type of resource to request for the job
|
|
58
63
|
venv : str, optional
|
|
59
64
|
Virtual environment or container system to use
|
|
60
65
|
log_dir : Path, optional
|
|
@@ -83,13 +88,13 @@ class ModelConfig(BaseModel):
|
|
|
83
88
|
)
|
|
84
89
|
num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
|
|
85
90
|
cpus_per_task: int = Field(
|
|
86
|
-
default=
|
|
91
|
+
default=int(DEFAULT_ARGS["cpus_per_task"]),
|
|
87
92
|
gt=0,
|
|
88
93
|
le=MAX_CPUS_PER_TASK,
|
|
89
94
|
description="CPUs per task",
|
|
90
95
|
)
|
|
91
96
|
mem_per_node: str = Field(
|
|
92
|
-
default=
|
|
97
|
+
default=DEFAULT_ARGS["mem_per_node"],
|
|
93
98
|
pattern=r"^\d{1,4}G$",
|
|
94
99
|
description="Memory per node",
|
|
95
100
|
)
|
|
@@ -97,42 +102,58 @@ class ModelConfig(BaseModel):
|
|
|
97
102
|
account: Optional[str] = Field(
|
|
98
103
|
default=None, description="Account name for job scheduling"
|
|
99
104
|
)
|
|
100
|
-
|
|
101
|
-
default=
|
|
105
|
+
work_dir: Optional[str] = Field(
|
|
106
|
+
default=None, description="Working directory for the job"
|
|
107
|
+
)
|
|
108
|
+
qos: Optional[Union[QOS, str]] = Field(
|
|
109
|
+
default=DEFAULT_ARGS["qos"] if DEFAULT_ARGS["qos"] != "" else None,
|
|
110
|
+
description="Quality of Service tier",
|
|
102
111
|
)
|
|
103
112
|
time: str = Field(
|
|
104
|
-
default=
|
|
113
|
+
default=DEFAULT_ARGS["time"],
|
|
105
114
|
pattern=r"^\d{2}:\d{2}:\d{2}$",
|
|
106
115
|
description="HH:MM:SS time limit",
|
|
107
116
|
)
|
|
108
|
-
partition: Union[PARTITION, str] = Field(
|
|
109
|
-
default=
|
|
117
|
+
partition: Optional[Union[PARTITION, str]] = Field(
|
|
118
|
+
default=DEFAULT_ARGS["partition"] if DEFAULT_ARGS["partition"] != "" else None,
|
|
119
|
+
description="GPU partition type",
|
|
120
|
+
)
|
|
121
|
+
resource_type: Optional[Union[RESOURCE_TYPE, str]] = Field(
|
|
122
|
+
default=DEFAULT_ARGS["resource_type"]
|
|
123
|
+
if DEFAULT_ARGS["resource_type"] != ""
|
|
124
|
+
else None,
|
|
125
|
+
description="Resource type",
|
|
110
126
|
)
|
|
111
127
|
exclude: Optional[str] = Field(
|
|
112
|
-
default=
|
|
128
|
+
default=DEFAULT_ARGS["exclude"],
|
|
113
129
|
description="Exclude certain nodes from the resources granted to the job",
|
|
114
130
|
)
|
|
115
|
-
|
|
116
|
-
default=
|
|
131
|
+
nodelist: Optional[str] = Field(
|
|
132
|
+
default=DEFAULT_ARGS["nodelist"],
|
|
133
|
+
description="Request a specific list of nodes for deployment",
|
|
117
134
|
)
|
|
118
135
|
bind: Optional[str] = Field(
|
|
119
|
-
default=
|
|
136
|
+
default=DEFAULT_ARGS["bind"],
|
|
137
|
+
description="Additional binds for the container",
|
|
120
138
|
)
|
|
121
139
|
venv: str = Field(
|
|
122
|
-
default="
|
|
140
|
+
default=DEFAULT_ARGS["venv"],
|
|
141
|
+
description="Virtual environment/container system",
|
|
123
142
|
)
|
|
124
143
|
log_dir: Path = Field(
|
|
125
|
-
default=Path(
|
|
144
|
+
default=Path(DEFAULT_ARGS["log_dir"]),
|
|
126
145
|
description="Log directory path",
|
|
127
146
|
)
|
|
128
147
|
model_weights_parent_dir: Path = Field(
|
|
129
|
-
default=Path(
|
|
148
|
+
default=Path(DEFAULT_ARGS["model_weights_parent_dir"]),
|
|
130
149
|
description="Base directory for model weights",
|
|
131
150
|
)
|
|
132
151
|
vllm_args: Optional[dict[str, Any]] = Field(
|
|
133
152
|
default={}, description="vLLM engine arguments"
|
|
134
153
|
)
|
|
135
|
-
|
|
154
|
+
env: Optional[dict[str, Any]] = Field(
|
|
155
|
+
default={}, description="Environment variables to be set"
|
|
156
|
+
)
|
|
136
157
|
model_config = ConfigDict(
|
|
137
158
|
extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
|
|
138
159
|
)
|
vec_inf/client/models.py
CHANGED
|
@@ -82,7 +82,7 @@ class LaunchResponse:
|
|
|
82
82
|
|
|
83
83
|
Parameters
|
|
84
84
|
----------
|
|
85
|
-
slurm_job_id :
|
|
85
|
+
slurm_job_id : str
|
|
86
86
|
ID of the launched SLURM job
|
|
87
87
|
model_name : str
|
|
88
88
|
Name of the launched model
|
|
@@ -92,12 +92,37 @@ class LaunchResponse:
|
|
|
92
92
|
Raw output from the launch command (hidden from repr)
|
|
93
93
|
"""
|
|
94
94
|
|
|
95
|
-
slurm_job_id:
|
|
95
|
+
slurm_job_id: str
|
|
96
96
|
model_name: str
|
|
97
97
|
config: dict[str, Any]
|
|
98
98
|
raw_output: str = field(repr=False)
|
|
99
99
|
|
|
100
100
|
|
|
101
|
+
@dataclass
|
|
102
|
+
class BatchLaunchResponse:
|
|
103
|
+
"""Response from launching multiple models in batch mode.
|
|
104
|
+
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
slurm_job_id : str
|
|
108
|
+
ID of the launched SLURM job
|
|
109
|
+
slurm_job_name : str
|
|
110
|
+
Name of the launched SLURM job
|
|
111
|
+
model_names : list[str]
|
|
112
|
+
Names of the launched models
|
|
113
|
+
config : dict[str, Any]
|
|
114
|
+
Configuration used for the launch
|
|
115
|
+
raw_output : str
|
|
116
|
+
Raw output from the launch command (hidden from repr)
|
|
117
|
+
"""
|
|
118
|
+
|
|
119
|
+
slurm_job_id: str
|
|
120
|
+
slurm_job_name: str
|
|
121
|
+
model_names: list[str]
|
|
122
|
+
config: dict[str, Any]
|
|
123
|
+
raw_output: str = field(repr=False)
|
|
124
|
+
|
|
125
|
+
|
|
101
126
|
@dataclass
|
|
102
127
|
class StatusResponse:
|
|
103
128
|
"""Response from checking a model's status.
|
|
@@ -106,6 +131,8 @@ class StatusResponse:
|
|
|
106
131
|
----------
|
|
107
132
|
model_name : str
|
|
108
133
|
Name of the model
|
|
134
|
+
log_dir : str
|
|
135
|
+
Path to the SLURM log directory
|
|
109
136
|
server_status : ModelStatus
|
|
110
137
|
Current status of the server
|
|
111
138
|
job_state : Union[str, ModelStatus]
|
|
@@ -121,6 +148,7 @@ class StatusResponse:
|
|
|
121
148
|
"""
|
|
122
149
|
|
|
123
150
|
model_name: str
|
|
151
|
+
log_dir: str
|
|
124
152
|
server_status: ModelStatus
|
|
125
153
|
job_state: Union[str, ModelStatus]
|
|
126
154
|
raw_output: str = field(repr=False)
|
|
@@ -160,12 +188,16 @@ class LaunchOptions:
|
|
|
160
188
|
Specific variant/version of the model
|
|
161
189
|
partition : str, optional
|
|
162
190
|
SLURM partition to use
|
|
191
|
+
resource_type : str, optional
|
|
192
|
+
Type of resource to request for the job
|
|
163
193
|
num_nodes : int, optional
|
|
164
194
|
Number of nodes to allocate
|
|
165
195
|
gpus_per_node : int, optional
|
|
166
196
|
Number of GPUs per node
|
|
167
197
|
account : str, optional
|
|
168
198
|
Account name for job scheduling
|
|
199
|
+
work_dir : str, optional
|
|
200
|
+
Set working directory for the batch job
|
|
169
201
|
qos : str, optional
|
|
170
202
|
Quality of Service level
|
|
171
203
|
time : str, optional
|
|
@@ -175,7 +207,7 @@ class LaunchOptions:
|
|
|
175
207
|
node_list : str, optional
|
|
176
208
|
Request a specific list of nodes for deployment
|
|
177
209
|
bind : str, optional
|
|
178
|
-
Additional binds for the
|
|
210
|
+
Additional binds for the container as a comma separated list of bind paths
|
|
179
211
|
vocab_size : int, optional
|
|
180
212
|
Size of model vocabulary
|
|
181
213
|
data_type : str, optional
|
|
@@ -188,17 +220,23 @@ class LaunchOptions:
|
|
|
188
220
|
Parent directory containing model weights
|
|
189
221
|
vllm_args : str, optional
|
|
190
222
|
Additional arguments for vLLM
|
|
223
|
+
env : str, optional
|
|
224
|
+
Environment variables to be set
|
|
225
|
+
config : str, optional
|
|
226
|
+
Path to custom model config yaml
|
|
191
227
|
"""
|
|
192
228
|
|
|
193
229
|
model_family: Optional[str] = None
|
|
194
230
|
model_variant: Optional[str] = None
|
|
195
231
|
partition: Optional[str] = None
|
|
232
|
+
resource_type: Optional[str] = None
|
|
196
233
|
num_nodes: Optional[int] = None
|
|
197
234
|
gpus_per_node: Optional[int] = None
|
|
198
235
|
account: Optional[str] = None
|
|
236
|
+
work_dir: Optional[str] = None
|
|
199
237
|
qos: Optional[str] = None
|
|
200
238
|
exclude: Optional[str] = None
|
|
201
|
-
|
|
239
|
+
nodelist: Optional[str] = None
|
|
202
240
|
bind: Optional[str] = None
|
|
203
241
|
time: Optional[str] = None
|
|
204
242
|
vocab_size: Optional[int] = None
|
|
@@ -207,6 +245,8 @@ class LaunchOptions:
|
|
|
207
245
|
log_dir: Optional[str] = None
|
|
208
246
|
model_weights_parent_dir: Optional[str] = None
|
|
209
247
|
vllm_args: Optional[str] = None
|
|
248
|
+
env: Optional[str] = None
|
|
249
|
+
config: Optional[str] = None
|
|
210
250
|
|
|
211
251
|
|
|
212
252
|
@dataclass
|
vec_inf/config/README.md
CHANGED
|
@@ -1,245 +1,6 @@
|
|
|
1
|
-
#
|
|
2
|
-
More profiling metrics coming soon!
|
|
1
|
+
# Configs
|
|
3
2
|
|
|
4
|
-
|
|
3
|
+
* [`environment.yaml`](environment.yaml): Configuration for the Slurm cluster environment, including image paths, resource availabilities, default value, and etc.
|
|
4
|
+
* [`models.yaml`](models.yaml): Configuration for launching model inference servers, including Slurm parameters as well as `vllm serve` arguments.
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
9
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
10
|
-
| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
11
|
-
| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
12
|
-
| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
13
|
-
|
|
14
|
-
### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
15
|
-
|
|
16
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
17
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
18
|
-
| [`CodeLlama-7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
19
|
-
| [`CodeLlama-7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
20
|
-
| [`CodeLlama-13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
21
|
-
| [`CodeLlama-13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
22
|
-
| [`CodeLlama-34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
23
|
-
| [`CodeLlama-34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
24
|
-
| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
25
|
-
| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
26
|
-
|
|
27
|
-
### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
28
|
-
|
|
29
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
30
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
31
|
-
| [`gemma-2-9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
|
|
32
|
-
| [`gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
|
|
33
|
-
| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
|
|
34
|
-
| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
|
|
35
|
-
|
|
36
|
-
### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
37
|
-
|
|
38
|
-
| Variant | Suggested resource allocation |
|
|
39
|
-
|:----------:|:----------:|
|
|
40
|
-
| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
|
|
41
|
-
| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
|
|
42
|
-
| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
|
|
43
|
-
| [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
|
|
44
|
-
| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
|
|
45
|
-
| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
|
|
46
|
-
|
|
47
|
-
### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
48
|
-
|
|
49
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
50
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
51
|
-
| [`Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
|
|
52
|
-
| [`Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
|
|
53
|
-
| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
|
|
54
|
-
| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
|
|
55
|
-
|
|
56
|
-
### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
57
|
-
|
|
58
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
59
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
60
|
-
| [`Meta-Llama-3.1-8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
61
|
-
| [`Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
62
|
-
| [`Meta-Llama-3.1-70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
63
|
-
| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
64
|
-
| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
65
|
-
|
|
66
|
-
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
67
|
-
|
|
68
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
69
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
70
|
-
| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
|
|
71
|
-
| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
72
|
-
| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
|
|
73
|
-
| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
74
|
-
|
|
75
|
-
### [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
76
|
-
|
|
77
|
-
| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
78
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
79
|
-
| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
80
|
-
| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
81
|
-
| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
|
|
82
|
-
| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
|
|
83
|
-
| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
|
|
84
|
-
| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
85
|
-
| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
86
|
-
|
|
87
|
-
### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
88
|
-
|
|
89
|
-
| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
90
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
91
|
-
| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
92
|
-
| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
93
|
-
| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
94
|
-
|
|
95
|
-
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
96
|
-
|
|
97
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
98
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
99
|
-
| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
100
|
-
|
|
101
|
-
### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
|
|
102
|
-
|
|
103
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
104
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
105
|
-
| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
|
|
106
|
-
|
|
107
|
-
### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
|
|
108
|
-
|
|
109
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
110
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
111
|
-
| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
112
|
-
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
113
|
-
| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
114
|
-
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
115
|
-
| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
116
|
-
| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
117
|
-
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
118
|
-
|
|
119
|
-
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
120
|
-
|
|
121
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
122
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
123
|
-
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
124
|
-
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
125
|
-
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
126
|
-
|
|
127
|
-
### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
|
|
128
|
-
|
|
129
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
130
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
131
|
-
| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
132
|
-
|
|
133
|
-
### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
|
|
134
|
-
|
|
135
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
136
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
137
|
-
| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
|
|
138
|
-
|
|
139
|
-
### [DeepSeek-R1: Distilled Models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
|
|
140
|
-
|
|
141
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
142
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
143
|
-
| [`DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
144
|
-
| [`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
145
|
-
| [`DeepSeek-R1-Distill-Qwen-1.5B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | 1x a40 | - tokens/s | - tokens/s |
|
|
146
|
-
| [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1x a40 | - tokens/s | - tokens/s |
|
|
147
|
-
| [`DeepSeek-R1-Distill-Qwen-14B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | 2x a40 | - tokens/s | - tokens/s |
|
|
148
|
-
| [`DeepSeek-R1-Distill-Qwen-32B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | 4x a40 | - tokens/s | - tokens/s |
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
## Vision Language Models
|
|
152
|
-
|
|
153
|
-
### [allenai: Molmo](https://huggingface.co/collections/allenai/molmo-66f379e6fe3b8ef090a8ca19)
|
|
154
|
-
|
|
155
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
156
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
157
|
-
| [`Molmo-7B-D-0924`](https://huggingface.co/allenai/Molmo-7B-D-0924) | 1x a40 | - tokens/s | - tokens/s |
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
161
|
-
|
|
162
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
163
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
164
|
-
| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
165
|
-
| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
166
|
-
|
|
167
|
-
### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
168
|
-
|
|
169
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
170
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
171
|
-
| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
172
|
-
| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
173
|
-
|
|
174
|
-
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
175
|
-
|
|
176
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
177
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
178
|
-
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
179
|
-
| [`Phi-3.5-vision-instruct`](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
180
|
-
|
|
181
|
-
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
182
|
-
|
|
183
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
184
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
185
|
-
| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
|
|
186
|
-
| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
187
|
-
| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
188
|
-
| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
189
|
-
|
|
190
|
-
**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
|
|
191
|
-
|
|
192
|
-
### [Mistral: Pixtral](https://huggingface.co/mistralai)
|
|
193
|
-
|
|
194
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
195
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
196
|
-
| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
|
|
197
|
-
|
|
198
|
-
### [OpenGVLab: InternVL2.5](https://huggingface.co/collections/OpenGVLab/internvl25-673e1019b66e2218f68d7c1c)
|
|
199
|
-
|
|
200
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
201
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
202
|
-
| [`InternVL2_5-8B`](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
203
|
-
| [`InternVL2_5-26B`](https://huggingface.co/OpenGVLab/InternVL2_5-26B) | 2x a40 | - tokens/s | - tokens/s |
|
|
204
|
-
| [`InternVL2_5-38B`](https://huggingface.co/OpenGVLab/InternVL2_5-38B) | 4x a40 | - tokens/s | - tokens/s |
|
|
205
|
-
|
|
206
|
-
### [THUDM: GLM-4](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
|
|
207
|
-
|
|
208
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
209
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
210
|
-
| [`glm-4v-9b`](https://huggingface.co/THUDM/glm-4v-9b) | 1x a40 | - tokens/s | - tokens/s |
|
|
211
|
-
|
|
212
|
-
### [DeepSeek: DeepSeek-VL2](https://huggingface.co/collections/deepseek-ai/deepseek-vl2-675c22accc456d3beb4613ab)
|
|
213
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
214
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
215
|
-
| [`deepseek-vl2`](https://huggingface.co/deepseek-ai/deepseek-vl2) | 2x a40 | - tokens/s | - tokens/s |
|
|
216
|
-
| [`deepseek-vl2-small`](https://huggingface.co/deepseek-ai/deepseek-vl2-small) | 1x a40 | - tokens/s | - tokens/s |
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
## Text Embedding Models
|
|
220
|
-
|
|
221
|
-
### [Liang Wang: e5](https://huggingface.co/intfloat)
|
|
222
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
223
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
224
|
-
| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
225
|
-
|
|
226
|
-
### [BAAI: bge](https://huggingface.co/BAAI)
|
|
227
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
228
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
229
|
-
| [`bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1x A40 | - tokens/s | - tokens/s |
|
|
230
|
-
|
|
231
|
-
### [Sentence Transformers: MiniLM](https://huggingface.co/sentence-transformers)
|
|
232
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
233
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
234
|
-
| [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1x A40 | - tokens/s | - tokens/s |
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
## Reward Modeling Models
|
|
239
|
-
|
|
240
|
-
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
241
|
-
|
|
242
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
243
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
244
|
-
| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
|
|
245
|
-
| [`Qwen2.5-Math-PRM-7B`](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | 1x a40 | - tokens/s | - tokens/s |
|
|
6
|
+
**NOTE**: These configs acts as last resort fallbacks in the `vec-inf` package, they will be updated to match the latest cached config on the Vector Killarney cluster with each new package version release.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
paths:
|
|
2
|
+
image_path: "/model-weights/vec-inf-shared/vector-inference_latest.sif"
|
|
3
|
+
|
|
4
|
+
containerization:
|
|
5
|
+
module_load_cmd: "module load apptainer"
|
|
6
|
+
module_name: "apptainer"
|
|
7
|
+
|
|
8
|
+
limits:
|
|
9
|
+
max_gpus_per_node: 8
|
|
10
|
+
max_num_nodes: 178
|
|
11
|
+
max_cpus_per_task: 64
|
|
12
|
+
|
|
13
|
+
allowed_values:
|
|
14
|
+
qos: []
|
|
15
|
+
partition: []
|
|
16
|
+
resource_type: ["l40s", "h100"]
|
|
17
|
+
|
|
18
|
+
required_args:
|
|
19
|
+
account: "VEC_INF_ACCOUNT"
|
|
20
|
+
work_dir: "VEC_INF_WORK_DIR"
|
|
21
|
+
|
|
22
|
+
default_args:
|
|
23
|
+
cpus_per_task: "16"
|
|
24
|
+
mem_per_node: "64G"
|
|
25
|
+
time: "08:00:00"
|
|
26
|
+
qos: ""
|
|
27
|
+
partition: ""
|
|
28
|
+
resource_type: ""
|
|
29
|
+
exclude: ""
|
|
30
|
+
nodelist: ""
|
|
31
|
+
bind: ""
|
|
32
|
+
venv: "apptainer"
|
|
33
|
+
data_type: "auto"
|
|
34
|
+
log_dir: "~/.vec-inf-logs"
|
|
35
|
+
model_weights_parent_dir: "/model-weights"
|