vec-inf 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +3 -3
- vec_inf/cli/_cli.py +239 -104
- vec_inf/cli/_helper.py +289 -564
- vec_inf/cli/_utils.py +26 -150
- vec_inf/cli/_vars.py +32 -0
- vec_inf/client/__init__.py +31 -0
- vec_inf/client/_client_vars.py +231 -0
- vec_inf/client/_exceptions.py +37 -0
- vec_inf/client/_helper.py +661 -0
- vec_inf/client/_slurm_script_generator.py +178 -0
- vec_inf/client/_utils.py +287 -0
- vec_inf/client/api.py +302 -0
- vec_inf/client/config.py +138 -0
- vec_inf/client/models.py +234 -0
- vec_inf/client/slurm_vars.py +49 -0
- vec_inf/config/README.md +0 -12
- vec_inf/config/models.yaml +410 -391
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/METADATA +52 -63
- vec_inf-0.6.1.dist-info/RECORD +25 -0
- vec_inf/cli/_config.py +0 -87
- vec_inf/multinode_vllm.slurm +0 -154
- vec_inf/vllm.slurm +0 -90
- vec_inf-0.5.0.dist-info/RECORD +0 -17
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/WHEEL +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.1.dist-info}/licenses/LICENSE +0 -0
vec_inf/config/models.yaml
CHANGED
|
@@ -6,13 +6,14 @@ models:
|
|
|
6
6
|
gpus_per_node: 4
|
|
7
7
|
num_nodes: 2
|
|
8
8
|
vocab_size: 256000
|
|
9
|
-
max_model_len: 8192
|
|
10
|
-
max_num_seqs: 256
|
|
11
|
-
pipeline_parallelism: true
|
|
12
|
-
enforce_eager: false
|
|
13
9
|
qos: m2
|
|
14
10
|
time: 08:00:00
|
|
15
11
|
partition: a40
|
|
12
|
+
vllm_args:
|
|
13
|
+
--pipeline-parallel-size: 2
|
|
14
|
+
--tensor-parallel-size: 4
|
|
15
|
+
--max-model-len: 8192
|
|
16
|
+
--max-num-seqs: 256
|
|
16
17
|
c4ai-command-r-plus-08-2024:
|
|
17
18
|
model_family: c4ai-command-r
|
|
18
19
|
model_variant: plus-08-2024
|
|
@@ -20,13 +21,14 @@ models:
|
|
|
20
21
|
gpus_per_node: 4
|
|
21
22
|
num_nodes: 2
|
|
22
23
|
vocab_size: 256000
|
|
23
|
-
max_model_len: 65536
|
|
24
|
-
max_num_seqs: 256
|
|
25
|
-
pipeline_parallelism: true
|
|
26
|
-
enforce_eager: false
|
|
27
24
|
qos: m2
|
|
28
25
|
time: 08:00:00
|
|
29
26
|
partition: a40
|
|
27
|
+
vllm_args:
|
|
28
|
+
--pipeline-parallel-size: 2
|
|
29
|
+
--tensor-parallel-size: 4
|
|
30
|
+
--max-model-len: 65536
|
|
31
|
+
--max-num-seqs: 256
|
|
30
32
|
c4ai-command-r-08-2024:
|
|
31
33
|
model_family: c4ai-command-r
|
|
32
34
|
model_variant: 08-2024
|
|
@@ -34,13 +36,14 @@ models:
|
|
|
34
36
|
gpus_per_node: 2
|
|
35
37
|
num_nodes: 1
|
|
36
38
|
vocab_size: 256000
|
|
37
|
-
max_model_len: 32768
|
|
38
|
-
max_num_seqs: 256
|
|
39
|
-
pipeline_parallelism: true
|
|
40
|
-
enforce_eager: false
|
|
41
39
|
qos: m2
|
|
42
40
|
time: 08:00:00
|
|
43
41
|
partition: a40
|
|
42
|
+
vllm_args:
|
|
43
|
+
--tensor-parallel-size: 2
|
|
44
|
+
--max-model-len: 32768
|
|
45
|
+
--max-num-seqs: 256
|
|
46
|
+
--compilation-config: 3
|
|
44
47
|
CodeLlama-7b-hf:
|
|
45
48
|
model_family: CodeLlama
|
|
46
49
|
model_variant: 7b-hf
|
|
@@ -48,13 +51,13 @@ models:
|
|
|
48
51
|
gpus_per_node: 1
|
|
49
52
|
num_nodes: 1
|
|
50
53
|
vocab_size: 32000
|
|
51
|
-
max_model_len: 16384
|
|
52
|
-
max_num_seqs: 256
|
|
53
|
-
pipeline_parallelism: true
|
|
54
|
-
enforce_eager: false
|
|
55
54
|
qos: m2
|
|
56
55
|
time: 08:00:00
|
|
57
56
|
partition: a40
|
|
57
|
+
vllm_args:
|
|
58
|
+
--max-model-len: 16384
|
|
59
|
+
--max-num-seqs: 256
|
|
60
|
+
--compilation-config: 3
|
|
58
61
|
CodeLlama-7b-Instruct-hf:
|
|
59
62
|
model_family: CodeLlama
|
|
60
63
|
model_variant: 7b-Instruct-hf
|
|
@@ -62,13 +65,13 @@ models:
|
|
|
62
65
|
gpus_per_node: 1
|
|
63
66
|
num_nodes: 1
|
|
64
67
|
vocab_size: 32000
|
|
65
|
-
max_model_len: 16384
|
|
66
|
-
max_num_seqs: 256
|
|
67
|
-
pipeline_parallelism: true
|
|
68
|
-
enforce_eager: false
|
|
69
68
|
qos: m2
|
|
70
69
|
time: 08:00:00
|
|
71
70
|
partition: a40
|
|
71
|
+
vllm_args:
|
|
72
|
+
--max-model-len: 16384
|
|
73
|
+
--max-num-seqs: 256
|
|
74
|
+
--compilation-config: 3
|
|
72
75
|
CodeLlama-13b-hf:
|
|
73
76
|
model_family: CodeLlama
|
|
74
77
|
model_variant: 13b-hf
|
|
@@ -76,13 +79,13 @@ models:
|
|
|
76
79
|
gpus_per_node: 1
|
|
77
80
|
num_nodes: 1
|
|
78
81
|
vocab_size: 32000
|
|
79
|
-
max_model_len: 16384
|
|
80
|
-
max_num_seqs: 256
|
|
81
|
-
pipeline_parallelism: true
|
|
82
|
-
enforce_eager: false
|
|
83
82
|
qos: m2
|
|
84
83
|
time: 08:00:00
|
|
85
84
|
partition: a40
|
|
85
|
+
vllm_args:
|
|
86
|
+
--max-model-len: 16384
|
|
87
|
+
--max-num-seqs: 256
|
|
88
|
+
--compilation-config: 3
|
|
86
89
|
CodeLlama-13b-Instruct-hf:
|
|
87
90
|
model_family: CodeLlama
|
|
88
91
|
model_variant: 13b-Instruct-hf
|
|
@@ -90,13 +93,13 @@ models:
|
|
|
90
93
|
gpus_per_node: 1
|
|
91
94
|
num_nodes: 1
|
|
92
95
|
vocab_size: 32000
|
|
93
|
-
max_model_len: 16384
|
|
94
|
-
max_num_seqs: 256
|
|
95
|
-
pipeline_parallelism: true
|
|
96
|
-
enforce_eager: false
|
|
97
96
|
qos: m2
|
|
98
97
|
time: 08:00:00
|
|
99
98
|
partition: a40
|
|
99
|
+
vllm_args:
|
|
100
|
+
--max-model-len: 16384
|
|
101
|
+
--max-num-seqs: 256
|
|
102
|
+
--compilation-config: 3
|
|
100
103
|
CodeLlama-34b-hf:
|
|
101
104
|
model_family: CodeLlama
|
|
102
105
|
model_variant: 34b-hf
|
|
@@ -104,13 +107,14 @@ models:
|
|
|
104
107
|
gpus_per_node: 2
|
|
105
108
|
num_nodes: 1
|
|
106
109
|
vocab_size: 32000
|
|
107
|
-
max_model_len: 16384
|
|
108
|
-
max_num_seqs: 256
|
|
109
|
-
pipeline_parallelism: true
|
|
110
|
-
enforce_eager: false
|
|
111
110
|
qos: m2
|
|
112
111
|
time: 08:00:00
|
|
113
112
|
partition: a40
|
|
113
|
+
vllm_args:
|
|
114
|
+
--tensor-parallel-size: 2
|
|
115
|
+
--max-model-len: 16384
|
|
116
|
+
--max-num-seqs: 256
|
|
117
|
+
--compilation-config: 3
|
|
114
118
|
CodeLlama-34b-Instruct-hf:
|
|
115
119
|
model_family: CodeLlama
|
|
116
120
|
model_variant: 34b-Instruct-hf
|
|
@@ -118,55 +122,44 @@ models:
|
|
|
118
122
|
gpus_per_node: 2
|
|
119
123
|
num_nodes: 1
|
|
120
124
|
vocab_size: 32000
|
|
121
|
-
max_model_len: 16384
|
|
122
|
-
max_num_seqs: 256
|
|
123
|
-
pipeline_parallelism: true
|
|
124
|
-
enforce_eager: false
|
|
125
125
|
qos: m2
|
|
126
126
|
time: 08:00:00
|
|
127
127
|
partition: a40
|
|
128
|
+
vllm_args:
|
|
129
|
+
--tensor-parallel-size: 2
|
|
130
|
+
--max-model-len: 16384
|
|
131
|
+
--max-num-seqs: 256
|
|
132
|
+
--compilation-config: 3
|
|
128
133
|
CodeLlama-70b-hf:
|
|
129
134
|
model_family: CodeLlama
|
|
130
135
|
model_variant: 70b-hf
|
|
131
136
|
model_type: LLM
|
|
132
137
|
gpus_per_node: 4
|
|
133
138
|
num_nodes: 1
|
|
134
|
-
vocab_size:
|
|
135
|
-
max_model_len: 4096
|
|
136
|
-
max_num_seqs: 256
|
|
137
|
-
pipeline_parallelism: true
|
|
138
|
-
enforce_eager: false
|
|
139
|
+
vocab_size: 32016
|
|
139
140
|
qos: m2
|
|
140
141
|
time: 08:00:00
|
|
141
142
|
partition: a40
|
|
143
|
+
vllm_args:
|
|
144
|
+
--tensor-parallel-size: 4
|
|
145
|
+
--max-model-len: 4096
|
|
146
|
+
--max-num-seqs: 256
|
|
147
|
+
--compilation-config: 3
|
|
142
148
|
CodeLlama-70b-Instruct-hf:
|
|
143
149
|
model_family: CodeLlama
|
|
144
150
|
model_variant: 70b-Instruct-hf
|
|
145
151
|
model_type: LLM
|
|
146
152
|
gpus_per_node: 4
|
|
147
153
|
num_nodes: 1
|
|
148
|
-
vocab_size:
|
|
149
|
-
max_model_len: 4096
|
|
150
|
-
max_num_seqs: 256
|
|
151
|
-
pipeline_parallelism: true
|
|
152
|
-
enforce_eager: false
|
|
153
|
-
qos: m2
|
|
154
|
-
time: 08:00:00
|
|
155
|
-
partition: a40
|
|
156
|
-
dbrx-instruct:
|
|
157
|
-
model_family: dbrx
|
|
158
|
-
model_variant: instruct
|
|
159
|
-
model_type: LLM
|
|
160
|
-
gpus_per_node: 4
|
|
161
|
-
num_nodes: 2
|
|
162
|
-
vocab_size: 100352
|
|
163
|
-
max_model_len: 32000
|
|
164
|
-
max_num_seqs: 256
|
|
165
|
-
pipeline_parallelism: true
|
|
166
|
-
enforce_eager: false
|
|
154
|
+
vocab_size: 32016
|
|
167
155
|
qos: m2
|
|
168
156
|
time: 08:00:00
|
|
169
157
|
partition: a40
|
|
158
|
+
vllm_args:
|
|
159
|
+
--tensor-parallel-size: 4
|
|
160
|
+
--max-model-len: 4096
|
|
161
|
+
--max-num-seqs: 256
|
|
162
|
+
--compilation-config: 3
|
|
170
163
|
gemma-2-9b:
|
|
171
164
|
model_family: gemma-2
|
|
172
165
|
model_variant: 9b
|
|
@@ -174,13 +167,13 @@ models:
|
|
|
174
167
|
gpus_per_node: 1
|
|
175
168
|
num_nodes: 1
|
|
176
169
|
vocab_size: 256000
|
|
177
|
-
max_model_len: 4096
|
|
178
|
-
max_num_seqs: 256
|
|
179
|
-
pipeline_parallelism: true
|
|
180
|
-
enforce_eager: false
|
|
181
170
|
qos: m2
|
|
182
171
|
time: 08:00:00
|
|
183
172
|
partition: a40
|
|
173
|
+
vllm_args:
|
|
174
|
+
--max-model-len: 4096
|
|
175
|
+
--max-num-seqs: 256
|
|
176
|
+
--compilation-config: 3
|
|
184
177
|
gemma-2-9b-it:
|
|
185
178
|
model_family: gemma-2
|
|
186
179
|
model_variant: 9b-it
|
|
@@ -188,13 +181,13 @@ models:
|
|
|
188
181
|
gpus_per_node: 1
|
|
189
182
|
num_nodes: 1
|
|
190
183
|
vocab_size: 256000
|
|
191
|
-
max_model_len: 4096
|
|
192
|
-
max_num_seqs: 256
|
|
193
|
-
pipeline_parallelism: true
|
|
194
|
-
enforce_eager: false
|
|
195
184
|
qos: m2
|
|
196
185
|
time: 08:00:00
|
|
197
186
|
partition: a40
|
|
187
|
+
vllm_args:
|
|
188
|
+
--max-model-len: 4096
|
|
189
|
+
--max-num-seqs: 256
|
|
190
|
+
--compilation-config: 3
|
|
198
191
|
gemma-2-27b:
|
|
199
192
|
model_family: gemma-2
|
|
200
193
|
model_variant: 27b
|
|
@@ -202,13 +195,14 @@ models:
|
|
|
202
195
|
gpus_per_node: 2
|
|
203
196
|
num_nodes: 1
|
|
204
197
|
vocab_size: 256000
|
|
205
|
-
max_model_len: 4096
|
|
206
|
-
max_num_seqs: 256
|
|
207
|
-
pipeline_parallelism: true
|
|
208
|
-
enforce_eager: false
|
|
209
198
|
qos: m2
|
|
210
199
|
time: 08:00:00
|
|
211
200
|
partition: a40
|
|
201
|
+
vllm_args:
|
|
202
|
+
--tensor-parallel-size: 2
|
|
203
|
+
--max-model-len: 4096
|
|
204
|
+
--max-num-seqs: 256
|
|
205
|
+
--compilation-config: 3
|
|
212
206
|
gemma-2-27b-it:
|
|
213
207
|
model_family: gemma-2
|
|
214
208
|
model_variant: 27b-it
|
|
@@ -216,13 +210,14 @@ models:
|
|
|
216
210
|
gpus_per_node: 2
|
|
217
211
|
num_nodes: 1
|
|
218
212
|
vocab_size: 256000
|
|
219
|
-
max_model_len: 4096
|
|
220
|
-
max_num_seqs: 256
|
|
221
|
-
pipeline_parallelism: true
|
|
222
|
-
enforce_eager: false
|
|
223
213
|
qos: m2
|
|
224
214
|
time: 08:00:00
|
|
225
215
|
partition: a40
|
|
216
|
+
vllm_args:
|
|
217
|
+
--tensor-parallel-size: 2
|
|
218
|
+
--max-model-len: 4096
|
|
219
|
+
--max-num-seqs: 256
|
|
220
|
+
--compilation-config: 3
|
|
226
221
|
Llama-2-7b-hf:
|
|
227
222
|
model_family: Llama-2
|
|
228
223
|
model_variant: 7b-hf
|
|
@@ -230,13 +225,13 @@ models:
|
|
|
230
225
|
gpus_per_node: 1
|
|
231
226
|
num_nodes: 1
|
|
232
227
|
vocab_size: 32000
|
|
233
|
-
max_model_len: 4096
|
|
234
|
-
max_num_seqs: 256
|
|
235
|
-
pipeline_parallelism: true
|
|
236
|
-
enforce_eager: false
|
|
237
228
|
qos: m2
|
|
238
229
|
time: 08:00:00
|
|
239
230
|
partition: a40
|
|
231
|
+
vllm_args:
|
|
232
|
+
--max-model-len: 4096
|
|
233
|
+
--max-num-seqs: 256
|
|
234
|
+
--compilation-config: 3
|
|
240
235
|
Llama-2-7b-chat-hf:
|
|
241
236
|
model_family: Llama-2
|
|
242
237
|
model_variant: 7b-chat-hf
|
|
@@ -244,13 +239,13 @@ models:
|
|
|
244
239
|
gpus_per_node: 1
|
|
245
240
|
num_nodes: 1
|
|
246
241
|
vocab_size: 32000
|
|
247
|
-
max_model_len: 4096
|
|
248
|
-
max_num_seqs: 256
|
|
249
|
-
pipeline_parallelism: true
|
|
250
|
-
enforce_eager: false
|
|
251
242
|
qos: m2
|
|
252
243
|
time: 08:00:00
|
|
253
244
|
partition: a40
|
|
245
|
+
vllm_args:
|
|
246
|
+
--max-model-len: 4096
|
|
247
|
+
--max-num-seqs: 256
|
|
248
|
+
--compilation-config: 3
|
|
254
249
|
Llama-2-13b-hf:
|
|
255
250
|
model_family: Llama-2
|
|
256
251
|
model_variant: 13b-hf
|
|
@@ -258,13 +253,13 @@ models:
|
|
|
258
253
|
gpus_per_node: 1
|
|
259
254
|
num_nodes: 1
|
|
260
255
|
vocab_size: 32000
|
|
261
|
-
max_model_len: 4096
|
|
262
|
-
max_num_seqs: 256
|
|
263
|
-
pipeline_parallelism: true
|
|
264
|
-
enforce_eager: false
|
|
265
256
|
qos: m2
|
|
266
257
|
time: 08:00:00
|
|
267
258
|
partition: a40
|
|
259
|
+
vllm_args:
|
|
260
|
+
--max-model-len: 4096
|
|
261
|
+
--max-num-seqs: 256
|
|
262
|
+
--compilation-config: 3
|
|
268
263
|
Llama-2-13b-chat-hf:
|
|
269
264
|
model_family: Llama-2
|
|
270
265
|
model_variant: 13b-chat-hf
|
|
@@ -272,13 +267,13 @@ models:
|
|
|
272
267
|
gpus_per_node: 1
|
|
273
268
|
num_nodes: 1
|
|
274
269
|
vocab_size: 32000
|
|
275
|
-
max_model_len: 4096
|
|
276
|
-
max_num_seqs: 256
|
|
277
|
-
pipeline_parallelism: true
|
|
278
|
-
enforce_eager: false
|
|
279
270
|
qos: m2
|
|
280
271
|
time: 08:00:00
|
|
281
272
|
partition: a40
|
|
273
|
+
vllm_args:
|
|
274
|
+
--max-model-len: 4096
|
|
275
|
+
--max-num-seqs: 256
|
|
276
|
+
--compilation-config: 3
|
|
282
277
|
Llama-2-70b-hf:
|
|
283
278
|
model_family: Llama-2
|
|
284
279
|
model_variant: 70b-hf
|
|
@@ -286,13 +281,14 @@ models:
|
|
|
286
281
|
gpus_per_node: 4
|
|
287
282
|
num_nodes: 1
|
|
288
283
|
vocab_size: 32000
|
|
289
|
-
max_model_len: 4096
|
|
290
|
-
max_num_seqs: 256
|
|
291
|
-
pipeline_parallelism: true
|
|
292
|
-
enforce_eager: false
|
|
293
284
|
qos: m2
|
|
294
285
|
time: 08:00:00
|
|
295
286
|
partition: a40
|
|
287
|
+
vllm_args:
|
|
288
|
+
--tensor-parallel-size: 4
|
|
289
|
+
--max-model-len: 4096
|
|
290
|
+
--max-num-seqs: 256
|
|
291
|
+
--compilation-config: 3
|
|
296
292
|
Llama-2-70b-chat-hf:
|
|
297
293
|
model_family: Llama-2
|
|
298
294
|
model_variant: 70b-chat-hf
|
|
@@ -300,13 +296,14 @@ models:
|
|
|
300
296
|
gpus_per_node: 4
|
|
301
297
|
num_nodes: 1
|
|
302
298
|
vocab_size: 32000
|
|
303
|
-
max_model_len: 4096
|
|
304
|
-
max_num_seqs: 256
|
|
305
|
-
pipeline_parallelism: true
|
|
306
|
-
enforce_eager: false
|
|
307
299
|
qos: m2
|
|
308
300
|
time: 08:00:00
|
|
309
301
|
partition: a40
|
|
302
|
+
vllm_args:
|
|
303
|
+
--tensor-parallel-size: 4
|
|
304
|
+
--max-model-len: 4096
|
|
305
|
+
--max-num-seqs: 256
|
|
306
|
+
--compilation-config: 3
|
|
310
307
|
llava-1.5-7b-hf:
|
|
311
308
|
model_family: llava-1.5
|
|
312
309
|
model_variant: 7b-hf
|
|
@@ -314,13 +311,13 @@ models:
|
|
|
314
311
|
gpus_per_node: 1
|
|
315
312
|
num_nodes: 1
|
|
316
313
|
vocab_size: 32000
|
|
317
|
-
max_model_len: 4096
|
|
318
|
-
max_num_seqs: 256
|
|
319
|
-
pipeline_parallelism: true
|
|
320
|
-
enforce_eager: false
|
|
321
314
|
qos: m2
|
|
322
315
|
time: 08:00:00
|
|
323
316
|
partition: a40
|
|
317
|
+
vllm_args:
|
|
318
|
+
--max-model-len: 4096
|
|
319
|
+
--max-num-seqs: 256
|
|
320
|
+
--compilation-config: 3
|
|
324
321
|
llava-1.5-13b-hf:
|
|
325
322
|
model_family: llava-1.5
|
|
326
323
|
model_variant: 13b-hf
|
|
@@ -328,13 +325,13 @@ models:
|
|
|
328
325
|
gpus_per_node: 1
|
|
329
326
|
num_nodes: 1
|
|
330
327
|
vocab_size: 32000
|
|
331
|
-
max_model_len: 4096
|
|
332
|
-
max_num_seqs: 256
|
|
333
|
-
pipeline_parallelism: true
|
|
334
|
-
enforce_eager: false
|
|
335
328
|
qos: m2
|
|
336
329
|
time: 08:00:00
|
|
337
330
|
partition: a40
|
|
331
|
+
vllm_args:
|
|
332
|
+
--max-model-len: 4096
|
|
333
|
+
--max-num-seqs: 256
|
|
334
|
+
--compilation-config: 3
|
|
338
335
|
llava-v1.6-mistral-7b-hf:
|
|
339
336
|
model_family: llava-v1.6
|
|
340
337
|
model_variant: mistral-7b-hf
|
|
@@ -342,13 +339,13 @@ models:
|
|
|
342
339
|
gpus_per_node: 1
|
|
343
340
|
num_nodes: 1
|
|
344
341
|
vocab_size: 32064
|
|
345
|
-
max_model_len: 32768
|
|
346
|
-
max_num_seqs: 256
|
|
347
|
-
pipeline_parallelism: true
|
|
348
|
-
enforce_eager: false
|
|
349
342
|
qos: m2
|
|
350
343
|
time: 08:00:00
|
|
351
344
|
partition: a40
|
|
345
|
+
vllm_args:
|
|
346
|
+
--max-model-len: 32768
|
|
347
|
+
--max-num-seqs: 256
|
|
348
|
+
--compilation-config: 3
|
|
352
349
|
llava-v1.6-34b-hf:
|
|
353
350
|
model_family: llava-v1.6
|
|
354
351
|
model_variant: 34b-hf
|
|
@@ -356,13 +353,14 @@ models:
|
|
|
356
353
|
gpus_per_node: 2
|
|
357
354
|
num_nodes: 1
|
|
358
355
|
vocab_size: 64064
|
|
359
|
-
max_model_len: 4096
|
|
360
|
-
max_num_seqs: 256
|
|
361
|
-
pipeline_parallelism: true
|
|
362
|
-
enforce_eager: false
|
|
363
356
|
qos: m2
|
|
364
357
|
time: 08:00:00
|
|
365
358
|
partition: a40
|
|
359
|
+
vllm_args:
|
|
360
|
+
--tensor-parallel-size: 2
|
|
361
|
+
--max-model-len: 4096
|
|
362
|
+
--max-num-seqs: 256
|
|
363
|
+
--compilation-config: 3
|
|
366
364
|
Meta-Llama-3-8B:
|
|
367
365
|
model_family: Meta-Llama-3
|
|
368
366
|
model_variant: 8B
|
|
@@ -370,13 +368,13 @@ models:
|
|
|
370
368
|
gpus_per_node: 1
|
|
371
369
|
num_nodes: 1
|
|
372
370
|
vocab_size: 128256
|
|
373
|
-
max_model_len: 8192
|
|
374
|
-
max_num_seqs: 256
|
|
375
|
-
pipeline_parallelism: true
|
|
376
|
-
enforce_eager: false
|
|
377
371
|
qos: m2
|
|
378
372
|
time: 08:00:00
|
|
379
373
|
partition: a40
|
|
374
|
+
vllm_args:
|
|
375
|
+
--max-model-len: 8192
|
|
376
|
+
--max-num-seqs: 256
|
|
377
|
+
--compilation-config: 3
|
|
380
378
|
Meta-Llama-3-8B-Instruct:
|
|
381
379
|
model_family: Meta-Llama-3
|
|
382
380
|
model_variant: 8B-Instruct
|
|
@@ -384,13 +382,13 @@ models:
|
|
|
384
382
|
gpus_per_node: 1
|
|
385
383
|
num_nodes: 1
|
|
386
384
|
vocab_size: 128256
|
|
387
|
-
max_model_len: 8192
|
|
388
|
-
max_num_seqs: 256
|
|
389
|
-
pipeline_parallelism: true
|
|
390
|
-
enforce_eager: false
|
|
391
385
|
qos: m2
|
|
392
386
|
time: 08:00:00
|
|
393
387
|
partition: a40
|
|
388
|
+
vllm_args:
|
|
389
|
+
--max-model-len: 8192
|
|
390
|
+
--max-num-seqs: 256
|
|
391
|
+
--compilation-config: 3
|
|
394
392
|
Meta-Llama-3-70B:
|
|
395
393
|
model_family: Meta-Llama-3
|
|
396
394
|
model_variant: 70B
|
|
@@ -398,13 +396,14 @@ models:
|
|
|
398
396
|
gpus_per_node: 4
|
|
399
397
|
num_nodes: 1
|
|
400
398
|
vocab_size: 128256
|
|
401
|
-
max_model_len: 8192
|
|
402
|
-
max_num_seqs: 256
|
|
403
|
-
pipeline_parallelism: true
|
|
404
|
-
enforce_eager: false
|
|
405
399
|
qos: m2
|
|
406
400
|
time: 08:00:00
|
|
407
401
|
partition: a40
|
|
402
|
+
vllm_args:
|
|
403
|
+
--tensor-parallel-size: 4
|
|
404
|
+
--max-model-len: 8192
|
|
405
|
+
--max-num-seqs: 256
|
|
406
|
+
--compilation-config: 3
|
|
408
407
|
Meta-Llama-3-70B-Instruct:
|
|
409
408
|
model_family: Meta-Llama-3
|
|
410
409
|
model_variant: 70B-Instruct
|
|
@@ -412,13 +411,14 @@ models:
|
|
|
412
411
|
gpus_per_node: 4
|
|
413
412
|
num_nodes: 1
|
|
414
413
|
vocab_size: 128256
|
|
415
|
-
max_model_len: 8192
|
|
416
|
-
max_num_seqs: 256
|
|
417
|
-
pipeline_parallelism: true
|
|
418
|
-
enforce_eager: false
|
|
419
414
|
qos: m2
|
|
420
415
|
time: 08:00:00
|
|
421
416
|
partition: a40
|
|
417
|
+
vllm_args:
|
|
418
|
+
--tensor-parallel-size: 4
|
|
419
|
+
--max-model-len: 8192
|
|
420
|
+
--max-num-seqs: 256
|
|
421
|
+
--compilation-config: 3
|
|
422
422
|
Meta-Llama-3.1-8B:
|
|
423
423
|
model_family: Meta-Llama-3.1
|
|
424
424
|
model_variant: 8B
|
|
@@ -426,13 +426,13 @@ models:
|
|
|
426
426
|
gpus_per_node: 1
|
|
427
427
|
num_nodes: 1
|
|
428
428
|
vocab_size: 128256
|
|
429
|
-
max_model_len: 131072
|
|
430
|
-
max_num_seqs: 256
|
|
431
|
-
pipeline_parallelism: true
|
|
432
|
-
enforce_eager: false
|
|
433
429
|
qos: m2
|
|
434
430
|
time: 08:00:00
|
|
435
431
|
partition: a40
|
|
432
|
+
vllm_args:
|
|
433
|
+
--max-model-len: 131072
|
|
434
|
+
--max-num-seqs: 256
|
|
435
|
+
--compilation-config: 3
|
|
436
436
|
Meta-Llama-3.1-8B-Instruct:
|
|
437
437
|
model_family: Meta-Llama-3.1
|
|
438
438
|
model_variant: 8B-Instruct
|
|
@@ -440,13 +440,13 @@ models:
|
|
|
440
440
|
gpus_per_node: 1
|
|
441
441
|
num_nodes: 1
|
|
442
442
|
vocab_size: 128256
|
|
443
|
-
max_model_len: 131072
|
|
444
|
-
max_num_seqs: 256
|
|
445
|
-
pipeline_parallelism: true
|
|
446
|
-
enforce_eager: false
|
|
447
443
|
qos: m2
|
|
448
444
|
time: 08:00:00
|
|
449
445
|
partition: a40
|
|
446
|
+
vllm_args:
|
|
447
|
+
--max-model-len: 131072
|
|
448
|
+
--max-num-seqs: 256
|
|
449
|
+
--compilation-config: 3
|
|
450
450
|
Meta-Llama-3.1-70B:
|
|
451
451
|
model_family: Meta-Llama-3.1
|
|
452
452
|
model_variant: 70B
|
|
@@ -454,13 +454,14 @@ models:
|
|
|
454
454
|
gpus_per_node: 4
|
|
455
455
|
num_nodes: 1
|
|
456
456
|
vocab_size: 128256
|
|
457
|
-
max_model_len: 65536
|
|
458
|
-
max_num_seqs: 256
|
|
459
|
-
pipeline_parallelism: true
|
|
460
|
-
enforce_eager: false
|
|
461
457
|
qos: m2
|
|
462
458
|
time: 08:00:00
|
|
463
459
|
partition: a40
|
|
460
|
+
vllm_args:
|
|
461
|
+
--tensor-parallel-size: 4
|
|
462
|
+
--max-model-len: 65536
|
|
463
|
+
--max-num-seqs: 256
|
|
464
|
+
--compilation-config: 3
|
|
464
465
|
Meta-Llama-3.1-70B-Instruct:
|
|
465
466
|
model_family: Meta-Llama-3.1
|
|
466
467
|
model_variant: 70B-Instruct
|
|
@@ -468,13 +469,14 @@ models:
|
|
|
468
469
|
gpus_per_node: 4
|
|
469
470
|
num_nodes: 1
|
|
470
471
|
vocab_size: 128256
|
|
471
|
-
max_model_len: 65536
|
|
472
|
-
max_num_seqs: 256
|
|
473
|
-
pipeline_parallelism: true
|
|
474
|
-
enforce_eager: false
|
|
475
472
|
qos: m2
|
|
476
473
|
time: 08:00:00
|
|
477
474
|
partition: a40
|
|
475
|
+
vllm_args:
|
|
476
|
+
--tensor-parallel-size: 4
|
|
477
|
+
--max-model-len: 65536
|
|
478
|
+
--max-num-seqs: 256
|
|
479
|
+
--compilation-config: 3
|
|
478
480
|
Meta-Llama-3.1-405B-Instruct:
|
|
479
481
|
model_family: Meta-Llama-3.1
|
|
480
482
|
model_variant: 405B-Instruct
|
|
@@ -482,27 +484,14 @@ models:
|
|
|
482
484
|
gpus_per_node: 4
|
|
483
485
|
num_nodes: 8
|
|
484
486
|
vocab_size: 128256
|
|
485
|
-
max_model_len: 16384
|
|
486
|
-
max_num_seqs: 256
|
|
487
|
-
pipeline_parallelism: true
|
|
488
|
-
enforce_eager: false
|
|
489
487
|
qos: m4
|
|
490
488
|
time: 02:00:00
|
|
491
489
|
partition: a40
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
num_nodes: 1
|
|
498
|
-
vocab_size: 32000
|
|
499
|
-
max_model_len: 32768
|
|
500
|
-
max_num_seqs: 256
|
|
501
|
-
pipeline_parallelism: true
|
|
502
|
-
enforce_eager: false
|
|
503
|
-
qos: m2
|
|
504
|
-
time: 08:00:00
|
|
505
|
-
partition: a40
|
|
490
|
+
vllm_args:
|
|
491
|
+
--pipeline-parallel-size: 8
|
|
492
|
+
--tensor-parallel-size: 4
|
|
493
|
+
--max-model-len: 16384
|
|
494
|
+
--max-num-seqs: 256
|
|
506
495
|
Mistral-7B-Instruct-v0.1:
|
|
507
496
|
model_family: Mistral
|
|
508
497
|
model_variant: 7B-Instruct-v0.1
|
|
@@ -510,13 +499,13 @@ models:
|
|
|
510
499
|
gpus_per_node: 1
|
|
511
500
|
num_nodes: 1
|
|
512
501
|
vocab_size: 32000
|
|
513
|
-
max_model_len: 32768
|
|
514
|
-
max_num_seqs: 256
|
|
515
|
-
pipeline_parallelism: true
|
|
516
|
-
enforce_eager: false
|
|
517
502
|
qos: m2
|
|
518
503
|
time: 08:00:00
|
|
519
504
|
partition: a40
|
|
505
|
+
vllm_args:
|
|
506
|
+
--max-model-len: 32768
|
|
507
|
+
--max-num-seqs: 256
|
|
508
|
+
--compilation-config: 3
|
|
520
509
|
Mistral-7B-Instruct-v0.2:
|
|
521
510
|
model_family: Mistral
|
|
522
511
|
model_variant: 7B-Instruct-v0.2
|
|
@@ -524,13 +513,13 @@ models:
|
|
|
524
513
|
gpus_per_node: 1
|
|
525
514
|
num_nodes: 1
|
|
526
515
|
vocab_size: 32000
|
|
527
|
-
max_model_len: 32768
|
|
528
|
-
max_num_seqs: 256
|
|
529
|
-
pipeline_parallelism: true
|
|
530
|
-
enforce_eager: false
|
|
531
516
|
qos: m2
|
|
532
517
|
time: 08:00:00
|
|
533
518
|
partition: a40
|
|
519
|
+
vllm_args:
|
|
520
|
+
--max-model-len: 32768
|
|
521
|
+
--max-num-seqs: 256
|
|
522
|
+
--compilation-config: 3
|
|
534
523
|
Mistral-7B-v0.3:
|
|
535
524
|
model_family: Mistral
|
|
536
525
|
model_variant: 7B-v0.3
|
|
@@ -538,13 +527,13 @@ models:
|
|
|
538
527
|
gpus_per_node: 1
|
|
539
528
|
num_nodes: 1
|
|
540
529
|
vocab_size: 32768
|
|
541
|
-
max_model_len: 32768
|
|
542
|
-
max_num_seqs: 256
|
|
543
|
-
pipeline_parallelism: true
|
|
544
|
-
enforce_eager: false
|
|
545
530
|
qos: m2
|
|
546
531
|
time: 08:00:00
|
|
547
532
|
partition: a40
|
|
533
|
+
vllm_args:
|
|
534
|
+
--max-model-len: 32768
|
|
535
|
+
--max-num-seqs: 256
|
|
536
|
+
--compilation-config: 3
|
|
548
537
|
Mistral-7B-Instruct-v0.3:
|
|
549
538
|
model_family: Mistral
|
|
550
539
|
model_variant: 7B-Instruct-v0.3
|
|
@@ -552,13 +541,13 @@ models:
|
|
|
552
541
|
gpus_per_node: 1
|
|
553
542
|
num_nodes: 1
|
|
554
543
|
vocab_size: 32768
|
|
555
|
-
max_model_len: 32768
|
|
556
|
-
max_num_seqs: 256
|
|
557
|
-
pipeline_parallelism: true
|
|
558
|
-
enforce_eager: false
|
|
559
544
|
qos: m2
|
|
560
545
|
time: 08:00:00
|
|
561
546
|
partition: a40
|
|
547
|
+
vllm_args:
|
|
548
|
+
--max-model-len: 32768
|
|
549
|
+
--max-num-seqs: 256
|
|
550
|
+
--compilation-config: 3
|
|
562
551
|
Mistral-Large-Instruct-2407:
|
|
563
552
|
model_family: Mistral
|
|
564
553
|
model_variant: Large-Instruct-2407
|
|
@@ -566,13 +555,14 @@ models:
|
|
|
566
555
|
gpus_per_node: 4
|
|
567
556
|
num_nodes: 2
|
|
568
557
|
vocab_size: 32768
|
|
569
|
-
max_model_len: 32768
|
|
570
|
-
max_num_seqs: 256
|
|
571
|
-
pipeline_parallelism: true
|
|
572
|
-
enforce_eager: false
|
|
573
558
|
qos: m2
|
|
574
559
|
time: 08:00:00
|
|
575
560
|
partition: a40
|
|
561
|
+
vllm_args:
|
|
562
|
+
--pipeline-parallel-size: 2
|
|
563
|
+
--tensor-parallel-size: 4
|
|
564
|
+
--max-model-len: 32768
|
|
565
|
+
--max-num-seqs: 256
|
|
576
566
|
Mistral-Large-Instruct-2411:
|
|
577
567
|
model_family: Mistral
|
|
578
568
|
model_variant: Large-Instruct-2411
|
|
@@ -580,13 +570,14 @@ models:
|
|
|
580
570
|
gpus_per_node: 4
|
|
581
571
|
num_nodes: 2
|
|
582
572
|
vocab_size: 32768
|
|
583
|
-
max_model_len: 32768
|
|
584
|
-
max_num_seqs: 256
|
|
585
|
-
pipeline_parallelism: true
|
|
586
|
-
enforce_eager: false
|
|
587
573
|
qos: m2
|
|
588
574
|
time: 08:00:00
|
|
589
575
|
partition: a40
|
|
576
|
+
vllm_args:
|
|
577
|
+
--pipeline-parallel-size: 2
|
|
578
|
+
--tensor-parallel-size: 4
|
|
579
|
+
--max-model-len: 32768
|
|
580
|
+
--max-num-seqs: 256
|
|
590
581
|
Mixtral-8x7B-Instruct-v0.1:
|
|
591
582
|
model_family: Mixtral
|
|
592
583
|
model_variant: 8x7B-Instruct-v0.1
|
|
@@ -594,13 +585,14 @@ models:
|
|
|
594
585
|
gpus_per_node: 4
|
|
595
586
|
num_nodes: 1
|
|
596
587
|
vocab_size: 32000
|
|
597
|
-
max_model_len: 32768
|
|
598
|
-
max_num_seqs: 256
|
|
599
|
-
pipeline_parallelism: true
|
|
600
|
-
enforce_eager: false
|
|
601
588
|
qos: m2
|
|
602
589
|
time: 08:00:00
|
|
603
590
|
partition: a40
|
|
591
|
+
vllm_args:
|
|
592
|
+
--tensor-parallel-size: 4
|
|
593
|
+
--max-model-len: 32768
|
|
594
|
+
--max-num-seqs: 256
|
|
595
|
+
--compilation-config: 3
|
|
604
596
|
Mixtral-8x22B-v0.1:
|
|
605
597
|
model_family: Mixtral
|
|
606
598
|
model_variant: 8x22B-v0.1
|
|
@@ -608,13 +600,14 @@ models:
|
|
|
608
600
|
gpus_per_node: 4
|
|
609
601
|
num_nodes: 2
|
|
610
602
|
vocab_size: 32768
|
|
611
|
-
max_model_len: 65536
|
|
612
|
-
max_num_seqs: 256
|
|
613
|
-
pipeline_parallelism: true
|
|
614
|
-
enforce_eager: false
|
|
615
603
|
qos: m2
|
|
616
604
|
time: 08:00:00
|
|
617
605
|
partition: a40
|
|
606
|
+
vllm_args:
|
|
607
|
+
--pipeline-parallel-size: 2
|
|
608
|
+
--tensor-parallel-size: 4
|
|
609
|
+
--max-model-len: 65536
|
|
610
|
+
--max-num-seqs: 256
|
|
618
611
|
Mixtral-8x22B-Instruct-v0.1:
|
|
619
612
|
model_family: Mixtral
|
|
620
613
|
model_variant: 8x22B-Instruct-v0.1
|
|
@@ -622,13 +615,14 @@ models:
|
|
|
622
615
|
gpus_per_node: 4
|
|
623
616
|
num_nodes: 2
|
|
624
617
|
vocab_size: 32768
|
|
625
|
-
max_model_len: 65536
|
|
626
|
-
max_num_seqs: 256
|
|
627
|
-
pipeline_parallelism: true
|
|
628
|
-
enforce_eager: false
|
|
629
618
|
qos: m2
|
|
630
619
|
time: 08:00:00
|
|
631
620
|
partition: a40
|
|
621
|
+
vllm_args:
|
|
622
|
+
--pipeline-parallel-size: 2
|
|
623
|
+
--tensor-parallel-size: 4
|
|
624
|
+
--max-model-len: 65536
|
|
625
|
+
--max-num-seqs: 256
|
|
632
626
|
Phi-3-medium-128k-instruct:
|
|
633
627
|
model_family: Phi-3
|
|
634
628
|
model_variant: medium-128k-instruct
|
|
@@ -636,13 +630,14 @@ models:
|
|
|
636
630
|
gpus_per_node: 2
|
|
637
631
|
num_nodes: 1
|
|
638
632
|
vocab_size: 32064
|
|
639
|
-
max_model_len: 131072
|
|
640
|
-
max_num_seqs: 256
|
|
641
|
-
pipeline_parallelism: true
|
|
642
|
-
enforce_eager: false
|
|
643
633
|
qos: m2
|
|
644
634
|
time: 08:00:00
|
|
645
635
|
partition: a40
|
|
636
|
+
vllm_args:
|
|
637
|
+
--tensor-parallel-size: 2
|
|
638
|
+
--max-model-len: 131072
|
|
639
|
+
--max-num-seqs: 256
|
|
640
|
+
--compilation-config: 3
|
|
646
641
|
Phi-3-vision-128k-instruct:
|
|
647
642
|
model_family: Phi-3-vision
|
|
648
643
|
model_variant: 128k-instruct
|
|
@@ -650,13 +645,14 @@ models:
|
|
|
650
645
|
gpus_per_node: 2
|
|
651
646
|
num_nodes: 1
|
|
652
647
|
vocab_size: 32064
|
|
653
|
-
max_model_len: 65536
|
|
654
|
-
max_num_seqs: 256
|
|
655
|
-
pipeline_parallelism: true
|
|
656
|
-
enforce_eager: false
|
|
657
648
|
qos: m2
|
|
658
649
|
time: 08:00:00
|
|
659
650
|
partition: a40
|
|
651
|
+
vllm_args:
|
|
652
|
+
--tensor-parallel-size: 2
|
|
653
|
+
--max-model-len: 65536
|
|
654
|
+
--max-num-seqs: 256
|
|
655
|
+
--compilation-config: 3
|
|
660
656
|
Llama3-OpenBioLLM-70B:
|
|
661
657
|
model_family: Llama3-OpenBioLLM
|
|
662
658
|
model_variant: 70B
|
|
@@ -664,13 +660,14 @@ models:
|
|
|
664
660
|
gpus_per_node: 4
|
|
665
661
|
num_nodes: 1
|
|
666
662
|
vocab_size: 128256
|
|
667
|
-
max_model_len: 8192
|
|
668
|
-
max_num_seqs: 256
|
|
669
|
-
pipeline_parallelism: true
|
|
670
|
-
enforce_eager: false
|
|
671
663
|
qos: m2
|
|
672
664
|
time: 08:00:00
|
|
673
665
|
partition: a40
|
|
666
|
+
vllm_args:
|
|
667
|
+
--tensor-parallel-size: 4
|
|
668
|
+
--max-model-len: 8192
|
|
669
|
+
--max-num-seqs: 256
|
|
670
|
+
--compilation-config: 3
|
|
674
671
|
Llama-3.1-Nemotron-70B-Instruct-HF:
|
|
675
672
|
model_family: Llama-3.1-Nemotron
|
|
676
673
|
model_variant: 70B-Instruct-HF
|
|
@@ -678,13 +675,14 @@ models:
|
|
|
678
675
|
gpus_per_node: 4
|
|
679
676
|
num_nodes: 1
|
|
680
677
|
vocab_size: 128256
|
|
681
|
-
max_model_len: 65536
|
|
682
|
-
max_num_seqs: 256
|
|
683
|
-
pipeline_parallelism: true
|
|
684
|
-
enforce_eager: false
|
|
685
678
|
qos: m2
|
|
686
679
|
time: 08:00:00
|
|
687
680
|
partition: a40
|
|
681
|
+
vllm_args:
|
|
682
|
+
--tensor-parallel-size: 4
|
|
683
|
+
--max-model-len: 65536
|
|
684
|
+
--max-num-seqs: 256
|
|
685
|
+
--compilation-config: 3
|
|
688
686
|
Llama-3.2-1B:
|
|
689
687
|
model_family: Llama-3.2
|
|
690
688
|
model_variant: 1B
|
|
@@ -692,13 +690,13 @@ models:
|
|
|
692
690
|
gpus_per_node: 1
|
|
693
691
|
num_nodes: 1
|
|
694
692
|
vocab_size: 128256
|
|
695
|
-
max_model_len: 131072
|
|
696
|
-
max_num_seqs: 256
|
|
697
|
-
pipeline_parallelism: true
|
|
698
|
-
enforce_eager: false
|
|
699
693
|
qos: m2
|
|
700
694
|
time: 08:00:00
|
|
701
695
|
partition: a40
|
|
696
|
+
vllm_args:
|
|
697
|
+
--max-model-len: 131072
|
|
698
|
+
--max-num-seqs: 256
|
|
699
|
+
--compilation-config: 3
|
|
702
700
|
Llama-3.2-1B-Instruct:
|
|
703
701
|
model_family: Llama-3.2
|
|
704
702
|
model_variant: 1B-Instruct
|
|
@@ -706,13 +704,13 @@ models:
|
|
|
706
704
|
gpus_per_node: 1
|
|
707
705
|
num_nodes: 1
|
|
708
706
|
vocab_size: 128256
|
|
709
|
-
max_model_len: 131072
|
|
710
|
-
max_num_seqs: 256
|
|
711
|
-
pipeline_parallelism: true
|
|
712
|
-
enforce_eager: false
|
|
713
707
|
qos: m2
|
|
714
708
|
time: 08:00:00
|
|
715
709
|
partition: a40
|
|
710
|
+
vllm_args:
|
|
711
|
+
--max-model-len: 131072
|
|
712
|
+
--max-num-seqs: 256
|
|
713
|
+
--compilation-config: 3
|
|
716
714
|
Llama-3.2-3B:
|
|
717
715
|
model_family: Llama-3.2
|
|
718
716
|
model_variant: 3B
|
|
@@ -720,13 +718,13 @@ models:
|
|
|
720
718
|
gpus_per_node: 1
|
|
721
719
|
num_nodes: 1
|
|
722
720
|
vocab_size: 128256
|
|
723
|
-
max_model_len: 131072
|
|
724
|
-
max_num_seqs: 256
|
|
725
|
-
pipeline_parallelism: true
|
|
726
|
-
enforce_eager: false
|
|
727
721
|
qos: m2
|
|
728
722
|
time: 08:00:00
|
|
729
723
|
partition: a40
|
|
724
|
+
vllm_args:
|
|
725
|
+
--max-model-len: 131072
|
|
726
|
+
--max-num-seqs: 256
|
|
727
|
+
--compilation-config: 3
|
|
730
728
|
Llama-3.2-3B-Instruct:
|
|
731
729
|
model_family: Llama-3.2
|
|
732
730
|
model_variant: 3B-Instruct
|
|
@@ -734,13 +732,13 @@ models:
|
|
|
734
732
|
gpus_per_node: 1
|
|
735
733
|
num_nodes: 1
|
|
736
734
|
vocab_size: 128256
|
|
737
|
-
max_model_len: 131072
|
|
738
|
-
max_num_seqs: 256
|
|
739
|
-
pipeline_parallelism: true
|
|
740
|
-
enforce_eager: false
|
|
741
735
|
qos: m2
|
|
742
736
|
time: 08:00:00
|
|
743
737
|
partition: a40
|
|
738
|
+
vllm_args:
|
|
739
|
+
--max-model-len: 131072
|
|
740
|
+
--max-num-seqs: 256
|
|
741
|
+
--compilation-config: 3
|
|
744
742
|
Llama-3.2-11B-Vision:
|
|
745
743
|
model_family: Llama-3.2
|
|
746
744
|
model_variant: 11B-Vision
|
|
@@ -748,13 +746,15 @@ models:
|
|
|
748
746
|
gpus_per_node: 2
|
|
749
747
|
num_nodes: 1
|
|
750
748
|
vocab_size: 128256
|
|
751
|
-
max_model_len: 4096
|
|
752
|
-
max_num_seqs: 64
|
|
753
|
-
pipeline_parallelism: false
|
|
754
|
-
enforce_eager: true
|
|
755
749
|
qos: m2
|
|
756
750
|
time: 08:00:00
|
|
757
751
|
partition: a40
|
|
752
|
+
vllm_args:
|
|
753
|
+
--tensor-parallel-size: 2
|
|
754
|
+
--max-model-len: 4096
|
|
755
|
+
--max-num-seqs: 64
|
|
756
|
+
--compilation-config: 3
|
|
757
|
+
--enforce-eager: true
|
|
758
758
|
Llama-3.2-11B-Vision-Instruct:
|
|
759
759
|
model_family: Llama-3.2
|
|
760
760
|
model_variant: 11B-Vision-Instruct
|
|
@@ -762,13 +762,15 @@ models:
|
|
|
762
762
|
gpus_per_node: 2
|
|
763
763
|
num_nodes: 1
|
|
764
764
|
vocab_size: 128256
|
|
765
|
-
max_model_len: 4096
|
|
766
|
-
max_num_seqs: 64
|
|
767
|
-
pipeline_parallelism: false
|
|
768
|
-
enforce_eager: true
|
|
769
765
|
qos: m2
|
|
770
766
|
time: 08:00:00
|
|
771
767
|
partition: a40
|
|
768
|
+
vllm_args:
|
|
769
|
+
--tensor-parallel-size: 2
|
|
770
|
+
--max-model-len: 4096
|
|
771
|
+
--max-num-seqs: 64
|
|
772
|
+
--compilation-config: 3
|
|
773
|
+
--enforce-eager: true
|
|
772
774
|
Llama-3.2-90B-Vision:
|
|
773
775
|
model_family: Llama-3.2
|
|
774
776
|
model_variant: 90B-Vision
|
|
@@ -776,13 +778,15 @@ models:
|
|
|
776
778
|
gpus_per_node: 4
|
|
777
779
|
num_nodes: 2
|
|
778
780
|
vocab_size: 128256
|
|
779
|
-
max_model_len: 4096
|
|
780
|
-
max_num_seqs: 32
|
|
781
|
-
pipeline_parallelism: false
|
|
782
|
-
enforce_eager: true
|
|
783
781
|
qos: m2
|
|
784
782
|
time: 08:00:00
|
|
785
783
|
partition: a40
|
|
784
|
+
vllm_args:
|
|
785
|
+
--tensor-parallel-size: 8
|
|
786
|
+
--max-model-len: 4096
|
|
787
|
+
--max-num-seqs: 32
|
|
788
|
+
--compilation-config: 3
|
|
789
|
+
--enforce-eager: true
|
|
786
790
|
Llama-3.2-90B-Vision-Instruct:
|
|
787
791
|
model_family: Llama-3.2
|
|
788
792
|
model_variant: 90B-Vision-Instruct
|
|
@@ -790,13 +794,15 @@ models:
|
|
|
790
794
|
gpus_per_node: 4
|
|
791
795
|
num_nodes: 2
|
|
792
796
|
vocab_size: 128256
|
|
793
|
-
max_model_len: 4096
|
|
794
|
-
max_num_seqs: 32
|
|
795
|
-
pipeline_parallelism: false
|
|
796
|
-
enforce_eager: true
|
|
797
797
|
qos: m2
|
|
798
798
|
time: 08:00:00
|
|
799
799
|
partition: a40
|
|
800
|
+
vllm_args:
|
|
801
|
+
--tensor-parallel-size: 8
|
|
802
|
+
--max-model-len: 4096
|
|
803
|
+
--max-num-seqs: 32
|
|
804
|
+
--compilation-config: 3
|
|
805
|
+
--enforce-eager: true
|
|
800
806
|
Qwen2.5-0.5B-Instruct:
|
|
801
807
|
model_family: Qwen2.5
|
|
802
808
|
model_variant: 0.5B-Instruct
|
|
@@ -804,13 +810,13 @@ models:
|
|
|
804
810
|
gpus_per_node: 1
|
|
805
811
|
num_nodes: 1
|
|
806
812
|
vocab_size: 152064
|
|
807
|
-
max_model_len: 32768
|
|
808
|
-
max_num_seqs: 256
|
|
809
|
-
pipeline_parallelism: true
|
|
810
|
-
enforce_eager: false
|
|
811
813
|
qos: m2
|
|
812
814
|
time: 08:00:00
|
|
813
815
|
partition: a40
|
|
816
|
+
vllm_args:
|
|
817
|
+
--max-model-len: 32768
|
|
818
|
+
--max-num-seqs: 256
|
|
819
|
+
--compilation-config: 3
|
|
814
820
|
Qwen2.5-1.5B-Instruct:
|
|
815
821
|
model_family: Qwen2.5
|
|
816
822
|
model_variant: 1.5B-Instruct
|
|
@@ -818,13 +824,13 @@ models:
|
|
|
818
824
|
gpus_per_node: 1
|
|
819
825
|
num_nodes: 1
|
|
820
826
|
vocab_size: 152064
|
|
821
|
-
max_model_len: 32768
|
|
822
|
-
max_num_seqs: 256
|
|
823
|
-
pipeline_parallelism: true
|
|
824
|
-
enforce_eager: false
|
|
825
827
|
qos: m2
|
|
826
828
|
time: 08:00:00
|
|
827
829
|
partition: a40
|
|
830
|
+
vllm_args:
|
|
831
|
+
--max-model-len: 32768
|
|
832
|
+
--max-num-seqs: 256
|
|
833
|
+
--compilation-config: 3
|
|
828
834
|
Qwen2.5-3B-Instruct:
|
|
829
835
|
model_family: Qwen2.5
|
|
830
836
|
model_variant: 3B-Instruct
|
|
@@ -832,13 +838,13 @@ models:
|
|
|
832
838
|
gpus_per_node: 1
|
|
833
839
|
num_nodes: 1
|
|
834
840
|
vocab_size: 152064
|
|
835
|
-
max_model_len: 32768
|
|
836
|
-
max_num_seqs: 256
|
|
837
|
-
pipeline_parallelism: true
|
|
838
|
-
enforce_eager: false
|
|
839
841
|
qos: m2
|
|
840
842
|
time: 08:00:00
|
|
841
843
|
partition: a40
|
|
844
|
+
vllm_args:
|
|
845
|
+
--max-model-len: 32768
|
|
846
|
+
--max-num-seqs: 256
|
|
847
|
+
--compilation-config: 3
|
|
842
848
|
Qwen2.5-7B-Instruct:
|
|
843
849
|
model_family: Qwen2.5
|
|
844
850
|
model_variant: 7B-Instruct
|
|
@@ -846,13 +852,13 @@ models:
|
|
|
846
852
|
gpus_per_node: 1
|
|
847
853
|
num_nodes: 1
|
|
848
854
|
vocab_size: 152064
|
|
849
|
-
max_model_len: 32768
|
|
850
|
-
max_num_seqs: 256
|
|
851
|
-
pipeline_parallelism: true
|
|
852
|
-
enforce_eager: false
|
|
853
855
|
qos: m2
|
|
854
856
|
time: 08:00:00
|
|
855
857
|
partition: a40
|
|
858
|
+
vllm_args:
|
|
859
|
+
--max-model-len: 32768
|
|
860
|
+
--max-num-seqs: 256
|
|
861
|
+
--compilation-config: 3
|
|
856
862
|
Qwen2.5-14B-Instruct:
|
|
857
863
|
model_family: Qwen2.5
|
|
858
864
|
model_variant: 14B-Instruct
|
|
@@ -860,13 +866,13 @@ models:
|
|
|
860
866
|
gpus_per_node: 1
|
|
861
867
|
num_nodes: 1
|
|
862
868
|
vocab_size: 152064
|
|
863
|
-
max_model_len: 32768
|
|
864
|
-
max_num_seqs: 256
|
|
865
|
-
pipeline_parallelism: true
|
|
866
|
-
enforce_eager: false
|
|
867
869
|
qos: m2
|
|
868
870
|
time: 08:00:00
|
|
869
871
|
partition: a40
|
|
872
|
+
vllm_args:
|
|
873
|
+
--max-model-len: 32768
|
|
874
|
+
--max-num-seqs: 256
|
|
875
|
+
--compilation-config: 3
|
|
870
876
|
Qwen2.5-32B-Instruct:
|
|
871
877
|
model_family: Qwen2.5
|
|
872
878
|
model_variant: 32B-Instruct
|
|
@@ -874,13 +880,14 @@ models:
|
|
|
874
880
|
gpus_per_node: 2
|
|
875
881
|
num_nodes: 1
|
|
876
882
|
vocab_size: 152064
|
|
877
|
-
max_model_len: 32768
|
|
878
|
-
max_num_seqs: 256
|
|
879
|
-
pipeline_parallelism: true
|
|
880
|
-
enforce_eager: false
|
|
881
883
|
qos: m2
|
|
882
884
|
time: 08:00:00
|
|
883
885
|
partition: a40
|
|
886
|
+
vllm_args:
|
|
887
|
+
--tensor-parallel-size: 2
|
|
888
|
+
--max-model-len: 32768
|
|
889
|
+
--max-num-seqs: 256
|
|
890
|
+
--compilation-config: 3
|
|
884
891
|
Qwen2.5-72B-Instruct:
|
|
885
892
|
model_family: Qwen2.5
|
|
886
893
|
model_variant: 72B-Instruct
|
|
@@ -888,13 +895,14 @@ models:
|
|
|
888
895
|
gpus_per_node: 4
|
|
889
896
|
num_nodes: 1
|
|
890
897
|
vocab_size: 152064
|
|
891
|
-
max_model_len: 16384
|
|
892
|
-
max_num_seqs: 256
|
|
893
|
-
pipeline_parallelism: true
|
|
894
|
-
enforce_eager: false
|
|
895
898
|
qos: m2
|
|
896
899
|
time: 08:00:00
|
|
897
900
|
partition: a40
|
|
901
|
+
vllm_args:
|
|
902
|
+
--tensor-parallel-size: 4
|
|
903
|
+
--max-model-len: 16384
|
|
904
|
+
--max-num-seqs: 256
|
|
905
|
+
--compilation-config: 3
|
|
898
906
|
Qwen2.5-Math-1.5B-Instruct:
|
|
899
907
|
model_family: Qwen2.5
|
|
900
908
|
model_variant: Math-1.5B-Instruct
|
|
@@ -902,13 +910,13 @@ models:
|
|
|
902
910
|
gpus_per_node: 1
|
|
903
911
|
num_nodes: 1
|
|
904
912
|
vocab_size: 152064
|
|
905
|
-
max_model_len: 4096
|
|
906
|
-
max_num_seqs: 256
|
|
907
|
-
pipeline_parallelism: true
|
|
908
|
-
enforce_eager: false
|
|
909
913
|
qos: m2
|
|
910
914
|
time: 08:00:00
|
|
911
915
|
partition: a40
|
|
916
|
+
vllm_args:
|
|
917
|
+
--max-model-len: 4096
|
|
918
|
+
--max-num-seqs: 256
|
|
919
|
+
--compilation-config: 3
|
|
912
920
|
Qwen2.5-Math-7B-Instruct:
|
|
913
921
|
model_family: Qwen2.5
|
|
914
922
|
model_variant: Math-7B-Instruct
|
|
@@ -916,13 +924,13 @@ models:
|
|
|
916
924
|
gpus_per_node: 1
|
|
917
925
|
num_nodes: 1
|
|
918
926
|
vocab_size: 152064
|
|
919
|
-
max_model_len: 4096
|
|
920
|
-
max_num_seqs: 256
|
|
921
|
-
pipeline_parallelism: true
|
|
922
|
-
enforce_eager: false
|
|
923
927
|
qos: m2
|
|
924
928
|
time: 08:00:00
|
|
925
929
|
partition: a40
|
|
930
|
+
vllm_args:
|
|
931
|
+
--max-model-len: 4096
|
|
932
|
+
--max-num-seqs: 256
|
|
933
|
+
--compilation-config: 3
|
|
926
934
|
Qwen2.5-Math-72B-Instruct:
|
|
927
935
|
model_family: Qwen2.5
|
|
928
936
|
model_variant: Math-72B-Instruct
|
|
@@ -930,13 +938,14 @@ models:
|
|
|
930
938
|
gpus_per_node: 4
|
|
931
939
|
num_nodes: 1
|
|
932
940
|
vocab_size: 152064
|
|
933
|
-
max_model_len: 4096
|
|
934
|
-
max_num_seqs: 256
|
|
935
|
-
pipeline_parallelism: true
|
|
936
|
-
enforce_eager: false
|
|
937
941
|
qos: m2
|
|
938
942
|
time: 08:00:00
|
|
939
943
|
partition: a40
|
|
944
|
+
vllm_args:
|
|
945
|
+
--tensor-parallel-size: 4
|
|
946
|
+
--max-model-len: 4096
|
|
947
|
+
--max-num-seqs: 256
|
|
948
|
+
--compilation-config: 3
|
|
940
949
|
Qwen2.5-Coder-7B-Instruct:
|
|
941
950
|
model_family: Qwen2.5
|
|
942
951
|
model_variant: Coder-7B-Instruct
|
|
@@ -944,13 +953,13 @@ models:
|
|
|
944
953
|
gpus_per_node: 1
|
|
945
954
|
num_nodes: 1
|
|
946
955
|
vocab_size: 152064
|
|
947
|
-
max_model_len: 32768
|
|
948
|
-
max_num_seqs: 256
|
|
949
|
-
pipeline_parallelism: true
|
|
950
|
-
enforce_eager: false
|
|
951
956
|
qos: m2
|
|
952
957
|
time: 08:00:00
|
|
953
958
|
partition: a40
|
|
959
|
+
vllm_args:
|
|
960
|
+
--max-model-len: 32768
|
|
961
|
+
--max-num-seqs: 256
|
|
962
|
+
--compilation-config: 3
|
|
954
963
|
Qwen2.5-Math-RM-72B:
|
|
955
964
|
model_family: Qwen2.5
|
|
956
965
|
model_variant: Math-RM-72B
|
|
@@ -958,13 +967,14 @@ models:
|
|
|
958
967
|
gpus_per_node: 4
|
|
959
968
|
num_nodes: 1
|
|
960
969
|
vocab_size: 152064
|
|
961
|
-
max_model_len: 4096
|
|
962
|
-
max_num_seqs: 256
|
|
963
|
-
pipeline_parallelism: true
|
|
964
|
-
enforce_eager: false
|
|
965
970
|
qos: m2
|
|
966
971
|
time: 08:00:00
|
|
967
972
|
partition: a40
|
|
973
|
+
vllm_args:
|
|
974
|
+
--tensor-parallel-size: 4
|
|
975
|
+
--max-model-len: 4096
|
|
976
|
+
--max-num-seqs: 256
|
|
977
|
+
--compilation-config: 3
|
|
968
978
|
Qwen2.5-Math-PRM-7B:
|
|
969
979
|
model_family: Qwen2.5
|
|
970
980
|
model_variant: Math-PRM-7B
|
|
@@ -972,13 +982,13 @@ models:
|
|
|
972
982
|
gpus_per_node: 1
|
|
973
983
|
num_nodes: 1
|
|
974
984
|
vocab_size: 152064
|
|
975
|
-
max_model_len: 4096
|
|
976
|
-
max_num_seqs: 256
|
|
977
|
-
pipeline_parallelism: true
|
|
978
|
-
enforce_eager: false
|
|
979
985
|
qos: m2
|
|
980
986
|
time: 08:00:00
|
|
981
987
|
partition: a40
|
|
988
|
+
vllm_args:
|
|
989
|
+
--max-model-len: 4096
|
|
990
|
+
--max-num-seqs: 256
|
|
991
|
+
--compilation-config: 3
|
|
982
992
|
QwQ-32B-Preview:
|
|
983
993
|
model_family: QwQ
|
|
984
994
|
model_variant: 32B-Preview
|
|
@@ -986,13 +996,14 @@ models:
|
|
|
986
996
|
gpus_per_node: 2
|
|
987
997
|
num_nodes: 1
|
|
988
998
|
vocab_size: 152064
|
|
989
|
-
max_model_len: 32768
|
|
990
|
-
max_num_seqs: 256
|
|
991
|
-
pipeline_parallelism: true
|
|
992
|
-
enforce_eager: false
|
|
993
999
|
qos: m2
|
|
994
1000
|
time: 08:00:00
|
|
995
1001
|
partition: a40
|
|
1002
|
+
vllm_args:
|
|
1003
|
+
--tensor-parallel-size: 2
|
|
1004
|
+
--max-model-len: 32768
|
|
1005
|
+
--max-num-seqs: 256
|
|
1006
|
+
--compilation-config: 3
|
|
996
1007
|
Pixtral-12B-2409:
|
|
997
1008
|
model_family: Pixtral
|
|
998
1009
|
model_variant: 12B-2409
|
|
@@ -1000,13 +1011,13 @@ models:
|
|
|
1000
1011
|
gpus_per_node: 1
|
|
1001
1012
|
num_nodes: 1
|
|
1002
1013
|
vocab_size: 131072
|
|
1003
|
-
max_model_len: 8192
|
|
1004
|
-
max_num_seqs: 256
|
|
1005
|
-
pipeline_parallelism: true
|
|
1006
|
-
enforce_eager: false
|
|
1007
1014
|
qos: m2
|
|
1008
1015
|
time: 08:00:00
|
|
1009
1016
|
partition: a40
|
|
1017
|
+
vllm_args:
|
|
1018
|
+
--max-model-len: 8192
|
|
1019
|
+
--max-num-seqs: 256
|
|
1020
|
+
--compilation-config: 3
|
|
1010
1021
|
e5-mistral-7b-instruct:
|
|
1011
1022
|
model_family: e5
|
|
1012
1023
|
model_variant: mistral-7b-instruct
|
|
@@ -1014,13 +1025,13 @@ models:
|
|
|
1014
1025
|
gpus_per_node: 1
|
|
1015
1026
|
num_nodes: 1
|
|
1016
1027
|
vocab_size: 32000
|
|
1017
|
-
max_model_len: 4096
|
|
1018
|
-
max_num_seqs: 256
|
|
1019
|
-
pipeline_parallelism: true
|
|
1020
|
-
enforce_eager: false
|
|
1021
1028
|
qos: m2
|
|
1022
1029
|
time: 08:00:00
|
|
1023
1030
|
partition: a40
|
|
1031
|
+
vllm_args:
|
|
1032
|
+
--max-model-len: 4096
|
|
1033
|
+
--max-num-seqs: 256
|
|
1034
|
+
--compilation-config: 3
|
|
1024
1035
|
bge-base-en-v1.5:
|
|
1025
1036
|
model_family: bge
|
|
1026
1037
|
model_variant: base-en-v1.5
|
|
@@ -1028,13 +1039,13 @@ models:
|
|
|
1028
1039
|
gpus_per_node: 1
|
|
1029
1040
|
num_nodes: 1
|
|
1030
1041
|
vocab_size: 30522
|
|
1031
|
-
max_model_len: 512
|
|
1032
|
-
max_num_seqs: 256
|
|
1033
|
-
pipeline_parallelism: true
|
|
1034
|
-
enforce_eager: false
|
|
1035
1042
|
qos: m2
|
|
1036
1043
|
time: 08:00:00
|
|
1037
1044
|
partition: a40
|
|
1045
|
+
vllm_args:
|
|
1046
|
+
--max-model-len: 512
|
|
1047
|
+
--max-num-seqs: 256
|
|
1048
|
+
--compilation-config: 3
|
|
1038
1049
|
all-MiniLM-L6-v2:
|
|
1039
1050
|
model_family: all-MiniLM
|
|
1040
1051
|
model_variant: L6-v2
|
|
@@ -1042,13 +1053,13 @@ models:
|
|
|
1042
1053
|
gpus_per_node: 1
|
|
1043
1054
|
num_nodes: 1
|
|
1044
1055
|
vocab_size: 30522
|
|
1045
|
-
max_model_len: 512
|
|
1046
|
-
max_num_seqs: 256
|
|
1047
|
-
pipeline_parallelism: true
|
|
1048
|
-
enforce_eager: false
|
|
1049
1056
|
qos: m2
|
|
1050
1057
|
time: 08:00:00
|
|
1051
1058
|
partition: a40
|
|
1059
|
+
vllm_args:
|
|
1060
|
+
--max-model-len: 512
|
|
1061
|
+
--max-num-seqs: 256
|
|
1062
|
+
--compilation-config: 3
|
|
1052
1063
|
Llama-3.3-70B-Instruct:
|
|
1053
1064
|
model_family: Llama-3.3
|
|
1054
1065
|
model_variant: 70B-Instruct
|
|
@@ -1056,13 +1067,14 @@ models:
|
|
|
1056
1067
|
gpus_per_node: 4
|
|
1057
1068
|
num_nodes: 1
|
|
1058
1069
|
vocab_size: 128256
|
|
1059
|
-
max_model_len: 65536
|
|
1060
|
-
max_num_seqs: 256
|
|
1061
|
-
pipeline_parallelism: true
|
|
1062
|
-
enforce_eager: false
|
|
1063
1070
|
qos: m2
|
|
1064
1071
|
time: 08:00:00
|
|
1065
1072
|
partition: a40
|
|
1073
|
+
vllm_args:
|
|
1074
|
+
--tensor-parallel-size: 4
|
|
1075
|
+
--max-model-len: 65536
|
|
1076
|
+
--max-num-seqs: 256
|
|
1077
|
+
--compilation-config: 3
|
|
1066
1078
|
InternVL2_5-26B:
|
|
1067
1079
|
model_family: InternVL2_5
|
|
1068
1080
|
model_variant: 26B
|
|
@@ -1070,13 +1082,14 @@ models:
|
|
|
1070
1082
|
gpus_per_node: 2
|
|
1071
1083
|
num_nodes: 1
|
|
1072
1084
|
vocab_size: 92553
|
|
1073
|
-
max_model_len: 32768
|
|
1074
|
-
max_num_seqs: 256
|
|
1075
|
-
pipeline_parallelism: true
|
|
1076
|
-
enforce_eager: false
|
|
1077
1085
|
qos: m2
|
|
1078
1086
|
time: 08:00:00
|
|
1079
1087
|
partition: a40
|
|
1088
|
+
vllm_args:
|
|
1089
|
+
--tensor-parallel-size: 2
|
|
1090
|
+
--max-model-len: 32768
|
|
1091
|
+
--max-num-seqs: 256
|
|
1092
|
+
--compilation-config: 3
|
|
1080
1093
|
InternVL2_5-38B:
|
|
1081
1094
|
model_family: InternVL2_5
|
|
1082
1095
|
model_variant: 38B
|
|
@@ -1084,13 +1097,14 @@ models:
|
|
|
1084
1097
|
gpus_per_node: 4
|
|
1085
1098
|
num_nodes: 1
|
|
1086
1099
|
vocab_size: 92553
|
|
1087
|
-
max_model_len: 32768
|
|
1088
|
-
max_num_seqs: 256
|
|
1089
|
-
pipeline_parallelism: true
|
|
1090
|
-
enforce_eager: false
|
|
1091
1100
|
qos: m2
|
|
1092
1101
|
time: 08:00:00
|
|
1093
1102
|
partition: a40
|
|
1103
|
+
vllm_args:
|
|
1104
|
+
--tensor-parallel-size: 4
|
|
1105
|
+
--max-model-len: 32768
|
|
1106
|
+
--max-num-seqs: 256
|
|
1107
|
+
--compilation-config: 3
|
|
1094
1108
|
Aya-Expanse-32B:
|
|
1095
1109
|
model_family: Aya-Expanse
|
|
1096
1110
|
model_variant: 32B
|
|
@@ -1098,69 +1112,72 @@ models:
|
|
|
1098
1112
|
gpus_per_node: 2
|
|
1099
1113
|
num_nodes: 1
|
|
1100
1114
|
vocab_size: 256000
|
|
1101
|
-
max_model_len: 8192
|
|
1102
|
-
max_num_seqs: 256
|
|
1103
|
-
pipeline_parallelism: true
|
|
1104
|
-
enforce_eager: false
|
|
1105
1115
|
qos: m2
|
|
1106
1116
|
time: 08:00:00
|
|
1107
1117
|
partition: a40
|
|
1118
|
+
vllm_args:
|
|
1119
|
+
--tensor-parallel-size: 2
|
|
1120
|
+
--max-model-len: 8192
|
|
1121
|
+
--max-num-seqs: 256
|
|
1122
|
+
--compilation-config: 3
|
|
1108
1123
|
DeepSeek-R1-Distill-Llama-70B:
|
|
1109
1124
|
model_family: DeepSeek-R1
|
|
1110
|
-
model_variant:
|
|
1125
|
+
model_variant: Distill-Llama-70B
|
|
1111
1126
|
model_type: LLM
|
|
1112
1127
|
gpus_per_node: 4
|
|
1113
|
-
num_nodes:
|
|
1128
|
+
num_nodes: 1
|
|
1114
1129
|
vocab_size: 128256
|
|
1115
|
-
max_model_len: 131072
|
|
1116
|
-
max_num_seqs: 256
|
|
1117
|
-
pipeline_parallelism: true
|
|
1118
|
-
enforce_eager: false
|
|
1119
1130
|
qos: m2
|
|
1120
1131
|
time: 08:00:00
|
|
1121
1132
|
partition: a40
|
|
1133
|
+
vllm_args:
|
|
1134
|
+
--tensor-parallel-size: 4
|
|
1135
|
+
--max-model-len: 65536
|
|
1136
|
+
--max-num-seqs: 256
|
|
1137
|
+
--compilation-config: 3
|
|
1122
1138
|
DeepSeek-R1-Distill-Llama-8B:
|
|
1123
1139
|
model_family: DeepSeek-R1
|
|
1124
|
-
model_variant:
|
|
1140
|
+
model_variant: Distill-Llama-8B
|
|
1125
1141
|
model_type: LLM
|
|
1126
1142
|
gpus_per_node: 1
|
|
1127
1143
|
num_nodes: 1
|
|
1128
1144
|
vocab_size: 128256
|
|
1129
|
-
max_model_len: 131072
|
|
1130
|
-
max_num_seqs: 256
|
|
1131
|
-
pipeline_parallelism: true
|
|
1132
|
-
enforce_eager: false
|
|
1133
1145
|
qos: m2
|
|
1134
1146
|
time: 08:00:00
|
|
1135
1147
|
partition: a40
|
|
1148
|
+
vllm_args:
|
|
1149
|
+
--max-model-len: 131072
|
|
1150
|
+
--max-num-seqs: 256
|
|
1151
|
+
--compilation-config: 3
|
|
1136
1152
|
DeepSeek-R1-Distill-Qwen-32B:
|
|
1137
1153
|
model_family: DeepSeek-R1
|
|
1138
1154
|
model_variant: Distill-Qwen-32B
|
|
1139
1155
|
model_type: LLM
|
|
1140
|
-
gpus_per_node:
|
|
1156
|
+
gpus_per_node: 2
|
|
1141
1157
|
num_nodes: 1
|
|
1142
1158
|
vocab_size: 152064
|
|
1143
|
-
max_model_len: 131072
|
|
1144
|
-
max_num_seqs: 256
|
|
1145
|
-
pipeline_parallelism: true
|
|
1146
|
-
enforce_eager: false
|
|
1147
1159
|
qos: m2
|
|
1148
1160
|
time: 08:00:00
|
|
1149
1161
|
partition: a40
|
|
1162
|
+
vllm_args:
|
|
1163
|
+
--tensor-parallel-size: 2
|
|
1164
|
+
--max-model-len: 65536
|
|
1165
|
+
--max-num-seqs: 256
|
|
1166
|
+
--compilation-config: 3
|
|
1150
1167
|
DeepSeek-R1-Distill-Qwen-14B:
|
|
1151
1168
|
model_family: DeepSeek-R1
|
|
1152
1169
|
model_variant: Distill-Qwen-14B
|
|
1153
1170
|
model_type: LLM
|
|
1154
|
-
gpus_per_node:
|
|
1171
|
+
gpus_per_node: 1
|
|
1155
1172
|
num_nodes: 1
|
|
1156
1173
|
vocab_size: 152064
|
|
1157
|
-
max_model_len: 131072
|
|
1158
|
-
max_num_seqs: 256
|
|
1159
|
-
pipeline_parallelism: true
|
|
1160
|
-
enforce_eager: false
|
|
1161
1174
|
qos: m2
|
|
1162
1175
|
time: 08:00:00
|
|
1163
1176
|
partition: a40
|
|
1177
|
+
vllm_args:
|
|
1178
|
+
--max-model-len: 65536
|
|
1179
|
+
--max-num-seqs: 256
|
|
1180
|
+
--compilation-config: 3
|
|
1164
1181
|
DeepSeek-R1-Distill-Qwen-7B:
|
|
1165
1182
|
model_family: DeepSeek-R1
|
|
1166
1183
|
model_variant: Distill-Qwen-7B
|
|
@@ -1168,13 +1185,13 @@ models:
|
|
|
1168
1185
|
gpus_per_node: 1
|
|
1169
1186
|
num_nodes: 1
|
|
1170
1187
|
vocab_size: 152064
|
|
1171
|
-
max_model_len: 131072
|
|
1172
|
-
max_num_seqs: 256
|
|
1173
|
-
pipeline_parallelism: true
|
|
1174
|
-
enforce_eager: false
|
|
1175
1188
|
qos: m2
|
|
1176
1189
|
time: 08:00:00
|
|
1177
1190
|
partition: a40
|
|
1191
|
+
vllm_args:
|
|
1192
|
+
--max-model-len: 131072
|
|
1193
|
+
--max-num-seqs: 256
|
|
1194
|
+
--compilation-config: 3
|
|
1178
1195
|
DeepSeek-R1-Distill-Qwen-1.5B:
|
|
1179
1196
|
model_family: DeepSeek-R1
|
|
1180
1197
|
model_variant: Distill-Qwen-1.5B
|
|
@@ -1182,13 +1199,13 @@ models:
|
|
|
1182
1199
|
gpus_per_node: 1
|
|
1183
1200
|
num_nodes: 1
|
|
1184
1201
|
vocab_size: 152064
|
|
1185
|
-
max_model_len: 131072
|
|
1186
|
-
max_num_seqs: 256
|
|
1187
|
-
pipeline_parallelism: true
|
|
1188
|
-
enforce_eager: false
|
|
1189
1202
|
qos: m2
|
|
1190
1203
|
time: 08:00:00
|
|
1191
1204
|
partition: a40
|
|
1205
|
+
vllm_args:
|
|
1206
|
+
--max-model-len: 131072
|
|
1207
|
+
--max-num-seqs: 256
|
|
1208
|
+
--compilation-config: 3
|
|
1192
1209
|
Phi-3.5-vision-instruct:
|
|
1193
1210
|
model_family: Phi-3.5-vision
|
|
1194
1211
|
model_variant: instruct
|
|
@@ -1196,13 +1213,14 @@ models:
|
|
|
1196
1213
|
gpus_per_node: 2
|
|
1197
1214
|
num_nodes: 1
|
|
1198
1215
|
vocab_size: 32064
|
|
1199
|
-
max_model_len: 65536
|
|
1200
|
-
max_num_seqs: 256
|
|
1201
|
-
pipeline_parallelism: true
|
|
1202
|
-
enforce_eager: false
|
|
1203
1216
|
qos: m2
|
|
1204
1217
|
time: 08:00:00
|
|
1205
1218
|
partition: a40
|
|
1219
|
+
vllm_args:
|
|
1220
|
+
--tensor-parallel-size: 2
|
|
1221
|
+
--max-model-len: 65536
|
|
1222
|
+
--max-num-seqs: 256
|
|
1223
|
+
--compilation-config: 3
|
|
1206
1224
|
InternVL2_5-8B:
|
|
1207
1225
|
model_family: InternVL2_5
|
|
1208
1226
|
model_variant: 8B
|
|
@@ -1210,13 +1228,13 @@ models:
|
|
|
1210
1228
|
gpus_per_node: 1
|
|
1211
1229
|
num_nodes: 1
|
|
1212
1230
|
vocab_size: 92553
|
|
1213
|
-
max_model_len: 32768
|
|
1214
|
-
max_num_seqs: 256
|
|
1215
|
-
pipeline_parallelism: true
|
|
1216
|
-
enforce_eager: false
|
|
1217
1231
|
qos: m2
|
|
1218
1232
|
time: 08:00:00
|
|
1219
1233
|
partition: a40
|
|
1234
|
+
vllm_args:
|
|
1235
|
+
--max-model-len: 32768
|
|
1236
|
+
--max-num-seqs: 256
|
|
1237
|
+
--compilation-config: 3
|
|
1220
1238
|
glm-4v-9b:
|
|
1221
1239
|
model_family: glm-4v
|
|
1222
1240
|
model_variant: 9b
|
|
@@ -1224,13 +1242,13 @@ models:
|
|
|
1224
1242
|
gpus_per_node: 1
|
|
1225
1243
|
num_nodes: 1
|
|
1226
1244
|
vocab_size: 151552
|
|
1227
|
-
max_model_len: 8192
|
|
1228
|
-
max_num_seqs: 256
|
|
1229
|
-
pipeline_parallelism: true
|
|
1230
|
-
enforce_eager: false
|
|
1231
1245
|
qos: m2
|
|
1232
1246
|
time: 08:00:00
|
|
1233
1247
|
partition: a40
|
|
1248
|
+
vllm_args:
|
|
1249
|
+
--max-model-len: 8192
|
|
1250
|
+
--max-num-seqs: 256
|
|
1251
|
+
--compilation-config: 3
|
|
1234
1252
|
Molmo-7B-D-0924:
|
|
1235
1253
|
model_family: Molmo
|
|
1236
1254
|
model_variant: 7B-D-0924
|
|
@@ -1238,26 +1256,27 @@ models:
|
|
|
1238
1256
|
gpus_per_node: 1
|
|
1239
1257
|
num_nodes: 1
|
|
1240
1258
|
vocab_size: 152064
|
|
1241
|
-
max_model_len: 4096
|
|
1242
|
-
max_num_seqs: 256
|
|
1243
|
-
pipeline_parallelism: true
|
|
1244
|
-
enforce_eager: false
|
|
1245
1259
|
qos: m2
|
|
1246
1260
|
time: 08:00:00
|
|
1247
1261
|
partition: a40
|
|
1262
|
+
vllm_args:
|
|
1263
|
+
--max-model-len: 4096
|
|
1264
|
+
--max-num-seqs: 256
|
|
1265
|
+
--compilation-config: 3
|
|
1248
1266
|
deepseek-vl2:
|
|
1249
1267
|
model_family: deepseek-vl2
|
|
1250
1268
|
model_type: VLM
|
|
1251
1269
|
gpus_per_node: 2
|
|
1252
1270
|
num_nodes: 1
|
|
1253
1271
|
vocab_size: 129280
|
|
1254
|
-
max_model_len: 4096
|
|
1255
|
-
max_num_seqs: 256
|
|
1256
|
-
pipeline_parallelism: true
|
|
1257
|
-
enforce_eager: false
|
|
1258
1272
|
qos: m2
|
|
1259
1273
|
time: 08:00:00
|
|
1260
1274
|
partition: a40
|
|
1275
|
+
vllm_args:
|
|
1276
|
+
--tensor-parallel-size: 2
|
|
1277
|
+
--max-model-len: 4096
|
|
1278
|
+
--max-num-seqs: 256
|
|
1279
|
+
--compilation-config: 3
|
|
1261
1280
|
deepseek-vl2-small:
|
|
1262
1281
|
model_family: deepseek-vl2
|
|
1263
1282
|
model_variant: small
|
|
@@ -1265,10 +1284,10 @@ models:
|
|
|
1265
1284
|
gpus_per_node: 1
|
|
1266
1285
|
num_nodes: 1
|
|
1267
1286
|
vocab_size: 129280
|
|
1268
|
-
max_model_len: 4096
|
|
1269
|
-
max_num_seqs: 256
|
|
1270
|
-
pipeline_parallelism: true
|
|
1271
|
-
enforce_eager: false
|
|
1272
1287
|
qos: m2
|
|
1273
1288
|
time: 08:00:00
|
|
1274
1289
|
partition: a40
|
|
1290
|
+
vllm_args:
|
|
1291
|
+
--max-model-len: 4096
|
|
1292
|
+
--max-num-seqs: 256
|
|
1293
|
+
--compilation-config: 3
|