vec-inf 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +3 -3
- vec_inf/cli/_cli.py +214 -104
- vec_inf/cli/_helper.py +289 -564
- vec_inf/cli/_utils.py +26 -150
- vec_inf/cli/_vars.py +32 -0
- vec_inf/client/__init__.py +31 -0
- vec_inf/client/_client_vars.py +213 -0
- vec_inf/client/_exceptions.py +37 -0
- vec_inf/client/_helper.py +674 -0
- vec_inf/client/_slurm_script_generator.py +179 -0
- vec_inf/client/_utils.py +287 -0
- vec_inf/client/api.py +302 -0
- vec_inf/client/config.py +128 -0
- vec_inf/client/models.py +225 -0
- vec_inf/client/slurm_vars.py +49 -0
- vec_inf/config/README.md +0 -12
- vec_inf/config/models.yaml +417 -391
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/METADATA +44 -61
- vec_inf-0.6.0.dist-info/RECORD +25 -0
- vec_inf/cli/_config.py +0 -87
- vec_inf/multinode_vllm.slurm +0 -154
- vec_inf/vllm.slurm +0 -90
- vec_inf-0.5.0.dist-info/RECORD +0 -17
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/WHEEL +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/licenses/LICENSE +0 -0
vec_inf/config/models.yaml
CHANGED
|
@@ -6,13 +6,15 @@ models:
|
|
|
6
6
|
gpus_per_node: 4
|
|
7
7
|
num_nodes: 2
|
|
8
8
|
vocab_size: 256000
|
|
9
|
-
max_model_len: 8192
|
|
10
|
-
max_num_seqs: 256
|
|
11
|
-
pipeline_parallelism: true
|
|
12
|
-
enforce_eager: false
|
|
13
9
|
qos: m2
|
|
14
10
|
time: 08:00:00
|
|
15
11
|
partition: a40
|
|
12
|
+
vllm_args:
|
|
13
|
+
--pipeline-parallel-size: 2
|
|
14
|
+
--tensor-parallel-size: 4
|
|
15
|
+
--max-model-len: 8192
|
|
16
|
+
--max-num-seqs: 256
|
|
17
|
+
--compilation-config: 3
|
|
16
18
|
c4ai-command-r-plus-08-2024:
|
|
17
19
|
model_family: c4ai-command-r
|
|
18
20
|
model_variant: plus-08-2024
|
|
@@ -20,13 +22,15 @@ models:
|
|
|
20
22
|
gpus_per_node: 4
|
|
21
23
|
num_nodes: 2
|
|
22
24
|
vocab_size: 256000
|
|
23
|
-
max_model_len: 65536
|
|
24
|
-
max_num_seqs: 256
|
|
25
|
-
pipeline_parallelism: true
|
|
26
|
-
enforce_eager: false
|
|
27
25
|
qos: m2
|
|
28
26
|
time: 08:00:00
|
|
29
27
|
partition: a40
|
|
28
|
+
vllm_args:
|
|
29
|
+
--pipeline-parallel-size: 2
|
|
30
|
+
--tensor-parallel-size: 4
|
|
31
|
+
--max-model-len: 65536
|
|
32
|
+
--max-num-seqs: 256
|
|
33
|
+
--compilation-config: 3
|
|
30
34
|
c4ai-command-r-08-2024:
|
|
31
35
|
model_family: c4ai-command-r
|
|
32
36
|
model_variant: 08-2024
|
|
@@ -34,13 +38,14 @@ models:
|
|
|
34
38
|
gpus_per_node: 2
|
|
35
39
|
num_nodes: 1
|
|
36
40
|
vocab_size: 256000
|
|
37
|
-
max_model_len: 32768
|
|
38
|
-
max_num_seqs: 256
|
|
39
|
-
pipeline_parallelism: true
|
|
40
|
-
enforce_eager: false
|
|
41
41
|
qos: m2
|
|
42
42
|
time: 08:00:00
|
|
43
43
|
partition: a40
|
|
44
|
+
vllm_args:
|
|
45
|
+
--tensor-parallel-size: 2
|
|
46
|
+
--max-model-len: 32768
|
|
47
|
+
--max-num-seqs: 256
|
|
48
|
+
--compilation-config: 3
|
|
44
49
|
CodeLlama-7b-hf:
|
|
45
50
|
model_family: CodeLlama
|
|
46
51
|
model_variant: 7b-hf
|
|
@@ -48,13 +53,13 @@ models:
|
|
|
48
53
|
gpus_per_node: 1
|
|
49
54
|
num_nodes: 1
|
|
50
55
|
vocab_size: 32000
|
|
51
|
-
max_model_len: 16384
|
|
52
|
-
max_num_seqs: 256
|
|
53
|
-
pipeline_parallelism: true
|
|
54
|
-
enforce_eager: false
|
|
55
56
|
qos: m2
|
|
56
57
|
time: 08:00:00
|
|
57
58
|
partition: a40
|
|
59
|
+
vllm_args:
|
|
60
|
+
--max-model-len: 16384
|
|
61
|
+
--max-num-seqs: 256
|
|
62
|
+
--compilation-config: 3
|
|
58
63
|
CodeLlama-7b-Instruct-hf:
|
|
59
64
|
model_family: CodeLlama
|
|
60
65
|
model_variant: 7b-Instruct-hf
|
|
@@ -62,13 +67,13 @@ models:
|
|
|
62
67
|
gpus_per_node: 1
|
|
63
68
|
num_nodes: 1
|
|
64
69
|
vocab_size: 32000
|
|
65
|
-
max_model_len: 16384
|
|
66
|
-
max_num_seqs: 256
|
|
67
|
-
pipeline_parallelism: true
|
|
68
|
-
enforce_eager: false
|
|
69
70
|
qos: m2
|
|
70
71
|
time: 08:00:00
|
|
71
72
|
partition: a40
|
|
73
|
+
vllm_args:
|
|
74
|
+
--max-model-len: 16384
|
|
75
|
+
--max-num-seqs: 256
|
|
76
|
+
--compilation-config: 3
|
|
72
77
|
CodeLlama-13b-hf:
|
|
73
78
|
model_family: CodeLlama
|
|
74
79
|
model_variant: 13b-hf
|
|
@@ -76,13 +81,13 @@ models:
|
|
|
76
81
|
gpus_per_node: 1
|
|
77
82
|
num_nodes: 1
|
|
78
83
|
vocab_size: 32000
|
|
79
|
-
max_model_len: 16384
|
|
80
|
-
max_num_seqs: 256
|
|
81
|
-
pipeline_parallelism: true
|
|
82
|
-
enforce_eager: false
|
|
83
84
|
qos: m2
|
|
84
85
|
time: 08:00:00
|
|
85
86
|
partition: a40
|
|
87
|
+
vllm_args:
|
|
88
|
+
--max-model-len: 16384
|
|
89
|
+
--max-num-seqs: 256
|
|
90
|
+
--compilation-config: 3
|
|
86
91
|
CodeLlama-13b-Instruct-hf:
|
|
87
92
|
model_family: CodeLlama
|
|
88
93
|
model_variant: 13b-Instruct-hf
|
|
@@ -90,13 +95,13 @@ models:
|
|
|
90
95
|
gpus_per_node: 1
|
|
91
96
|
num_nodes: 1
|
|
92
97
|
vocab_size: 32000
|
|
93
|
-
max_model_len: 16384
|
|
94
|
-
max_num_seqs: 256
|
|
95
|
-
pipeline_parallelism: true
|
|
96
|
-
enforce_eager: false
|
|
97
98
|
qos: m2
|
|
98
99
|
time: 08:00:00
|
|
99
100
|
partition: a40
|
|
101
|
+
vllm_args:
|
|
102
|
+
--max-model-len: 16384
|
|
103
|
+
--max-num-seqs: 256
|
|
104
|
+
--compilation-config: 3
|
|
100
105
|
CodeLlama-34b-hf:
|
|
101
106
|
model_family: CodeLlama
|
|
102
107
|
model_variant: 34b-hf
|
|
@@ -104,13 +109,14 @@ models:
|
|
|
104
109
|
gpus_per_node: 2
|
|
105
110
|
num_nodes: 1
|
|
106
111
|
vocab_size: 32000
|
|
107
|
-
max_model_len: 16384
|
|
108
|
-
max_num_seqs: 256
|
|
109
|
-
pipeline_parallelism: true
|
|
110
|
-
enforce_eager: false
|
|
111
112
|
qos: m2
|
|
112
113
|
time: 08:00:00
|
|
113
114
|
partition: a40
|
|
115
|
+
vllm_args:
|
|
116
|
+
--tensor-parallel-size: 2
|
|
117
|
+
--max-model-len: 16384
|
|
118
|
+
--max-num-seqs: 256
|
|
119
|
+
--compilation-config: 3
|
|
114
120
|
CodeLlama-34b-Instruct-hf:
|
|
115
121
|
model_family: CodeLlama
|
|
116
122
|
model_variant: 34b-Instruct-hf
|
|
@@ -118,55 +124,44 @@ models:
|
|
|
118
124
|
gpus_per_node: 2
|
|
119
125
|
num_nodes: 1
|
|
120
126
|
vocab_size: 32000
|
|
121
|
-
max_model_len: 16384
|
|
122
|
-
max_num_seqs: 256
|
|
123
|
-
pipeline_parallelism: true
|
|
124
|
-
enforce_eager: false
|
|
125
127
|
qos: m2
|
|
126
128
|
time: 08:00:00
|
|
127
129
|
partition: a40
|
|
130
|
+
vllm_args:
|
|
131
|
+
--tensor-parallel-size: 2
|
|
132
|
+
--max-model-len: 16384
|
|
133
|
+
--max-num-seqs: 256
|
|
134
|
+
--compilation-config: 3
|
|
128
135
|
CodeLlama-70b-hf:
|
|
129
136
|
model_family: CodeLlama
|
|
130
137
|
model_variant: 70b-hf
|
|
131
138
|
model_type: LLM
|
|
132
139
|
gpus_per_node: 4
|
|
133
140
|
num_nodes: 1
|
|
134
|
-
vocab_size:
|
|
135
|
-
max_model_len: 4096
|
|
136
|
-
max_num_seqs: 256
|
|
137
|
-
pipeline_parallelism: true
|
|
138
|
-
enforce_eager: false
|
|
141
|
+
vocab_size: 32016
|
|
139
142
|
qos: m2
|
|
140
143
|
time: 08:00:00
|
|
141
144
|
partition: a40
|
|
145
|
+
vllm_args:
|
|
146
|
+
--tensor-parallel-size: 4
|
|
147
|
+
--max-model-len: 4096
|
|
148
|
+
--max-num-seqs: 256
|
|
149
|
+
--compilation-config: 3
|
|
142
150
|
CodeLlama-70b-Instruct-hf:
|
|
143
151
|
model_family: CodeLlama
|
|
144
152
|
model_variant: 70b-Instruct-hf
|
|
145
153
|
model_type: LLM
|
|
146
154
|
gpus_per_node: 4
|
|
147
155
|
num_nodes: 1
|
|
148
|
-
vocab_size:
|
|
149
|
-
max_model_len: 4096
|
|
150
|
-
max_num_seqs: 256
|
|
151
|
-
pipeline_parallelism: true
|
|
152
|
-
enforce_eager: false
|
|
153
|
-
qos: m2
|
|
154
|
-
time: 08:00:00
|
|
155
|
-
partition: a40
|
|
156
|
-
dbrx-instruct:
|
|
157
|
-
model_family: dbrx
|
|
158
|
-
model_variant: instruct
|
|
159
|
-
model_type: LLM
|
|
160
|
-
gpus_per_node: 4
|
|
161
|
-
num_nodes: 2
|
|
162
|
-
vocab_size: 100352
|
|
163
|
-
max_model_len: 32000
|
|
164
|
-
max_num_seqs: 256
|
|
165
|
-
pipeline_parallelism: true
|
|
166
|
-
enforce_eager: false
|
|
156
|
+
vocab_size: 32016
|
|
167
157
|
qos: m2
|
|
168
158
|
time: 08:00:00
|
|
169
159
|
partition: a40
|
|
160
|
+
vllm_args:
|
|
161
|
+
--tensor-parallel-size: 4
|
|
162
|
+
--max-model-len: 4096
|
|
163
|
+
--max-num-seqs: 256
|
|
164
|
+
--compilation-config: 3
|
|
170
165
|
gemma-2-9b:
|
|
171
166
|
model_family: gemma-2
|
|
172
167
|
model_variant: 9b
|
|
@@ -174,13 +169,13 @@ models:
|
|
|
174
169
|
gpus_per_node: 1
|
|
175
170
|
num_nodes: 1
|
|
176
171
|
vocab_size: 256000
|
|
177
|
-
max_model_len: 4096
|
|
178
|
-
max_num_seqs: 256
|
|
179
|
-
pipeline_parallelism: true
|
|
180
|
-
enforce_eager: false
|
|
181
172
|
qos: m2
|
|
182
173
|
time: 08:00:00
|
|
183
174
|
partition: a40
|
|
175
|
+
vllm_args:
|
|
176
|
+
--max-model-len: 4096
|
|
177
|
+
--max-num-seqs: 256
|
|
178
|
+
--compilation-config: 3
|
|
184
179
|
gemma-2-9b-it:
|
|
185
180
|
model_family: gemma-2
|
|
186
181
|
model_variant: 9b-it
|
|
@@ -188,13 +183,13 @@ models:
|
|
|
188
183
|
gpus_per_node: 1
|
|
189
184
|
num_nodes: 1
|
|
190
185
|
vocab_size: 256000
|
|
191
|
-
max_model_len: 4096
|
|
192
|
-
max_num_seqs: 256
|
|
193
|
-
pipeline_parallelism: true
|
|
194
|
-
enforce_eager: false
|
|
195
186
|
qos: m2
|
|
196
187
|
time: 08:00:00
|
|
197
188
|
partition: a40
|
|
189
|
+
vllm_args:
|
|
190
|
+
--max-model-len: 4096
|
|
191
|
+
--max-num-seqs: 256
|
|
192
|
+
--compilation-config: 3
|
|
198
193
|
gemma-2-27b:
|
|
199
194
|
model_family: gemma-2
|
|
200
195
|
model_variant: 27b
|
|
@@ -202,13 +197,14 @@ models:
|
|
|
202
197
|
gpus_per_node: 2
|
|
203
198
|
num_nodes: 1
|
|
204
199
|
vocab_size: 256000
|
|
205
|
-
max_model_len: 4096
|
|
206
|
-
max_num_seqs: 256
|
|
207
|
-
pipeline_parallelism: true
|
|
208
|
-
enforce_eager: false
|
|
209
200
|
qos: m2
|
|
210
201
|
time: 08:00:00
|
|
211
202
|
partition: a40
|
|
203
|
+
vllm_args:
|
|
204
|
+
--tensor-parallel-size: 2
|
|
205
|
+
--max-model-len: 4096
|
|
206
|
+
--max-num-seqs: 256
|
|
207
|
+
--compilation-config: 3
|
|
212
208
|
gemma-2-27b-it:
|
|
213
209
|
model_family: gemma-2
|
|
214
210
|
model_variant: 27b-it
|
|
@@ -216,13 +212,14 @@ models:
|
|
|
216
212
|
gpus_per_node: 2
|
|
217
213
|
num_nodes: 1
|
|
218
214
|
vocab_size: 256000
|
|
219
|
-
max_model_len: 4096
|
|
220
|
-
max_num_seqs: 256
|
|
221
|
-
pipeline_parallelism: true
|
|
222
|
-
enforce_eager: false
|
|
223
215
|
qos: m2
|
|
224
216
|
time: 08:00:00
|
|
225
217
|
partition: a40
|
|
218
|
+
vllm_args:
|
|
219
|
+
--tensor-parallel-size: 2
|
|
220
|
+
--max-model-len: 4096
|
|
221
|
+
--max-num-seqs: 256
|
|
222
|
+
--compilation-config: 3
|
|
226
223
|
Llama-2-7b-hf:
|
|
227
224
|
model_family: Llama-2
|
|
228
225
|
model_variant: 7b-hf
|
|
@@ -230,13 +227,13 @@ models:
|
|
|
230
227
|
gpus_per_node: 1
|
|
231
228
|
num_nodes: 1
|
|
232
229
|
vocab_size: 32000
|
|
233
|
-
max_model_len: 4096
|
|
234
|
-
max_num_seqs: 256
|
|
235
|
-
pipeline_parallelism: true
|
|
236
|
-
enforce_eager: false
|
|
237
230
|
qos: m2
|
|
238
231
|
time: 08:00:00
|
|
239
232
|
partition: a40
|
|
233
|
+
vllm_args:
|
|
234
|
+
--max-model-len: 4096
|
|
235
|
+
--max-num-seqs: 256
|
|
236
|
+
--compilation-config: 3
|
|
240
237
|
Llama-2-7b-chat-hf:
|
|
241
238
|
model_family: Llama-2
|
|
242
239
|
model_variant: 7b-chat-hf
|
|
@@ -244,13 +241,13 @@ models:
|
|
|
244
241
|
gpus_per_node: 1
|
|
245
242
|
num_nodes: 1
|
|
246
243
|
vocab_size: 32000
|
|
247
|
-
max_model_len: 4096
|
|
248
|
-
max_num_seqs: 256
|
|
249
|
-
pipeline_parallelism: true
|
|
250
|
-
enforce_eager: false
|
|
251
244
|
qos: m2
|
|
252
245
|
time: 08:00:00
|
|
253
246
|
partition: a40
|
|
247
|
+
vllm_args:
|
|
248
|
+
--max-model-len: 4096
|
|
249
|
+
--max-num-seqs: 256
|
|
250
|
+
--compilation-config: 3
|
|
254
251
|
Llama-2-13b-hf:
|
|
255
252
|
model_family: Llama-2
|
|
256
253
|
model_variant: 13b-hf
|
|
@@ -258,13 +255,13 @@ models:
|
|
|
258
255
|
gpus_per_node: 1
|
|
259
256
|
num_nodes: 1
|
|
260
257
|
vocab_size: 32000
|
|
261
|
-
max_model_len: 4096
|
|
262
|
-
max_num_seqs: 256
|
|
263
|
-
pipeline_parallelism: true
|
|
264
|
-
enforce_eager: false
|
|
265
258
|
qos: m2
|
|
266
259
|
time: 08:00:00
|
|
267
260
|
partition: a40
|
|
261
|
+
vllm_args:
|
|
262
|
+
--max-model-len: 4096
|
|
263
|
+
--max-num-seqs: 256
|
|
264
|
+
--compilation-config: 3
|
|
268
265
|
Llama-2-13b-chat-hf:
|
|
269
266
|
model_family: Llama-2
|
|
270
267
|
model_variant: 13b-chat-hf
|
|
@@ -272,13 +269,13 @@ models:
|
|
|
272
269
|
gpus_per_node: 1
|
|
273
270
|
num_nodes: 1
|
|
274
271
|
vocab_size: 32000
|
|
275
|
-
max_model_len: 4096
|
|
276
|
-
max_num_seqs: 256
|
|
277
|
-
pipeline_parallelism: true
|
|
278
|
-
enforce_eager: false
|
|
279
272
|
qos: m2
|
|
280
273
|
time: 08:00:00
|
|
281
274
|
partition: a40
|
|
275
|
+
vllm_args:
|
|
276
|
+
--max-model-len: 4096
|
|
277
|
+
--max-num-seqs: 256
|
|
278
|
+
--compilation-config: 3
|
|
282
279
|
Llama-2-70b-hf:
|
|
283
280
|
model_family: Llama-2
|
|
284
281
|
model_variant: 70b-hf
|
|
@@ -286,13 +283,14 @@ models:
|
|
|
286
283
|
gpus_per_node: 4
|
|
287
284
|
num_nodes: 1
|
|
288
285
|
vocab_size: 32000
|
|
289
|
-
max_model_len: 4096
|
|
290
|
-
max_num_seqs: 256
|
|
291
|
-
pipeline_parallelism: true
|
|
292
|
-
enforce_eager: false
|
|
293
286
|
qos: m2
|
|
294
287
|
time: 08:00:00
|
|
295
288
|
partition: a40
|
|
289
|
+
vllm_args:
|
|
290
|
+
--tensor-parallel-size: 4
|
|
291
|
+
--max-model-len: 4096
|
|
292
|
+
--max-num-seqs: 256
|
|
293
|
+
--compilation-config: 3
|
|
296
294
|
Llama-2-70b-chat-hf:
|
|
297
295
|
model_family: Llama-2
|
|
298
296
|
model_variant: 70b-chat-hf
|
|
@@ -300,13 +298,14 @@ models:
|
|
|
300
298
|
gpus_per_node: 4
|
|
301
299
|
num_nodes: 1
|
|
302
300
|
vocab_size: 32000
|
|
303
|
-
max_model_len: 4096
|
|
304
|
-
max_num_seqs: 256
|
|
305
|
-
pipeline_parallelism: true
|
|
306
|
-
enforce_eager: false
|
|
307
301
|
qos: m2
|
|
308
302
|
time: 08:00:00
|
|
309
303
|
partition: a40
|
|
304
|
+
vllm_args:
|
|
305
|
+
--tensor-parallel-size: 4
|
|
306
|
+
--max-model-len: 4096
|
|
307
|
+
--max-num-seqs: 256
|
|
308
|
+
--compilation-config: 3
|
|
310
309
|
llava-1.5-7b-hf:
|
|
311
310
|
model_family: llava-1.5
|
|
312
311
|
model_variant: 7b-hf
|
|
@@ -314,13 +313,13 @@ models:
|
|
|
314
313
|
gpus_per_node: 1
|
|
315
314
|
num_nodes: 1
|
|
316
315
|
vocab_size: 32000
|
|
317
|
-
max_model_len: 4096
|
|
318
|
-
max_num_seqs: 256
|
|
319
|
-
pipeline_parallelism: true
|
|
320
|
-
enforce_eager: false
|
|
321
316
|
qos: m2
|
|
322
317
|
time: 08:00:00
|
|
323
318
|
partition: a40
|
|
319
|
+
vllm_args:
|
|
320
|
+
--max-model-len: 4096
|
|
321
|
+
--max-num-seqs: 256
|
|
322
|
+
--compilation-config: 3
|
|
324
323
|
llava-1.5-13b-hf:
|
|
325
324
|
model_family: llava-1.5
|
|
326
325
|
model_variant: 13b-hf
|
|
@@ -328,13 +327,13 @@ models:
|
|
|
328
327
|
gpus_per_node: 1
|
|
329
328
|
num_nodes: 1
|
|
330
329
|
vocab_size: 32000
|
|
331
|
-
max_model_len: 4096
|
|
332
|
-
max_num_seqs: 256
|
|
333
|
-
pipeline_parallelism: true
|
|
334
|
-
enforce_eager: false
|
|
335
330
|
qos: m2
|
|
336
331
|
time: 08:00:00
|
|
337
332
|
partition: a40
|
|
333
|
+
vllm_args:
|
|
334
|
+
--max-model-len: 4096
|
|
335
|
+
--max-num-seqs: 256
|
|
336
|
+
--compilation-config: 3
|
|
338
337
|
llava-v1.6-mistral-7b-hf:
|
|
339
338
|
model_family: llava-v1.6
|
|
340
339
|
model_variant: mistral-7b-hf
|
|
@@ -342,13 +341,13 @@ models:
|
|
|
342
341
|
gpus_per_node: 1
|
|
343
342
|
num_nodes: 1
|
|
344
343
|
vocab_size: 32064
|
|
345
|
-
max_model_len: 32768
|
|
346
|
-
max_num_seqs: 256
|
|
347
|
-
pipeline_parallelism: true
|
|
348
|
-
enforce_eager: false
|
|
349
344
|
qos: m2
|
|
350
345
|
time: 08:00:00
|
|
351
346
|
partition: a40
|
|
347
|
+
vllm_args:
|
|
348
|
+
--max-model-len: 32768
|
|
349
|
+
--max-num-seqs: 256
|
|
350
|
+
--compilation-config: 3
|
|
352
351
|
llava-v1.6-34b-hf:
|
|
353
352
|
model_family: llava-v1.6
|
|
354
353
|
model_variant: 34b-hf
|
|
@@ -356,13 +355,14 @@ models:
|
|
|
356
355
|
gpus_per_node: 2
|
|
357
356
|
num_nodes: 1
|
|
358
357
|
vocab_size: 64064
|
|
359
|
-
max_model_len: 4096
|
|
360
|
-
max_num_seqs: 256
|
|
361
|
-
pipeline_parallelism: true
|
|
362
|
-
enforce_eager: false
|
|
363
358
|
qos: m2
|
|
364
359
|
time: 08:00:00
|
|
365
360
|
partition: a40
|
|
361
|
+
vllm_args:
|
|
362
|
+
--tensor-parallel-size: 2
|
|
363
|
+
--max-model-len: 4096
|
|
364
|
+
--max-num-seqs: 256
|
|
365
|
+
--compilation-config: 3
|
|
366
366
|
Meta-Llama-3-8B:
|
|
367
367
|
model_family: Meta-Llama-3
|
|
368
368
|
model_variant: 8B
|
|
@@ -370,13 +370,13 @@ models:
|
|
|
370
370
|
gpus_per_node: 1
|
|
371
371
|
num_nodes: 1
|
|
372
372
|
vocab_size: 128256
|
|
373
|
-
max_model_len: 8192
|
|
374
|
-
max_num_seqs: 256
|
|
375
|
-
pipeline_parallelism: true
|
|
376
|
-
enforce_eager: false
|
|
377
373
|
qos: m2
|
|
378
374
|
time: 08:00:00
|
|
379
375
|
partition: a40
|
|
376
|
+
vllm_args:
|
|
377
|
+
--max-model-len: 8192
|
|
378
|
+
--max-num-seqs: 256
|
|
379
|
+
--compilation-config: 3
|
|
380
380
|
Meta-Llama-3-8B-Instruct:
|
|
381
381
|
model_family: Meta-Llama-3
|
|
382
382
|
model_variant: 8B-Instruct
|
|
@@ -384,13 +384,13 @@ models:
|
|
|
384
384
|
gpus_per_node: 1
|
|
385
385
|
num_nodes: 1
|
|
386
386
|
vocab_size: 128256
|
|
387
|
-
max_model_len: 8192
|
|
388
|
-
max_num_seqs: 256
|
|
389
|
-
pipeline_parallelism: true
|
|
390
|
-
enforce_eager: false
|
|
391
387
|
qos: m2
|
|
392
388
|
time: 08:00:00
|
|
393
389
|
partition: a40
|
|
390
|
+
vllm_args:
|
|
391
|
+
--max-model-len: 8192
|
|
392
|
+
--max-num-seqs: 256
|
|
393
|
+
--compilation-config: 3
|
|
394
394
|
Meta-Llama-3-70B:
|
|
395
395
|
model_family: Meta-Llama-3
|
|
396
396
|
model_variant: 70B
|
|
@@ -398,13 +398,14 @@ models:
|
|
|
398
398
|
gpus_per_node: 4
|
|
399
399
|
num_nodes: 1
|
|
400
400
|
vocab_size: 128256
|
|
401
|
-
max_model_len: 8192
|
|
402
|
-
max_num_seqs: 256
|
|
403
|
-
pipeline_parallelism: true
|
|
404
|
-
enforce_eager: false
|
|
405
401
|
qos: m2
|
|
406
402
|
time: 08:00:00
|
|
407
403
|
partition: a40
|
|
404
|
+
vllm_args:
|
|
405
|
+
--tensor-parallel-size: 4
|
|
406
|
+
--max-model-len: 8192
|
|
407
|
+
--max-num-seqs: 256
|
|
408
|
+
--compilation-config: 3
|
|
408
409
|
Meta-Llama-3-70B-Instruct:
|
|
409
410
|
model_family: Meta-Llama-3
|
|
410
411
|
model_variant: 70B-Instruct
|
|
@@ -412,13 +413,14 @@ models:
|
|
|
412
413
|
gpus_per_node: 4
|
|
413
414
|
num_nodes: 1
|
|
414
415
|
vocab_size: 128256
|
|
415
|
-
max_model_len: 8192
|
|
416
|
-
max_num_seqs: 256
|
|
417
|
-
pipeline_parallelism: true
|
|
418
|
-
enforce_eager: false
|
|
419
416
|
qos: m2
|
|
420
417
|
time: 08:00:00
|
|
421
418
|
partition: a40
|
|
419
|
+
vllm_args:
|
|
420
|
+
--tensor-parallel-size: 4
|
|
421
|
+
--max-model-len: 8192
|
|
422
|
+
--max-num-seqs: 256
|
|
423
|
+
--compilation-config: 3
|
|
422
424
|
Meta-Llama-3.1-8B:
|
|
423
425
|
model_family: Meta-Llama-3.1
|
|
424
426
|
model_variant: 8B
|
|
@@ -426,13 +428,13 @@ models:
|
|
|
426
428
|
gpus_per_node: 1
|
|
427
429
|
num_nodes: 1
|
|
428
430
|
vocab_size: 128256
|
|
429
|
-
max_model_len: 131072
|
|
430
|
-
max_num_seqs: 256
|
|
431
|
-
pipeline_parallelism: true
|
|
432
|
-
enforce_eager: false
|
|
433
431
|
qos: m2
|
|
434
432
|
time: 08:00:00
|
|
435
433
|
partition: a40
|
|
434
|
+
vllm_args:
|
|
435
|
+
--max-model-len: 131072
|
|
436
|
+
--max-num-seqs: 256
|
|
437
|
+
--compilation-config: 3
|
|
436
438
|
Meta-Llama-3.1-8B-Instruct:
|
|
437
439
|
model_family: Meta-Llama-3.1
|
|
438
440
|
model_variant: 8B-Instruct
|
|
@@ -440,13 +442,13 @@ models:
|
|
|
440
442
|
gpus_per_node: 1
|
|
441
443
|
num_nodes: 1
|
|
442
444
|
vocab_size: 128256
|
|
443
|
-
max_model_len: 131072
|
|
444
|
-
max_num_seqs: 256
|
|
445
|
-
pipeline_parallelism: true
|
|
446
|
-
enforce_eager: false
|
|
447
445
|
qos: m2
|
|
448
446
|
time: 08:00:00
|
|
449
447
|
partition: a40
|
|
448
|
+
vllm_args:
|
|
449
|
+
--max-model-len: 131072
|
|
450
|
+
--max-num-seqs: 256
|
|
451
|
+
--compilation-config: 3
|
|
450
452
|
Meta-Llama-3.1-70B:
|
|
451
453
|
model_family: Meta-Llama-3.1
|
|
452
454
|
model_variant: 70B
|
|
@@ -454,13 +456,14 @@ models:
|
|
|
454
456
|
gpus_per_node: 4
|
|
455
457
|
num_nodes: 1
|
|
456
458
|
vocab_size: 128256
|
|
457
|
-
max_model_len: 65536
|
|
458
|
-
max_num_seqs: 256
|
|
459
|
-
pipeline_parallelism: true
|
|
460
|
-
enforce_eager: false
|
|
461
459
|
qos: m2
|
|
462
460
|
time: 08:00:00
|
|
463
461
|
partition: a40
|
|
462
|
+
vllm_args:
|
|
463
|
+
--tensor-parallel-size: 4
|
|
464
|
+
--max-model-len: 65536
|
|
465
|
+
--max-num-seqs: 256
|
|
466
|
+
--compilation-config: 3
|
|
464
467
|
Meta-Llama-3.1-70B-Instruct:
|
|
465
468
|
model_family: Meta-Llama-3.1
|
|
466
469
|
model_variant: 70B-Instruct
|
|
@@ -468,13 +471,14 @@ models:
|
|
|
468
471
|
gpus_per_node: 4
|
|
469
472
|
num_nodes: 1
|
|
470
473
|
vocab_size: 128256
|
|
471
|
-
max_model_len: 65536
|
|
472
|
-
max_num_seqs: 256
|
|
473
|
-
pipeline_parallelism: true
|
|
474
|
-
enforce_eager: false
|
|
475
474
|
qos: m2
|
|
476
475
|
time: 08:00:00
|
|
477
476
|
partition: a40
|
|
477
|
+
vllm_args:
|
|
478
|
+
--tensor-parallel-size: 4
|
|
479
|
+
--max-model-len: 65536
|
|
480
|
+
--max-num-seqs: 256
|
|
481
|
+
--compilation-config: 3
|
|
478
482
|
Meta-Llama-3.1-405B-Instruct:
|
|
479
483
|
model_family: Meta-Llama-3.1
|
|
480
484
|
model_variant: 405B-Instruct
|
|
@@ -482,27 +486,15 @@ models:
|
|
|
482
486
|
gpus_per_node: 4
|
|
483
487
|
num_nodes: 8
|
|
484
488
|
vocab_size: 128256
|
|
485
|
-
max_model_len: 16384
|
|
486
|
-
max_num_seqs: 256
|
|
487
|
-
pipeline_parallelism: true
|
|
488
|
-
enforce_eager: false
|
|
489
489
|
qos: m4
|
|
490
490
|
time: 02:00:00
|
|
491
491
|
partition: a40
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
vocab_size: 32000
|
|
499
|
-
max_model_len: 32768
|
|
500
|
-
max_num_seqs: 256
|
|
501
|
-
pipeline_parallelism: true
|
|
502
|
-
enforce_eager: false
|
|
503
|
-
qos: m2
|
|
504
|
-
time: 08:00:00
|
|
505
|
-
partition: a40
|
|
492
|
+
vllm_args:
|
|
493
|
+
--pipeline-parallel-size: 8
|
|
494
|
+
--tensor-parallel-size: 4
|
|
495
|
+
--max-model-len: 16384
|
|
496
|
+
--max-num-seqs: 256
|
|
497
|
+
--compilation-config: 3
|
|
506
498
|
Mistral-7B-Instruct-v0.1:
|
|
507
499
|
model_family: Mistral
|
|
508
500
|
model_variant: 7B-Instruct-v0.1
|
|
@@ -510,13 +502,13 @@ models:
|
|
|
510
502
|
gpus_per_node: 1
|
|
511
503
|
num_nodes: 1
|
|
512
504
|
vocab_size: 32000
|
|
513
|
-
max_model_len: 32768
|
|
514
|
-
max_num_seqs: 256
|
|
515
|
-
pipeline_parallelism: true
|
|
516
|
-
enforce_eager: false
|
|
517
505
|
qos: m2
|
|
518
506
|
time: 08:00:00
|
|
519
507
|
partition: a40
|
|
508
|
+
vllm_args:
|
|
509
|
+
--max-model-len: 32768
|
|
510
|
+
--max-num-seqs: 256
|
|
511
|
+
--compilation-config: 3
|
|
520
512
|
Mistral-7B-Instruct-v0.2:
|
|
521
513
|
model_family: Mistral
|
|
522
514
|
model_variant: 7B-Instruct-v0.2
|
|
@@ -524,13 +516,13 @@ models:
|
|
|
524
516
|
gpus_per_node: 1
|
|
525
517
|
num_nodes: 1
|
|
526
518
|
vocab_size: 32000
|
|
527
|
-
max_model_len: 32768
|
|
528
|
-
max_num_seqs: 256
|
|
529
|
-
pipeline_parallelism: true
|
|
530
|
-
enforce_eager: false
|
|
531
519
|
qos: m2
|
|
532
520
|
time: 08:00:00
|
|
533
521
|
partition: a40
|
|
522
|
+
vllm_args:
|
|
523
|
+
--max-model-len: 32768
|
|
524
|
+
--max-num-seqs: 256
|
|
525
|
+
--compilation-config: 3
|
|
534
526
|
Mistral-7B-v0.3:
|
|
535
527
|
model_family: Mistral
|
|
536
528
|
model_variant: 7B-v0.3
|
|
@@ -538,13 +530,13 @@ models:
|
|
|
538
530
|
gpus_per_node: 1
|
|
539
531
|
num_nodes: 1
|
|
540
532
|
vocab_size: 32768
|
|
541
|
-
max_model_len: 32768
|
|
542
|
-
max_num_seqs: 256
|
|
543
|
-
pipeline_parallelism: true
|
|
544
|
-
enforce_eager: false
|
|
545
533
|
qos: m2
|
|
546
534
|
time: 08:00:00
|
|
547
535
|
partition: a40
|
|
536
|
+
vllm_args:
|
|
537
|
+
--max-model-len: 32768
|
|
538
|
+
--max-num-seqs: 256
|
|
539
|
+
--compilation-config: 3
|
|
548
540
|
Mistral-7B-Instruct-v0.3:
|
|
549
541
|
model_family: Mistral
|
|
550
542
|
model_variant: 7B-Instruct-v0.3
|
|
@@ -552,13 +544,13 @@ models:
|
|
|
552
544
|
gpus_per_node: 1
|
|
553
545
|
num_nodes: 1
|
|
554
546
|
vocab_size: 32768
|
|
555
|
-
max_model_len: 32768
|
|
556
|
-
max_num_seqs: 256
|
|
557
|
-
pipeline_parallelism: true
|
|
558
|
-
enforce_eager: false
|
|
559
547
|
qos: m2
|
|
560
548
|
time: 08:00:00
|
|
561
549
|
partition: a40
|
|
550
|
+
vllm_args:
|
|
551
|
+
--max-model-len: 32768
|
|
552
|
+
--max-num-seqs: 256
|
|
553
|
+
--compilation-config: 3
|
|
562
554
|
Mistral-Large-Instruct-2407:
|
|
563
555
|
model_family: Mistral
|
|
564
556
|
model_variant: Large-Instruct-2407
|
|
@@ -566,13 +558,15 @@ models:
|
|
|
566
558
|
gpus_per_node: 4
|
|
567
559
|
num_nodes: 2
|
|
568
560
|
vocab_size: 32768
|
|
569
|
-
max_model_len: 32768
|
|
570
|
-
max_num_seqs: 256
|
|
571
|
-
pipeline_parallelism: true
|
|
572
|
-
enforce_eager: false
|
|
573
561
|
qos: m2
|
|
574
562
|
time: 08:00:00
|
|
575
563
|
partition: a40
|
|
564
|
+
vllm_args:
|
|
565
|
+
--pipeline-parallel-size: 2
|
|
566
|
+
--tensor-parallel-size: 4
|
|
567
|
+
--max-model-len: 32768
|
|
568
|
+
--max-num-seqs: 256
|
|
569
|
+
--compilation-config: 3
|
|
576
570
|
Mistral-Large-Instruct-2411:
|
|
577
571
|
model_family: Mistral
|
|
578
572
|
model_variant: Large-Instruct-2411
|
|
@@ -580,13 +574,15 @@ models:
|
|
|
580
574
|
gpus_per_node: 4
|
|
581
575
|
num_nodes: 2
|
|
582
576
|
vocab_size: 32768
|
|
583
|
-
max_model_len: 32768
|
|
584
|
-
max_num_seqs: 256
|
|
585
|
-
pipeline_parallelism: true
|
|
586
|
-
enforce_eager: false
|
|
587
577
|
qos: m2
|
|
588
578
|
time: 08:00:00
|
|
589
579
|
partition: a40
|
|
580
|
+
vllm_args:
|
|
581
|
+
--pipeline-parallel-size: 2
|
|
582
|
+
--tensor-parallel-size: 4
|
|
583
|
+
--max-model-len: 32768
|
|
584
|
+
--max-num-seqs: 256
|
|
585
|
+
--compilation-config: 3
|
|
590
586
|
Mixtral-8x7B-Instruct-v0.1:
|
|
591
587
|
model_family: Mixtral
|
|
592
588
|
model_variant: 8x7B-Instruct-v0.1
|
|
@@ -594,13 +590,14 @@ models:
|
|
|
594
590
|
gpus_per_node: 4
|
|
595
591
|
num_nodes: 1
|
|
596
592
|
vocab_size: 32000
|
|
597
|
-
max_model_len: 32768
|
|
598
|
-
max_num_seqs: 256
|
|
599
|
-
pipeline_parallelism: true
|
|
600
|
-
enforce_eager: false
|
|
601
593
|
qos: m2
|
|
602
594
|
time: 08:00:00
|
|
603
595
|
partition: a40
|
|
596
|
+
vllm_args:
|
|
597
|
+
--tensor-parallel-size: 4
|
|
598
|
+
--max-model-len: 32768
|
|
599
|
+
--max-num-seqs: 256
|
|
600
|
+
--compilation-config: 3
|
|
604
601
|
Mixtral-8x22B-v0.1:
|
|
605
602
|
model_family: Mixtral
|
|
606
603
|
model_variant: 8x22B-v0.1
|
|
@@ -608,13 +605,15 @@ models:
|
|
|
608
605
|
gpus_per_node: 4
|
|
609
606
|
num_nodes: 2
|
|
610
607
|
vocab_size: 32768
|
|
611
|
-
max_model_len: 65536
|
|
612
|
-
max_num_seqs: 256
|
|
613
|
-
pipeline_parallelism: true
|
|
614
|
-
enforce_eager: false
|
|
615
608
|
qos: m2
|
|
616
609
|
time: 08:00:00
|
|
617
610
|
partition: a40
|
|
611
|
+
vllm_args:
|
|
612
|
+
--pipeline-parallel-size: 2
|
|
613
|
+
--tensor-parallel-size: 4
|
|
614
|
+
--max-model-len: 65536
|
|
615
|
+
--max-num-seqs: 256
|
|
616
|
+
--compilation-config: 3
|
|
618
617
|
Mixtral-8x22B-Instruct-v0.1:
|
|
619
618
|
model_family: Mixtral
|
|
620
619
|
model_variant: 8x22B-Instruct-v0.1
|
|
@@ -622,13 +621,15 @@ models:
|
|
|
622
621
|
gpus_per_node: 4
|
|
623
622
|
num_nodes: 2
|
|
624
623
|
vocab_size: 32768
|
|
625
|
-
max_model_len: 65536
|
|
626
|
-
max_num_seqs: 256
|
|
627
|
-
pipeline_parallelism: true
|
|
628
|
-
enforce_eager: false
|
|
629
624
|
qos: m2
|
|
630
625
|
time: 08:00:00
|
|
631
626
|
partition: a40
|
|
627
|
+
vllm_args:
|
|
628
|
+
--pipeline-parallel-size: 2
|
|
629
|
+
--tensor-parallel-size: 4
|
|
630
|
+
--max-model-len: 65536
|
|
631
|
+
--max-num-seqs: 256
|
|
632
|
+
--compilation-config: 3
|
|
632
633
|
Phi-3-medium-128k-instruct:
|
|
633
634
|
model_family: Phi-3
|
|
634
635
|
model_variant: medium-128k-instruct
|
|
@@ -636,13 +637,14 @@ models:
|
|
|
636
637
|
gpus_per_node: 2
|
|
637
638
|
num_nodes: 1
|
|
638
639
|
vocab_size: 32064
|
|
639
|
-
max_model_len: 131072
|
|
640
|
-
max_num_seqs: 256
|
|
641
|
-
pipeline_parallelism: true
|
|
642
|
-
enforce_eager: false
|
|
643
640
|
qos: m2
|
|
644
641
|
time: 08:00:00
|
|
645
642
|
partition: a40
|
|
643
|
+
vllm_args:
|
|
644
|
+
--tensor-parallel-size: 2
|
|
645
|
+
--max-model-len: 131072
|
|
646
|
+
--max-num-seqs: 256
|
|
647
|
+
--compilation-config: 3
|
|
646
648
|
Phi-3-vision-128k-instruct:
|
|
647
649
|
model_family: Phi-3-vision
|
|
648
650
|
model_variant: 128k-instruct
|
|
@@ -650,13 +652,14 @@ models:
|
|
|
650
652
|
gpus_per_node: 2
|
|
651
653
|
num_nodes: 1
|
|
652
654
|
vocab_size: 32064
|
|
653
|
-
max_model_len: 65536
|
|
654
|
-
max_num_seqs: 256
|
|
655
|
-
pipeline_parallelism: true
|
|
656
|
-
enforce_eager: false
|
|
657
655
|
qos: m2
|
|
658
656
|
time: 08:00:00
|
|
659
657
|
partition: a40
|
|
658
|
+
vllm_args:
|
|
659
|
+
--tensor-parallel-size: 2
|
|
660
|
+
--max-model-len: 65536
|
|
661
|
+
--max-num-seqs: 256
|
|
662
|
+
--compilation-config: 3
|
|
660
663
|
Llama3-OpenBioLLM-70B:
|
|
661
664
|
model_family: Llama3-OpenBioLLM
|
|
662
665
|
model_variant: 70B
|
|
@@ -664,13 +667,14 @@ models:
|
|
|
664
667
|
gpus_per_node: 4
|
|
665
668
|
num_nodes: 1
|
|
666
669
|
vocab_size: 128256
|
|
667
|
-
max_model_len: 8192
|
|
668
|
-
max_num_seqs: 256
|
|
669
|
-
pipeline_parallelism: true
|
|
670
|
-
enforce_eager: false
|
|
671
670
|
qos: m2
|
|
672
671
|
time: 08:00:00
|
|
673
672
|
partition: a40
|
|
673
|
+
vllm_args:
|
|
674
|
+
--tensor-parallel-size: 4
|
|
675
|
+
--max-model-len: 8192
|
|
676
|
+
--max-num-seqs: 256
|
|
677
|
+
--compilation-config: 3
|
|
674
678
|
Llama-3.1-Nemotron-70B-Instruct-HF:
|
|
675
679
|
model_family: Llama-3.1-Nemotron
|
|
676
680
|
model_variant: 70B-Instruct-HF
|
|
@@ -678,13 +682,14 @@ models:
|
|
|
678
682
|
gpus_per_node: 4
|
|
679
683
|
num_nodes: 1
|
|
680
684
|
vocab_size: 128256
|
|
681
|
-
max_model_len: 65536
|
|
682
|
-
max_num_seqs: 256
|
|
683
|
-
pipeline_parallelism: true
|
|
684
|
-
enforce_eager: false
|
|
685
685
|
qos: m2
|
|
686
686
|
time: 08:00:00
|
|
687
687
|
partition: a40
|
|
688
|
+
vllm_args:
|
|
689
|
+
--tensor-parallel-size: 4
|
|
690
|
+
--max-model-len: 65536
|
|
691
|
+
--max-num-seqs: 256
|
|
692
|
+
--compilation-config: 3
|
|
688
693
|
Llama-3.2-1B:
|
|
689
694
|
model_family: Llama-3.2
|
|
690
695
|
model_variant: 1B
|
|
@@ -692,13 +697,13 @@ models:
|
|
|
692
697
|
gpus_per_node: 1
|
|
693
698
|
num_nodes: 1
|
|
694
699
|
vocab_size: 128256
|
|
695
|
-
max_model_len: 131072
|
|
696
|
-
max_num_seqs: 256
|
|
697
|
-
pipeline_parallelism: true
|
|
698
|
-
enforce_eager: false
|
|
699
700
|
qos: m2
|
|
700
701
|
time: 08:00:00
|
|
701
702
|
partition: a40
|
|
703
|
+
vllm_args:
|
|
704
|
+
--max-model-len: 131072
|
|
705
|
+
--max-num-seqs: 256
|
|
706
|
+
--compilation-config: 3
|
|
702
707
|
Llama-3.2-1B-Instruct:
|
|
703
708
|
model_family: Llama-3.2
|
|
704
709
|
model_variant: 1B-Instruct
|
|
@@ -706,13 +711,13 @@ models:
|
|
|
706
711
|
gpus_per_node: 1
|
|
707
712
|
num_nodes: 1
|
|
708
713
|
vocab_size: 128256
|
|
709
|
-
max_model_len: 131072
|
|
710
|
-
max_num_seqs: 256
|
|
711
|
-
pipeline_parallelism: true
|
|
712
|
-
enforce_eager: false
|
|
713
714
|
qos: m2
|
|
714
715
|
time: 08:00:00
|
|
715
716
|
partition: a40
|
|
717
|
+
vllm_args:
|
|
718
|
+
--max-model-len: 131072
|
|
719
|
+
--max-num-seqs: 256
|
|
720
|
+
--compilation-config: 3
|
|
716
721
|
Llama-3.2-3B:
|
|
717
722
|
model_family: Llama-3.2
|
|
718
723
|
model_variant: 3B
|
|
@@ -720,13 +725,13 @@ models:
|
|
|
720
725
|
gpus_per_node: 1
|
|
721
726
|
num_nodes: 1
|
|
722
727
|
vocab_size: 128256
|
|
723
|
-
max_model_len: 131072
|
|
724
|
-
max_num_seqs: 256
|
|
725
|
-
pipeline_parallelism: true
|
|
726
|
-
enforce_eager: false
|
|
727
728
|
qos: m2
|
|
728
729
|
time: 08:00:00
|
|
729
730
|
partition: a40
|
|
731
|
+
vllm_args:
|
|
732
|
+
--max-model-len: 131072
|
|
733
|
+
--max-num-seqs: 256
|
|
734
|
+
--compilation-config: 3
|
|
730
735
|
Llama-3.2-3B-Instruct:
|
|
731
736
|
model_family: Llama-3.2
|
|
732
737
|
model_variant: 3B-Instruct
|
|
@@ -734,13 +739,13 @@ models:
|
|
|
734
739
|
gpus_per_node: 1
|
|
735
740
|
num_nodes: 1
|
|
736
741
|
vocab_size: 128256
|
|
737
|
-
max_model_len: 131072
|
|
738
|
-
max_num_seqs: 256
|
|
739
|
-
pipeline_parallelism: true
|
|
740
|
-
enforce_eager: false
|
|
741
742
|
qos: m2
|
|
742
743
|
time: 08:00:00
|
|
743
744
|
partition: a40
|
|
745
|
+
vllm_args:
|
|
746
|
+
--max-model-len: 131072
|
|
747
|
+
--max-num-seqs: 256
|
|
748
|
+
--compilation-config: 3
|
|
744
749
|
Llama-3.2-11B-Vision:
|
|
745
750
|
model_family: Llama-3.2
|
|
746
751
|
model_variant: 11B-Vision
|
|
@@ -748,13 +753,15 @@ models:
|
|
|
748
753
|
gpus_per_node: 2
|
|
749
754
|
num_nodes: 1
|
|
750
755
|
vocab_size: 128256
|
|
751
|
-
max_model_len: 4096
|
|
752
|
-
max_num_seqs: 64
|
|
753
|
-
pipeline_parallelism: false
|
|
754
|
-
enforce_eager: true
|
|
755
756
|
qos: m2
|
|
756
757
|
time: 08:00:00
|
|
757
758
|
partition: a40
|
|
759
|
+
vllm_args:
|
|
760
|
+
--tensor-parallel-size: 2
|
|
761
|
+
--max-model-len: 4096
|
|
762
|
+
--max-num-seqs: 64
|
|
763
|
+
--compilation-config: 3
|
|
764
|
+
--enforce-eager: true
|
|
758
765
|
Llama-3.2-11B-Vision-Instruct:
|
|
759
766
|
model_family: Llama-3.2
|
|
760
767
|
model_variant: 11B-Vision-Instruct
|
|
@@ -762,13 +769,15 @@ models:
|
|
|
762
769
|
gpus_per_node: 2
|
|
763
770
|
num_nodes: 1
|
|
764
771
|
vocab_size: 128256
|
|
765
|
-
max_model_len: 4096
|
|
766
|
-
max_num_seqs: 64
|
|
767
|
-
pipeline_parallelism: false
|
|
768
|
-
enforce_eager: true
|
|
769
772
|
qos: m2
|
|
770
773
|
time: 08:00:00
|
|
771
774
|
partition: a40
|
|
775
|
+
vllm_args:
|
|
776
|
+
--tensor-parallel-size: 2
|
|
777
|
+
--max-model-len: 4096
|
|
778
|
+
--max-num-seqs: 64
|
|
779
|
+
--compilation-config: 3
|
|
780
|
+
--enforce-eager: true
|
|
772
781
|
Llama-3.2-90B-Vision:
|
|
773
782
|
model_family: Llama-3.2
|
|
774
783
|
model_variant: 90B-Vision
|
|
@@ -776,13 +785,15 @@ models:
|
|
|
776
785
|
gpus_per_node: 4
|
|
777
786
|
num_nodes: 2
|
|
778
787
|
vocab_size: 128256
|
|
779
|
-
max_model_len: 4096
|
|
780
|
-
max_num_seqs: 32
|
|
781
|
-
pipeline_parallelism: false
|
|
782
|
-
enforce_eager: true
|
|
783
788
|
qos: m2
|
|
784
789
|
time: 08:00:00
|
|
785
790
|
partition: a40
|
|
791
|
+
vllm_args:
|
|
792
|
+
--tensor-parallel-size: 8
|
|
793
|
+
--max-model-len: 4096
|
|
794
|
+
--max-num-seqs: 32
|
|
795
|
+
--compilation-config: 3
|
|
796
|
+
--enforce-eager: true
|
|
786
797
|
Llama-3.2-90B-Vision-Instruct:
|
|
787
798
|
model_family: Llama-3.2
|
|
788
799
|
model_variant: 90B-Vision-Instruct
|
|
@@ -790,13 +801,15 @@ models:
|
|
|
790
801
|
gpus_per_node: 4
|
|
791
802
|
num_nodes: 2
|
|
792
803
|
vocab_size: 128256
|
|
793
|
-
max_model_len: 4096
|
|
794
|
-
max_num_seqs: 32
|
|
795
|
-
pipeline_parallelism: false
|
|
796
|
-
enforce_eager: true
|
|
797
804
|
qos: m2
|
|
798
805
|
time: 08:00:00
|
|
799
806
|
partition: a40
|
|
807
|
+
vllm_args:
|
|
808
|
+
--tensor-parallel-size: 8
|
|
809
|
+
--max-model-len: 4096
|
|
810
|
+
--max-num-seqs: 32
|
|
811
|
+
--compilation-config: 3
|
|
812
|
+
--enforce-eager: true
|
|
800
813
|
Qwen2.5-0.5B-Instruct:
|
|
801
814
|
model_family: Qwen2.5
|
|
802
815
|
model_variant: 0.5B-Instruct
|
|
@@ -804,13 +817,13 @@ models:
|
|
|
804
817
|
gpus_per_node: 1
|
|
805
818
|
num_nodes: 1
|
|
806
819
|
vocab_size: 152064
|
|
807
|
-
max_model_len: 32768
|
|
808
|
-
max_num_seqs: 256
|
|
809
|
-
pipeline_parallelism: true
|
|
810
|
-
enforce_eager: false
|
|
811
820
|
qos: m2
|
|
812
821
|
time: 08:00:00
|
|
813
822
|
partition: a40
|
|
823
|
+
vllm_args:
|
|
824
|
+
--max-model-len: 32768
|
|
825
|
+
--max-num-seqs: 256
|
|
826
|
+
--compilation-config: 3
|
|
814
827
|
Qwen2.5-1.5B-Instruct:
|
|
815
828
|
model_family: Qwen2.5
|
|
816
829
|
model_variant: 1.5B-Instruct
|
|
@@ -818,13 +831,13 @@ models:
|
|
|
818
831
|
gpus_per_node: 1
|
|
819
832
|
num_nodes: 1
|
|
820
833
|
vocab_size: 152064
|
|
821
|
-
max_model_len: 32768
|
|
822
|
-
max_num_seqs: 256
|
|
823
|
-
pipeline_parallelism: true
|
|
824
|
-
enforce_eager: false
|
|
825
834
|
qos: m2
|
|
826
835
|
time: 08:00:00
|
|
827
836
|
partition: a40
|
|
837
|
+
vllm_args:
|
|
838
|
+
--max-model-len: 32768
|
|
839
|
+
--max-num-seqs: 256
|
|
840
|
+
--compilation-config: 3
|
|
828
841
|
Qwen2.5-3B-Instruct:
|
|
829
842
|
model_family: Qwen2.5
|
|
830
843
|
model_variant: 3B-Instruct
|
|
@@ -832,13 +845,13 @@ models:
|
|
|
832
845
|
gpus_per_node: 1
|
|
833
846
|
num_nodes: 1
|
|
834
847
|
vocab_size: 152064
|
|
835
|
-
max_model_len: 32768
|
|
836
|
-
max_num_seqs: 256
|
|
837
|
-
pipeline_parallelism: true
|
|
838
|
-
enforce_eager: false
|
|
839
848
|
qos: m2
|
|
840
849
|
time: 08:00:00
|
|
841
850
|
partition: a40
|
|
851
|
+
vllm_args:
|
|
852
|
+
--max-model-len: 32768
|
|
853
|
+
--max-num-seqs: 256
|
|
854
|
+
--compilation-config: 3
|
|
842
855
|
Qwen2.5-7B-Instruct:
|
|
843
856
|
model_family: Qwen2.5
|
|
844
857
|
model_variant: 7B-Instruct
|
|
@@ -846,13 +859,13 @@ models:
|
|
|
846
859
|
gpus_per_node: 1
|
|
847
860
|
num_nodes: 1
|
|
848
861
|
vocab_size: 152064
|
|
849
|
-
max_model_len: 32768
|
|
850
|
-
max_num_seqs: 256
|
|
851
|
-
pipeline_parallelism: true
|
|
852
|
-
enforce_eager: false
|
|
853
862
|
qos: m2
|
|
854
863
|
time: 08:00:00
|
|
855
864
|
partition: a40
|
|
865
|
+
vllm_args:
|
|
866
|
+
--max-model-len: 32768
|
|
867
|
+
--max-num-seqs: 256
|
|
868
|
+
--compilation-config: 3
|
|
856
869
|
Qwen2.5-14B-Instruct:
|
|
857
870
|
model_family: Qwen2.5
|
|
858
871
|
model_variant: 14B-Instruct
|
|
@@ -860,13 +873,13 @@ models:
|
|
|
860
873
|
gpus_per_node: 1
|
|
861
874
|
num_nodes: 1
|
|
862
875
|
vocab_size: 152064
|
|
863
|
-
max_model_len: 32768
|
|
864
|
-
max_num_seqs: 256
|
|
865
|
-
pipeline_parallelism: true
|
|
866
|
-
enforce_eager: false
|
|
867
876
|
qos: m2
|
|
868
877
|
time: 08:00:00
|
|
869
878
|
partition: a40
|
|
879
|
+
vllm_args:
|
|
880
|
+
--max-model-len: 32768
|
|
881
|
+
--max-num-seqs: 256
|
|
882
|
+
--compilation-config: 3
|
|
870
883
|
Qwen2.5-32B-Instruct:
|
|
871
884
|
model_family: Qwen2.5
|
|
872
885
|
model_variant: 32B-Instruct
|
|
@@ -874,13 +887,14 @@ models:
|
|
|
874
887
|
gpus_per_node: 2
|
|
875
888
|
num_nodes: 1
|
|
876
889
|
vocab_size: 152064
|
|
877
|
-
max_model_len: 32768
|
|
878
|
-
max_num_seqs: 256
|
|
879
|
-
pipeline_parallelism: true
|
|
880
|
-
enforce_eager: false
|
|
881
890
|
qos: m2
|
|
882
891
|
time: 08:00:00
|
|
883
892
|
partition: a40
|
|
893
|
+
vllm_args:
|
|
894
|
+
--tensor-parallel-size: 2
|
|
895
|
+
--max-model-len: 32768
|
|
896
|
+
--max-num-seqs: 256
|
|
897
|
+
--compilation-config: 3
|
|
884
898
|
Qwen2.5-72B-Instruct:
|
|
885
899
|
model_family: Qwen2.5
|
|
886
900
|
model_variant: 72B-Instruct
|
|
@@ -888,13 +902,14 @@ models:
|
|
|
888
902
|
gpus_per_node: 4
|
|
889
903
|
num_nodes: 1
|
|
890
904
|
vocab_size: 152064
|
|
891
|
-
max_model_len: 16384
|
|
892
|
-
max_num_seqs: 256
|
|
893
|
-
pipeline_parallelism: true
|
|
894
|
-
enforce_eager: false
|
|
895
905
|
qos: m2
|
|
896
906
|
time: 08:00:00
|
|
897
907
|
partition: a40
|
|
908
|
+
vllm_args:
|
|
909
|
+
--tensor-parallel-size: 4
|
|
910
|
+
--max-model-len: 16384
|
|
911
|
+
--max-num-seqs: 256
|
|
912
|
+
--compilation-config: 3
|
|
898
913
|
Qwen2.5-Math-1.5B-Instruct:
|
|
899
914
|
model_family: Qwen2.5
|
|
900
915
|
model_variant: Math-1.5B-Instruct
|
|
@@ -902,13 +917,13 @@ models:
|
|
|
902
917
|
gpus_per_node: 1
|
|
903
918
|
num_nodes: 1
|
|
904
919
|
vocab_size: 152064
|
|
905
|
-
max_model_len: 4096
|
|
906
|
-
max_num_seqs: 256
|
|
907
|
-
pipeline_parallelism: true
|
|
908
|
-
enforce_eager: false
|
|
909
920
|
qos: m2
|
|
910
921
|
time: 08:00:00
|
|
911
922
|
partition: a40
|
|
923
|
+
vllm_args:
|
|
924
|
+
--max-model-len: 4096
|
|
925
|
+
--max-num-seqs: 256
|
|
926
|
+
--compilation-config: 3
|
|
912
927
|
Qwen2.5-Math-7B-Instruct:
|
|
913
928
|
model_family: Qwen2.5
|
|
914
929
|
model_variant: Math-7B-Instruct
|
|
@@ -916,13 +931,13 @@ models:
|
|
|
916
931
|
gpus_per_node: 1
|
|
917
932
|
num_nodes: 1
|
|
918
933
|
vocab_size: 152064
|
|
919
|
-
max_model_len: 4096
|
|
920
|
-
max_num_seqs: 256
|
|
921
|
-
pipeline_parallelism: true
|
|
922
|
-
enforce_eager: false
|
|
923
934
|
qos: m2
|
|
924
935
|
time: 08:00:00
|
|
925
936
|
partition: a40
|
|
937
|
+
vllm_args:
|
|
938
|
+
--max-model-len: 4096
|
|
939
|
+
--max-num-seqs: 256
|
|
940
|
+
--compilation-config: 3
|
|
926
941
|
Qwen2.5-Math-72B-Instruct:
|
|
927
942
|
model_family: Qwen2.5
|
|
928
943
|
model_variant: Math-72B-Instruct
|
|
@@ -930,13 +945,14 @@ models:
|
|
|
930
945
|
gpus_per_node: 4
|
|
931
946
|
num_nodes: 1
|
|
932
947
|
vocab_size: 152064
|
|
933
|
-
max_model_len: 4096
|
|
934
|
-
max_num_seqs: 256
|
|
935
|
-
pipeline_parallelism: true
|
|
936
|
-
enforce_eager: false
|
|
937
948
|
qos: m2
|
|
938
949
|
time: 08:00:00
|
|
939
950
|
partition: a40
|
|
951
|
+
vllm_args:
|
|
952
|
+
--tensor-parallel-size: 4
|
|
953
|
+
--max-model-len: 4096
|
|
954
|
+
--max-num-seqs: 256
|
|
955
|
+
--compilation-config: 3
|
|
940
956
|
Qwen2.5-Coder-7B-Instruct:
|
|
941
957
|
model_family: Qwen2.5
|
|
942
958
|
model_variant: Coder-7B-Instruct
|
|
@@ -944,13 +960,13 @@ models:
|
|
|
944
960
|
gpus_per_node: 1
|
|
945
961
|
num_nodes: 1
|
|
946
962
|
vocab_size: 152064
|
|
947
|
-
max_model_len: 32768
|
|
948
|
-
max_num_seqs: 256
|
|
949
|
-
pipeline_parallelism: true
|
|
950
|
-
enforce_eager: false
|
|
951
963
|
qos: m2
|
|
952
964
|
time: 08:00:00
|
|
953
965
|
partition: a40
|
|
966
|
+
vllm_args:
|
|
967
|
+
--max-model-len: 32768
|
|
968
|
+
--max-num-seqs: 256
|
|
969
|
+
--compilation-config: 3
|
|
954
970
|
Qwen2.5-Math-RM-72B:
|
|
955
971
|
model_family: Qwen2.5
|
|
956
972
|
model_variant: Math-RM-72B
|
|
@@ -958,13 +974,14 @@ models:
|
|
|
958
974
|
gpus_per_node: 4
|
|
959
975
|
num_nodes: 1
|
|
960
976
|
vocab_size: 152064
|
|
961
|
-
max_model_len: 4096
|
|
962
|
-
max_num_seqs: 256
|
|
963
|
-
pipeline_parallelism: true
|
|
964
|
-
enforce_eager: false
|
|
965
977
|
qos: m2
|
|
966
978
|
time: 08:00:00
|
|
967
979
|
partition: a40
|
|
980
|
+
vllm_args:
|
|
981
|
+
--tensor-parallel-size: 4
|
|
982
|
+
--max-model-len: 4096
|
|
983
|
+
--max-num-seqs: 256
|
|
984
|
+
--compilation-config: 3
|
|
968
985
|
Qwen2.5-Math-PRM-7B:
|
|
969
986
|
model_family: Qwen2.5
|
|
970
987
|
model_variant: Math-PRM-7B
|
|
@@ -972,13 +989,13 @@ models:
|
|
|
972
989
|
gpus_per_node: 1
|
|
973
990
|
num_nodes: 1
|
|
974
991
|
vocab_size: 152064
|
|
975
|
-
max_model_len: 4096
|
|
976
|
-
max_num_seqs: 256
|
|
977
|
-
pipeline_parallelism: true
|
|
978
|
-
enforce_eager: false
|
|
979
992
|
qos: m2
|
|
980
993
|
time: 08:00:00
|
|
981
994
|
partition: a40
|
|
995
|
+
vllm_args:
|
|
996
|
+
--max-model-len: 4096
|
|
997
|
+
--max-num-seqs: 256
|
|
998
|
+
--compilation-config: 3
|
|
982
999
|
QwQ-32B-Preview:
|
|
983
1000
|
model_family: QwQ
|
|
984
1001
|
model_variant: 32B-Preview
|
|
@@ -986,13 +1003,14 @@ models:
|
|
|
986
1003
|
gpus_per_node: 2
|
|
987
1004
|
num_nodes: 1
|
|
988
1005
|
vocab_size: 152064
|
|
989
|
-
max_model_len: 32768
|
|
990
|
-
max_num_seqs: 256
|
|
991
|
-
pipeline_parallelism: true
|
|
992
|
-
enforce_eager: false
|
|
993
1006
|
qos: m2
|
|
994
1007
|
time: 08:00:00
|
|
995
1008
|
partition: a40
|
|
1009
|
+
vllm_args:
|
|
1010
|
+
--tensor-parallel-size: 2
|
|
1011
|
+
--max-model-len: 32768
|
|
1012
|
+
--max-num-seqs: 256
|
|
1013
|
+
--compilation-config: 3
|
|
996
1014
|
Pixtral-12B-2409:
|
|
997
1015
|
model_family: Pixtral
|
|
998
1016
|
model_variant: 12B-2409
|
|
@@ -1000,13 +1018,13 @@ models:
|
|
|
1000
1018
|
gpus_per_node: 1
|
|
1001
1019
|
num_nodes: 1
|
|
1002
1020
|
vocab_size: 131072
|
|
1003
|
-
max_model_len: 8192
|
|
1004
|
-
max_num_seqs: 256
|
|
1005
|
-
pipeline_parallelism: true
|
|
1006
|
-
enforce_eager: false
|
|
1007
1021
|
qos: m2
|
|
1008
1022
|
time: 08:00:00
|
|
1009
1023
|
partition: a40
|
|
1024
|
+
vllm_args:
|
|
1025
|
+
--max-model-len: 8192
|
|
1026
|
+
--max-num-seqs: 256
|
|
1027
|
+
--compilation-config: 3
|
|
1010
1028
|
e5-mistral-7b-instruct:
|
|
1011
1029
|
model_family: e5
|
|
1012
1030
|
model_variant: mistral-7b-instruct
|
|
@@ -1014,13 +1032,13 @@ models:
|
|
|
1014
1032
|
gpus_per_node: 1
|
|
1015
1033
|
num_nodes: 1
|
|
1016
1034
|
vocab_size: 32000
|
|
1017
|
-
max_model_len: 4096
|
|
1018
|
-
max_num_seqs: 256
|
|
1019
|
-
pipeline_parallelism: true
|
|
1020
|
-
enforce_eager: false
|
|
1021
1035
|
qos: m2
|
|
1022
1036
|
time: 08:00:00
|
|
1023
1037
|
partition: a40
|
|
1038
|
+
vllm_args:
|
|
1039
|
+
--max-model-len: 4096
|
|
1040
|
+
--max-num-seqs: 256
|
|
1041
|
+
--compilation-config: 3
|
|
1024
1042
|
bge-base-en-v1.5:
|
|
1025
1043
|
model_family: bge
|
|
1026
1044
|
model_variant: base-en-v1.5
|
|
@@ -1028,13 +1046,13 @@ models:
|
|
|
1028
1046
|
gpus_per_node: 1
|
|
1029
1047
|
num_nodes: 1
|
|
1030
1048
|
vocab_size: 30522
|
|
1031
|
-
max_model_len: 512
|
|
1032
|
-
max_num_seqs: 256
|
|
1033
|
-
pipeline_parallelism: true
|
|
1034
|
-
enforce_eager: false
|
|
1035
1049
|
qos: m2
|
|
1036
1050
|
time: 08:00:00
|
|
1037
1051
|
partition: a40
|
|
1052
|
+
vllm_args:
|
|
1053
|
+
--max-model-len: 512
|
|
1054
|
+
--max-num-seqs: 256
|
|
1055
|
+
--compilation-config: 3
|
|
1038
1056
|
all-MiniLM-L6-v2:
|
|
1039
1057
|
model_family: all-MiniLM
|
|
1040
1058
|
model_variant: L6-v2
|
|
@@ -1042,13 +1060,13 @@ models:
|
|
|
1042
1060
|
gpus_per_node: 1
|
|
1043
1061
|
num_nodes: 1
|
|
1044
1062
|
vocab_size: 30522
|
|
1045
|
-
max_model_len: 512
|
|
1046
|
-
max_num_seqs: 256
|
|
1047
|
-
pipeline_parallelism: true
|
|
1048
|
-
enforce_eager: false
|
|
1049
1063
|
qos: m2
|
|
1050
1064
|
time: 08:00:00
|
|
1051
1065
|
partition: a40
|
|
1066
|
+
vllm_args:
|
|
1067
|
+
--max-model-len: 512
|
|
1068
|
+
--max-num-seqs: 256
|
|
1069
|
+
--compilation-config: 3
|
|
1052
1070
|
Llama-3.3-70B-Instruct:
|
|
1053
1071
|
model_family: Llama-3.3
|
|
1054
1072
|
model_variant: 70B-Instruct
|
|
@@ -1056,13 +1074,14 @@ models:
|
|
|
1056
1074
|
gpus_per_node: 4
|
|
1057
1075
|
num_nodes: 1
|
|
1058
1076
|
vocab_size: 128256
|
|
1059
|
-
max_model_len: 65536
|
|
1060
|
-
max_num_seqs: 256
|
|
1061
|
-
pipeline_parallelism: true
|
|
1062
|
-
enforce_eager: false
|
|
1063
1077
|
qos: m2
|
|
1064
1078
|
time: 08:00:00
|
|
1065
1079
|
partition: a40
|
|
1080
|
+
vllm_args:
|
|
1081
|
+
--tensor-parallel-size: 4
|
|
1082
|
+
--max-model-len: 65536
|
|
1083
|
+
--max-num-seqs: 256
|
|
1084
|
+
--compilation-config: 3
|
|
1066
1085
|
InternVL2_5-26B:
|
|
1067
1086
|
model_family: InternVL2_5
|
|
1068
1087
|
model_variant: 26B
|
|
@@ -1070,13 +1089,14 @@ models:
|
|
|
1070
1089
|
gpus_per_node: 2
|
|
1071
1090
|
num_nodes: 1
|
|
1072
1091
|
vocab_size: 92553
|
|
1073
|
-
max_model_len: 32768
|
|
1074
|
-
max_num_seqs: 256
|
|
1075
|
-
pipeline_parallelism: true
|
|
1076
|
-
enforce_eager: false
|
|
1077
1092
|
qos: m2
|
|
1078
1093
|
time: 08:00:00
|
|
1079
1094
|
partition: a40
|
|
1095
|
+
vllm_args:
|
|
1096
|
+
--tensor-parallel-size: 2
|
|
1097
|
+
--max-model-len: 32768
|
|
1098
|
+
--max-num-seqs: 256
|
|
1099
|
+
--compilation-config: 3
|
|
1080
1100
|
InternVL2_5-38B:
|
|
1081
1101
|
model_family: InternVL2_5
|
|
1082
1102
|
model_variant: 38B
|
|
@@ -1084,13 +1104,14 @@ models:
|
|
|
1084
1104
|
gpus_per_node: 4
|
|
1085
1105
|
num_nodes: 1
|
|
1086
1106
|
vocab_size: 92553
|
|
1087
|
-
max_model_len: 32768
|
|
1088
|
-
max_num_seqs: 256
|
|
1089
|
-
pipeline_parallelism: true
|
|
1090
|
-
enforce_eager: false
|
|
1091
1107
|
qos: m2
|
|
1092
1108
|
time: 08:00:00
|
|
1093
1109
|
partition: a40
|
|
1110
|
+
vllm_args:
|
|
1111
|
+
--tensor-parallel-size: 4
|
|
1112
|
+
--max-model-len: 32768
|
|
1113
|
+
--max-num-seqs: 256
|
|
1114
|
+
--compilation-config: 3
|
|
1094
1115
|
Aya-Expanse-32B:
|
|
1095
1116
|
model_family: Aya-Expanse
|
|
1096
1117
|
model_variant: 32B
|
|
@@ -1098,69 +1119,72 @@ models:
|
|
|
1098
1119
|
gpus_per_node: 2
|
|
1099
1120
|
num_nodes: 1
|
|
1100
1121
|
vocab_size: 256000
|
|
1101
|
-
max_model_len: 8192
|
|
1102
|
-
max_num_seqs: 256
|
|
1103
|
-
pipeline_parallelism: true
|
|
1104
|
-
enforce_eager: false
|
|
1105
1122
|
qos: m2
|
|
1106
1123
|
time: 08:00:00
|
|
1107
1124
|
partition: a40
|
|
1125
|
+
vllm_args:
|
|
1126
|
+
--tensor-parallel-size: 2
|
|
1127
|
+
--max-model-len: 8192
|
|
1128
|
+
--max-num-seqs: 256
|
|
1129
|
+
--compilation-config: 3
|
|
1108
1130
|
DeepSeek-R1-Distill-Llama-70B:
|
|
1109
1131
|
model_family: DeepSeek-R1
|
|
1110
|
-
model_variant:
|
|
1132
|
+
model_variant: Distill-Llama-70B
|
|
1111
1133
|
model_type: LLM
|
|
1112
1134
|
gpus_per_node: 4
|
|
1113
|
-
num_nodes:
|
|
1135
|
+
num_nodes: 1
|
|
1114
1136
|
vocab_size: 128256
|
|
1115
|
-
max_model_len: 131072
|
|
1116
|
-
max_num_seqs: 256
|
|
1117
|
-
pipeline_parallelism: true
|
|
1118
|
-
enforce_eager: false
|
|
1119
1137
|
qos: m2
|
|
1120
1138
|
time: 08:00:00
|
|
1121
1139
|
partition: a40
|
|
1140
|
+
vllm_args:
|
|
1141
|
+
--tensor-parallel-size: 4
|
|
1142
|
+
--max-model-len: 65536
|
|
1143
|
+
--max-num-seqs: 256
|
|
1144
|
+
--compilation-config: 3
|
|
1122
1145
|
DeepSeek-R1-Distill-Llama-8B:
|
|
1123
1146
|
model_family: DeepSeek-R1
|
|
1124
|
-
model_variant:
|
|
1147
|
+
model_variant: Distill-Llama-8B
|
|
1125
1148
|
model_type: LLM
|
|
1126
1149
|
gpus_per_node: 1
|
|
1127
1150
|
num_nodes: 1
|
|
1128
1151
|
vocab_size: 128256
|
|
1129
|
-
max_model_len: 131072
|
|
1130
|
-
max_num_seqs: 256
|
|
1131
|
-
pipeline_parallelism: true
|
|
1132
|
-
enforce_eager: false
|
|
1133
1152
|
qos: m2
|
|
1134
1153
|
time: 08:00:00
|
|
1135
1154
|
partition: a40
|
|
1155
|
+
vllm_args:
|
|
1156
|
+
--max-model-len: 131072
|
|
1157
|
+
--max-num-seqs: 256
|
|
1158
|
+
--compilation-config: 3
|
|
1136
1159
|
DeepSeek-R1-Distill-Qwen-32B:
|
|
1137
1160
|
model_family: DeepSeek-R1
|
|
1138
1161
|
model_variant: Distill-Qwen-32B
|
|
1139
1162
|
model_type: LLM
|
|
1140
|
-
gpus_per_node:
|
|
1163
|
+
gpus_per_node: 2
|
|
1141
1164
|
num_nodes: 1
|
|
1142
1165
|
vocab_size: 152064
|
|
1143
|
-
max_model_len: 131072
|
|
1144
|
-
max_num_seqs: 256
|
|
1145
|
-
pipeline_parallelism: true
|
|
1146
|
-
enforce_eager: false
|
|
1147
1166
|
qos: m2
|
|
1148
1167
|
time: 08:00:00
|
|
1149
1168
|
partition: a40
|
|
1169
|
+
vllm_args:
|
|
1170
|
+
--tensor-parallel-size: 2
|
|
1171
|
+
--max-model-len: 65536
|
|
1172
|
+
--max-num-seqs: 256
|
|
1173
|
+
--compilation-config: 3
|
|
1150
1174
|
DeepSeek-R1-Distill-Qwen-14B:
|
|
1151
1175
|
model_family: DeepSeek-R1
|
|
1152
1176
|
model_variant: Distill-Qwen-14B
|
|
1153
1177
|
model_type: LLM
|
|
1154
|
-
gpus_per_node:
|
|
1178
|
+
gpus_per_node: 1
|
|
1155
1179
|
num_nodes: 1
|
|
1156
1180
|
vocab_size: 152064
|
|
1157
|
-
max_model_len: 131072
|
|
1158
|
-
max_num_seqs: 256
|
|
1159
|
-
pipeline_parallelism: true
|
|
1160
|
-
enforce_eager: false
|
|
1161
1181
|
qos: m2
|
|
1162
1182
|
time: 08:00:00
|
|
1163
1183
|
partition: a40
|
|
1184
|
+
vllm_args:
|
|
1185
|
+
--max-model-len: 65536
|
|
1186
|
+
--max-num-seqs: 256
|
|
1187
|
+
--compilation-config: 3
|
|
1164
1188
|
DeepSeek-R1-Distill-Qwen-7B:
|
|
1165
1189
|
model_family: DeepSeek-R1
|
|
1166
1190
|
model_variant: Distill-Qwen-7B
|
|
@@ -1168,13 +1192,13 @@ models:
|
|
|
1168
1192
|
gpus_per_node: 1
|
|
1169
1193
|
num_nodes: 1
|
|
1170
1194
|
vocab_size: 152064
|
|
1171
|
-
max_model_len: 131072
|
|
1172
|
-
max_num_seqs: 256
|
|
1173
|
-
pipeline_parallelism: true
|
|
1174
|
-
enforce_eager: false
|
|
1175
1195
|
qos: m2
|
|
1176
1196
|
time: 08:00:00
|
|
1177
1197
|
partition: a40
|
|
1198
|
+
vllm_args:
|
|
1199
|
+
--max-model-len: 131072
|
|
1200
|
+
--max-num-seqs: 256
|
|
1201
|
+
--compilation-config: 3
|
|
1178
1202
|
DeepSeek-R1-Distill-Qwen-1.5B:
|
|
1179
1203
|
model_family: DeepSeek-R1
|
|
1180
1204
|
model_variant: Distill-Qwen-1.5B
|
|
@@ -1182,13 +1206,13 @@ models:
|
|
|
1182
1206
|
gpus_per_node: 1
|
|
1183
1207
|
num_nodes: 1
|
|
1184
1208
|
vocab_size: 152064
|
|
1185
|
-
max_model_len: 131072
|
|
1186
|
-
max_num_seqs: 256
|
|
1187
|
-
pipeline_parallelism: true
|
|
1188
|
-
enforce_eager: false
|
|
1189
1209
|
qos: m2
|
|
1190
1210
|
time: 08:00:00
|
|
1191
1211
|
partition: a40
|
|
1212
|
+
vllm_args:
|
|
1213
|
+
--max-model-len: 131072
|
|
1214
|
+
--max-num-seqs: 256
|
|
1215
|
+
--compilation-config: 3
|
|
1192
1216
|
Phi-3.5-vision-instruct:
|
|
1193
1217
|
model_family: Phi-3.5-vision
|
|
1194
1218
|
model_variant: instruct
|
|
@@ -1196,13 +1220,14 @@ models:
|
|
|
1196
1220
|
gpus_per_node: 2
|
|
1197
1221
|
num_nodes: 1
|
|
1198
1222
|
vocab_size: 32064
|
|
1199
|
-
max_model_len: 65536
|
|
1200
|
-
max_num_seqs: 256
|
|
1201
|
-
pipeline_parallelism: true
|
|
1202
|
-
enforce_eager: false
|
|
1203
1223
|
qos: m2
|
|
1204
1224
|
time: 08:00:00
|
|
1205
1225
|
partition: a40
|
|
1226
|
+
vllm_args:
|
|
1227
|
+
--tensor-parallel-size: 2
|
|
1228
|
+
--max-model-len: 65536
|
|
1229
|
+
--max-num-seqs: 256
|
|
1230
|
+
--compilation-config: 3
|
|
1206
1231
|
InternVL2_5-8B:
|
|
1207
1232
|
model_family: InternVL2_5
|
|
1208
1233
|
model_variant: 8B
|
|
@@ -1210,13 +1235,13 @@ models:
|
|
|
1210
1235
|
gpus_per_node: 1
|
|
1211
1236
|
num_nodes: 1
|
|
1212
1237
|
vocab_size: 92553
|
|
1213
|
-
max_model_len: 32768
|
|
1214
|
-
max_num_seqs: 256
|
|
1215
|
-
pipeline_parallelism: true
|
|
1216
|
-
enforce_eager: false
|
|
1217
1238
|
qos: m2
|
|
1218
1239
|
time: 08:00:00
|
|
1219
1240
|
partition: a40
|
|
1241
|
+
vllm_args:
|
|
1242
|
+
--max-model-len: 32768
|
|
1243
|
+
--max-num-seqs: 256
|
|
1244
|
+
--compilation-config: 3
|
|
1220
1245
|
glm-4v-9b:
|
|
1221
1246
|
model_family: glm-4v
|
|
1222
1247
|
model_variant: 9b
|
|
@@ -1224,13 +1249,13 @@ models:
|
|
|
1224
1249
|
gpus_per_node: 1
|
|
1225
1250
|
num_nodes: 1
|
|
1226
1251
|
vocab_size: 151552
|
|
1227
|
-
max_model_len: 8192
|
|
1228
|
-
max_num_seqs: 256
|
|
1229
|
-
pipeline_parallelism: true
|
|
1230
|
-
enforce_eager: false
|
|
1231
1252
|
qos: m2
|
|
1232
1253
|
time: 08:00:00
|
|
1233
1254
|
partition: a40
|
|
1255
|
+
vllm_args:
|
|
1256
|
+
--max-model-len: 8192
|
|
1257
|
+
--max-num-seqs: 256
|
|
1258
|
+
--compilation-config: 3
|
|
1234
1259
|
Molmo-7B-D-0924:
|
|
1235
1260
|
model_family: Molmo
|
|
1236
1261
|
model_variant: 7B-D-0924
|
|
@@ -1238,26 +1263,27 @@ models:
|
|
|
1238
1263
|
gpus_per_node: 1
|
|
1239
1264
|
num_nodes: 1
|
|
1240
1265
|
vocab_size: 152064
|
|
1241
|
-
max_model_len: 4096
|
|
1242
|
-
max_num_seqs: 256
|
|
1243
|
-
pipeline_parallelism: true
|
|
1244
|
-
enforce_eager: false
|
|
1245
1266
|
qos: m2
|
|
1246
1267
|
time: 08:00:00
|
|
1247
1268
|
partition: a40
|
|
1269
|
+
vllm_args:
|
|
1270
|
+
--max-model-len: 4096
|
|
1271
|
+
--max-num-seqs: 256
|
|
1272
|
+
--compilation-config: 3
|
|
1248
1273
|
deepseek-vl2:
|
|
1249
1274
|
model_family: deepseek-vl2
|
|
1250
1275
|
model_type: VLM
|
|
1251
1276
|
gpus_per_node: 2
|
|
1252
1277
|
num_nodes: 1
|
|
1253
1278
|
vocab_size: 129280
|
|
1254
|
-
max_model_len: 4096
|
|
1255
|
-
max_num_seqs: 256
|
|
1256
|
-
pipeline_parallelism: true
|
|
1257
|
-
enforce_eager: false
|
|
1258
1279
|
qos: m2
|
|
1259
1280
|
time: 08:00:00
|
|
1260
1281
|
partition: a40
|
|
1282
|
+
vllm_args:
|
|
1283
|
+
--tensor-parallel-size: 2
|
|
1284
|
+
--max-model-len: 4096
|
|
1285
|
+
--max-num-seqs: 256
|
|
1286
|
+
--compilation-config: 3
|
|
1261
1287
|
deepseek-vl2-small:
|
|
1262
1288
|
model_family: deepseek-vl2
|
|
1263
1289
|
model_variant: small
|
|
@@ -1265,10 +1291,10 @@ models:
|
|
|
1265
1291
|
gpus_per_node: 1
|
|
1266
1292
|
num_nodes: 1
|
|
1267
1293
|
vocab_size: 129280
|
|
1268
|
-
max_model_len: 4096
|
|
1269
|
-
max_num_seqs: 256
|
|
1270
|
-
pipeline_parallelism: true
|
|
1271
|
-
enforce_eager: false
|
|
1272
1294
|
qos: m2
|
|
1273
1295
|
time: 08:00:00
|
|
1274
1296
|
partition: a40
|
|
1297
|
+
vllm_args:
|
|
1298
|
+
--max-model-len: 4096
|
|
1299
|
+
--max-num-seqs: 256
|
|
1300
|
+
--compilation-config: 3
|