vec-inf 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,13 +6,15 @@ models:
6
6
  gpus_per_node: 4
7
7
  num_nodes: 2
8
8
  vocab_size: 256000
9
- max_model_len: 8192
10
- max_num_seqs: 256
11
- pipeline_parallelism: true
12
- enforce_eager: false
13
9
  qos: m2
14
10
  time: 08:00:00
15
11
  partition: a40
12
+ vllm_args:
13
+ --pipeline-parallel-size: 2
14
+ --tensor-parallel-size: 4
15
+ --max-model-len: 8192
16
+ --max-num-seqs: 256
17
+ --compilation-config: 3
16
18
  c4ai-command-r-plus-08-2024:
17
19
  model_family: c4ai-command-r
18
20
  model_variant: plus-08-2024
@@ -20,13 +22,15 @@ models:
20
22
  gpus_per_node: 4
21
23
  num_nodes: 2
22
24
  vocab_size: 256000
23
- max_model_len: 65536
24
- max_num_seqs: 256
25
- pipeline_parallelism: true
26
- enforce_eager: false
27
25
  qos: m2
28
26
  time: 08:00:00
29
27
  partition: a40
28
+ vllm_args:
29
+ --pipeline-parallel-size: 2
30
+ --tensor-parallel-size: 4
31
+ --max-model-len: 65536
32
+ --max-num-seqs: 256
33
+ --compilation-config: 3
30
34
  c4ai-command-r-08-2024:
31
35
  model_family: c4ai-command-r
32
36
  model_variant: 08-2024
@@ -34,13 +38,14 @@ models:
34
38
  gpus_per_node: 2
35
39
  num_nodes: 1
36
40
  vocab_size: 256000
37
- max_model_len: 32768
38
- max_num_seqs: 256
39
- pipeline_parallelism: true
40
- enforce_eager: false
41
41
  qos: m2
42
42
  time: 08:00:00
43
43
  partition: a40
44
+ vllm_args:
45
+ --tensor-parallel-size: 2
46
+ --max-model-len: 32768
47
+ --max-num-seqs: 256
48
+ --compilation-config: 3
44
49
  CodeLlama-7b-hf:
45
50
  model_family: CodeLlama
46
51
  model_variant: 7b-hf
@@ -48,13 +53,13 @@ models:
48
53
  gpus_per_node: 1
49
54
  num_nodes: 1
50
55
  vocab_size: 32000
51
- max_model_len: 16384
52
- max_num_seqs: 256
53
- pipeline_parallelism: true
54
- enforce_eager: false
55
56
  qos: m2
56
57
  time: 08:00:00
57
58
  partition: a40
59
+ vllm_args:
60
+ --max-model-len: 16384
61
+ --max-num-seqs: 256
62
+ --compilation-config: 3
58
63
  CodeLlama-7b-Instruct-hf:
59
64
  model_family: CodeLlama
60
65
  model_variant: 7b-Instruct-hf
@@ -62,13 +67,13 @@ models:
62
67
  gpus_per_node: 1
63
68
  num_nodes: 1
64
69
  vocab_size: 32000
65
- max_model_len: 16384
66
- max_num_seqs: 256
67
- pipeline_parallelism: true
68
- enforce_eager: false
69
70
  qos: m2
70
71
  time: 08:00:00
71
72
  partition: a40
73
+ vllm_args:
74
+ --max-model-len: 16384
75
+ --max-num-seqs: 256
76
+ --compilation-config: 3
72
77
  CodeLlama-13b-hf:
73
78
  model_family: CodeLlama
74
79
  model_variant: 13b-hf
@@ -76,13 +81,13 @@ models:
76
81
  gpus_per_node: 1
77
82
  num_nodes: 1
78
83
  vocab_size: 32000
79
- max_model_len: 16384
80
- max_num_seqs: 256
81
- pipeline_parallelism: true
82
- enforce_eager: false
83
84
  qos: m2
84
85
  time: 08:00:00
85
86
  partition: a40
87
+ vllm_args:
88
+ --max-model-len: 16384
89
+ --max-num-seqs: 256
90
+ --compilation-config: 3
86
91
  CodeLlama-13b-Instruct-hf:
87
92
  model_family: CodeLlama
88
93
  model_variant: 13b-Instruct-hf
@@ -90,13 +95,13 @@ models:
90
95
  gpus_per_node: 1
91
96
  num_nodes: 1
92
97
  vocab_size: 32000
93
- max_model_len: 16384
94
- max_num_seqs: 256
95
- pipeline_parallelism: true
96
- enforce_eager: false
97
98
  qos: m2
98
99
  time: 08:00:00
99
100
  partition: a40
101
+ vllm_args:
102
+ --max-model-len: 16384
103
+ --max-num-seqs: 256
104
+ --compilation-config: 3
100
105
  CodeLlama-34b-hf:
101
106
  model_family: CodeLlama
102
107
  model_variant: 34b-hf
@@ -104,13 +109,14 @@ models:
104
109
  gpus_per_node: 2
105
110
  num_nodes: 1
106
111
  vocab_size: 32000
107
- max_model_len: 16384
108
- max_num_seqs: 256
109
- pipeline_parallelism: true
110
- enforce_eager: false
111
112
  qos: m2
112
113
  time: 08:00:00
113
114
  partition: a40
115
+ vllm_args:
116
+ --tensor-parallel-size: 2
117
+ --max-model-len: 16384
118
+ --max-num-seqs: 256
119
+ --compilation-config: 3
114
120
  CodeLlama-34b-Instruct-hf:
115
121
  model_family: CodeLlama
116
122
  model_variant: 34b-Instruct-hf
@@ -118,55 +124,44 @@ models:
118
124
  gpus_per_node: 2
119
125
  num_nodes: 1
120
126
  vocab_size: 32000
121
- max_model_len: 16384
122
- max_num_seqs: 256
123
- pipeline_parallelism: true
124
- enforce_eager: false
125
127
  qos: m2
126
128
  time: 08:00:00
127
129
  partition: a40
130
+ vllm_args:
131
+ --tensor-parallel-size: 2
132
+ --max-model-len: 16384
133
+ --max-num-seqs: 256
134
+ --compilation-config: 3
128
135
  CodeLlama-70b-hf:
129
136
  model_family: CodeLlama
130
137
  model_variant: 70b-hf
131
138
  model_type: LLM
132
139
  gpus_per_node: 4
133
140
  num_nodes: 1
134
- vocab_size: 32000
135
- max_model_len: 4096
136
- max_num_seqs: 256
137
- pipeline_parallelism: true
138
- enforce_eager: false
141
+ vocab_size: 32016
139
142
  qos: m2
140
143
  time: 08:00:00
141
144
  partition: a40
145
+ vllm_args:
146
+ --tensor-parallel-size: 4
147
+ --max-model-len: 4096
148
+ --max-num-seqs: 256
149
+ --compilation-config: 3
142
150
  CodeLlama-70b-Instruct-hf:
143
151
  model_family: CodeLlama
144
152
  model_variant: 70b-Instruct-hf
145
153
  model_type: LLM
146
154
  gpus_per_node: 4
147
155
  num_nodes: 1
148
- vocab_size: 32000
149
- max_model_len: 4096
150
- max_num_seqs: 256
151
- pipeline_parallelism: true
152
- enforce_eager: false
153
- qos: m2
154
- time: 08:00:00
155
- partition: a40
156
- dbrx-instruct:
157
- model_family: dbrx
158
- model_variant: instruct
159
- model_type: LLM
160
- gpus_per_node: 4
161
- num_nodes: 2
162
- vocab_size: 100352
163
- max_model_len: 32000
164
- max_num_seqs: 256
165
- pipeline_parallelism: true
166
- enforce_eager: false
156
+ vocab_size: 32016
167
157
  qos: m2
168
158
  time: 08:00:00
169
159
  partition: a40
160
+ vllm_args:
161
+ --tensor-parallel-size: 4
162
+ --max-model-len: 4096
163
+ --max-num-seqs: 256
164
+ --compilation-config: 3
170
165
  gemma-2-9b:
171
166
  model_family: gemma-2
172
167
  model_variant: 9b
@@ -174,13 +169,13 @@ models:
174
169
  gpus_per_node: 1
175
170
  num_nodes: 1
176
171
  vocab_size: 256000
177
- max_model_len: 4096
178
- max_num_seqs: 256
179
- pipeline_parallelism: true
180
- enforce_eager: false
181
172
  qos: m2
182
173
  time: 08:00:00
183
174
  partition: a40
175
+ vllm_args:
176
+ --max-model-len: 4096
177
+ --max-num-seqs: 256
178
+ --compilation-config: 3
184
179
  gemma-2-9b-it:
185
180
  model_family: gemma-2
186
181
  model_variant: 9b-it
@@ -188,13 +183,13 @@ models:
188
183
  gpus_per_node: 1
189
184
  num_nodes: 1
190
185
  vocab_size: 256000
191
- max_model_len: 4096
192
- max_num_seqs: 256
193
- pipeline_parallelism: true
194
- enforce_eager: false
195
186
  qos: m2
196
187
  time: 08:00:00
197
188
  partition: a40
189
+ vllm_args:
190
+ --max-model-len: 4096
191
+ --max-num-seqs: 256
192
+ --compilation-config: 3
198
193
  gemma-2-27b:
199
194
  model_family: gemma-2
200
195
  model_variant: 27b
@@ -202,13 +197,14 @@ models:
202
197
  gpus_per_node: 2
203
198
  num_nodes: 1
204
199
  vocab_size: 256000
205
- max_model_len: 4096
206
- max_num_seqs: 256
207
- pipeline_parallelism: true
208
- enforce_eager: false
209
200
  qos: m2
210
201
  time: 08:00:00
211
202
  partition: a40
203
+ vllm_args:
204
+ --tensor-parallel-size: 2
205
+ --max-model-len: 4096
206
+ --max-num-seqs: 256
207
+ --compilation-config: 3
212
208
  gemma-2-27b-it:
213
209
  model_family: gemma-2
214
210
  model_variant: 27b-it
@@ -216,13 +212,14 @@ models:
216
212
  gpus_per_node: 2
217
213
  num_nodes: 1
218
214
  vocab_size: 256000
219
- max_model_len: 4096
220
- max_num_seqs: 256
221
- pipeline_parallelism: true
222
- enforce_eager: false
223
215
  qos: m2
224
216
  time: 08:00:00
225
217
  partition: a40
218
+ vllm_args:
219
+ --tensor-parallel-size: 2
220
+ --max-model-len: 4096
221
+ --max-num-seqs: 256
222
+ --compilation-config: 3
226
223
  Llama-2-7b-hf:
227
224
  model_family: Llama-2
228
225
  model_variant: 7b-hf
@@ -230,13 +227,13 @@ models:
230
227
  gpus_per_node: 1
231
228
  num_nodes: 1
232
229
  vocab_size: 32000
233
- max_model_len: 4096
234
- max_num_seqs: 256
235
- pipeline_parallelism: true
236
- enforce_eager: false
237
230
  qos: m2
238
231
  time: 08:00:00
239
232
  partition: a40
233
+ vllm_args:
234
+ --max-model-len: 4096
235
+ --max-num-seqs: 256
236
+ --compilation-config: 3
240
237
  Llama-2-7b-chat-hf:
241
238
  model_family: Llama-2
242
239
  model_variant: 7b-chat-hf
@@ -244,13 +241,13 @@ models:
244
241
  gpus_per_node: 1
245
242
  num_nodes: 1
246
243
  vocab_size: 32000
247
- max_model_len: 4096
248
- max_num_seqs: 256
249
- pipeline_parallelism: true
250
- enforce_eager: false
251
244
  qos: m2
252
245
  time: 08:00:00
253
246
  partition: a40
247
+ vllm_args:
248
+ --max-model-len: 4096
249
+ --max-num-seqs: 256
250
+ --compilation-config: 3
254
251
  Llama-2-13b-hf:
255
252
  model_family: Llama-2
256
253
  model_variant: 13b-hf
@@ -258,13 +255,13 @@ models:
258
255
  gpus_per_node: 1
259
256
  num_nodes: 1
260
257
  vocab_size: 32000
261
- max_model_len: 4096
262
- max_num_seqs: 256
263
- pipeline_parallelism: true
264
- enforce_eager: false
265
258
  qos: m2
266
259
  time: 08:00:00
267
260
  partition: a40
261
+ vllm_args:
262
+ --max-model-len: 4096
263
+ --max-num-seqs: 256
264
+ --compilation-config: 3
268
265
  Llama-2-13b-chat-hf:
269
266
  model_family: Llama-2
270
267
  model_variant: 13b-chat-hf
@@ -272,13 +269,13 @@ models:
272
269
  gpus_per_node: 1
273
270
  num_nodes: 1
274
271
  vocab_size: 32000
275
- max_model_len: 4096
276
- max_num_seqs: 256
277
- pipeline_parallelism: true
278
- enforce_eager: false
279
272
  qos: m2
280
273
  time: 08:00:00
281
274
  partition: a40
275
+ vllm_args:
276
+ --max-model-len: 4096
277
+ --max-num-seqs: 256
278
+ --compilation-config: 3
282
279
  Llama-2-70b-hf:
283
280
  model_family: Llama-2
284
281
  model_variant: 70b-hf
@@ -286,13 +283,14 @@ models:
286
283
  gpus_per_node: 4
287
284
  num_nodes: 1
288
285
  vocab_size: 32000
289
- max_model_len: 4096
290
- max_num_seqs: 256
291
- pipeline_parallelism: true
292
- enforce_eager: false
293
286
  qos: m2
294
287
  time: 08:00:00
295
288
  partition: a40
289
+ vllm_args:
290
+ --tensor-parallel-size: 4
291
+ --max-model-len: 4096
292
+ --max-num-seqs: 256
293
+ --compilation-config: 3
296
294
  Llama-2-70b-chat-hf:
297
295
  model_family: Llama-2
298
296
  model_variant: 70b-chat-hf
@@ -300,13 +298,14 @@ models:
300
298
  gpus_per_node: 4
301
299
  num_nodes: 1
302
300
  vocab_size: 32000
303
- max_model_len: 4096
304
- max_num_seqs: 256
305
- pipeline_parallelism: true
306
- enforce_eager: false
307
301
  qos: m2
308
302
  time: 08:00:00
309
303
  partition: a40
304
+ vllm_args:
305
+ --tensor-parallel-size: 4
306
+ --max-model-len: 4096
307
+ --max-num-seqs: 256
308
+ --compilation-config: 3
310
309
  llava-1.5-7b-hf:
311
310
  model_family: llava-1.5
312
311
  model_variant: 7b-hf
@@ -314,13 +313,13 @@ models:
314
313
  gpus_per_node: 1
315
314
  num_nodes: 1
316
315
  vocab_size: 32000
317
- max_model_len: 4096
318
- max_num_seqs: 256
319
- pipeline_parallelism: true
320
- enforce_eager: false
321
316
  qos: m2
322
317
  time: 08:00:00
323
318
  partition: a40
319
+ vllm_args:
320
+ --max-model-len: 4096
321
+ --max-num-seqs: 256
322
+ --compilation-config: 3
324
323
  llava-1.5-13b-hf:
325
324
  model_family: llava-1.5
326
325
  model_variant: 13b-hf
@@ -328,13 +327,13 @@ models:
328
327
  gpus_per_node: 1
329
328
  num_nodes: 1
330
329
  vocab_size: 32000
331
- max_model_len: 4096
332
- max_num_seqs: 256
333
- pipeline_parallelism: true
334
- enforce_eager: false
335
330
  qos: m2
336
331
  time: 08:00:00
337
332
  partition: a40
333
+ vllm_args:
334
+ --max-model-len: 4096
335
+ --max-num-seqs: 256
336
+ --compilation-config: 3
338
337
  llava-v1.6-mistral-7b-hf:
339
338
  model_family: llava-v1.6
340
339
  model_variant: mistral-7b-hf
@@ -342,13 +341,13 @@ models:
342
341
  gpus_per_node: 1
343
342
  num_nodes: 1
344
343
  vocab_size: 32064
345
- max_model_len: 32768
346
- max_num_seqs: 256
347
- pipeline_parallelism: true
348
- enforce_eager: false
349
344
  qos: m2
350
345
  time: 08:00:00
351
346
  partition: a40
347
+ vllm_args:
348
+ --max-model-len: 32768
349
+ --max-num-seqs: 256
350
+ --compilation-config: 3
352
351
  llava-v1.6-34b-hf:
353
352
  model_family: llava-v1.6
354
353
  model_variant: 34b-hf
@@ -356,13 +355,14 @@ models:
356
355
  gpus_per_node: 2
357
356
  num_nodes: 1
358
357
  vocab_size: 64064
359
- max_model_len: 4096
360
- max_num_seqs: 256
361
- pipeline_parallelism: true
362
- enforce_eager: false
363
358
  qos: m2
364
359
  time: 08:00:00
365
360
  partition: a40
361
+ vllm_args:
362
+ --tensor-parallel-size: 2
363
+ --max-model-len: 4096
364
+ --max-num-seqs: 256
365
+ --compilation-config: 3
366
366
  Meta-Llama-3-8B:
367
367
  model_family: Meta-Llama-3
368
368
  model_variant: 8B
@@ -370,13 +370,13 @@ models:
370
370
  gpus_per_node: 1
371
371
  num_nodes: 1
372
372
  vocab_size: 128256
373
- max_model_len: 8192
374
- max_num_seqs: 256
375
- pipeline_parallelism: true
376
- enforce_eager: false
377
373
  qos: m2
378
374
  time: 08:00:00
379
375
  partition: a40
376
+ vllm_args:
377
+ --max-model-len: 8192
378
+ --max-num-seqs: 256
379
+ --compilation-config: 3
380
380
  Meta-Llama-3-8B-Instruct:
381
381
  model_family: Meta-Llama-3
382
382
  model_variant: 8B-Instruct
@@ -384,13 +384,13 @@ models:
384
384
  gpus_per_node: 1
385
385
  num_nodes: 1
386
386
  vocab_size: 128256
387
- max_model_len: 8192
388
- max_num_seqs: 256
389
- pipeline_parallelism: true
390
- enforce_eager: false
391
387
  qos: m2
392
388
  time: 08:00:00
393
389
  partition: a40
390
+ vllm_args:
391
+ --max-model-len: 8192
392
+ --max-num-seqs: 256
393
+ --compilation-config: 3
394
394
  Meta-Llama-3-70B:
395
395
  model_family: Meta-Llama-3
396
396
  model_variant: 70B
@@ -398,13 +398,14 @@ models:
398
398
  gpus_per_node: 4
399
399
  num_nodes: 1
400
400
  vocab_size: 128256
401
- max_model_len: 8192
402
- max_num_seqs: 256
403
- pipeline_parallelism: true
404
- enforce_eager: false
405
401
  qos: m2
406
402
  time: 08:00:00
407
403
  partition: a40
404
+ vllm_args:
405
+ --tensor-parallel-size: 4
406
+ --max-model-len: 8192
407
+ --max-num-seqs: 256
408
+ --compilation-config: 3
408
409
  Meta-Llama-3-70B-Instruct:
409
410
  model_family: Meta-Llama-3
410
411
  model_variant: 70B-Instruct
@@ -412,13 +413,14 @@ models:
412
413
  gpus_per_node: 4
413
414
  num_nodes: 1
414
415
  vocab_size: 128256
415
- max_model_len: 8192
416
- max_num_seqs: 256
417
- pipeline_parallelism: true
418
- enforce_eager: false
419
416
  qos: m2
420
417
  time: 08:00:00
421
418
  partition: a40
419
+ vllm_args:
420
+ --tensor-parallel-size: 4
421
+ --max-model-len: 8192
422
+ --max-num-seqs: 256
423
+ --compilation-config: 3
422
424
  Meta-Llama-3.1-8B:
423
425
  model_family: Meta-Llama-3.1
424
426
  model_variant: 8B
@@ -426,13 +428,13 @@ models:
426
428
  gpus_per_node: 1
427
429
  num_nodes: 1
428
430
  vocab_size: 128256
429
- max_model_len: 131072
430
- max_num_seqs: 256
431
- pipeline_parallelism: true
432
- enforce_eager: false
433
431
  qos: m2
434
432
  time: 08:00:00
435
433
  partition: a40
434
+ vllm_args:
435
+ --max-model-len: 131072
436
+ --max-num-seqs: 256
437
+ --compilation-config: 3
436
438
  Meta-Llama-3.1-8B-Instruct:
437
439
  model_family: Meta-Llama-3.1
438
440
  model_variant: 8B-Instruct
@@ -440,13 +442,13 @@ models:
440
442
  gpus_per_node: 1
441
443
  num_nodes: 1
442
444
  vocab_size: 128256
443
- max_model_len: 131072
444
- max_num_seqs: 256
445
- pipeline_parallelism: true
446
- enforce_eager: false
447
445
  qos: m2
448
446
  time: 08:00:00
449
447
  partition: a40
448
+ vllm_args:
449
+ --max-model-len: 131072
450
+ --max-num-seqs: 256
451
+ --compilation-config: 3
450
452
  Meta-Llama-3.1-70B:
451
453
  model_family: Meta-Llama-3.1
452
454
  model_variant: 70B
@@ -454,13 +456,14 @@ models:
454
456
  gpus_per_node: 4
455
457
  num_nodes: 1
456
458
  vocab_size: 128256
457
- max_model_len: 65536
458
- max_num_seqs: 256
459
- pipeline_parallelism: true
460
- enforce_eager: false
461
459
  qos: m2
462
460
  time: 08:00:00
463
461
  partition: a40
462
+ vllm_args:
463
+ --tensor-parallel-size: 4
464
+ --max-model-len: 65536
465
+ --max-num-seqs: 256
466
+ --compilation-config: 3
464
467
  Meta-Llama-3.1-70B-Instruct:
465
468
  model_family: Meta-Llama-3.1
466
469
  model_variant: 70B-Instruct
@@ -468,13 +471,14 @@ models:
468
471
  gpus_per_node: 4
469
472
  num_nodes: 1
470
473
  vocab_size: 128256
471
- max_model_len: 65536
472
- max_num_seqs: 256
473
- pipeline_parallelism: true
474
- enforce_eager: false
475
474
  qos: m2
476
475
  time: 08:00:00
477
476
  partition: a40
477
+ vllm_args:
478
+ --tensor-parallel-size: 4
479
+ --max-model-len: 65536
480
+ --max-num-seqs: 256
481
+ --compilation-config: 3
478
482
  Meta-Llama-3.1-405B-Instruct:
479
483
  model_family: Meta-Llama-3.1
480
484
  model_variant: 405B-Instruct
@@ -482,27 +486,15 @@ models:
482
486
  gpus_per_node: 4
483
487
  num_nodes: 8
484
488
  vocab_size: 128256
485
- max_model_len: 16384
486
- max_num_seqs: 256
487
- pipeline_parallelism: true
488
- enforce_eager: false
489
489
  qos: m4
490
490
  time: 02:00:00
491
491
  partition: a40
492
- Mistral-7B-v0.1:
493
- model_family: Mistral
494
- model_variant: 7B-v0.1
495
- model_type: LLM
496
- gpus_per_node: 1
497
- num_nodes: 1
498
- vocab_size: 32000
499
- max_model_len: 32768
500
- max_num_seqs: 256
501
- pipeline_parallelism: true
502
- enforce_eager: false
503
- qos: m2
504
- time: 08:00:00
505
- partition: a40
492
+ vllm_args:
493
+ --pipeline-parallel-size: 8
494
+ --tensor-parallel-size: 4
495
+ --max-model-len: 16384
496
+ --max-num-seqs: 256
497
+ --compilation-config: 3
506
498
  Mistral-7B-Instruct-v0.1:
507
499
  model_family: Mistral
508
500
  model_variant: 7B-Instruct-v0.1
@@ -510,13 +502,13 @@ models:
510
502
  gpus_per_node: 1
511
503
  num_nodes: 1
512
504
  vocab_size: 32000
513
- max_model_len: 32768
514
- max_num_seqs: 256
515
- pipeline_parallelism: true
516
- enforce_eager: false
517
505
  qos: m2
518
506
  time: 08:00:00
519
507
  partition: a40
508
+ vllm_args:
509
+ --max-model-len: 32768
510
+ --max-num-seqs: 256
511
+ --compilation-config: 3
520
512
  Mistral-7B-Instruct-v0.2:
521
513
  model_family: Mistral
522
514
  model_variant: 7B-Instruct-v0.2
@@ -524,13 +516,13 @@ models:
524
516
  gpus_per_node: 1
525
517
  num_nodes: 1
526
518
  vocab_size: 32000
527
- max_model_len: 32768
528
- max_num_seqs: 256
529
- pipeline_parallelism: true
530
- enforce_eager: false
531
519
  qos: m2
532
520
  time: 08:00:00
533
521
  partition: a40
522
+ vllm_args:
523
+ --max-model-len: 32768
524
+ --max-num-seqs: 256
525
+ --compilation-config: 3
534
526
  Mistral-7B-v0.3:
535
527
  model_family: Mistral
536
528
  model_variant: 7B-v0.3
@@ -538,13 +530,13 @@ models:
538
530
  gpus_per_node: 1
539
531
  num_nodes: 1
540
532
  vocab_size: 32768
541
- max_model_len: 32768
542
- max_num_seqs: 256
543
- pipeline_parallelism: true
544
- enforce_eager: false
545
533
  qos: m2
546
534
  time: 08:00:00
547
535
  partition: a40
536
+ vllm_args:
537
+ --max-model-len: 32768
538
+ --max-num-seqs: 256
539
+ --compilation-config: 3
548
540
  Mistral-7B-Instruct-v0.3:
549
541
  model_family: Mistral
550
542
  model_variant: 7B-Instruct-v0.3
@@ -552,13 +544,13 @@ models:
552
544
  gpus_per_node: 1
553
545
  num_nodes: 1
554
546
  vocab_size: 32768
555
- max_model_len: 32768
556
- max_num_seqs: 256
557
- pipeline_parallelism: true
558
- enforce_eager: false
559
547
  qos: m2
560
548
  time: 08:00:00
561
549
  partition: a40
550
+ vllm_args:
551
+ --max-model-len: 32768
552
+ --max-num-seqs: 256
553
+ --compilation-config: 3
562
554
  Mistral-Large-Instruct-2407:
563
555
  model_family: Mistral
564
556
  model_variant: Large-Instruct-2407
@@ -566,13 +558,15 @@ models:
566
558
  gpus_per_node: 4
567
559
  num_nodes: 2
568
560
  vocab_size: 32768
569
- max_model_len: 32768
570
- max_num_seqs: 256
571
- pipeline_parallelism: true
572
- enforce_eager: false
573
561
  qos: m2
574
562
  time: 08:00:00
575
563
  partition: a40
564
+ vllm_args:
565
+ --pipeline-parallel-size: 2
566
+ --tensor-parallel-size: 4
567
+ --max-model-len: 32768
568
+ --max-num-seqs: 256
569
+ --compilation-config: 3
576
570
  Mistral-Large-Instruct-2411:
577
571
  model_family: Mistral
578
572
  model_variant: Large-Instruct-2411
@@ -580,13 +574,15 @@ models:
580
574
  gpus_per_node: 4
581
575
  num_nodes: 2
582
576
  vocab_size: 32768
583
- max_model_len: 32768
584
- max_num_seqs: 256
585
- pipeline_parallelism: true
586
- enforce_eager: false
587
577
  qos: m2
588
578
  time: 08:00:00
589
579
  partition: a40
580
+ vllm_args:
581
+ --pipeline-parallel-size: 2
582
+ --tensor-parallel-size: 4
583
+ --max-model-len: 32768
584
+ --max-num-seqs: 256
585
+ --compilation-config: 3
590
586
  Mixtral-8x7B-Instruct-v0.1:
591
587
  model_family: Mixtral
592
588
  model_variant: 8x7B-Instruct-v0.1
@@ -594,13 +590,14 @@ models:
594
590
  gpus_per_node: 4
595
591
  num_nodes: 1
596
592
  vocab_size: 32000
597
- max_model_len: 32768
598
- max_num_seqs: 256
599
- pipeline_parallelism: true
600
- enforce_eager: false
601
593
  qos: m2
602
594
  time: 08:00:00
603
595
  partition: a40
596
+ vllm_args:
597
+ --tensor-parallel-size: 4
598
+ --max-model-len: 32768
599
+ --max-num-seqs: 256
600
+ --compilation-config: 3
604
601
  Mixtral-8x22B-v0.1:
605
602
  model_family: Mixtral
606
603
  model_variant: 8x22B-v0.1
@@ -608,13 +605,15 @@ models:
608
605
  gpus_per_node: 4
609
606
  num_nodes: 2
610
607
  vocab_size: 32768
611
- max_model_len: 65536
612
- max_num_seqs: 256
613
- pipeline_parallelism: true
614
- enforce_eager: false
615
608
  qos: m2
616
609
  time: 08:00:00
617
610
  partition: a40
611
+ vllm_args:
612
+ --pipeline-parallel-size: 2
613
+ --tensor-parallel-size: 4
614
+ --max-model-len: 65536
615
+ --max-num-seqs: 256
616
+ --compilation-config: 3
618
617
  Mixtral-8x22B-Instruct-v0.1:
619
618
  model_family: Mixtral
620
619
  model_variant: 8x22B-Instruct-v0.1
@@ -622,13 +621,15 @@ models:
622
621
  gpus_per_node: 4
623
622
  num_nodes: 2
624
623
  vocab_size: 32768
625
- max_model_len: 65536
626
- max_num_seqs: 256
627
- pipeline_parallelism: true
628
- enforce_eager: false
629
624
  qos: m2
630
625
  time: 08:00:00
631
626
  partition: a40
627
+ vllm_args:
628
+ --pipeline-parallel-size: 2
629
+ --tensor-parallel-size: 4
630
+ --max-model-len: 65536
631
+ --max-num-seqs: 256
632
+ --compilation-config: 3
632
633
  Phi-3-medium-128k-instruct:
633
634
  model_family: Phi-3
634
635
  model_variant: medium-128k-instruct
@@ -636,13 +637,14 @@ models:
636
637
  gpus_per_node: 2
637
638
  num_nodes: 1
638
639
  vocab_size: 32064
639
- max_model_len: 131072
640
- max_num_seqs: 256
641
- pipeline_parallelism: true
642
- enforce_eager: false
643
640
  qos: m2
644
641
  time: 08:00:00
645
642
  partition: a40
643
+ vllm_args:
644
+ --tensor-parallel-size: 2
645
+ --max-model-len: 131072
646
+ --max-num-seqs: 256
647
+ --compilation-config: 3
646
648
  Phi-3-vision-128k-instruct:
647
649
  model_family: Phi-3-vision
648
650
  model_variant: 128k-instruct
@@ -650,13 +652,14 @@ models:
650
652
  gpus_per_node: 2
651
653
  num_nodes: 1
652
654
  vocab_size: 32064
653
- max_model_len: 65536
654
- max_num_seqs: 256
655
- pipeline_parallelism: true
656
- enforce_eager: false
657
655
  qos: m2
658
656
  time: 08:00:00
659
657
  partition: a40
658
+ vllm_args:
659
+ --tensor-parallel-size: 2
660
+ --max-model-len: 65536
661
+ --max-num-seqs: 256
662
+ --compilation-config: 3
660
663
  Llama3-OpenBioLLM-70B:
661
664
  model_family: Llama3-OpenBioLLM
662
665
  model_variant: 70B
@@ -664,13 +667,14 @@ models:
664
667
  gpus_per_node: 4
665
668
  num_nodes: 1
666
669
  vocab_size: 128256
667
- max_model_len: 8192
668
- max_num_seqs: 256
669
- pipeline_parallelism: true
670
- enforce_eager: false
671
670
  qos: m2
672
671
  time: 08:00:00
673
672
  partition: a40
673
+ vllm_args:
674
+ --tensor-parallel-size: 4
675
+ --max-model-len: 8192
676
+ --max-num-seqs: 256
677
+ --compilation-config: 3
674
678
  Llama-3.1-Nemotron-70B-Instruct-HF:
675
679
  model_family: Llama-3.1-Nemotron
676
680
  model_variant: 70B-Instruct-HF
@@ -678,13 +682,14 @@ models:
678
682
  gpus_per_node: 4
679
683
  num_nodes: 1
680
684
  vocab_size: 128256
681
- max_model_len: 65536
682
- max_num_seqs: 256
683
- pipeline_parallelism: true
684
- enforce_eager: false
685
685
  qos: m2
686
686
  time: 08:00:00
687
687
  partition: a40
688
+ vllm_args:
689
+ --tensor-parallel-size: 4
690
+ --max-model-len: 65536
691
+ --max-num-seqs: 256
692
+ --compilation-config: 3
688
693
  Llama-3.2-1B:
689
694
  model_family: Llama-3.2
690
695
  model_variant: 1B
@@ -692,13 +697,13 @@ models:
692
697
  gpus_per_node: 1
693
698
  num_nodes: 1
694
699
  vocab_size: 128256
695
- max_model_len: 131072
696
- max_num_seqs: 256
697
- pipeline_parallelism: true
698
- enforce_eager: false
699
700
  qos: m2
700
701
  time: 08:00:00
701
702
  partition: a40
703
+ vllm_args:
704
+ --max-model-len: 131072
705
+ --max-num-seqs: 256
706
+ --compilation-config: 3
702
707
  Llama-3.2-1B-Instruct:
703
708
  model_family: Llama-3.2
704
709
  model_variant: 1B-Instruct
@@ -706,13 +711,13 @@ models:
706
711
  gpus_per_node: 1
707
712
  num_nodes: 1
708
713
  vocab_size: 128256
709
- max_model_len: 131072
710
- max_num_seqs: 256
711
- pipeline_parallelism: true
712
- enforce_eager: false
713
714
  qos: m2
714
715
  time: 08:00:00
715
716
  partition: a40
717
+ vllm_args:
718
+ --max-model-len: 131072
719
+ --max-num-seqs: 256
720
+ --compilation-config: 3
716
721
  Llama-3.2-3B:
717
722
  model_family: Llama-3.2
718
723
  model_variant: 3B
@@ -720,13 +725,13 @@ models:
720
725
  gpus_per_node: 1
721
726
  num_nodes: 1
722
727
  vocab_size: 128256
723
- max_model_len: 131072
724
- max_num_seqs: 256
725
- pipeline_parallelism: true
726
- enforce_eager: false
727
728
  qos: m2
728
729
  time: 08:00:00
729
730
  partition: a40
731
+ vllm_args:
732
+ --max-model-len: 131072
733
+ --max-num-seqs: 256
734
+ --compilation-config: 3
730
735
  Llama-3.2-3B-Instruct:
731
736
  model_family: Llama-3.2
732
737
  model_variant: 3B-Instruct
@@ -734,13 +739,13 @@ models:
734
739
  gpus_per_node: 1
735
740
  num_nodes: 1
736
741
  vocab_size: 128256
737
- max_model_len: 131072
738
- max_num_seqs: 256
739
- pipeline_parallelism: true
740
- enforce_eager: false
741
742
  qos: m2
742
743
  time: 08:00:00
743
744
  partition: a40
745
+ vllm_args:
746
+ --max-model-len: 131072
747
+ --max-num-seqs: 256
748
+ --compilation-config: 3
744
749
  Llama-3.2-11B-Vision:
745
750
  model_family: Llama-3.2
746
751
  model_variant: 11B-Vision
@@ -748,13 +753,15 @@ models:
748
753
  gpus_per_node: 2
749
754
  num_nodes: 1
750
755
  vocab_size: 128256
751
- max_model_len: 4096
752
- max_num_seqs: 64
753
- pipeline_parallelism: false
754
- enforce_eager: true
755
756
  qos: m2
756
757
  time: 08:00:00
757
758
  partition: a40
759
+ vllm_args:
760
+ --tensor-parallel-size: 2
761
+ --max-model-len: 4096
762
+ --max-num-seqs: 64
763
+ --compilation-config: 3
764
+ --enforce-eager: true
758
765
  Llama-3.2-11B-Vision-Instruct:
759
766
  model_family: Llama-3.2
760
767
  model_variant: 11B-Vision-Instruct
@@ -762,13 +769,15 @@ models:
762
769
  gpus_per_node: 2
763
770
  num_nodes: 1
764
771
  vocab_size: 128256
765
- max_model_len: 4096
766
- max_num_seqs: 64
767
- pipeline_parallelism: false
768
- enforce_eager: true
769
772
  qos: m2
770
773
  time: 08:00:00
771
774
  partition: a40
775
+ vllm_args:
776
+ --tensor-parallel-size: 2
777
+ --max-model-len: 4096
778
+ --max-num-seqs: 64
779
+ --compilation-config: 3
780
+ --enforce-eager: true
772
781
  Llama-3.2-90B-Vision:
773
782
  model_family: Llama-3.2
774
783
  model_variant: 90B-Vision
@@ -776,13 +785,15 @@ models:
776
785
  gpus_per_node: 4
777
786
  num_nodes: 2
778
787
  vocab_size: 128256
779
- max_model_len: 4096
780
- max_num_seqs: 32
781
- pipeline_parallelism: false
782
- enforce_eager: true
783
788
  qos: m2
784
789
  time: 08:00:00
785
790
  partition: a40
791
+ vllm_args:
792
+ --tensor-parallel-size: 8
793
+ --max-model-len: 4096
794
+ --max-num-seqs: 32
795
+ --compilation-config: 3
796
+ --enforce-eager: true
786
797
  Llama-3.2-90B-Vision-Instruct:
787
798
  model_family: Llama-3.2
788
799
  model_variant: 90B-Vision-Instruct
@@ -790,13 +801,15 @@ models:
790
801
  gpus_per_node: 4
791
802
  num_nodes: 2
792
803
  vocab_size: 128256
793
- max_model_len: 4096
794
- max_num_seqs: 32
795
- pipeline_parallelism: false
796
- enforce_eager: true
797
804
  qos: m2
798
805
  time: 08:00:00
799
806
  partition: a40
807
+ vllm_args:
808
+ --tensor-parallel-size: 8
809
+ --max-model-len: 4096
810
+ --max-num-seqs: 32
811
+ --compilation-config: 3
812
+ --enforce-eager: true
800
813
  Qwen2.5-0.5B-Instruct:
801
814
  model_family: Qwen2.5
802
815
  model_variant: 0.5B-Instruct
@@ -804,13 +817,13 @@ models:
804
817
  gpus_per_node: 1
805
818
  num_nodes: 1
806
819
  vocab_size: 152064
807
- max_model_len: 32768
808
- max_num_seqs: 256
809
- pipeline_parallelism: true
810
- enforce_eager: false
811
820
  qos: m2
812
821
  time: 08:00:00
813
822
  partition: a40
823
+ vllm_args:
824
+ --max-model-len: 32768
825
+ --max-num-seqs: 256
826
+ --compilation-config: 3
814
827
  Qwen2.5-1.5B-Instruct:
815
828
  model_family: Qwen2.5
816
829
  model_variant: 1.5B-Instruct
@@ -818,13 +831,13 @@ models:
818
831
  gpus_per_node: 1
819
832
  num_nodes: 1
820
833
  vocab_size: 152064
821
- max_model_len: 32768
822
- max_num_seqs: 256
823
- pipeline_parallelism: true
824
- enforce_eager: false
825
834
  qos: m2
826
835
  time: 08:00:00
827
836
  partition: a40
837
+ vllm_args:
838
+ --max-model-len: 32768
839
+ --max-num-seqs: 256
840
+ --compilation-config: 3
828
841
  Qwen2.5-3B-Instruct:
829
842
  model_family: Qwen2.5
830
843
  model_variant: 3B-Instruct
@@ -832,13 +845,13 @@ models:
832
845
  gpus_per_node: 1
833
846
  num_nodes: 1
834
847
  vocab_size: 152064
835
- max_model_len: 32768
836
- max_num_seqs: 256
837
- pipeline_parallelism: true
838
- enforce_eager: false
839
848
  qos: m2
840
849
  time: 08:00:00
841
850
  partition: a40
851
+ vllm_args:
852
+ --max-model-len: 32768
853
+ --max-num-seqs: 256
854
+ --compilation-config: 3
842
855
  Qwen2.5-7B-Instruct:
843
856
  model_family: Qwen2.5
844
857
  model_variant: 7B-Instruct
@@ -846,13 +859,13 @@ models:
846
859
  gpus_per_node: 1
847
860
  num_nodes: 1
848
861
  vocab_size: 152064
849
- max_model_len: 32768
850
- max_num_seqs: 256
851
- pipeline_parallelism: true
852
- enforce_eager: false
853
862
  qos: m2
854
863
  time: 08:00:00
855
864
  partition: a40
865
+ vllm_args:
866
+ --max-model-len: 32768
867
+ --max-num-seqs: 256
868
+ --compilation-config: 3
856
869
  Qwen2.5-14B-Instruct:
857
870
  model_family: Qwen2.5
858
871
  model_variant: 14B-Instruct
@@ -860,13 +873,13 @@ models:
860
873
  gpus_per_node: 1
861
874
  num_nodes: 1
862
875
  vocab_size: 152064
863
- max_model_len: 32768
864
- max_num_seqs: 256
865
- pipeline_parallelism: true
866
- enforce_eager: false
867
876
  qos: m2
868
877
  time: 08:00:00
869
878
  partition: a40
879
+ vllm_args:
880
+ --max-model-len: 32768
881
+ --max-num-seqs: 256
882
+ --compilation-config: 3
870
883
  Qwen2.5-32B-Instruct:
871
884
  model_family: Qwen2.5
872
885
  model_variant: 32B-Instruct
@@ -874,13 +887,14 @@ models:
874
887
  gpus_per_node: 2
875
888
  num_nodes: 1
876
889
  vocab_size: 152064
877
- max_model_len: 32768
878
- max_num_seqs: 256
879
- pipeline_parallelism: true
880
- enforce_eager: false
881
890
  qos: m2
882
891
  time: 08:00:00
883
892
  partition: a40
893
+ vllm_args:
894
+ --tensor-parallel-size: 2
895
+ --max-model-len: 32768
896
+ --max-num-seqs: 256
897
+ --compilation-config: 3
884
898
  Qwen2.5-72B-Instruct:
885
899
  model_family: Qwen2.5
886
900
  model_variant: 72B-Instruct
@@ -888,13 +902,14 @@ models:
888
902
  gpus_per_node: 4
889
903
  num_nodes: 1
890
904
  vocab_size: 152064
891
- max_model_len: 16384
892
- max_num_seqs: 256
893
- pipeline_parallelism: true
894
- enforce_eager: false
895
905
  qos: m2
896
906
  time: 08:00:00
897
907
  partition: a40
908
+ vllm_args:
909
+ --tensor-parallel-size: 4
910
+ --max-model-len: 16384
911
+ --max-num-seqs: 256
912
+ --compilation-config: 3
898
913
  Qwen2.5-Math-1.5B-Instruct:
899
914
  model_family: Qwen2.5
900
915
  model_variant: Math-1.5B-Instruct
@@ -902,13 +917,13 @@ models:
902
917
  gpus_per_node: 1
903
918
  num_nodes: 1
904
919
  vocab_size: 152064
905
- max_model_len: 4096
906
- max_num_seqs: 256
907
- pipeline_parallelism: true
908
- enforce_eager: false
909
920
  qos: m2
910
921
  time: 08:00:00
911
922
  partition: a40
923
+ vllm_args:
924
+ --max-model-len: 4096
925
+ --max-num-seqs: 256
926
+ --compilation-config: 3
912
927
  Qwen2.5-Math-7B-Instruct:
913
928
  model_family: Qwen2.5
914
929
  model_variant: Math-7B-Instruct
@@ -916,13 +931,13 @@ models:
916
931
  gpus_per_node: 1
917
932
  num_nodes: 1
918
933
  vocab_size: 152064
919
- max_model_len: 4096
920
- max_num_seqs: 256
921
- pipeline_parallelism: true
922
- enforce_eager: false
923
934
  qos: m2
924
935
  time: 08:00:00
925
936
  partition: a40
937
+ vllm_args:
938
+ --max-model-len: 4096
939
+ --max-num-seqs: 256
940
+ --compilation-config: 3
926
941
  Qwen2.5-Math-72B-Instruct:
927
942
  model_family: Qwen2.5
928
943
  model_variant: Math-72B-Instruct
@@ -930,13 +945,14 @@ models:
930
945
  gpus_per_node: 4
931
946
  num_nodes: 1
932
947
  vocab_size: 152064
933
- max_model_len: 4096
934
- max_num_seqs: 256
935
- pipeline_parallelism: true
936
- enforce_eager: false
937
948
  qos: m2
938
949
  time: 08:00:00
939
950
  partition: a40
951
+ vllm_args:
952
+ --tensor-parallel-size: 4
953
+ --max-model-len: 4096
954
+ --max-num-seqs: 256
955
+ --compilation-config: 3
940
956
  Qwen2.5-Coder-7B-Instruct:
941
957
  model_family: Qwen2.5
942
958
  model_variant: Coder-7B-Instruct
@@ -944,13 +960,13 @@ models:
944
960
  gpus_per_node: 1
945
961
  num_nodes: 1
946
962
  vocab_size: 152064
947
- max_model_len: 32768
948
- max_num_seqs: 256
949
- pipeline_parallelism: true
950
- enforce_eager: false
951
963
  qos: m2
952
964
  time: 08:00:00
953
965
  partition: a40
966
+ vllm_args:
967
+ --max-model-len: 32768
968
+ --max-num-seqs: 256
969
+ --compilation-config: 3
954
970
  Qwen2.5-Math-RM-72B:
955
971
  model_family: Qwen2.5
956
972
  model_variant: Math-RM-72B
@@ -958,13 +974,14 @@ models:
958
974
  gpus_per_node: 4
959
975
  num_nodes: 1
960
976
  vocab_size: 152064
961
- max_model_len: 4096
962
- max_num_seqs: 256
963
- pipeline_parallelism: true
964
- enforce_eager: false
965
977
  qos: m2
966
978
  time: 08:00:00
967
979
  partition: a40
980
+ vllm_args:
981
+ --tensor-parallel-size: 4
982
+ --max-model-len: 4096
983
+ --max-num-seqs: 256
984
+ --compilation-config: 3
968
985
  Qwen2.5-Math-PRM-7B:
969
986
  model_family: Qwen2.5
970
987
  model_variant: Math-PRM-7B
@@ -972,13 +989,13 @@ models:
972
989
  gpus_per_node: 1
973
990
  num_nodes: 1
974
991
  vocab_size: 152064
975
- max_model_len: 4096
976
- max_num_seqs: 256
977
- pipeline_parallelism: true
978
- enforce_eager: false
979
992
  qos: m2
980
993
  time: 08:00:00
981
994
  partition: a40
995
+ vllm_args:
996
+ --max-model-len: 4096
997
+ --max-num-seqs: 256
998
+ --compilation-config: 3
982
999
  QwQ-32B-Preview:
983
1000
  model_family: QwQ
984
1001
  model_variant: 32B-Preview
@@ -986,13 +1003,14 @@ models:
986
1003
  gpus_per_node: 2
987
1004
  num_nodes: 1
988
1005
  vocab_size: 152064
989
- max_model_len: 32768
990
- max_num_seqs: 256
991
- pipeline_parallelism: true
992
- enforce_eager: false
993
1006
  qos: m2
994
1007
  time: 08:00:00
995
1008
  partition: a40
1009
+ vllm_args:
1010
+ --tensor-parallel-size: 2
1011
+ --max-model-len: 32768
1012
+ --max-num-seqs: 256
1013
+ --compilation-config: 3
996
1014
  Pixtral-12B-2409:
997
1015
  model_family: Pixtral
998
1016
  model_variant: 12B-2409
@@ -1000,13 +1018,13 @@ models:
1000
1018
  gpus_per_node: 1
1001
1019
  num_nodes: 1
1002
1020
  vocab_size: 131072
1003
- max_model_len: 8192
1004
- max_num_seqs: 256
1005
- pipeline_parallelism: true
1006
- enforce_eager: false
1007
1021
  qos: m2
1008
1022
  time: 08:00:00
1009
1023
  partition: a40
1024
+ vllm_args:
1025
+ --max-model-len: 8192
1026
+ --max-num-seqs: 256
1027
+ --compilation-config: 3
1010
1028
  e5-mistral-7b-instruct:
1011
1029
  model_family: e5
1012
1030
  model_variant: mistral-7b-instruct
@@ -1014,13 +1032,13 @@ models:
1014
1032
  gpus_per_node: 1
1015
1033
  num_nodes: 1
1016
1034
  vocab_size: 32000
1017
- max_model_len: 4096
1018
- max_num_seqs: 256
1019
- pipeline_parallelism: true
1020
- enforce_eager: false
1021
1035
  qos: m2
1022
1036
  time: 08:00:00
1023
1037
  partition: a40
1038
+ vllm_args:
1039
+ --max-model-len: 4096
1040
+ --max-num-seqs: 256
1041
+ --compilation-config: 3
1024
1042
  bge-base-en-v1.5:
1025
1043
  model_family: bge
1026
1044
  model_variant: base-en-v1.5
@@ -1028,13 +1046,13 @@ models:
1028
1046
  gpus_per_node: 1
1029
1047
  num_nodes: 1
1030
1048
  vocab_size: 30522
1031
- max_model_len: 512
1032
- max_num_seqs: 256
1033
- pipeline_parallelism: true
1034
- enforce_eager: false
1035
1049
  qos: m2
1036
1050
  time: 08:00:00
1037
1051
  partition: a40
1052
+ vllm_args:
1053
+ --max-model-len: 512
1054
+ --max-num-seqs: 256
1055
+ --compilation-config: 3
1038
1056
  all-MiniLM-L6-v2:
1039
1057
  model_family: all-MiniLM
1040
1058
  model_variant: L6-v2
@@ -1042,13 +1060,13 @@ models:
1042
1060
  gpus_per_node: 1
1043
1061
  num_nodes: 1
1044
1062
  vocab_size: 30522
1045
- max_model_len: 512
1046
- max_num_seqs: 256
1047
- pipeline_parallelism: true
1048
- enforce_eager: false
1049
1063
  qos: m2
1050
1064
  time: 08:00:00
1051
1065
  partition: a40
1066
+ vllm_args:
1067
+ --max-model-len: 512
1068
+ --max-num-seqs: 256
1069
+ --compilation-config: 3
1052
1070
  Llama-3.3-70B-Instruct:
1053
1071
  model_family: Llama-3.3
1054
1072
  model_variant: 70B-Instruct
@@ -1056,13 +1074,14 @@ models:
1056
1074
  gpus_per_node: 4
1057
1075
  num_nodes: 1
1058
1076
  vocab_size: 128256
1059
- max_model_len: 65536
1060
- max_num_seqs: 256
1061
- pipeline_parallelism: true
1062
- enforce_eager: false
1063
1077
  qos: m2
1064
1078
  time: 08:00:00
1065
1079
  partition: a40
1080
+ vllm_args:
1081
+ --tensor-parallel-size: 4
1082
+ --max-model-len: 65536
1083
+ --max-num-seqs: 256
1084
+ --compilation-config: 3
1066
1085
  InternVL2_5-26B:
1067
1086
  model_family: InternVL2_5
1068
1087
  model_variant: 26B
@@ -1070,13 +1089,14 @@ models:
1070
1089
  gpus_per_node: 2
1071
1090
  num_nodes: 1
1072
1091
  vocab_size: 92553
1073
- max_model_len: 32768
1074
- max_num_seqs: 256
1075
- pipeline_parallelism: true
1076
- enforce_eager: false
1077
1092
  qos: m2
1078
1093
  time: 08:00:00
1079
1094
  partition: a40
1095
+ vllm_args:
1096
+ --tensor-parallel-size: 2
1097
+ --max-model-len: 32768
1098
+ --max-num-seqs: 256
1099
+ --compilation-config: 3
1080
1100
  InternVL2_5-38B:
1081
1101
  model_family: InternVL2_5
1082
1102
  model_variant: 38B
@@ -1084,13 +1104,14 @@ models:
1084
1104
  gpus_per_node: 4
1085
1105
  num_nodes: 1
1086
1106
  vocab_size: 92553
1087
- max_model_len: 32768
1088
- max_num_seqs: 256
1089
- pipeline_parallelism: true
1090
- enforce_eager: false
1091
1107
  qos: m2
1092
1108
  time: 08:00:00
1093
1109
  partition: a40
1110
+ vllm_args:
1111
+ --tensor-parallel-size: 4
1112
+ --max-model-len: 32768
1113
+ --max-num-seqs: 256
1114
+ --compilation-config: 3
1094
1115
  Aya-Expanse-32B:
1095
1116
  model_family: Aya-Expanse
1096
1117
  model_variant: 32B
@@ -1098,69 +1119,72 @@ models:
1098
1119
  gpus_per_node: 2
1099
1120
  num_nodes: 1
1100
1121
  vocab_size: 256000
1101
- max_model_len: 8192
1102
- max_num_seqs: 256
1103
- pipeline_parallelism: true
1104
- enforce_eager: false
1105
1122
  qos: m2
1106
1123
  time: 08:00:00
1107
1124
  partition: a40
1125
+ vllm_args:
1126
+ --tensor-parallel-size: 2
1127
+ --max-model-len: 8192
1128
+ --max-num-seqs: 256
1129
+ --compilation-config: 3
1108
1130
  DeepSeek-R1-Distill-Llama-70B:
1109
1131
  model_family: DeepSeek-R1
1110
- model_variant: 'Distill-Llama-70B '
1132
+ model_variant: Distill-Llama-70B
1111
1133
  model_type: LLM
1112
1134
  gpus_per_node: 4
1113
- num_nodes: 2
1135
+ num_nodes: 1
1114
1136
  vocab_size: 128256
1115
- max_model_len: 131072
1116
- max_num_seqs: 256
1117
- pipeline_parallelism: true
1118
- enforce_eager: false
1119
1137
  qos: m2
1120
1138
  time: 08:00:00
1121
1139
  partition: a40
1140
+ vllm_args:
1141
+ --tensor-parallel-size: 4
1142
+ --max-model-len: 65536
1143
+ --max-num-seqs: 256
1144
+ --compilation-config: 3
1122
1145
  DeepSeek-R1-Distill-Llama-8B:
1123
1146
  model_family: DeepSeek-R1
1124
- model_variant: 'Distill-Llama-8B '
1147
+ model_variant: Distill-Llama-8B
1125
1148
  model_type: LLM
1126
1149
  gpus_per_node: 1
1127
1150
  num_nodes: 1
1128
1151
  vocab_size: 128256
1129
- max_model_len: 131072
1130
- max_num_seqs: 256
1131
- pipeline_parallelism: true
1132
- enforce_eager: false
1133
1152
  qos: m2
1134
1153
  time: 08:00:00
1135
1154
  partition: a40
1155
+ vllm_args:
1156
+ --max-model-len: 131072
1157
+ --max-num-seqs: 256
1158
+ --compilation-config: 3
1136
1159
  DeepSeek-R1-Distill-Qwen-32B:
1137
1160
  model_family: DeepSeek-R1
1138
1161
  model_variant: Distill-Qwen-32B
1139
1162
  model_type: LLM
1140
- gpus_per_node: 4
1163
+ gpus_per_node: 2
1141
1164
  num_nodes: 1
1142
1165
  vocab_size: 152064
1143
- max_model_len: 131072
1144
- max_num_seqs: 256
1145
- pipeline_parallelism: true
1146
- enforce_eager: false
1147
1166
  qos: m2
1148
1167
  time: 08:00:00
1149
1168
  partition: a40
1169
+ vllm_args:
1170
+ --tensor-parallel-size: 2
1171
+ --max-model-len: 65536
1172
+ --max-num-seqs: 256
1173
+ --compilation-config: 3
1150
1174
  DeepSeek-R1-Distill-Qwen-14B:
1151
1175
  model_family: DeepSeek-R1
1152
1176
  model_variant: Distill-Qwen-14B
1153
1177
  model_type: LLM
1154
- gpus_per_node: 2
1178
+ gpus_per_node: 1
1155
1179
  num_nodes: 1
1156
1180
  vocab_size: 152064
1157
- max_model_len: 131072
1158
- max_num_seqs: 256
1159
- pipeline_parallelism: true
1160
- enforce_eager: false
1161
1181
  qos: m2
1162
1182
  time: 08:00:00
1163
1183
  partition: a40
1184
+ vllm_args:
1185
+ --max-model-len: 65536
1186
+ --max-num-seqs: 256
1187
+ --compilation-config: 3
1164
1188
  DeepSeek-R1-Distill-Qwen-7B:
1165
1189
  model_family: DeepSeek-R1
1166
1190
  model_variant: Distill-Qwen-7B
@@ -1168,13 +1192,13 @@ models:
1168
1192
  gpus_per_node: 1
1169
1193
  num_nodes: 1
1170
1194
  vocab_size: 152064
1171
- max_model_len: 131072
1172
- max_num_seqs: 256
1173
- pipeline_parallelism: true
1174
- enforce_eager: false
1175
1195
  qos: m2
1176
1196
  time: 08:00:00
1177
1197
  partition: a40
1198
+ vllm_args:
1199
+ --max-model-len: 131072
1200
+ --max-num-seqs: 256
1201
+ --compilation-config: 3
1178
1202
  DeepSeek-R1-Distill-Qwen-1.5B:
1179
1203
  model_family: DeepSeek-R1
1180
1204
  model_variant: Distill-Qwen-1.5B
@@ -1182,13 +1206,13 @@ models:
1182
1206
  gpus_per_node: 1
1183
1207
  num_nodes: 1
1184
1208
  vocab_size: 152064
1185
- max_model_len: 131072
1186
- max_num_seqs: 256
1187
- pipeline_parallelism: true
1188
- enforce_eager: false
1189
1209
  qos: m2
1190
1210
  time: 08:00:00
1191
1211
  partition: a40
1212
+ vllm_args:
1213
+ --max-model-len: 131072
1214
+ --max-num-seqs: 256
1215
+ --compilation-config: 3
1192
1216
  Phi-3.5-vision-instruct:
1193
1217
  model_family: Phi-3.5-vision
1194
1218
  model_variant: instruct
@@ -1196,13 +1220,14 @@ models:
1196
1220
  gpus_per_node: 2
1197
1221
  num_nodes: 1
1198
1222
  vocab_size: 32064
1199
- max_model_len: 65536
1200
- max_num_seqs: 256
1201
- pipeline_parallelism: true
1202
- enforce_eager: false
1203
1223
  qos: m2
1204
1224
  time: 08:00:00
1205
1225
  partition: a40
1226
+ vllm_args:
1227
+ --tensor-parallel-size: 2
1228
+ --max-model-len: 65536
1229
+ --max-num-seqs: 256
1230
+ --compilation-config: 3
1206
1231
  InternVL2_5-8B:
1207
1232
  model_family: InternVL2_5
1208
1233
  model_variant: 8B
@@ -1210,13 +1235,13 @@ models:
1210
1235
  gpus_per_node: 1
1211
1236
  num_nodes: 1
1212
1237
  vocab_size: 92553
1213
- max_model_len: 32768
1214
- max_num_seqs: 256
1215
- pipeline_parallelism: true
1216
- enforce_eager: false
1217
1238
  qos: m2
1218
1239
  time: 08:00:00
1219
1240
  partition: a40
1241
+ vllm_args:
1242
+ --max-model-len: 32768
1243
+ --max-num-seqs: 256
1244
+ --compilation-config: 3
1220
1245
  glm-4v-9b:
1221
1246
  model_family: glm-4v
1222
1247
  model_variant: 9b
@@ -1224,13 +1249,13 @@ models:
1224
1249
  gpus_per_node: 1
1225
1250
  num_nodes: 1
1226
1251
  vocab_size: 151552
1227
- max_model_len: 8192
1228
- max_num_seqs: 256
1229
- pipeline_parallelism: true
1230
- enforce_eager: false
1231
1252
  qos: m2
1232
1253
  time: 08:00:00
1233
1254
  partition: a40
1255
+ vllm_args:
1256
+ --max-model-len: 8192
1257
+ --max-num-seqs: 256
1258
+ --compilation-config: 3
1234
1259
  Molmo-7B-D-0924:
1235
1260
  model_family: Molmo
1236
1261
  model_variant: 7B-D-0924
@@ -1238,26 +1263,27 @@ models:
1238
1263
  gpus_per_node: 1
1239
1264
  num_nodes: 1
1240
1265
  vocab_size: 152064
1241
- max_model_len: 4096
1242
- max_num_seqs: 256
1243
- pipeline_parallelism: true
1244
- enforce_eager: false
1245
1266
  qos: m2
1246
1267
  time: 08:00:00
1247
1268
  partition: a40
1269
+ vllm_args:
1270
+ --max-model-len: 4096
1271
+ --max-num-seqs: 256
1272
+ --compilation-config: 3
1248
1273
  deepseek-vl2:
1249
1274
  model_family: deepseek-vl2
1250
1275
  model_type: VLM
1251
1276
  gpus_per_node: 2
1252
1277
  num_nodes: 1
1253
1278
  vocab_size: 129280
1254
- max_model_len: 4096
1255
- max_num_seqs: 256
1256
- pipeline_parallelism: true
1257
- enforce_eager: false
1258
1279
  qos: m2
1259
1280
  time: 08:00:00
1260
1281
  partition: a40
1282
+ vllm_args:
1283
+ --tensor-parallel-size: 2
1284
+ --max-model-len: 4096
1285
+ --max-num-seqs: 256
1286
+ --compilation-config: 3
1261
1287
  deepseek-vl2-small:
1262
1288
  model_family: deepseek-vl2
1263
1289
  model_variant: small
@@ -1265,10 +1291,10 @@ models:
1265
1291
  gpus_per_node: 1
1266
1292
  num_nodes: 1
1267
1293
  vocab_size: 129280
1268
- max_model_len: 4096
1269
- max_num_seqs: 256
1270
- pipeline_parallelism: true
1271
- enforce_eager: false
1272
1294
  qos: m2
1273
1295
  time: 08:00:00
1274
1296
  partition: a40
1297
+ vllm_args:
1298
+ --max-model-len: 4096
1299
+ --max-num-seqs: 256
1300
+ --compilation-config: 3