vec-inf 0.5.0__py3-none-any.whl → 0.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,13 +6,14 @@ models:
6
6
  gpus_per_node: 4
7
7
  num_nodes: 2
8
8
  vocab_size: 256000
9
- max_model_len: 8192
10
- max_num_seqs: 256
11
- pipeline_parallelism: true
12
- enforce_eager: false
13
9
  qos: m2
14
10
  time: 08:00:00
15
11
  partition: a40
12
+ vllm_args:
13
+ --pipeline-parallel-size: 2
14
+ --tensor-parallel-size: 4
15
+ --max-model-len: 8192
16
+ --max-num-seqs: 256
16
17
  c4ai-command-r-plus-08-2024:
17
18
  model_family: c4ai-command-r
18
19
  model_variant: plus-08-2024
@@ -20,13 +21,14 @@ models:
20
21
  gpus_per_node: 4
21
22
  num_nodes: 2
22
23
  vocab_size: 256000
23
- max_model_len: 65536
24
- max_num_seqs: 256
25
- pipeline_parallelism: true
26
- enforce_eager: false
27
24
  qos: m2
28
25
  time: 08:00:00
29
26
  partition: a40
27
+ vllm_args:
28
+ --pipeline-parallel-size: 2
29
+ --tensor-parallel-size: 4
30
+ --max-model-len: 65536
31
+ --max-num-seqs: 256
30
32
  c4ai-command-r-08-2024:
31
33
  model_family: c4ai-command-r
32
34
  model_variant: 08-2024
@@ -34,13 +36,14 @@ models:
34
36
  gpus_per_node: 2
35
37
  num_nodes: 1
36
38
  vocab_size: 256000
37
- max_model_len: 32768
38
- max_num_seqs: 256
39
- pipeline_parallelism: true
40
- enforce_eager: false
41
39
  qos: m2
42
40
  time: 08:00:00
43
41
  partition: a40
42
+ vllm_args:
43
+ --tensor-parallel-size: 2
44
+ --max-model-len: 32768
45
+ --max-num-seqs: 256
46
+ --compilation-config: 3
44
47
  CodeLlama-7b-hf:
45
48
  model_family: CodeLlama
46
49
  model_variant: 7b-hf
@@ -48,13 +51,13 @@ models:
48
51
  gpus_per_node: 1
49
52
  num_nodes: 1
50
53
  vocab_size: 32000
51
- max_model_len: 16384
52
- max_num_seqs: 256
53
- pipeline_parallelism: true
54
- enforce_eager: false
55
54
  qos: m2
56
55
  time: 08:00:00
57
56
  partition: a40
57
+ vllm_args:
58
+ --max-model-len: 16384
59
+ --max-num-seqs: 256
60
+ --compilation-config: 3
58
61
  CodeLlama-7b-Instruct-hf:
59
62
  model_family: CodeLlama
60
63
  model_variant: 7b-Instruct-hf
@@ -62,13 +65,13 @@ models:
62
65
  gpus_per_node: 1
63
66
  num_nodes: 1
64
67
  vocab_size: 32000
65
- max_model_len: 16384
66
- max_num_seqs: 256
67
- pipeline_parallelism: true
68
- enforce_eager: false
69
68
  qos: m2
70
69
  time: 08:00:00
71
70
  partition: a40
71
+ vllm_args:
72
+ --max-model-len: 16384
73
+ --max-num-seqs: 256
74
+ --compilation-config: 3
72
75
  CodeLlama-13b-hf:
73
76
  model_family: CodeLlama
74
77
  model_variant: 13b-hf
@@ -76,13 +79,13 @@ models:
76
79
  gpus_per_node: 1
77
80
  num_nodes: 1
78
81
  vocab_size: 32000
79
- max_model_len: 16384
80
- max_num_seqs: 256
81
- pipeline_parallelism: true
82
- enforce_eager: false
83
82
  qos: m2
84
83
  time: 08:00:00
85
84
  partition: a40
85
+ vllm_args:
86
+ --max-model-len: 16384
87
+ --max-num-seqs: 256
88
+ --compilation-config: 3
86
89
  CodeLlama-13b-Instruct-hf:
87
90
  model_family: CodeLlama
88
91
  model_variant: 13b-Instruct-hf
@@ -90,13 +93,13 @@ models:
90
93
  gpus_per_node: 1
91
94
  num_nodes: 1
92
95
  vocab_size: 32000
93
- max_model_len: 16384
94
- max_num_seqs: 256
95
- pipeline_parallelism: true
96
- enforce_eager: false
97
96
  qos: m2
98
97
  time: 08:00:00
99
98
  partition: a40
99
+ vllm_args:
100
+ --max-model-len: 16384
101
+ --max-num-seqs: 256
102
+ --compilation-config: 3
100
103
  CodeLlama-34b-hf:
101
104
  model_family: CodeLlama
102
105
  model_variant: 34b-hf
@@ -104,13 +107,14 @@ models:
104
107
  gpus_per_node: 2
105
108
  num_nodes: 1
106
109
  vocab_size: 32000
107
- max_model_len: 16384
108
- max_num_seqs: 256
109
- pipeline_parallelism: true
110
- enforce_eager: false
111
110
  qos: m2
112
111
  time: 08:00:00
113
112
  partition: a40
113
+ vllm_args:
114
+ --tensor-parallel-size: 2
115
+ --max-model-len: 16384
116
+ --max-num-seqs: 256
117
+ --compilation-config: 3
114
118
  CodeLlama-34b-Instruct-hf:
115
119
  model_family: CodeLlama
116
120
  model_variant: 34b-Instruct-hf
@@ -118,55 +122,44 @@ models:
118
122
  gpus_per_node: 2
119
123
  num_nodes: 1
120
124
  vocab_size: 32000
121
- max_model_len: 16384
122
- max_num_seqs: 256
123
- pipeline_parallelism: true
124
- enforce_eager: false
125
125
  qos: m2
126
126
  time: 08:00:00
127
127
  partition: a40
128
+ vllm_args:
129
+ --tensor-parallel-size: 2
130
+ --max-model-len: 16384
131
+ --max-num-seqs: 256
132
+ --compilation-config: 3
128
133
  CodeLlama-70b-hf:
129
134
  model_family: CodeLlama
130
135
  model_variant: 70b-hf
131
136
  model_type: LLM
132
137
  gpus_per_node: 4
133
138
  num_nodes: 1
134
- vocab_size: 32000
135
- max_model_len: 4096
136
- max_num_seqs: 256
137
- pipeline_parallelism: true
138
- enforce_eager: false
139
+ vocab_size: 32016
139
140
  qos: m2
140
141
  time: 08:00:00
141
142
  partition: a40
143
+ vllm_args:
144
+ --tensor-parallel-size: 4
145
+ --max-model-len: 4096
146
+ --max-num-seqs: 256
147
+ --compilation-config: 3
142
148
  CodeLlama-70b-Instruct-hf:
143
149
  model_family: CodeLlama
144
150
  model_variant: 70b-Instruct-hf
145
151
  model_type: LLM
146
152
  gpus_per_node: 4
147
153
  num_nodes: 1
148
- vocab_size: 32000
149
- max_model_len: 4096
150
- max_num_seqs: 256
151
- pipeline_parallelism: true
152
- enforce_eager: false
153
- qos: m2
154
- time: 08:00:00
155
- partition: a40
156
- dbrx-instruct:
157
- model_family: dbrx
158
- model_variant: instruct
159
- model_type: LLM
160
- gpus_per_node: 4
161
- num_nodes: 2
162
- vocab_size: 100352
163
- max_model_len: 32000
164
- max_num_seqs: 256
165
- pipeline_parallelism: true
166
- enforce_eager: false
154
+ vocab_size: 32016
167
155
  qos: m2
168
156
  time: 08:00:00
169
157
  partition: a40
158
+ vllm_args:
159
+ --tensor-parallel-size: 4
160
+ --max-model-len: 4096
161
+ --max-num-seqs: 256
162
+ --compilation-config: 3
170
163
  gemma-2-9b:
171
164
  model_family: gemma-2
172
165
  model_variant: 9b
@@ -174,13 +167,13 @@ models:
174
167
  gpus_per_node: 1
175
168
  num_nodes: 1
176
169
  vocab_size: 256000
177
- max_model_len: 4096
178
- max_num_seqs: 256
179
- pipeline_parallelism: true
180
- enforce_eager: false
181
170
  qos: m2
182
171
  time: 08:00:00
183
172
  partition: a40
173
+ vllm_args:
174
+ --max-model-len: 4096
175
+ --max-num-seqs: 256
176
+ --compilation-config: 3
184
177
  gemma-2-9b-it:
185
178
  model_family: gemma-2
186
179
  model_variant: 9b-it
@@ -188,13 +181,13 @@ models:
188
181
  gpus_per_node: 1
189
182
  num_nodes: 1
190
183
  vocab_size: 256000
191
- max_model_len: 4096
192
- max_num_seqs: 256
193
- pipeline_parallelism: true
194
- enforce_eager: false
195
184
  qos: m2
196
185
  time: 08:00:00
197
186
  partition: a40
187
+ vllm_args:
188
+ --max-model-len: 4096
189
+ --max-num-seqs: 256
190
+ --compilation-config: 3
198
191
  gemma-2-27b:
199
192
  model_family: gemma-2
200
193
  model_variant: 27b
@@ -202,13 +195,14 @@ models:
202
195
  gpus_per_node: 2
203
196
  num_nodes: 1
204
197
  vocab_size: 256000
205
- max_model_len: 4096
206
- max_num_seqs: 256
207
- pipeline_parallelism: true
208
- enforce_eager: false
209
198
  qos: m2
210
199
  time: 08:00:00
211
200
  partition: a40
201
+ vllm_args:
202
+ --tensor-parallel-size: 2
203
+ --max-model-len: 4096
204
+ --max-num-seqs: 256
205
+ --compilation-config: 3
212
206
  gemma-2-27b-it:
213
207
  model_family: gemma-2
214
208
  model_variant: 27b-it
@@ -216,13 +210,14 @@ models:
216
210
  gpus_per_node: 2
217
211
  num_nodes: 1
218
212
  vocab_size: 256000
219
- max_model_len: 4096
220
- max_num_seqs: 256
221
- pipeline_parallelism: true
222
- enforce_eager: false
223
213
  qos: m2
224
214
  time: 08:00:00
225
215
  partition: a40
216
+ vllm_args:
217
+ --tensor-parallel-size: 2
218
+ --max-model-len: 4096
219
+ --max-num-seqs: 256
220
+ --compilation-config: 3
226
221
  Llama-2-7b-hf:
227
222
  model_family: Llama-2
228
223
  model_variant: 7b-hf
@@ -230,13 +225,13 @@ models:
230
225
  gpus_per_node: 1
231
226
  num_nodes: 1
232
227
  vocab_size: 32000
233
- max_model_len: 4096
234
- max_num_seqs: 256
235
- pipeline_parallelism: true
236
- enforce_eager: false
237
228
  qos: m2
238
229
  time: 08:00:00
239
230
  partition: a40
231
+ vllm_args:
232
+ --max-model-len: 4096
233
+ --max-num-seqs: 256
234
+ --compilation-config: 3
240
235
  Llama-2-7b-chat-hf:
241
236
  model_family: Llama-2
242
237
  model_variant: 7b-chat-hf
@@ -244,13 +239,13 @@ models:
244
239
  gpus_per_node: 1
245
240
  num_nodes: 1
246
241
  vocab_size: 32000
247
- max_model_len: 4096
248
- max_num_seqs: 256
249
- pipeline_parallelism: true
250
- enforce_eager: false
251
242
  qos: m2
252
243
  time: 08:00:00
253
244
  partition: a40
245
+ vllm_args:
246
+ --max-model-len: 4096
247
+ --max-num-seqs: 256
248
+ --compilation-config: 3
254
249
  Llama-2-13b-hf:
255
250
  model_family: Llama-2
256
251
  model_variant: 13b-hf
@@ -258,13 +253,13 @@ models:
258
253
  gpus_per_node: 1
259
254
  num_nodes: 1
260
255
  vocab_size: 32000
261
- max_model_len: 4096
262
- max_num_seqs: 256
263
- pipeline_parallelism: true
264
- enforce_eager: false
265
256
  qos: m2
266
257
  time: 08:00:00
267
258
  partition: a40
259
+ vllm_args:
260
+ --max-model-len: 4096
261
+ --max-num-seqs: 256
262
+ --compilation-config: 3
268
263
  Llama-2-13b-chat-hf:
269
264
  model_family: Llama-2
270
265
  model_variant: 13b-chat-hf
@@ -272,13 +267,13 @@ models:
272
267
  gpus_per_node: 1
273
268
  num_nodes: 1
274
269
  vocab_size: 32000
275
- max_model_len: 4096
276
- max_num_seqs: 256
277
- pipeline_parallelism: true
278
- enforce_eager: false
279
270
  qos: m2
280
271
  time: 08:00:00
281
272
  partition: a40
273
+ vllm_args:
274
+ --max-model-len: 4096
275
+ --max-num-seqs: 256
276
+ --compilation-config: 3
282
277
  Llama-2-70b-hf:
283
278
  model_family: Llama-2
284
279
  model_variant: 70b-hf
@@ -286,13 +281,14 @@ models:
286
281
  gpus_per_node: 4
287
282
  num_nodes: 1
288
283
  vocab_size: 32000
289
- max_model_len: 4096
290
- max_num_seqs: 256
291
- pipeline_parallelism: true
292
- enforce_eager: false
293
284
  qos: m2
294
285
  time: 08:00:00
295
286
  partition: a40
287
+ vllm_args:
288
+ --tensor-parallel-size: 4
289
+ --max-model-len: 4096
290
+ --max-num-seqs: 256
291
+ --compilation-config: 3
296
292
  Llama-2-70b-chat-hf:
297
293
  model_family: Llama-2
298
294
  model_variant: 70b-chat-hf
@@ -300,13 +296,14 @@ models:
300
296
  gpus_per_node: 4
301
297
  num_nodes: 1
302
298
  vocab_size: 32000
303
- max_model_len: 4096
304
- max_num_seqs: 256
305
- pipeline_parallelism: true
306
- enforce_eager: false
307
299
  qos: m2
308
300
  time: 08:00:00
309
301
  partition: a40
302
+ vllm_args:
303
+ --tensor-parallel-size: 4
304
+ --max-model-len: 4096
305
+ --max-num-seqs: 256
306
+ --compilation-config: 3
310
307
  llava-1.5-7b-hf:
311
308
  model_family: llava-1.5
312
309
  model_variant: 7b-hf
@@ -314,13 +311,13 @@ models:
314
311
  gpus_per_node: 1
315
312
  num_nodes: 1
316
313
  vocab_size: 32000
317
- max_model_len: 4096
318
- max_num_seqs: 256
319
- pipeline_parallelism: true
320
- enforce_eager: false
321
314
  qos: m2
322
315
  time: 08:00:00
323
316
  partition: a40
317
+ vllm_args:
318
+ --max-model-len: 4096
319
+ --max-num-seqs: 256
320
+ --compilation-config: 3
324
321
  llava-1.5-13b-hf:
325
322
  model_family: llava-1.5
326
323
  model_variant: 13b-hf
@@ -328,13 +325,13 @@ models:
328
325
  gpus_per_node: 1
329
326
  num_nodes: 1
330
327
  vocab_size: 32000
331
- max_model_len: 4096
332
- max_num_seqs: 256
333
- pipeline_parallelism: true
334
- enforce_eager: false
335
328
  qos: m2
336
329
  time: 08:00:00
337
330
  partition: a40
331
+ vllm_args:
332
+ --max-model-len: 4096
333
+ --max-num-seqs: 256
334
+ --compilation-config: 3
338
335
  llava-v1.6-mistral-7b-hf:
339
336
  model_family: llava-v1.6
340
337
  model_variant: mistral-7b-hf
@@ -342,13 +339,13 @@ models:
342
339
  gpus_per_node: 1
343
340
  num_nodes: 1
344
341
  vocab_size: 32064
345
- max_model_len: 32768
346
- max_num_seqs: 256
347
- pipeline_parallelism: true
348
- enforce_eager: false
349
342
  qos: m2
350
343
  time: 08:00:00
351
344
  partition: a40
345
+ vllm_args:
346
+ --max-model-len: 32768
347
+ --max-num-seqs: 256
348
+ --compilation-config: 3
352
349
  llava-v1.6-34b-hf:
353
350
  model_family: llava-v1.6
354
351
  model_variant: 34b-hf
@@ -356,13 +353,14 @@ models:
356
353
  gpus_per_node: 2
357
354
  num_nodes: 1
358
355
  vocab_size: 64064
359
- max_model_len: 4096
360
- max_num_seqs: 256
361
- pipeline_parallelism: true
362
- enforce_eager: false
363
356
  qos: m2
364
357
  time: 08:00:00
365
358
  partition: a40
359
+ vllm_args:
360
+ --tensor-parallel-size: 2
361
+ --max-model-len: 4096
362
+ --max-num-seqs: 256
363
+ --compilation-config: 3
366
364
  Meta-Llama-3-8B:
367
365
  model_family: Meta-Llama-3
368
366
  model_variant: 8B
@@ -370,13 +368,13 @@ models:
370
368
  gpus_per_node: 1
371
369
  num_nodes: 1
372
370
  vocab_size: 128256
373
- max_model_len: 8192
374
- max_num_seqs: 256
375
- pipeline_parallelism: true
376
- enforce_eager: false
377
371
  qos: m2
378
372
  time: 08:00:00
379
373
  partition: a40
374
+ vllm_args:
375
+ --max-model-len: 8192
376
+ --max-num-seqs: 256
377
+ --compilation-config: 3
380
378
  Meta-Llama-3-8B-Instruct:
381
379
  model_family: Meta-Llama-3
382
380
  model_variant: 8B-Instruct
@@ -384,13 +382,13 @@ models:
384
382
  gpus_per_node: 1
385
383
  num_nodes: 1
386
384
  vocab_size: 128256
387
- max_model_len: 8192
388
- max_num_seqs: 256
389
- pipeline_parallelism: true
390
- enforce_eager: false
391
385
  qos: m2
392
386
  time: 08:00:00
393
387
  partition: a40
388
+ vllm_args:
389
+ --max-model-len: 8192
390
+ --max-num-seqs: 256
391
+ --compilation-config: 3
394
392
  Meta-Llama-3-70B:
395
393
  model_family: Meta-Llama-3
396
394
  model_variant: 70B
@@ -398,13 +396,14 @@ models:
398
396
  gpus_per_node: 4
399
397
  num_nodes: 1
400
398
  vocab_size: 128256
401
- max_model_len: 8192
402
- max_num_seqs: 256
403
- pipeline_parallelism: true
404
- enforce_eager: false
405
399
  qos: m2
406
400
  time: 08:00:00
407
401
  partition: a40
402
+ vllm_args:
403
+ --tensor-parallel-size: 4
404
+ --max-model-len: 8192
405
+ --max-num-seqs: 256
406
+ --compilation-config: 3
408
407
  Meta-Llama-3-70B-Instruct:
409
408
  model_family: Meta-Llama-3
410
409
  model_variant: 70B-Instruct
@@ -412,13 +411,14 @@ models:
412
411
  gpus_per_node: 4
413
412
  num_nodes: 1
414
413
  vocab_size: 128256
415
- max_model_len: 8192
416
- max_num_seqs: 256
417
- pipeline_parallelism: true
418
- enforce_eager: false
419
414
  qos: m2
420
415
  time: 08:00:00
421
416
  partition: a40
417
+ vllm_args:
418
+ --tensor-parallel-size: 4
419
+ --max-model-len: 8192
420
+ --max-num-seqs: 256
421
+ --compilation-config: 3
422
422
  Meta-Llama-3.1-8B:
423
423
  model_family: Meta-Llama-3.1
424
424
  model_variant: 8B
@@ -426,13 +426,13 @@ models:
426
426
  gpus_per_node: 1
427
427
  num_nodes: 1
428
428
  vocab_size: 128256
429
- max_model_len: 131072
430
- max_num_seqs: 256
431
- pipeline_parallelism: true
432
- enforce_eager: false
433
429
  qos: m2
434
430
  time: 08:00:00
435
431
  partition: a40
432
+ vllm_args:
433
+ --max-model-len: 131072
434
+ --max-num-seqs: 256
435
+ --compilation-config: 3
436
436
  Meta-Llama-3.1-8B-Instruct:
437
437
  model_family: Meta-Llama-3.1
438
438
  model_variant: 8B-Instruct
@@ -440,13 +440,13 @@ models:
440
440
  gpus_per_node: 1
441
441
  num_nodes: 1
442
442
  vocab_size: 128256
443
- max_model_len: 131072
444
- max_num_seqs: 256
445
- pipeline_parallelism: true
446
- enforce_eager: false
447
443
  qos: m2
448
444
  time: 08:00:00
449
445
  partition: a40
446
+ vllm_args:
447
+ --max-model-len: 131072
448
+ --max-num-seqs: 256
449
+ --compilation-config: 3
450
450
  Meta-Llama-3.1-70B:
451
451
  model_family: Meta-Llama-3.1
452
452
  model_variant: 70B
@@ -454,13 +454,14 @@ models:
454
454
  gpus_per_node: 4
455
455
  num_nodes: 1
456
456
  vocab_size: 128256
457
- max_model_len: 65536
458
- max_num_seqs: 256
459
- pipeline_parallelism: true
460
- enforce_eager: false
461
457
  qos: m2
462
458
  time: 08:00:00
463
459
  partition: a40
460
+ vllm_args:
461
+ --tensor-parallel-size: 4
462
+ --max-model-len: 65536
463
+ --max-num-seqs: 256
464
+ --compilation-config: 3
464
465
  Meta-Llama-3.1-70B-Instruct:
465
466
  model_family: Meta-Llama-3.1
466
467
  model_variant: 70B-Instruct
@@ -468,13 +469,14 @@ models:
468
469
  gpus_per_node: 4
469
470
  num_nodes: 1
470
471
  vocab_size: 128256
471
- max_model_len: 65536
472
- max_num_seqs: 256
473
- pipeline_parallelism: true
474
- enforce_eager: false
475
472
  qos: m2
476
473
  time: 08:00:00
477
474
  partition: a40
475
+ vllm_args:
476
+ --tensor-parallel-size: 4
477
+ --max-model-len: 65536
478
+ --max-num-seqs: 256
479
+ --compilation-config: 3
478
480
  Meta-Llama-3.1-405B-Instruct:
479
481
  model_family: Meta-Llama-3.1
480
482
  model_variant: 405B-Instruct
@@ -482,27 +484,14 @@ models:
482
484
  gpus_per_node: 4
483
485
  num_nodes: 8
484
486
  vocab_size: 128256
485
- max_model_len: 16384
486
- max_num_seqs: 256
487
- pipeline_parallelism: true
488
- enforce_eager: false
489
487
  qos: m4
490
488
  time: 02:00:00
491
489
  partition: a40
492
- Mistral-7B-v0.1:
493
- model_family: Mistral
494
- model_variant: 7B-v0.1
495
- model_type: LLM
496
- gpus_per_node: 1
497
- num_nodes: 1
498
- vocab_size: 32000
499
- max_model_len: 32768
500
- max_num_seqs: 256
501
- pipeline_parallelism: true
502
- enforce_eager: false
503
- qos: m2
504
- time: 08:00:00
505
- partition: a40
490
+ vllm_args:
491
+ --pipeline-parallel-size: 8
492
+ --tensor-parallel-size: 4
493
+ --max-model-len: 16384
494
+ --max-num-seqs: 256
506
495
  Mistral-7B-Instruct-v0.1:
507
496
  model_family: Mistral
508
497
  model_variant: 7B-Instruct-v0.1
@@ -510,13 +499,13 @@ models:
510
499
  gpus_per_node: 1
511
500
  num_nodes: 1
512
501
  vocab_size: 32000
513
- max_model_len: 32768
514
- max_num_seqs: 256
515
- pipeline_parallelism: true
516
- enforce_eager: false
517
502
  qos: m2
518
503
  time: 08:00:00
519
504
  partition: a40
505
+ vllm_args:
506
+ --max-model-len: 32768
507
+ --max-num-seqs: 256
508
+ --compilation-config: 3
520
509
  Mistral-7B-Instruct-v0.2:
521
510
  model_family: Mistral
522
511
  model_variant: 7B-Instruct-v0.2
@@ -524,13 +513,13 @@ models:
524
513
  gpus_per_node: 1
525
514
  num_nodes: 1
526
515
  vocab_size: 32000
527
- max_model_len: 32768
528
- max_num_seqs: 256
529
- pipeline_parallelism: true
530
- enforce_eager: false
531
516
  qos: m2
532
517
  time: 08:00:00
533
518
  partition: a40
519
+ vllm_args:
520
+ --max-model-len: 32768
521
+ --max-num-seqs: 256
522
+ --compilation-config: 3
534
523
  Mistral-7B-v0.3:
535
524
  model_family: Mistral
536
525
  model_variant: 7B-v0.3
@@ -538,13 +527,13 @@ models:
538
527
  gpus_per_node: 1
539
528
  num_nodes: 1
540
529
  vocab_size: 32768
541
- max_model_len: 32768
542
- max_num_seqs: 256
543
- pipeline_parallelism: true
544
- enforce_eager: false
545
530
  qos: m2
546
531
  time: 08:00:00
547
532
  partition: a40
533
+ vllm_args:
534
+ --max-model-len: 32768
535
+ --max-num-seqs: 256
536
+ --compilation-config: 3
548
537
  Mistral-7B-Instruct-v0.3:
549
538
  model_family: Mistral
550
539
  model_variant: 7B-Instruct-v0.3
@@ -552,13 +541,13 @@ models:
552
541
  gpus_per_node: 1
553
542
  num_nodes: 1
554
543
  vocab_size: 32768
555
- max_model_len: 32768
556
- max_num_seqs: 256
557
- pipeline_parallelism: true
558
- enforce_eager: false
559
544
  qos: m2
560
545
  time: 08:00:00
561
546
  partition: a40
547
+ vllm_args:
548
+ --max-model-len: 32768
549
+ --max-num-seqs: 256
550
+ --compilation-config: 3
562
551
  Mistral-Large-Instruct-2407:
563
552
  model_family: Mistral
564
553
  model_variant: Large-Instruct-2407
@@ -566,13 +555,14 @@ models:
566
555
  gpus_per_node: 4
567
556
  num_nodes: 2
568
557
  vocab_size: 32768
569
- max_model_len: 32768
570
- max_num_seqs: 256
571
- pipeline_parallelism: true
572
- enforce_eager: false
573
558
  qos: m2
574
559
  time: 08:00:00
575
560
  partition: a40
561
+ vllm_args:
562
+ --pipeline-parallel-size: 2
563
+ --tensor-parallel-size: 4
564
+ --max-model-len: 32768
565
+ --max-num-seqs: 256
576
566
  Mistral-Large-Instruct-2411:
577
567
  model_family: Mistral
578
568
  model_variant: Large-Instruct-2411
@@ -580,13 +570,14 @@ models:
580
570
  gpus_per_node: 4
581
571
  num_nodes: 2
582
572
  vocab_size: 32768
583
- max_model_len: 32768
584
- max_num_seqs: 256
585
- pipeline_parallelism: true
586
- enforce_eager: false
587
573
  qos: m2
588
574
  time: 08:00:00
589
575
  partition: a40
576
+ vllm_args:
577
+ --pipeline-parallel-size: 2
578
+ --tensor-parallel-size: 4
579
+ --max-model-len: 32768
580
+ --max-num-seqs: 256
590
581
  Mixtral-8x7B-Instruct-v0.1:
591
582
  model_family: Mixtral
592
583
  model_variant: 8x7B-Instruct-v0.1
@@ -594,13 +585,14 @@ models:
594
585
  gpus_per_node: 4
595
586
  num_nodes: 1
596
587
  vocab_size: 32000
597
- max_model_len: 32768
598
- max_num_seqs: 256
599
- pipeline_parallelism: true
600
- enforce_eager: false
601
588
  qos: m2
602
589
  time: 08:00:00
603
590
  partition: a40
591
+ vllm_args:
592
+ --tensor-parallel-size: 4
593
+ --max-model-len: 32768
594
+ --max-num-seqs: 256
595
+ --compilation-config: 3
604
596
  Mixtral-8x22B-v0.1:
605
597
  model_family: Mixtral
606
598
  model_variant: 8x22B-v0.1
@@ -608,13 +600,14 @@ models:
608
600
  gpus_per_node: 4
609
601
  num_nodes: 2
610
602
  vocab_size: 32768
611
- max_model_len: 65536
612
- max_num_seqs: 256
613
- pipeline_parallelism: true
614
- enforce_eager: false
615
603
  qos: m2
616
604
  time: 08:00:00
617
605
  partition: a40
606
+ vllm_args:
607
+ --pipeline-parallel-size: 2
608
+ --tensor-parallel-size: 4
609
+ --max-model-len: 65536
610
+ --max-num-seqs: 256
618
611
  Mixtral-8x22B-Instruct-v0.1:
619
612
  model_family: Mixtral
620
613
  model_variant: 8x22B-Instruct-v0.1
@@ -622,13 +615,14 @@ models:
622
615
  gpus_per_node: 4
623
616
  num_nodes: 2
624
617
  vocab_size: 32768
625
- max_model_len: 65536
626
- max_num_seqs: 256
627
- pipeline_parallelism: true
628
- enforce_eager: false
629
618
  qos: m2
630
619
  time: 08:00:00
631
620
  partition: a40
621
+ vllm_args:
622
+ --pipeline-parallel-size: 2
623
+ --tensor-parallel-size: 4
624
+ --max-model-len: 65536
625
+ --max-num-seqs: 256
632
626
  Phi-3-medium-128k-instruct:
633
627
  model_family: Phi-3
634
628
  model_variant: medium-128k-instruct
@@ -636,13 +630,14 @@ models:
636
630
  gpus_per_node: 2
637
631
  num_nodes: 1
638
632
  vocab_size: 32064
639
- max_model_len: 131072
640
- max_num_seqs: 256
641
- pipeline_parallelism: true
642
- enforce_eager: false
643
633
  qos: m2
644
634
  time: 08:00:00
645
635
  partition: a40
636
+ vllm_args:
637
+ --tensor-parallel-size: 2
638
+ --max-model-len: 131072
639
+ --max-num-seqs: 256
640
+ --compilation-config: 3
646
641
  Phi-3-vision-128k-instruct:
647
642
  model_family: Phi-3-vision
648
643
  model_variant: 128k-instruct
@@ -650,13 +645,14 @@ models:
650
645
  gpus_per_node: 2
651
646
  num_nodes: 1
652
647
  vocab_size: 32064
653
- max_model_len: 65536
654
- max_num_seqs: 256
655
- pipeline_parallelism: true
656
- enforce_eager: false
657
648
  qos: m2
658
649
  time: 08:00:00
659
650
  partition: a40
651
+ vllm_args:
652
+ --tensor-parallel-size: 2
653
+ --max-model-len: 65536
654
+ --max-num-seqs: 256
655
+ --compilation-config: 3
660
656
  Llama3-OpenBioLLM-70B:
661
657
  model_family: Llama3-OpenBioLLM
662
658
  model_variant: 70B
@@ -664,13 +660,14 @@ models:
664
660
  gpus_per_node: 4
665
661
  num_nodes: 1
666
662
  vocab_size: 128256
667
- max_model_len: 8192
668
- max_num_seqs: 256
669
- pipeline_parallelism: true
670
- enforce_eager: false
671
663
  qos: m2
672
664
  time: 08:00:00
673
665
  partition: a40
666
+ vllm_args:
667
+ --tensor-parallel-size: 4
668
+ --max-model-len: 8192
669
+ --max-num-seqs: 256
670
+ --compilation-config: 3
674
671
  Llama-3.1-Nemotron-70B-Instruct-HF:
675
672
  model_family: Llama-3.1-Nemotron
676
673
  model_variant: 70B-Instruct-HF
@@ -678,13 +675,14 @@ models:
678
675
  gpus_per_node: 4
679
676
  num_nodes: 1
680
677
  vocab_size: 128256
681
- max_model_len: 65536
682
- max_num_seqs: 256
683
- pipeline_parallelism: true
684
- enforce_eager: false
685
678
  qos: m2
686
679
  time: 08:00:00
687
680
  partition: a40
681
+ vllm_args:
682
+ --tensor-parallel-size: 4
683
+ --max-model-len: 65536
684
+ --max-num-seqs: 256
685
+ --compilation-config: 3
688
686
  Llama-3.2-1B:
689
687
  model_family: Llama-3.2
690
688
  model_variant: 1B
@@ -692,13 +690,13 @@ models:
692
690
  gpus_per_node: 1
693
691
  num_nodes: 1
694
692
  vocab_size: 128256
695
- max_model_len: 131072
696
- max_num_seqs: 256
697
- pipeline_parallelism: true
698
- enforce_eager: false
699
693
  qos: m2
700
694
  time: 08:00:00
701
695
  partition: a40
696
+ vllm_args:
697
+ --max-model-len: 131072
698
+ --max-num-seqs: 256
699
+ --compilation-config: 3
702
700
  Llama-3.2-1B-Instruct:
703
701
  model_family: Llama-3.2
704
702
  model_variant: 1B-Instruct
@@ -706,13 +704,13 @@ models:
706
704
  gpus_per_node: 1
707
705
  num_nodes: 1
708
706
  vocab_size: 128256
709
- max_model_len: 131072
710
- max_num_seqs: 256
711
- pipeline_parallelism: true
712
- enforce_eager: false
713
707
  qos: m2
714
708
  time: 08:00:00
715
709
  partition: a40
710
+ vllm_args:
711
+ --max-model-len: 131072
712
+ --max-num-seqs: 256
713
+ --compilation-config: 3
716
714
  Llama-3.2-3B:
717
715
  model_family: Llama-3.2
718
716
  model_variant: 3B
@@ -720,13 +718,13 @@ models:
720
718
  gpus_per_node: 1
721
719
  num_nodes: 1
722
720
  vocab_size: 128256
723
- max_model_len: 131072
724
- max_num_seqs: 256
725
- pipeline_parallelism: true
726
- enforce_eager: false
727
721
  qos: m2
728
722
  time: 08:00:00
729
723
  partition: a40
724
+ vllm_args:
725
+ --max-model-len: 131072
726
+ --max-num-seqs: 256
727
+ --compilation-config: 3
730
728
  Llama-3.2-3B-Instruct:
731
729
  model_family: Llama-3.2
732
730
  model_variant: 3B-Instruct
@@ -734,13 +732,13 @@ models:
734
732
  gpus_per_node: 1
735
733
  num_nodes: 1
736
734
  vocab_size: 128256
737
- max_model_len: 131072
738
- max_num_seqs: 256
739
- pipeline_parallelism: true
740
- enforce_eager: false
741
735
  qos: m2
742
736
  time: 08:00:00
743
737
  partition: a40
738
+ vllm_args:
739
+ --max-model-len: 131072
740
+ --max-num-seqs: 256
741
+ --compilation-config: 3
744
742
  Llama-3.2-11B-Vision:
745
743
  model_family: Llama-3.2
746
744
  model_variant: 11B-Vision
@@ -748,13 +746,15 @@ models:
748
746
  gpus_per_node: 2
749
747
  num_nodes: 1
750
748
  vocab_size: 128256
751
- max_model_len: 4096
752
- max_num_seqs: 64
753
- pipeline_parallelism: false
754
- enforce_eager: true
755
749
  qos: m2
756
750
  time: 08:00:00
757
751
  partition: a40
752
+ vllm_args:
753
+ --tensor-parallel-size: 2
754
+ --max-model-len: 4096
755
+ --max-num-seqs: 64
756
+ --compilation-config: 3
757
+ --enforce-eager: true
758
758
  Llama-3.2-11B-Vision-Instruct:
759
759
  model_family: Llama-3.2
760
760
  model_variant: 11B-Vision-Instruct
@@ -762,13 +762,15 @@ models:
762
762
  gpus_per_node: 2
763
763
  num_nodes: 1
764
764
  vocab_size: 128256
765
- max_model_len: 4096
766
- max_num_seqs: 64
767
- pipeline_parallelism: false
768
- enforce_eager: true
769
765
  qos: m2
770
766
  time: 08:00:00
771
767
  partition: a40
768
+ vllm_args:
769
+ --tensor-parallel-size: 2
770
+ --max-model-len: 4096
771
+ --max-num-seqs: 64
772
+ --compilation-config: 3
773
+ --enforce-eager: true
772
774
  Llama-3.2-90B-Vision:
773
775
  model_family: Llama-3.2
774
776
  model_variant: 90B-Vision
@@ -776,13 +778,15 @@ models:
776
778
  gpus_per_node: 4
777
779
  num_nodes: 2
778
780
  vocab_size: 128256
779
- max_model_len: 4096
780
- max_num_seqs: 32
781
- pipeline_parallelism: false
782
- enforce_eager: true
783
781
  qos: m2
784
782
  time: 08:00:00
785
783
  partition: a40
784
+ vllm_args:
785
+ --tensor-parallel-size: 8
786
+ --max-model-len: 4096
787
+ --max-num-seqs: 32
788
+ --compilation-config: 3
789
+ --enforce-eager: true
786
790
  Llama-3.2-90B-Vision-Instruct:
787
791
  model_family: Llama-3.2
788
792
  model_variant: 90B-Vision-Instruct
@@ -790,13 +794,15 @@ models:
790
794
  gpus_per_node: 4
791
795
  num_nodes: 2
792
796
  vocab_size: 128256
793
- max_model_len: 4096
794
- max_num_seqs: 32
795
- pipeline_parallelism: false
796
- enforce_eager: true
797
797
  qos: m2
798
798
  time: 08:00:00
799
799
  partition: a40
800
+ vllm_args:
801
+ --tensor-parallel-size: 8
802
+ --max-model-len: 4096
803
+ --max-num-seqs: 32
804
+ --compilation-config: 3
805
+ --enforce-eager: true
800
806
  Qwen2.5-0.5B-Instruct:
801
807
  model_family: Qwen2.5
802
808
  model_variant: 0.5B-Instruct
@@ -804,13 +810,13 @@ models:
804
810
  gpus_per_node: 1
805
811
  num_nodes: 1
806
812
  vocab_size: 152064
807
- max_model_len: 32768
808
- max_num_seqs: 256
809
- pipeline_parallelism: true
810
- enforce_eager: false
811
813
  qos: m2
812
814
  time: 08:00:00
813
815
  partition: a40
816
+ vllm_args:
817
+ --max-model-len: 32768
818
+ --max-num-seqs: 256
819
+ --compilation-config: 3
814
820
  Qwen2.5-1.5B-Instruct:
815
821
  model_family: Qwen2.5
816
822
  model_variant: 1.5B-Instruct
@@ -818,13 +824,13 @@ models:
818
824
  gpus_per_node: 1
819
825
  num_nodes: 1
820
826
  vocab_size: 152064
821
- max_model_len: 32768
822
- max_num_seqs: 256
823
- pipeline_parallelism: true
824
- enforce_eager: false
825
827
  qos: m2
826
828
  time: 08:00:00
827
829
  partition: a40
830
+ vllm_args:
831
+ --max-model-len: 32768
832
+ --max-num-seqs: 256
833
+ --compilation-config: 3
828
834
  Qwen2.5-3B-Instruct:
829
835
  model_family: Qwen2.5
830
836
  model_variant: 3B-Instruct
@@ -832,13 +838,13 @@ models:
832
838
  gpus_per_node: 1
833
839
  num_nodes: 1
834
840
  vocab_size: 152064
835
- max_model_len: 32768
836
- max_num_seqs: 256
837
- pipeline_parallelism: true
838
- enforce_eager: false
839
841
  qos: m2
840
842
  time: 08:00:00
841
843
  partition: a40
844
+ vllm_args:
845
+ --max-model-len: 32768
846
+ --max-num-seqs: 256
847
+ --compilation-config: 3
842
848
  Qwen2.5-7B-Instruct:
843
849
  model_family: Qwen2.5
844
850
  model_variant: 7B-Instruct
@@ -846,13 +852,13 @@ models:
846
852
  gpus_per_node: 1
847
853
  num_nodes: 1
848
854
  vocab_size: 152064
849
- max_model_len: 32768
850
- max_num_seqs: 256
851
- pipeline_parallelism: true
852
- enforce_eager: false
853
855
  qos: m2
854
856
  time: 08:00:00
855
857
  partition: a40
858
+ vllm_args:
859
+ --max-model-len: 32768
860
+ --max-num-seqs: 256
861
+ --compilation-config: 3
856
862
  Qwen2.5-14B-Instruct:
857
863
  model_family: Qwen2.5
858
864
  model_variant: 14B-Instruct
@@ -860,13 +866,13 @@ models:
860
866
  gpus_per_node: 1
861
867
  num_nodes: 1
862
868
  vocab_size: 152064
863
- max_model_len: 32768
864
- max_num_seqs: 256
865
- pipeline_parallelism: true
866
- enforce_eager: false
867
869
  qos: m2
868
870
  time: 08:00:00
869
871
  partition: a40
872
+ vllm_args:
873
+ --max-model-len: 32768
874
+ --max-num-seqs: 256
875
+ --compilation-config: 3
870
876
  Qwen2.5-32B-Instruct:
871
877
  model_family: Qwen2.5
872
878
  model_variant: 32B-Instruct
@@ -874,13 +880,14 @@ models:
874
880
  gpus_per_node: 2
875
881
  num_nodes: 1
876
882
  vocab_size: 152064
877
- max_model_len: 32768
878
- max_num_seqs: 256
879
- pipeline_parallelism: true
880
- enforce_eager: false
881
883
  qos: m2
882
884
  time: 08:00:00
883
885
  partition: a40
886
+ vllm_args:
887
+ --tensor-parallel-size: 2
888
+ --max-model-len: 32768
889
+ --max-num-seqs: 256
890
+ --compilation-config: 3
884
891
  Qwen2.5-72B-Instruct:
885
892
  model_family: Qwen2.5
886
893
  model_variant: 72B-Instruct
@@ -888,13 +895,14 @@ models:
888
895
  gpus_per_node: 4
889
896
  num_nodes: 1
890
897
  vocab_size: 152064
891
- max_model_len: 16384
892
- max_num_seqs: 256
893
- pipeline_parallelism: true
894
- enforce_eager: false
895
898
  qos: m2
896
899
  time: 08:00:00
897
900
  partition: a40
901
+ vllm_args:
902
+ --tensor-parallel-size: 4
903
+ --max-model-len: 16384
904
+ --max-num-seqs: 256
905
+ --compilation-config: 3
898
906
  Qwen2.5-Math-1.5B-Instruct:
899
907
  model_family: Qwen2.5
900
908
  model_variant: Math-1.5B-Instruct
@@ -902,13 +910,13 @@ models:
902
910
  gpus_per_node: 1
903
911
  num_nodes: 1
904
912
  vocab_size: 152064
905
- max_model_len: 4096
906
- max_num_seqs: 256
907
- pipeline_parallelism: true
908
- enforce_eager: false
909
913
  qos: m2
910
914
  time: 08:00:00
911
915
  partition: a40
916
+ vllm_args:
917
+ --max-model-len: 4096
918
+ --max-num-seqs: 256
919
+ --compilation-config: 3
912
920
  Qwen2.5-Math-7B-Instruct:
913
921
  model_family: Qwen2.5
914
922
  model_variant: Math-7B-Instruct
@@ -916,13 +924,13 @@ models:
916
924
  gpus_per_node: 1
917
925
  num_nodes: 1
918
926
  vocab_size: 152064
919
- max_model_len: 4096
920
- max_num_seqs: 256
921
- pipeline_parallelism: true
922
- enforce_eager: false
923
927
  qos: m2
924
928
  time: 08:00:00
925
929
  partition: a40
930
+ vllm_args:
931
+ --max-model-len: 4096
932
+ --max-num-seqs: 256
933
+ --compilation-config: 3
926
934
  Qwen2.5-Math-72B-Instruct:
927
935
  model_family: Qwen2.5
928
936
  model_variant: Math-72B-Instruct
@@ -930,13 +938,14 @@ models:
930
938
  gpus_per_node: 4
931
939
  num_nodes: 1
932
940
  vocab_size: 152064
933
- max_model_len: 4096
934
- max_num_seqs: 256
935
- pipeline_parallelism: true
936
- enforce_eager: false
937
941
  qos: m2
938
942
  time: 08:00:00
939
943
  partition: a40
944
+ vllm_args:
945
+ --tensor-parallel-size: 4
946
+ --max-model-len: 4096
947
+ --max-num-seqs: 256
948
+ --compilation-config: 3
940
949
  Qwen2.5-Coder-7B-Instruct:
941
950
  model_family: Qwen2.5
942
951
  model_variant: Coder-7B-Instruct
@@ -944,13 +953,13 @@ models:
944
953
  gpus_per_node: 1
945
954
  num_nodes: 1
946
955
  vocab_size: 152064
947
- max_model_len: 32768
948
- max_num_seqs: 256
949
- pipeline_parallelism: true
950
- enforce_eager: false
951
956
  qos: m2
952
957
  time: 08:00:00
953
958
  partition: a40
959
+ vllm_args:
960
+ --max-model-len: 32768
961
+ --max-num-seqs: 256
962
+ --compilation-config: 3
954
963
  Qwen2.5-Math-RM-72B:
955
964
  model_family: Qwen2.5
956
965
  model_variant: Math-RM-72B
@@ -958,13 +967,14 @@ models:
958
967
  gpus_per_node: 4
959
968
  num_nodes: 1
960
969
  vocab_size: 152064
961
- max_model_len: 4096
962
- max_num_seqs: 256
963
- pipeline_parallelism: true
964
- enforce_eager: false
965
970
  qos: m2
966
971
  time: 08:00:00
967
972
  partition: a40
973
+ vllm_args:
974
+ --tensor-parallel-size: 4
975
+ --max-model-len: 4096
976
+ --max-num-seqs: 256
977
+ --compilation-config: 3
968
978
  Qwen2.5-Math-PRM-7B:
969
979
  model_family: Qwen2.5
970
980
  model_variant: Math-PRM-7B
@@ -972,13 +982,13 @@ models:
972
982
  gpus_per_node: 1
973
983
  num_nodes: 1
974
984
  vocab_size: 152064
975
- max_model_len: 4096
976
- max_num_seqs: 256
977
- pipeline_parallelism: true
978
- enforce_eager: false
979
985
  qos: m2
980
986
  time: 08:00:00
981
987
  partition: a40
988
+ vllm_args:
989
+ --max-model-len: 4096
990
+ --max-num-seqs: 256
991
+ --compilation-config: 3
982
992
  QwQ-32B-Preview:
983
993
  model_family: QwQ
984
994
  model_variant: 32B-Preview
@@ -986,13 +996,14 @@ models:
986
996
  gpus_per_node: 2
987
997
  num_nodes: 1
988
998
  vocab_size: 152064
989
- max_model_len: 32768
990
- max_num_seqs: 256
991
- pipeline_parallelism: true
992
- enforce_eager: false
993
999
  qos: m2
994
1000
  time: 08:00:00
995
1001
  partition: a40
1002
+ vllm_args:
1003
+ --tensor-parallel-size: 2
1004
+ --max-model-len: 32768
1005
+ --max-num-seqs: 256
1006
+ --compilation-config: 3
996
1007
  Pixtral-12B-2409:
997
1008
  model_family: Pixtral
998
1009
  model_variant: 12B-2409
@@ -1000,13 +1011,13 @@ models:
1000
1011
  gpus_per_node: 1
1001
1012
  num_nodes: 1
1002
1013
  vocab_size: 131072
1003
- max_model_len: 8192
1004
- max_num_seqs: 256
1005
- pipeline_parallelism: true
1006
- enforce_eager: false
1007
1014
  qos: m2
1008
1015
  time: 08:00:00
1009
1016
  partition: a40
1017
+ vllm_args:
1018
+ --max-model-len: 8192
1019
+ --max-num-seqs: 256
1020
+ --compilation-config: 3
1010
1021
  e5-mistral-7b-instruct:
1011
1022
  model_family: e5
1012
1023
  model_variant: mistral-7b-instruct
@@ -1014,13 +1025,13 @@ models:
1014
1025
  gpus_per_node: 1
1015
1026
  num_nodes: 1
1016
1027
  vocab_size: 32000
1017
- max_model_len: 4096
1018
- max_num_seqs: 256
1019
- pipeline_parallelism: true
1020
- enforce_eager: false
1021
1028
  qos: m2
1022
1029
  time: 08:00:00
1023
1030
  partition: a40
1031
+ vllm_args:
1032
+ --max-model-len: 4096
1033
+ --max-num-seqs: 256
1034
+ --compilation-config: 3
1024
1035
  bge-base-en-v1.5:
1025
1036
  model_family: bge
1026
1037
  model_variant: base-en-v1.5
@@ -1028,13 +1039,13 @@ models:
1028
1039
  gpus_per_node: 1
1029
1040
  num_nodes: 1
1030
1041
  vocab_size: 30522
1031
- max_model_len: 512
1032
- max_num_seqs: 256
1033
- pipeline_parallelism: true
1034
- enforce_eager: false
1035
1042
  qos: m2
1036
1043
  time: 08:00:00
1037
1044
  partition: a40
1045
+ vllm_args:
1046
+ --max-model-len: 512
1047
+ --max-num-seqs: 256
1048
+ --compilation-config: 3
1038
1049
  all-MiniLM-L6-v2:
1039
1050
  model_family: all-MiniLM
1040
1051
  model_variant: L6-v2
@@ -1042,13 +1053,13 @@ models:
1042
1053
  gpus_per_node: 1
1043
1054
  num_nodes: 1
1044
1055
  vocab_size: 30522
1045
- max_model_len: 512
1046
- max_num_seqs: 256
1047
- pipeline_parallelism: true
1048
- enforce_eager: false
1049
1056
  qos: m2
1050
1057
  time: 08:00:00
1051
1058
  partition: a40
1059
+ vllm_args:
1060
+ --max-model-len: 512
1061
+ --max-num-seqs: 256
1062
+ --compilation-config: 3
1052
1063
  Llama-3.3-70B-Instruct:
1053
1064
  model_family: Llama-3.3
1054
1065
  model_variant: 70B-Instruct
@@ -1056,13 +1067,14 @@ models:
1056
1067
  gpus_per_node: 4
1057
1068
  num_nodes: 1
1058
1069
  vocab_size: 128256
1059
- max_model_len: 65536
1060
- max_num_seqs: 256
1061
- pipeline_parallelism: true
1062
- enforce_eager: false
1063
1070
  qos: m2
1064
1071
  time: 08:00:00
1065
1072
  partition: a40
1073
+ vllm_args:
1074
+ --tensor-parallel-size: 4
1075
+ --max-model-len: 65536
1076
+ --max-num-seqs: 256
1077
+ --compilation-config: 3
1066
1078
  InternVL2_5-26B:
1067
1079
  model_family: InternVL2_5
1068
1080
  model_variant: 26B
@@ -1070,13 +1082,14 @@ models:
1070
1082
  gpus_per_node: 2
1071
1083
  num_nodes: 1
1072
1084
  vocab_size: 92553
1073
- max_model_len: 32768
1074
- max_num_seqs: 256
1075
- pipeline_parallelism: true
1076
- enforce_eager: false
1077
1085
  qos: m2
1078
1086
  time: 08:00:00
1079
1087
  partition: a40
1088
+ vllm_args:
1089
+ --tensor-parallel-size: 2
1090
+ --max-model-len: 32768
1091
+ --max-num-seqs: 256
1092
+ --compilation-config: 3
1080
1093
  InternVL2_5-38B:
1081
1094
  model_family: InternVL2_5
1082
1095
  model_variant: 38B
@@ -1084,13 +1097,14 @@ models:
1084
1097
  gpus_per_node: 4
1085
1098
  num_nodes: 1
1086
1099
  vocab_size: 92553
1087
- max_model_len: 32768
1088
- max_num_seqs: 256
1089
- pipeline_parallelism: true
1090
- enforce_eager: false
1091
1100
  qos: m2
1092
1101
  time: 08:00:00
1093
1102
  partition: a40
1103
+ vllm_args:
1104
+ --tensor-parallel-size: 4
1105
+ --max-model-len: 32768
1106
+ --max-num-seqs: 256
1107
+ --compilation-config: 3
1094
1108
  Aya-Expanse-32B:
1095
1109
  model_family: Aya-Expanse
1096
1110
  model_variant: 32B
@@ -1098,69 +1112,72 @@ models:
1098
1112
  gpus_per_node: 2
1099
1113
  num_nodes: 1
1100
1114
  vocab_size: 256000
1101
- max_model_len: 8192
1102
- max_num_seqs: 256
1103
- pipeline_parallelism: true
1104
- enforce_eager: false
1105
1115
  qos: m2
1106
1116
  time: 08:00:00
1107
1117
  partition: a40
1118
+ vllm_args:
1119
+ --tensor-parallel-size: 2
1120
+ --max-model-len: 8192
1121
+ --max-num-seqs: 256
1122
+ --compilation-config: 3
1108
1123
  DeepSeek-R1-Distill-Llama-70B:
1109
1124
  model_family: DeepSeek-R1
1110
- model_variant: 'Distill-Llama-70B '
1125
+ model_variant: Distill-Llama-70B
1111
1126
  model_type: LLM
1112
1127
  gpus_per_node: 4
1113
- num_nodes: 2
1128
+ num_nodes: 1
1114
1129
  vocab_size: 128256
1115
- max_model_len: 131072
1116
- max_num_seqs: 256
1117
- pipeline_parallelism: true
1118
- enforce_eager: false
1119
1130
  qos: m2
1120
1131
  time: 08:00:00
1121
1132
  partition: a40
1133
+ vllm_args:
1134
+ --tensor-parallel-size: 4
1135
+ --max-model-len: 65536
1136
+ --max-num-seqs: 256
1137
+ --compilation-config: 3
1122
1138
  DeepSeek-R1-Distill-Llama-8B:
1123
1139
  model_family: DeepSeek-R1
1124
- model_variant: 'Distill-Llama-8B '
1140
+ model_variant: Distill-Llama-8B
1125
1141
  model_type: LLM
1126
1142
  gpus_per_node: 1
1127
1143
  num_nodes: 1
1128
1144
  vocab_size: 128256
1129
- max_model_len: 131072
1130
- max_num_seqs: 256
1131
- pipeline_parallelism: true
1132
- enforce_eager: false
1133
1145
  qos: m2
1134
1146
  time: 08:00:00
1135
1147
  partition: a40
1148
+ vllm_args:
1149
+ --max-model-len: 131072
1150
+ --max-num-seqs: 256
1151
+ --compilation-config: 3
1136
1152
  DeepSeek-R1-Distill-Qwen-32B:
1137
1153
  model_family: DeepSeek-R1
1138
1154
  model_variant: Distill-Qwen-32B
1139
1155
  model_type: LLM
1140
- gpus_per_node: 4
1156
+ gpus_per_node: 2
1141
1157
  num_nodes: 1
1142
1158
  vocab_size: 152064
1143
- max_model_len: 131072
1144
- max_num_seqs: 256
1145
- pipeline_parallelism: true
1146
- enforce_eager: false
1147
1159
  qos: m2
1148
1160
  time: 08:00:00
1149
1161
  partition: a40
1162
+ vllm_args:
1163
+ --tensor-parallel-size: 2
1164
+ --max-model-len: 65536
1165
+ --max-num-seqs: 256
1166
+ --compilation-config: 3
1150
1167
  DeepSeek-R1-Distill-Qwen-14B:
1151
1168
  model_family: DeepSeek-R1
1152
1169
  model_variant: Distill-Qwen-14B
1153
1170
  model_type: LLM
1154
- gpus_per_node: 2
1171
+ gpus_per_node: 1
1155
1172
  num_nodes: 1
1156
1173
  vocab_size: 152064
1157
- max_model_len: 131072
1158
- max_num_seqs: 256
1159
- pipeline_parallelism: true
1160
- enforce_eager: false
1161
1174
  qos: m2
1162
1175
  time: 08:00:00
1163
1176
  partition: a40
1177
+ vllm_args:
1178
+ --max-model-len: 65536
1179
+ --max-num-seqs: 256
1180
+ --compilation-config: 3
1164
1181
  DeepSeek-R1-Distill-Qwen-7B:
1165
1182
  model_family: DeepSeek-R1
1166
1183
  model_variant: Distill-Qwen-7B
@@ -1168,13 +1185,13 @@ models:
1168
1185
  gpus_per_node: 1
1169
1186
  num_nodes: 1
1170
1187
  vocab_size: 152064
1171
- max_model_len: 131072
1172
- max_num_seqs: 256
1173
- pipeline_parallelism: true
1174
- enforce_eager: false
1175
1188
  qos: m2
1176
1189
  time: 08:00:00
1177
1190
  partition: a40
1191
+ vllm_args:
1192
+ --max-model-len: 131072
1193
+ --max-num-seqs: 256
1194
+ --compilation-config: 3
1178
1195
  DeepSeek-R1-Distill-Qwen-1.5B:
1179
1196
  model_family: DeepSeek-R1
1180
1197
  model_variant: Distill-Qwen-1.5B
@@ -1182,13 +1199,13 @@ models:
1182
1199
  gpus_per_node: 1
1183
1200
  num_nodes: 1
1184
1201
  vocab_size: 152064
1185
- max_model_len: 131072
1186
- max_num_seqs: 256
1187
- pipeline_parallelism: true
1188
- enforce_eager: false
1189
1202
  qos: m2
1190
1203
  time: 08:00:00
1191
1204
  partition: a40
1205
+ vllm_args:
1206
+ --max-model-len: 131072
1207
+ --max-num-seqs: 256
1208
+ --compilation-config: 3
1192
1209
  Phi-3.5-vision-instruct:
1193
1210
  model_family: Phi-3.5-vision
1194
1211
  model_variant: instruct
@@ -1196,13 +1213,14 @@ models:
1196
1213
  gpus_per_node: 2
1197
1214
  num_nodes: 1
1198
1215
  vocab_size: 32064
1199
- max_model_len: 65536
1200
- max_num_seqs: 256
1201
- pipeline_parallelism: true
1202
- enforce_eager: false
1203
1216
  qos: m2
1204
1217
  time: 08:00:00
1205
1218
  partition: a40
1219
+ vllm_args:
1220
+ --tensor-parallel-size: 2
1221
+ --max-model-len: 65536
1222
+ --max-num-seqs: 256
1223
+ --compilation-config: 3
1206
1224
  InternVL2_5-8B:
1207
1225
  model_family: InternVL2_5
1208
1226
  model_variant: 8B
@@ -1210,13 +1228,13 @@ models:
1210
1228
  gpus_per_node: 1
1211
1229
  num_nodes: 1
1212
1230
  vocab_size: 92553
1213
- max_model_len: 32768
1214
- max_num_seqs: 256
1215
- pipeline_parallelism: true
1216
- enforce_eager: false
1217
1231
  qos: m2
1218
1232
  time: 08:00:00
1219
1233
  partition: a40
1234
+ vllm_args:
1235
+ --max-model-len: 32768
1236
+ --max-num-seqs: 256
1237
+ --compilation-config: 3
1220
1238
  glm-4v-9b:
1221
1239
  model_family: glm-4v
1222
1240
  model_variant: 9b
@@ -1224,13 +1242,13 @@ models:
1224
1242
  gpus_per_node: 1
1225
1243
  num_nodes: 1
1226
1244
  vocab_size: 151552
1227
- max_model_len: 8192
1228
- max_num_seqs: 256
1229
- pipeline_parallelism: true
1230
- enforce_eager: false
1231
1245
  qos: m2
1232
1246
  time: 08:00:00
1233
1247
  partition: a40
1248
+ vllm_args:
1249
+ --max-model-len: 8192
1250
+ --max-num-seqs: 256
1251
+ --compilation-config: 3
1234
1252
  Molmo-7B-D-0924:
1235
1253
  model_family: Molmo
1236
1254
  model_variant: 7B-D-0924
@@ -1238,26 +1256,27 @@ models:
1238
1256
  gpus_per_node: 1
1239
1257
  num_nodes: 1
1240
1258
  vocab_size: 152064
1241
- max_model_len: 4096
1242
- max_num_seqs: 256
1243
- pipeline_parallelism: true
1244
- enforce_eager: false
1245
1259
  qos: m2
1246
1260
  time: 08:00:00
1247
1261
  partition: a40
1262
+ vllm_args:
1263
+ --max-model-len: 4096
1264
+ --max-num-seqs: 256
1265
+ --compilation-config: 3
1248
1266
  deepseek-vl2:
1249
1267
  model_family: deepseek-vl2
1250
1268
  model_type: VLM
1251
1269
  gpus_per_node: 2
1252
1270
  num_nodes: 1
1253
1271
  vocab_size: 129280
1254
- max_model_len: 4096
1255
- max_num_seqs: 256
1256
- pipeline_parallelism: true
1257
- enforce_eager: false
1258
1272
  qos: m2
1259
1273
  time: 08:00:00
1260
1274
  partition: a40
1275
+ vllm_args:
1276
+ --tensor-parallel-size: 2
1277
+ --max-model-len: 4096
1278
+ --max-num-seqs: 256
1279
+ --compilation-config: 3
1261
1280
  deepseek-vl2-small:
1262
1281
  model_family: deepseek-vl2
1263
1282
  model_variant: small
@@ -1265,10 +1284,10 @@ models:
1265
1284
  gpus_per_node: 1
1266
1285
  num_nodes: 1
1267
1286
  vocab_size: 129280
1268
- max_model_len: 4096
1269
- max_num_seqs: 256
1270
- pipeline_parallelism: true
1271
- enforce_eager: false
1272
1287
  qos: m2
1273
1288
  time: 08:00:00
1274
1289
  partition: a40
1290
+ vllm_args:
1291
+ --max-model-len: 4096
1292
+ --max-num-seqs: 256
1293
+ --compilation-config: 3