vec-inf 0.4.0.post1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1274 @@
1
+ models:
2
+ c4ai-command-r-plus:
3
+ model_family: c4ai-command-r
4
+ model_variant: plus
5
+ model_type: LLM
6
+ gpus_per_node: 4
7
+ num_nodes: 2
8
+ vocab_size: 256000
9
+ max_model_len: 8192
10
+ max_num_seqs: 256
11
+ pipeline_parallelism: true
12
+ enforce_eager: false
13
+ qos: m2
14
+ time: 08:00:00
15
+ partition: a40
16
+ c4ai-command-r-plus-08-2024:
17
+ model_family: c4ai-command-r
18
+ model_variant: plus-08-2024
19
+ model_type: LLM
20
+ gpus_per_node: 4
21
+ num_nodes: 2
22
+ vocab_size: 256000
23
+ max_model_len: 65536
24
+ max_num_seqs: 256
25
+ pipeline_parallelism: true
26
+ enforce_eager: false
27
+ qos: m2
28
+ time: 08:00:00
29
+ partition: a40
30
+ c4ai-command-r-08-2024:
31
+ model_family: c4ai-command-r
32
+ model_variant: 08-2024
33
+ model_type: LLM
34
+ gpus_per_node: 2
35
+ num_nodes: 1
36
+ vocab_size: 256000
37
+ max_model_len: 32768
38
+ max_num_seqs: 256
39
+ pipeline_parallelism: true
40
+ enforce_eager: false
41
+ qos: m2
42
+ time: 08:00:00
43
+ partition: a40
44
+ CodeLlama-7b-hf:
45
+ model_family: CodeLlama
46
+ model_variant: 7b-hf
47
+ model_type: LLM
48
+ gpus_per_node: 1
49
+ num_nodes: 1
50
+ vocab_size: 32000
51
+ max_model_len: 16384
52
+ max_num_seqs: 256
53
+ pipeline_parallelism: true
54
+ enforce_eager: false
55
+ qos: m2
56
+ time: 08:00:00
57
+ partition: a40
58
+ CodeLlama-7b-Instruct-hf:
59
+ model_family: CodeLlama
60
+ model_variant: 7b-Instruct-hf
61
+ model_type: LLM
62
+ gpus_per_node: 1
63
+ num_nodes: 1
64
+ vocab_size: 32000
65
+ max_model_len: 16384
66
+ max_num_seqs: 256
67
+ pipeline_parallelism: true
68
+ enforce_eager: false
69
+ qos: m2
70
+ time: 08:00:00
71
+ partition: a40
72
+ CodeLlama-13b-hf:
73
+ model_family: CodeLlama
74
+ model_variant: 13b-hf
75
+ model_type: LLM
76
+ gpus_per_node: 1
77
+ num_nodes: 1
78
+ vocab_size: 32000
79
+ max_model_len: 16384
80
+ max_num_seqs: 256
81
+ pipeline_parallelism: true
82
+ enforce_eager: false
83
+ qos: m2
84
+ time: 08:00:00
85
+ partition: a40
86
+ CodeLlama-13b-Instruct-hf:
87
+ model_family: CodeLlama
88
+ model_variant: 13b-Instruct-hf
89
+ model_type: LLM
90
+ gpus_per_node: 1
91
+ num_nodes: 1
92
+ vocab_size: 32000
93
+ max_model_len: 16384
94
+ max_num_seqs: 256
95
+ pipeline_parallelism: true
96
+ enforce_eager: false
97
+ qos: m2
98
+ time: 08:00:00
99
+ partition: a40
100
+ CodeLlama-34b-hf:
101
+ model_family: CodeLlama
102
+ model_variant: 34b-hf
103
+ model_type: LLM
104
+ gpus_per_node: 2
105
+ num_nodes: 1
106
+ vocab_size: 32000
107
+ max_model_len: 16384
108
+ max_num_seqs: 256
109
+ pipeline_parallelism: true
110
+ enforce_eager: false
111
+ qos: m2
112
+ time: 08:00:00
113
+ partition: a40
114
+ CodeLlama-34b-Instruct-hf:
115
+ model_family: CodeLlama
116
+ model_variant: 34b-Instruct-hf
117
+ model_type: LLM
118
+ gpus_per_node: 2
119
+ num_nodes: 1
120
+ vocab_size: 32000
121
+ max_model_len: 16384
122
+ max_num_seqs: 256
123
+ pipeline_parallelism: true
124
+ enforce_eager: false
125
+ qos: m2
126
+ time: 08:00:00
127
+ partition: a40
128
+ CodeLlama-70b-hf:
129
+ model_family: CodeLlama
130
+ model_variant: 70b-hf
131
+ model_type: LLM
132
+ gpus_per_node: 4
133
+ num_nodes: 1
134
+ vocab_size: 32000
135
+ max_model_len: 4096
136
+ max_num_seqs: 256
137
+ pipeline_parallelism: true
138
+ enforce_eager: false
139
+ qos: m2
140
+ time: 08:00:00
141
+ partition: a40
142
+ CodeLlama-70b-Instruct-hf:
143
+ model_family: CodeLlama
144
+ model_variant: 70b-Instruct-hf
145
+ model_type: LLM
146
+ gpus_per_node: 4
147
+ num_nodes: 1
148
+ vocab_size: 32000
149
+ max_model_len: 4096
150
+ max_num_seqs: 256
151
+ pipeline_parallelism: true
152
+ enforce_eager: false
153
+ qos: m2
154
+ time: 08:00:00
155
+ partition: a40
156
+ dbrx-instruct:
157
+ model_family: dbrx
158
+ model_variant: instruct
159
+ model_type: LLM
160
+ gpus_per_node: 4
161
+ num_nodes: 2
162
+ vocab_size: 100352
163
+ max_model_len: 32000
164
+ max_num_seqs: 256
165
+ pipeline_parallelism: true
166
+ enforce_eager: false
167
+ qos: m2
168
+ time: 08:00:00
169
+ partition: a40
170
+ gemma-2-9b:
171
+ model_family: gemma-2
172
+ model_variant: 9b
173
+ model_type: LLM
174
+ gpus_per_node: 1
175
+ num_nodes: 1
176
+ vocab_size: 256000
177
+ max_model_len: 4096
178
+ max_num_seqs: 256
179
+ pipeline_parallelism: true
180
+ enforce_eager: false
181
+ qos: m2
182
+ time: 08:00:00
183
+ partition: a40
184
+ gemma-2-9b-it:
185
+ model_family: gemma-2
186
+ model_variant: 9b-it
187
+ model_type: LLM
188
+ gpus_per_node: 1
189
+ num_nodes: 1
190
+ vocab_size: 256000
191
+ max_model_len: 4096
192
+ max_num_seqs: 256
193
+ pipeline_parallelism: true
194
+ enforce_eager: false
195
+ qos: m2
196
+ time: 08:00:00
197
+ partition: a40
198
+ gemma-2-27b:
199
+ model_family: gemma-2
200
+ model_variant: 27b
201
+ model_type: LLM
202
+ gpus_per_node: 2
203
+ num_nodes: 1
204
+ vocab_size: 256000
205
+ max_model_len: 4096
206
+ max_num_seqs: 256
207
+ pipeline_parallelism: true
208
+ enforce_eager: false
209
+ qos: m2
210
+ time: 08:00:00
211
+ partition: a40
212
+ gemma-2-27b-it:
213
+ model_family: gemma-2
214
+ model_variant: 27b-it
215
+ model_type: LLM
216
+ gpus_per_node: 2
217
+ num_nodes: 1
218
+ vocab_size: 256000
219
+ max_model_len: 4096
220
+ max_num_seqs: 256
221
+ pipeline_parallelism: true
222
+ enforce_eager: false
223
+ qos: m2
224
+ time: 08:00:00
225
+ partition: a40
226
+ Llama-2-7b-hf:
227
+ model_family: Llama-2
228
+ model_variant: 7b-hf
229
+ model_type: LLM
230
+ gpus_per_node: 1
231
+ num_nodes: 1
232
+ vocab_size: 32000
233
+ max_model_len: 4096
234
+ max_num_seqs: 256
235
+ pipeline_parallelism: true
236
+ enforce_eager: false
237
+ qos: m2
238
+ time: 08:00:00
239
+ partition: a40
240
+ Llama-2-7b-chat-hf:
241
+ model_family: Llama-2
242
+ model_variant: 7b-chat-hf
243
+ model_type: LLM
244
+ gpus_per_node: 1
245
+ num_nodes: 1
246
+ vocab_size: 32000
247
+ max_model_len: 4096
248
+ max_num_seqs: 256
249
+ pipeline_parallelism: true
250
+ enforce_eager: false
251
+ qos: m2
252
+ time: 08:00:00
253
+ partition: a40
254
+ Llama-2-13b-hf:
255
+ model_family: Llama-2
256
+ model_variant: 13b-hf
257
+ model_type: LLM
258
+ gpus_per_node: 1
259
+ num_nodes: 1
260
+ vocab_size: 32000
261
+ max_model_len: 4096
262
+ max_num_seqs: 256
263
+ pipeline_parallelism: true
264
+ enforce_eager: false
265
+ qos: m2
266
+ time: 08:00:00
267
+ partition: a40
268
+ Llama-2-13b-chat-hf:
269
+ model_family: Llama-2
270
+ model_variant: 13b-chat-hf
271
+ model_type: LLM
272
+ gpus_per_node: 1
273
+ num_nodes: 1
274
+ vocab_size: 32000
275
+ max_model_len: 4096
276
+ max_num_seqs: 256
277
+ pipeline_parallelism: true
278
+ enforce_eager: false
279
+ qos: m2
280
+ time: 08:00:00
281
+ partition: a40
282
+ Llama-2-70b-hf:
283
+ model_family: Llama-2
284
+ model_variant: 70b-hf
285
+ model_type: LLM
286
+ gpus_per_node: 4
287
+ num_nodes: 1
288
+ vocab_size: 32000
289
+ max_model_len: 4096
290
+ max_num_seqs: 256
291
+ pipeline_parallelism: true
292
+ enforce_eager: false
293
+ qos: m2
294
+ time: 08:00:00
295
+ partition: a40
296
+ Llama-2-70b-chat-hf:
297
+ model_family: Llama-2
298
+ model_variant: 70b-chat-hf
299
+ model_type: LLM
300
+ gpus_per_node: 4
301
+ num_nodes: 1
302
+ vocab_size: 32000
303
+ max_model_len: 4096
304
+ max_num_seqs: 256
305
+ pipeline_parallelism: true
306
+ enforce_eager: false
307
+ qos: m2
308
+ time: 08:00:00
309
+ partition: a40
310
+ llava-1.5-7b-hf:
311
+ model_family: llava-1.5
312
+ model_variant: 7b-hf
313
+ model_type: VLM
314
+ gpus_per_node: 1
315
+ num_nodes: 1
316
+ vocab_size: 32000
317
+ max_model_len: 4096
318
+ max_num_seqs: 256
319
+ pipeline_parallelism: true
320
+ enforce_eager: false
321
+ qos: m2
322
+ time: 08:00:00
323
+ partition: a40
324
+ llava-1.5-13b-hf:
325
+ model_family: llava-1.5
326
+ model_variant: 13b-hf
327
+ model_type: VLM
328
+ gpus_per_node: 1
329
+ num_nodes: 1
330
+ vocab_size: 32000
331
+ max_model_len: 4096
332
+ max_num_seqs: 256
333
+ pipeline_parallelism: true
334
+ enforce_eager: false
335
+ qos: m2
336
+ time: 08:00:00
337
+ partition: a40
338
+ llava-v1.6-mistral-7b-hf:
339
+ model_family: llava-v1.6
340
+ model_variant: mistral-7b-hf
341
+ model_type: VLM
342
+ gpus_per_node: 1
343
+ num_nodes: 1
344
+ vocab_size: 32064
345
+ max_model_len: 32768
346
+ max_num_seqs: 256
347
+ pipeline_parallelism: true
348
+ enforce_eager: false
349
+ qos: m2
350
+ time: 08:00:00
351
+ partition: a40
352
+ llava-v1.6-34b-hf:
353
+ model_family: llava-v1.6
354
+ model_variant: 34b-hf
355
+ model_type: VLM
356
+ gpus_per_node: 2
357
+ num_nodes: 1
358
+ vocab_size: 64064
359
+ max_model_len: 4096
360
+ max_num_seqs: 256
361
+ pipeline_parallelism: true
362
+ enforce_eager: false
363
+ qos: m2
364
+ time: 08:00:00
365
+ partition: a40
366
+ Meta-Llama-3-8B:
367
+ model_family: Meta-Llama-3
368
+ model_variant: 8B
369
+ model_type: LLM
370
+ gpus_per_node: 1
371
+ num_nodes: 1
372
+ vocab_size: 128256
373
+ max_model_len: 8192
374
+ max_num_seqs: 256
375
+ pipeline_parallelism: true
376
+ enforce_eager: false
377
+ qos: m2
378
+ time: 08:00:00
379
+ partition: a40
380
+ Meta-Llama-3-8B-Instruct:
381
+ model_family: Meta-Llama-3
382
+ model_variant: 8B-Instruct
383
+ model_type: LLM
384
+ gpus_per_node: 1
385
+ num_nodes: 1
386
+ vocab_size: 128256
387
+ max_model_len: 8192
388
+ max_num_seqs: 256
389
+ pipeline_parallelism: true
390
+ enforce_eager: false
391
+ qos: m2
392
+ time: 08:00:00
393
+ partition: a40
394
+ Meta-Llama-3-70B:
395
+ model_family: Meta-Llama-3
396
+ model_variant: 70B
397
+ model_type: LLM
398
+ gpus_per_node: 4
399
+ num_nodes: 1
400
+ vocab_size: 128256
401
+ max_model_len: 8192
402
+ max_num_seqs: 256
403
+ pipeline_parallelism: true
404
+ enforce_eager: false
405
+ qos: m2
406
+ time: 08:00:00
407
+ partition: a40
408
+ Meta-Llama-3-70B-Instruct:
409
+ model_family: Meta-Llama-3
410
+ model_variant: 70B-Instruct
411
+ model_type: LLM
412
+ gpus_per_node: 4
413
+ num_nodes: 1
414
+ vocab_size: 128256
415
+ max_model_len: 8192
416
+ max_num_seqs: 256
417
+ pipeline_parallelism: true
418
+ enforce_eager: false
419
+ qos: m2
420
+ time: 08:00:00
421
+ partition: a40
422
+ Meta-Llama-3.1-8B:
423
+ model_family: Meta-Llama-3.1
424
+ model_variant: 8B
425
+ model_type: LLM
426
+ gpus_per_node: 1
427
+ num_nodes: 1
428
+ vocab_size: 128256
429
+ max_model_len: 131072
430
+ max_num_seqs: 256
431
+ pipeline_parallelism: true
432
+ enforce_eager: false
433
+ qos: m2
434
+ time: 08:00:00
435
+ partition: a40
436
+ Meta-Llama-3.1-8B-Instruct:
437
+ model_family: Meta-Llama-3.1
438
+ model_variant: 8B-Instruct
439
+ model_type: LLM
440
+ gpus_per_node: 1
441
+ num_nodes: 1
442
+ vocab_size: 128256
443
+ max_model_len: 131072
444
+ max_num_seqs: 256
445
+ pipeline_parallelism: true
446
+ enforce_eager: false
447
+ qos: m2
448
+ time: 08:00:00
449
+ partition: a40
450
+ Meta-Llama-3.1-70B:
451
+ model_family: Meta-Llama-3.1
452
+ model_variant: 70B
453
+ model_type: LLM
454
+ gpus_per_node: 4
455
+ num_nodes: 1
456
+ vocab_size: 128256
457
+ max_model_len: 65536
458
+ max_num_seqs: 256
459
+ pipeline_parallelism: true
460
+ enforce_eager: false
461
+ qos: m2
462
+ time: 08:00:00
463
+ partition: a40
464
+ Meta-Llama-3.1-70B-Instruct:
465
+ model_family: Meta-Llama-3.1
466
+ model_variant: 70B-Instruct
467
+ model_type: LLM
468
+ gpus_per_node: 4
469
+ num_nodes: 1
470
+ vocab_size: 128256
471
+ max_model_len: 65536
472
+ max_num_seqs: 256
473
+ pipeline_parallelism: true
474
+ enforce_eager: false
475
+ qos: m2
476
+ time: 08:00:00
477
+ partition: a40
478
+ Meta-Llama-3.1-405B-Instruct:
479
+ model_family: Meta-Llama-3.1
480
+ model_variant: 405B-Instruct
481
+ model_type: LLM
482
+ gpus_per_node: 4
483
+ num_nodes: 8
484
+ vocab_size: 128256
485
+ max_model_len: 16384
486
+ max_num_seqs: 256
487
+ pipeline_parallelism: true
488
+ enforce_eager: false
489
+ qos: m4
490
+ time: 02:00:00
491
+ partition: a40
492
+ Mistral-7B-v0.1:
493
+ model_family: Mistral
494
+ model_variant: 7B-v0.1
495
+ model_type: LLM
496
+ gpus_per_node: 1
497
+ num_nodes: 1
498
+ vocab_size: 32000
499
+ max_model_len: 32768
500
+ max_num_seqs: 256
501
+ pipeline_parallelism: true
502
+ enforce_eager: false
503
+ qos: m2
504
+ time: 08:00:00
505
+ partition: a40
506
+ Mistral-7B-Instruct-v0.1:
507
+ model_family: Mistral
508
+ model_variant: 7B-Instruct-v0.1
509
+ model_type: LLM
510
+ gpus_per_node: 1
511
+ num_nodes: 1
512
+ vocab_size: 32000
513
+ max_model_len: 32768
514
+ max_num_seqs: 256
515
+ pipeline_parallelism: true
516
+ enforce_eager: false
517
+ qos: m2
518
+ time: 08:00:00
519
+ partition: a40
520
+ Mistral-7B-Instruct-v0.2:
521
+ model_family: Mistral
522
+ model_variant: 7B-Instruct-v0.2
523
+ model_type: LLM
524
+ gpus_per_node: 1
525
+ num_nodes: 1
526
+ vocab_size: 32000
527
+ max_model_len: 32768
528
+ max_num_seqs: 256
529
+ pipeline_parallelism: true
530
+ enforce_eager: false
531
+ qos: m2
532
+ time: 08:00:00
533
+ partition: a40
534
+ Mistral-7B-v0.3:
535
+ model_family: Mistral
536
+ model_variant: 7B-v0.3
537
+ model_type: LLM
538
+ gpus_per_node: 1
539
+ num_nodes: 1
540
+ vocab_size: 32768
541
+ max_model_len: 32768
542
+ max_num_seqs: 256
543
+ pipeline_parallelism: true
544
+ enforce_eager: false
545
+ qos: m2
546
+ time: 08:00:00
547
+ partition: a40
548
+ Mistral-7B-Instruct-v0.3:
549
+ model_family: Mistral
550
+ model_variant: 7B-Instruct-v0.3
551
+ model_type: LLM
552
+ gpus_per_node: 1
553
+ num_nodes: 1
554
+ vocab_size: 32768
555
+ max_model_len: 32768
556
+ max_num_seqs: 256
557
+ pipeline_parallelism: true
558
+ enforce_eager: false
559
+ qos: m2
560
+ time: 08:00:00
561
+ partition: a40
562
+ Mistral-Large-Instruct-2407:
563
+ model_family: Mistral
564
+ model_variant: Large-Instruct-2407
565
+ model_type: LLM
566
+ gpus_per_node: 4
567
+ num_nodes: 2
568
+ vocab_size: 32768
569
+ max_model_len: 32768
570
+ max_num_seqs: 256
571
+ pipeline_parallelism: true
572
+ enforce_eager: false
573
+ qos: m2
574
+ time: 08:00:00
575
+ partition: a40
576
+ Mistral-Large-Instruct-2411:
577
+ model_family: Mistral
578
+ model_variant: Large-Instruct-2411
579
+ model_type: LLM
580
+ gpus_per_node: 4
581
+ num_nodes: 2
582
+ vocab_size: 32768
583
+ max_model_len: 32768
584
+ max_num_seqs: 256
585
+ pipeline_parallelism: true
586
+ enforce_eager: false
587
+ qos: m2
588
+ time: 08:00:00
589
+ partition: a40
590
+ Mixtral-8x7B-Instruct-v0.1:
591
+ model_family: Mixtral
592
+ model_variant: 8x7B-Instruct-v0.1
593
+ model_type: LLM
594
+ gpus_per_node: 4
595
+ num_nodes: 1
596
+ vocab_size: 32000
597
+ max_model_len: 32768
598
+ max_num_seqs: 256
599
+ pipeline_parallelism: true
600
+ enforce_eager: false
601
+ qos: m2
602
+ time: 08:00:00
603
+ partition: a40
604
+ Mixtral-8x22B-v0.1:
605
+ model_family: Mixtral
606
+ model_variant: 8x22B-v0.1
607
+ model_type: LLM
608
+ gpus_per_node: 4
609
+ num_nodes: 2
610
+ vocab_size: 32768
611
+ max_model_len: 65536
612
+ max_num_seqs: 256
613
+ pipeline_parallelism: true
614
+ enforce_eager: false
615
+ qos: m2
616
+ time: 08:00:00
617
+ partition: a40
618
+ Mixtral-8x22B-Instruct-v0.1:
619
+ model_family: Mixtral
620
+ model_variant: 8x22B-Instruct-v0.1
621
+ model_type: LLM
622
+ gpus_per_node: 4
623
+ num_nodes: 2
624
+ vocab_size: 32768
625
+ max_model_len: 65536
626
+ max_num_seqs: 256
627
+ pipeline_parallelism: true
628
+ enforce_eager: false
629
+ qos: m2
630
+ time: 08:00:00
631
+ partition: a40
632
+ Phi-3-medium-128k-instruct:
633
+ model_family: Phi-3
634
+ model_variant: medium-128k-instruct
635
+ model_type: LLM
636
+ gpus_per_node: 2
637
+ num_nodes: 1
638
+ vocab_size: 32064
639
+ max_model_len: 131072
640
+ max_num_seqs: 256
641
+ pipeline_parallelism: true
642
+ enforce_eager: false
643
+ qos: m2
644
+ time: 08:00:00
645
+ partition: a40
646
+ Phi-3-vision-128k-instruct:
647
+ model_family: Phi-3-vision
648
+ model_variant: 128k-instruct
649
+ model_type: VLM
650
+ gpus_per_node: 2
651
+ num_nodes: 1
652
+ vocab_size: 32064
653
+ max_model_len: 65536
654
+ max_num_seqs: 256
655
+ pipeline_parallelism: true
656
+ enforce_eager: false
657
+ qos: m2
658
+ time: 08:00:00
659
+ partition: a40
660
+ Llama3-OpenBioLLM-70B:
661
+ model_family: Llama3-OpenBioLLM
662
+ model_variant: 70B
663
+ model_type: LLM
664
+ gpus_per_node: 4
665
+ num_nodes: 1
666
+ vocab_size: 128256
667
+ max_model_len: 8192
668
+ max_num_seqs: 256
669
+ pipeline_parallelism: true
670
+ enforce_eager: false
671
+ qos: m2
672
+ time: 08:00:00
673
+ partition: a40
674
+ Llama-3.1-Nemotron-70B-Instruct-HF:
675
+ model_family: Llama-3.1-Nemotron
676
+ model_variant: 70B-Instruct-HF
677
+ model_type: LLM
678
+ gpus_per_node: 4
679
+ num_nodes: 1
680
+ vocab_size: 128256
681
+ max_model_len: 65536
682
+ max_num_seqs: 256
683
+ pipeline_parallelism: true
684
+ enforce_eager: false
685
+ qos: m2
686
+ time: 08:00:00
687
+ partition: a40
688
+ Llama-3.2-1B:
689
+ model_family: Llama-3.2
690
+ model_variant: 1B
691
+ model_type: LLM
692
+ gpus_per_node: 1
693
+ num_nodes: 1
694
+ vocab_size: 128256
695
+ max_model_len: 131072
696
+ max_num_seqs: 256
697
+ pipeline_parallelism: true
698
+ enforce_eager: false
699
+ qos: m2
700
+ time: 08:00:00
701
+ partition: a40
702
+ Llama-3.2-1B-Instruct:
703
+ model_family: Llama-3.2
704
+ model_variant: 1B-Instruct
705
+ model_type: LLM
706
+ gpus_per_node: 1
707
+ num_nodes: 1
708
+ vocab_size: 128256
709
+ max_model_len: 131072
710
+ max_num_seqs: 256
711
+ pipeline_parallelism: true
712
+ enforce_eager: false
713
+ qos: m2
714
+ time: 08:00:00
715
+ partition: a40
716
+ Llama-3.2-3B:
717
+ model_family: Llama-3.2
718
+ model_variant: 3B
719
+ model_type: LLM
720
+ gpus_per_node: 1
721
+ num_nodes: 1
722
+ vocab_size: 128256
723
+ max_model_len: 131072
724
+ max_num_seqs: 256
725
+ pipeline_parallelism: true
726
+ enforce_eager: false
727
+ qos: m2
728
+ time: 08:00:00
729
+ partition: a40
730
+ Llama-3.2-3B-Instruct:
731
+ model_family: Llama-3.2
732
+ model_variant: 3B-Instruct
733
+ model_type: LLM
734
+ gpus_per_node: 1
735
+ num_nodes: 1
736
+ vocab_size: 128256
737
+ max_model_len: 131072
738
+ max_num_seqs: 256
739
+ pipeline_parallelism: true
740
+ enforce_eager: false
741
+ qos: m2
742
+ time: 08:00:00
743
+ partition: a40
744
+ Llama-3.2-11B-Vision:
745
+ model_family: Llama-3.2
746
+ model_variant: 11B-Vision
747
+ model_type: VLM
748
+ gpus_per_node: 2
749
+ num_nodes: 1
750
+ vocab_size: 128256
751
+ max_model_len: 4096
752
+ max_num_seqs: 64
753
+ pipeline_parallelism: false
754
+ enforce_eager: true
755
+ qos: m2
756
+ time: 08:00:00
757
+ partition: a40
758
+ Llama-3.2-11B-Vision-Instruct:
759
+ model_family: Llama-3.2
760
+ model_variant: 11B-Vision-Instruct
761
+ model_type: VLM
762
+ gpus_per_node: 2
763
+ num_nodes: 1
764
+ vocab_size: 128256
765
+ max_model_len: 4096
766
+ max_num_seqs: 64
767
+ pipeline_parallelism: false
768
+ enforce_eager: true
769
+ qos: m2
770
+ time: 08:00:00
771
+ partition: a40
772
+ Llama-3.2-90B-Vision:
773
+ model_family: Llama-3.2
774
+ model_variant: 90B-Vision
775
+ model_type: VLM
776
+ gpus_per_node: 4
777
+ num_nodes: 2
778
+ vocab_size: 128256
779
+ max_model_len: 4096
780
+ max_num_seqs: 32
781
+ pipeline_parallelism: false
782
+ enforce_eager: true
783
+ qos: m2
784
+ time: 08:00:00
785
+ partition: a40
786
+ Llama-3.2-90B-Vision-Instruct:
787
+ model_family: Llama-3.2
788
+ model_variant: 90B-Vision-Instruct
789
+ model_type: VLM
790
+ gpus_per_node: 4
791
+ num_nodes: 2
792
+ vocab_size: 128256
793
+ max_model_len: 4096
794
+ max_num_seqs: 32
795
+ pipeline_parallelism: false
796
+ enforce_eager: true
797
+ qos: m2
798
+ time: 08:00:00
799
+ partition: a40
800
+ Qwen2.5-0.5B-Instruct:
801
+ model_family: Qwen2.5
802
+ model_variant: 0.5B-Instruct
803
+ model_type: LLM
804
+ gpus_per_node: 1
805
+ num_nodes: 1
806
+ vocab_size: 152064
807
+ max_model_len: 32768
808
+ max_num_seqs: 256
809
+ pipeline_parallelism: true
810
+ enforce_eager: false
811
+ qos: m2
812
+ time: 08:00:00
813
+ partition: a40
814
+ Qwen2.5-1.5B-Instruct:
815
+ model_family: Qwen2.5
816
+ model_variant: 1.5B-Instruct
817
+ model_type: LLM
818
+ gpus_per_node: 1
819
+ num_nodes: 1
820
+ vocab_size: 152064
821
+ max_model_len: 32768
822
+ max_num_seqs: 256
823
+ pipeline_parallelism: true
824
+ enforce_eager: false
825
+ qos: m2
826
+ time: 08:00:00
827
+ partition: a40
828
+ Qwen2.5-3B-Instruct:
829
+ model_family: Qwen2.5
830
+ model_variant: 3B-Instruct
831
+ model_type: LLM
832
+ gpus_per_node: 1
833
+ num_nodes: 1
834
+ vocab_size: 152064
835
+ max_model_len: 32768
836
+ max_num_seqs: 256
837
+ pipeline_parallelism: true
838
+ enforce_eager: false
839
+ qos: m2
840
+ time: 08:00:00
841
+ partition: a40
842
+ Qwen2.5-7B-Instruct:
843
+ model_family: Qwen2.5
844
+ model_variant: 7B-Instruct
845
+ model_type: LLM
846
+ gpus_per_node: 1
847
+ num_nodes: 1
848
+ vocab_size: 152064
849
+ max_model_len: 32768
850
+ max_num_seqs: 256
851
+ pipeline_parallelism: true
852
+ enforce_eager: false
853
+ qos: m2
854
+ time: 08:00:00
855
+ partition: a40
856
+ Qwen2.5-14B-Instruct:
857
+ model_family: Qwen2.5
858
+ model_variant: 14B-Instruct
859
+ model_type: LLM
860
+ gpus_per_node: 1
861
+ num_nodes: 1
862
+ vocab_size: 152064
863
+ max_model_len: 32768
864
+ max_num_seqs: 256
865
+ pipeline_parallelism: true
866
+ enforce_eager: false
867
+ qos: m2
868
+ time: 08:00:00
869
+ partition: a40
870
+ Qwen2.5-32B-Instruct:
871
+ model_family: Qwen2.5
872
+ model_variant: 32B-Instruct
873
+ model_type: LLM
874
+ gpus_per_node: 2
875
+ num_nodes: 1
876
+ vocab_size: 152064
877
+ max_model_len: 32768
878
+ max_num_seqs: 256
879
+ pipeline_parallelism: true
880
+ enforce_eager: false
881
+ qos: m2
882
+ time: 08:00:00
883
+ partition: a40
884
+ Qwen2.5-72B-Instruct:
885
+ model_family: Qwen2.5
886
+ model_variant: 72B-Instruct
887
+ model_type: LLM
888
+ gpus_per_node: 4
889
+ num_nodes: 1
890
+ vocab_size: 152064
891
+ max_model_len: 16384
892
+ max_num_seqs: 256
893
+ pipeline_parallelism: true
894
+ enforce_eager: false
895
+ qos: m2
896
+ time: 08:00:00
897
+ partition: a40
898
+ Qwen2.5-Math-1.5B-Instruct:
899
+ model_family: Qwen2.5
900
+ model_variant: Math-1.5B-Instruct
901
+ model_type: LLM
902
+ gpus_per_node: 1
903
+ num_nodes: 1
904
+ vocab_size: 152064
905
+ max_model_len: 4096
906
+ max_num_seqs: 256
907
+ pipeline_parallelism: true
908
+ enforce_eager: false
909
+ qos: m2
910
+ time: 08:00:00
911
+ partition: a40
912
+ Qwen2.5-Math-7B-Instruct:
913
+ model_family: Qwen2.5
914
+ model_variant: Math-7B-Instruct
915
+ model_type: LLM
916
+ gpus_per_node: 1
917
+ num_nodes: 1
918
+ vocab_size: 152064
919
+ max_model_len: 4096
920
+ max_num_seqs: 256
921
+ pipeline_parallelism: true
922
+ enforce_eager: false
923
+ qos: m2
924
+ time: 08:00:00
925
+ partition: a40
926
+ Qwen2.5-Math-72B-Instruct:
927
+ model_family: Qwen2.5
928
+ model_variant: Math-72B-Instruct
929
+ model_type: LLM
930
+ gpus_per_node: 4
931
+ num_nodes: 1
932
+ vocab_size: 152064
933
+ max_model_len: 4096
934
+ max_num_seqs: 256
935
+ pipeline_parallelism: true
936
+ enforce_eager: false
937
+ qos: m2
938
+ time: 08:00:00
939
+ partition: a40
940
+ Qwen2.5-Coder-7B-Instruct:
941
+ model_family: Qwen2.5
942
+ model_variant: Coder-7B-Instruct
943
+ model_type: LLM
944
+ gpus_per_node: 1
945
+ num_nodes: 1
946
+ vocab_size: 152064
947
+ max_model_len: 32768
948
+ max_num_seqs: 256
949
+ pipeline_parallelism: true
950
+ enforce_eager: false
951
+ qos: m2
952
+ time: 08:00:00
953
+ partition: a40
954
+ Qwen2.5-Math-RM-72B:
955
+ model_family: Qwen2.5
956
+ model_variant: Math-RM-72B
957
+ model_type: Reward_Modeling
958
+ gpus_per_node: 4
959
+ num_nodes: 1
960
+ vocab_size: 152064
961
+ max_model_len: 4096
962
+ max_num_seqs: 256
963
+ pipeline_parallelism: true
964
+ enforce_eager: false
965
+ qos: m2
966
+ time: 08:00:00
967
+ partition: a40
968
+ Qwen2.5-Math-PRM-7B:
969
+ model_family: Qwen2.5
970
+ model_variant: Math-PRM-7B
971
+ model_type: Reward_Modeling
972
+ gpus_per_node: 1
973
+ num_nodes: 1
974
+ vocab_size: 152064
975
+ max_model_len: 4096
976
+ max_num_seqs: 256
977
+ pipeline_parallelism: true
978
+ enforce_eager: false
979
+ qos: m2
980
+ time: 08:00:00
981
+ partition: a40
982
+ QwQ-32B-Preview:
983
+ model_family: QwQ
984
+ model_variant: 32B-Preview
985
+ model_type: LLM
986
+ gpus_per_node: 2
987
+ num_nodes: 1
988
+ vocab_size: 152064
989
+ max_model_len: 32768
990
+ max_num_seqs: 256
991
+ pipeline_parallelism: true
992
+ enforce_eager: false
993
+ qos: m2
994
+ time: 08:00:00
995
+ partition: a40
996
+ Pixtral-12B-2409:
997
+ model_family: Pixtral
998
+ model_variant: 12B-2409
999
+ model_type: VLM
1000
+ gpus_per_node: 1
1001
+ num_nodes: 1
1002
+ vocab_size: 131072
1003
+ max_model_len: 8192
1004
+ max_num_seqs: 256
1005
+ pipeline_parallelism: true
1006
+ enforce_eager: false
1007
+ qos: m2
1008
+ time: 08:00:00
1009
+ partition: a40
1010
+ e5-mistral-7b-instruct:
1011
+ model_family: e5
1012
+ model_variant: mistral-7b-instruct
1013
+ model_type: Text_Embedding
1014
+ gpus_per_node: 1
1015
+ num_nodes: 1
1016
+ vocab_size: 32000
1017
+ max_model_len: 4096
1018
+ max_num_seqs: 256
1019
+ pipeline_parallelism: true
1020
+ enforce_eager: false
1021
+ qos: m2
1022
+ time: 08:00:00
1023
+ partition: a40
1024
+ bge-base-en-v1.5:
1025
+ model_family: bge
1026
+ model_variant: base-en-v1.5
1027
+ model_type: Text_Embedding
1028
+ gpus_per_node: 1
1029
+ num_nodes: 1
1030
+ vocab_size: 30522
1031
+ max_model_len: 512
1032
+ max_num_seqs: 256
1033
+ pipeline_parallelism: true
1034
+ enforce_eager: false
1035
+ qos: m2
1036
+ time: 08:00:00
1037
+ partition: a40
1038
+ all-MiniLM-L6-v2:
1039
+ model_family: all-MiniLM
1040
+ model_variant: L6-v2
1041
+ model_type: Text_Embedding
1042
+ gpus_per_node: 1
1043
+ num_nodes: 1
1044
+ vocab_size: 30522
1045
+ max_model_len: 512
1046
+ max_num_seqs: 256
1047
+ pipeline_parallelism: true
1048
+ enforce_eager: false
1049
+ qos: m2
1050
+ time: 08:00:00
1051
+ partition: a40
1052
+ Llama-3.3-70B-Instruct:
1053
+ model_family: Llama-3.3
1054
+ model_variant: 70B-Instruct
1055
+ model_type: LLM
1056
+ gpus_per_node: 4
1057
+ num_nodes: 1
1058
+ vocab_size: 128256
1059
+ max_model_len: 65536
1060
+ max_num_seqs: 256
1061
+ pipeline_parallelism: true
1062
+ enforce_eager: false
1063
+ qos: m2
1064
+ time: 08:00:00
1065
+ partition: a40
1066
+ InternVL2_5-26B:
1067
+ model_family: InternVL2_5
1068
+ model_variant: 26B
1069
+ model_type: VLM
1070
+ gpus_per_node: 2
1071
+ num_nodes: 1
1072
+ vocab_size: 92553
1073
+ max_model_len: 32768
1074
+ max_num_seqs: 256
1075
+ pipeline_parallelism: true
1076
+ enforce_eager: false
1077
+ qos: m2
1078
+ time: 08:00:00
1079
+ partition: a40
1080
+ InternVL2_5-38B:
1081
+ model_family: InternVL2_5
1082
+ model_variant: 38B
1083
+ model_type: VLM
1084
+ gpus_per_node: 4
1085
+ num_nodes: 1
1086
+ vocab_size: 92553
1087
+ max_model_len: 32768
1088
+ max_num_seqs: 256
1089
+ pipeline_parallelism: true
1090
+ enforce_eager: false
1091
+ qos: m2
1092
+ time: 08:00:00
1093
+ partition: a40
1094
+ Aya-Expanse-32B:
1095
+ model_family: Aya-Expanse
1096
+ model_variant: 32B
1097
+ model_type: LLM
1098
+ gpus_per_node: 2
1099
+ num_nodes: 1
1100
+ vocab_size: 256000
1101
+ max_model_len: 8192
1102
+ max_num_seqs: 256
1103
+ pipeline_parallelism: true
1104
+ enforce_eager: false
1105
+ qos: m2
1106
+ time: 08:00:00
1107
+ partition: a40
1108
+ DeepSeek-R1-Distill-Llama-70B:
1109
+ model_family: DeepSeek-R1
1110
+ model_variant: 'Distill-Llama-70B '
1111
+ model_type: LLM
1112
+ gpus_per_node: 4
1113
+ num_nodes: 2
1114
+ vocab_size: 128256
1115
+ max_model_len: 131072
1116
+ max_num_seqs: 256
1117
+ pipeline_parallelism: true
1118
+ enforce_eager: false
1119
+ qos: m2
1120
+ time: 08:00:00
1121
+ partition: a40
1122
+ DeepSeek-R1-Distill-Llama-8B:
1123
+ model_family: DeepSeek-R1
1124
+ model_variant: 'Distill-Llama-8B '
1125
+ model_type: LLM
1126
+ gpus_per_node: 1
1127
+ num_nodes: 1
1128
+ vocab_size: 128256
1129
+ max_model_len: 131072
1130
+ max_num_seqs: 256
1131
+ pipeline_parallelism: true
1132
+ enforce_eager: false
1133
+ qos: m2
1134
+ time: 08:00:00
1135
+ partition: a40
1136
+ DeepSeek-R1-Distill-Qwen-32B:
1137
+ model_family: DeepSeek-R1
1138
+ model_variant: Distill-Qwen-32B
1139
+ model_type: LLM
1140
+ gpus_per_node: 4
1141
+ num_nodes: 1
1142
+ vocab_size: 152064
1143
+ max_model_len: 131072
1144
+ max_num_seqs: 256
1145
+ pipeline_parallelism: true
1146
+ enforce_eager: false
1147
+ qos: m2
1148
+ time: 08:00:00
1149
+ partition: a40
1150
+ DeepSeek-R1-Distill-Qwen-14B:
1151
+ model_family: DeepSeek-R1
1152
+ model_variant: Distill-Qwen-14B
1153
+ model_type: LLM
1154
+ gpus_per_node: 2
1155
+ num_nodes: 1
1156
+ vocab_size: 152064
1157
+ max_model_len: 131072
1158
+ max_num_seqs: 256
1159
+ pipeline_parallelism: true
1160
+ enforce_eager: false
1161
+ qos: m2
1162
+ time: 08:00:00
1163
+ partition: a40
1164
+ DeepSeek-R1-Distill-Qwen-7B:
1165
+ model_family: DeepSeek-R1
1166
+ model_variant: Distill-Qwen-7B
1167
+ model_type: LLM
1168
+ gpus_per_node: 1
1169
+ num_nodes: 1
1170
+ vocab_size: 152064
1171
+ max_model_len: 131072
1172
+ max_num_seqs: 256
1173
+ pipeline_parallelism: true
1174
+ enforce_eager: false
1175
+ qos: m2
1176
+ time: 08:00:00
1177
+ partition: a40
1178
+ DeepSeek-R1-Distill-Qwen-1.5B:
1179
+ model_family: DeepSeek-R1
1180
+ model_variant: Distill-Qwen-1.5B
1181
+ model_type: LLM
1182
+ gpus_per_node: 1
1183
+ num_nodes: 1
1184
+ vocab_size: 152064
1185
+ max_model_len: 131072
1186
+ max_num_seqs: 256
1187
+ pipeline_parallelism: true
1188
+ enforce_eager: false
1189
+ qos: m2
1190
+ time: 08:00:00
1191
+ partition: a40
1192
+ Phi-3.5-vision-instruct:
1193
+ model_family: Phi-3.5-vision
1194
+ model_variant: instruct
1195
+ model_type: VLM
1196
+ gpus_per_node: 2
1197
+ num_nodes: 1
1198
+ vocab_size: 32064
1199
+ max_model_len: 65536
1200
+ max_num_seqs: 256
1201
+ pipeline_parallelism: true
1202
+ enforce_eager: false
1203
+ qos: m2
1204
+ time: 08:00:00
1205
+ partition: a40
1206
+ InternVL2_5-8B:
1207
+ model_family: InternVL2_5
1208
+ model_variant: 8B
1209
+ model_type: VLM
1210
+ gpus_per_node: 1
1211
+ num_nodes: 1
1212
+ vocab_size: 92553
1213
+ max_model_len: 32768
1214
+ max_num_seqs: 256
1215
+ pipeline_parallelism: true
1216
+ enforce_eager: false
1217
+ qos: m2
1218
+ time: 08:00:00
1219
+ partition: a40
1220
+ glm-4v-9b:
1221
+ model_family: glm-4v
1222
+ model_variant: 9b
1223
+ model_type: VLM
1224
+ gpus_per_node: 1
1225
+ num_nodes: 1
1226
+ vocab_size: 151552
1227
+ max_model_len: 8192
1228
+ max_num_seqs: 256
1229
+ pipeline_parallelism: true
1230
+ enforce_eager: false
1231
+ qos: m2
1232
+ time: 08:00:00
1233
+ partition: a40
1234
+ Molmo-7B-D-0924:
1235
+ model_family: Molmo
1236
+ model_variant: 7B-D-0924
1237
+ model_type: VLM
1238
+ gpus_per_node: 1
1239
+ num_nodes: 1
1240
+ vocab_size: 152064
1241
+ max_model_len: 4096
1242
+ max_num_seqs: 256
1243
+ pipeline_parallelism: true
1244
+ enforce_eager: false
1245
+ qos: m2
1246
+ time: 08:00:00
1247
+ partition: a40
1248
+ deepseek-vl2:
1249
+ model_family: deepseek-vl2
1250
+ model_type: VLM
1251
+ gpus_per_node: 2
1252
+ num_nodes: 1
1253
+ vocab_size: 129280
1254
+ max_model_len: 4096
1255
+ max_num_seqs: 256
1256
+ pipeline_parallelism: true
1257
+ enforce_eager: false
1258
+ qos: m2
1259
+ time: 08:00:00
1260
+ partition: a40
1261
+ deepseek-vl2-small:
1262
+ model_family: deepseek-vl2
1263
+ model_variant: small
1264
+ model_type: VLM
1265
+ gpus_per_node: 1
1266
+ num_nodes: 1
1267
+ vocab_size: 129280
1268
+ max_model_len: 4096
1269
+ max_num_seqs: 256
1270
+ pipeline_parallelism: true
1271
+ enforce_eager: false
1272
+ qos: m2
1273
+ time: 08:00:00
1274
+ partition: a40