vec-inf 0.7.3__py3-none-any.whl → 0.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,12 +6,14 @@ models:
6
6
  gpus_per_node: 4
7
7
  num_nodes: 2
8
8
  vocab_size: 256000
9
- time: 08:00:00
10
- resource_type: l40s
11
9
  vllm_args:
12
10
  --pipeline-parallel-size: 2
13
11
  --tensor-parallel-size: 4
14
12
  --max-model-len: 65536
13
+ sglang_args:
14
+ --pipeline-parallel-size: 2
15
+ --tensor-parallel-size: 4
16
+ --context-length: 65536
15
17
  c4ai-command-r-08-2024:
16
18
  model_family: c4ai-command-r
17
19
  model_variant: 08-2024
@@ -19,11 +21,12 @@ models:
19
21
  gpus_per_node: 2
20
22
  num_nodes: 1
21
23
  vocab_size: 256000
22
- time: 08:00:00
23
- resource_type: l40s
24
24
  vllm_args:
25
25
  --tensor-parallel-size: 2
26
26
  --max-model-len: 32768
27
+ sglang_args:
28
+ --tensor-parallel-size: 2
29
+ --context-length: 32768
27
30
  CodeLlama-7b-hf:
28
31
  model_family: CodeLlama
29
32
  model_variant: 7b-hf
@@ -31,10 +34,6 @@ models:
31
34
  gpus_per_node: 1
32
35
  num_nodes: 1
33
36
  vocab_size: 32000
34
- time: 08:00:00
35
- resource_type: l40s
36
- vllm_args:
37
- --max-model-len: 16384
38
37
  CodeLlama-7b-Instruct-hf:
39
38
  model_family: CodeLlama
40
39
  model_variant: 7b-Instruct-hf
@@ -42,10 +41,6 @@ models:
42
41
  gpus_per_node: 1
43
42
  num_nodes: 1
44
43
  vocab_size: 32000
45
- time: 08:00:00
46
- resource_type: l40s
47
- vllm_args:
48
- --max-model-len: 16384
49
44
  CodeLlama-13b-hf:
50
45
  model_family: CodeLlama
51
46
  model_variant: 13b-hf
@@ -53,10 +48,6 @@ models:
53
48
  gpus_per_node: 1
54
49
  num_nodes: 1
55
50
  vocab_size: 32000
56
- time: 08:00:00
57
- resource_type: l40s
58
- vllm_args:
59
- --max-model-len: 16384
60
51
  CodeLlama-13b-Instruct-hf:
61
52
  model_family: CodeLlama
62
53
  model_variant: 13b-Instruct-hf
@@ -64,10 +55,6 @@ models:
64
55
  gpus_per_node: 1
65
56
  num_nodes: 1
66
57
  vocab_size: 32000
67
- time: 08:00:00
68
- resource_type: l40s
69
- vllm_args:
70
- --max-model-len: 16384
71
58
  CodeLlama-34b-hf:
72
59
  model_family: CodeLlama
73
60
  model_variant: 34b-hf
@@ -75,11 +62,10 @@ models:
75
62
  gpus_per_node: 2
76
63
  num_nodes: 1
77
64
  vocab_size: 32000
78
- time: 08:00:00
79
- resource_type: l40s
80
65
  vllm_args:
81
66
  --tensor-parallel-size: 2
82
- --max-model-len: 16384
67
+ sglang_args:
68
+ --tensor-parallel-size: 2
83
69
  CodeLlama-34b-Instruct-hf:
84
70
  model_family: CodeLlama
85
71
  model_variant: 34b-Instruct-hf
@@ -87,11 +73,10 @@ models:
87
73
  gpus_per_node: 2
88
74
  num_nodes: 1
89
75
  vocab_size: 32000
90
- time: 08:00:00
91
- resource_type: l40s
92
76
  vllm_args:
93
77
  --tensor-parallel-size: 2
94
- --max-model-len: 16384
78
+ sglang_args:
79
+ --tensor-parallel-size: 2
95
80
  CodeLlama-70b-hf:
96
81
  model_family: CodeLlama
97
82
  model_variant: 70b-hf
@@ -99,11 +84,10 @@ models:
99
84
  gpus_per_node: 4
100
85
  num_nodes: 1
101
86
  vocab_size: 32016
102
- time: 08:00:00
103
- resource_type: l40s
104
87
  vllm_args:
105
88
  --tensor-parallel-size: 4
106
- --max-model-len: 4096
89
+ sglang_args:
90
+ --tensor-parallel-size: 4
107
91
  CodeLlama-70b-Instruct-hf:
108
92
  model_family: CodeLlama
109
93
  model_variant: 70b-Instruct-hf
@@ -111,11 +95,10 @@ models:
111
95
  gpus_per_node: 4
112
96
  num_nodes: 1
113
97
  vocab_size: 32016
114
- time: 08:00:00
115
- resource_type: l40s
116
98
  vllm_args:
117
99
  --tensor-parallel-size: 4
118
- --max-model-len: 4096
100
+ sglang_args:
101
+ --tensor-parallel-size: 4
119
102
  gemma-2-2b-it:
120
103
  model_family: gemma-2
121
104
  model_variant: 2b-it
@@ -123,10 +106,6 @@ models:
123
106
  gpus_per_node: 1
124
107
  num_nodes: 1
125
108
  vocab_size: 256000
126
- time: 08:00:00
127
- resource_type: l40s
128
- vllm_args:
129
- --max-model-len: 4096
130
109
  gemma-2-9b:
131
110
  model_family: gemma-2
132
111
  model_variant: 9b
@@ -134,10 +113,6 @@ models:
134
113
  gpus_per_node: 1
135
114
  num_nodes: 1
136
115
  vocab_size: 256000
137
- time: 08:00:00
138
- resource_type: l40s
139
- vllm_args:
140
- --max-model-len: 4096
141
116
  gemma-2-9b-it:
142
117
  model_family: gemma-2
143
118
  model_variant: 9b-it
@@ -145,10 +120,6 @@ models:
145
120
  gpus_per_node: 1
146
121
  num_nodes: 1
147
122
  vocab_size: 256000
148
- time: 08:00:00
149
- resource_type: l40s
150
- vllm_args:
151
- --max-model-len: 4096
152
123
  gemma-2-27b:
153
124
  model_family: gemma-2
154
125
  model_variant: 27b
@@ -156,11 +127,10 @@ models:
156
127
  gpus_per_node: 2
157
128
  num_nodes: 1
158
129
  vocab_size: 256000
159
- time: 08:00:00
160
- resource_type: l40s
161
130
  vllm_args:
162
131
  --tensor-parallel-size: 2
163
- --max-model-len: 4096
132
+ sglang_args:
133
+ --tensor-parallel-size: 2
164
134
  gemma-2-27b-it:
165
135
  model_family: gemma-2
166
136
  model_variant: 27b-it
@@ -168,11 +138,10 @@ models:
168
138
  gpus_per_node: 2
169
139
  num_nodes: 1
170
140
  vocab_size: 256000
171
- time: 08:00:00
172
- resource_type: l40s
173
141
  vllm_args:
174
142
  --tensor-parallel-size: 2
175
- --max-model-len: 4096
143
+ sglang_args:
144
+ --tensor-parallel-size: 2
176
145
  Llama-2-7b-hf:
177
146
  model_family: Llama-2
178
147
  model_variant: 7b-hf
@@ -180,10 +149,6 @@ models:
180
149
  gpus_per_node: 1
181
150
  num_nodes: 1
182
151
  vocab_size: 32000
183
- time: 08:00:00
184
- resource_type: l40s
185
- vllm_args:
186
- --max-model-len: 4096
187
152
  Llama-2-7b-chat-hf:
188
153
  model_family: Llama-2
189
154
  model_variant: 7b-chat-hf
@@ -191,10 +156,6 @@ models:
191
156
  gpus_per_node: 1
192
157
  num_nodes: 1
193
158
  vocab_size: 32000
194
- time: 08:00:00
195
- resource_type: l40s
196
- vllm_args:
197
- --max-model-len: 4096
198
159
  Llama-2-13b-hf:
199
160
  model_family: Llama-2
200
161
  model_variant: 13b-hf
@@ -202,10 +163,6 @@ models:
202
163
  gpus_per_node: 1
203
164
  num_nodes: 1
204
165
  vocab_size: 32000
205
- time: 08:00:00
206
- resource_type: l40s
207
- vllm_args:
208
- --max-model-len: 4096
209
166
  Llama-2-13b-chat-hf:
210
167
  model_family: Llama-2
211
168
  model_variant: 13b-chat-hf
@@ -213,22 +170,6 @@ models:
213
170
  gpus_per_node: 1
214
171
  num_nodes: 1
215
172
  vocab_size: 32000
216
- time: 08:00:00
217
- resource_type: l40s
218
- vllm_args:
219
- --max-model-len: 4096
220
- Llama-2-70b-hf:
221
- model_family: Llama-2
222
- model_variant: 70b-hf
223
- model_type: LLM
224
- gpus_per_node: 4
225
- num_nodes: 1
226
- vocab_size: 32000
227
- time: 08:00:00
228
- resource_type: l40s
229
- vllm_args:
230
- --tensor-parallel-size: 4
231
- --max-model-len: 4096
232
173
  Llama-2-70b-chat-hf:
233
174
  model_family: Llama-2
234
175
  model_variant: 70b-chat-hf
@@ -236,11 +177,10 @@ models:
236
177
  gpus_per_node: 4
237
178
  num_nodes: 1
238
179
  vocab_size: 32000
239
- time: 08:00:00
240
- resource_type: l40s
241
180
  vllm_args:
242
181
  --tensor-parallel-size: 4
243
- --max-model-len: 4096
182
+ sglang_args:
183
+ --tensor-parallel-size: 4
244
184
  llava-1.5-7b-hf:
245
185
  model_family: llava-1.5
246
186
  model_variant: 7b-hf
@@ -248,10 +188,6 @@ models:
248
188
  gpus_per_node: 1
249
189
  num_nodes: 1
250
190
  vocab_size: 32000
251
- time: 08:00:00
252
- resource_type: l40s
253
- vllm_args:
254
- --max-model-len: 4096
255
191
  llava-1.5-13b-hf:
256
192
  model_family: llava-1.5
257
193
  model_variant: 13b-hf
@@ -259,10 +195,6 @@ models:
259
195
  gpus_per_node: 1
260
196
  num_nodes: 1
261
197
  vocab_size: 32000
262
- time: 08:00:00
263
- resource_type: l40s
264
- vllm_args:
265
- --max-model-len: 4096
266
198
  llava-v1.6-mistral-7b-hf:
267
199
  model_family: llava-v1.6
268
200
  model_variant: mistral-7b-hf
@@ -270,10 +202,6 @@ models:
270
202
  gpus_per_node: 1
271
203
  num_nodes: 1
272
204
  vocab_size: 32064
273
- time: 08:00:00
274
- resource_type: l40s
275
- vllm_args:
276
- --max-model-len: 32768
277
205
  llava-v1.6-34b-hf:
278
206
  model_family: llava-v1.6
279
207
  model_variant: 34b-hf
@@ -281,11 +209,10 @@ models:
281
209
  gpus_per_node: 2
282
210
  num_nodes: 1
283
211
  vocab_size: 64064
284
- time: 08:00:00
285
- resource_type: l40s
286
212
  vllm_args:
287
213
  --tensor-parallel-size: 2
288
- --max-model-len: 4096
214
+ sglang_args:
215
+ --tensor-parallel-size: 2
289
216
  Meta-Llama-3-8B:
290
217
  model_family: Meta-Llama-3
291
218
  model_variant: 8B
@@ -293,10 +220,6 @@ models:
293
220
  gpus_per_node: 1
294
221
  num_nodes: 1
295
222
  vocab_size: 128256
296
- time: 08:00:00
297
- resource_type: l40s
298
- vllm_args:
299
- --max-model-len: 8192
300
223
  Meta-Llama-3-8B-Instruct:
301
224
  model_family: Meta-Llama-3
302
225
  model_variant: 8B-Instruct
@@ -304,10 +227,6 @@ models:
304
227
  gpus_per_node: 1
305
228
  num_nodes: 1
306
229
  vocab_size: 128256
307
- time: 08:00:00
308
- resource_type: l40s
309
- vllm_args:
310
- --max-model-len: 8192
311
230
  Meta-Llama-3-70B:
312
231
  model_family: Meta-Llama-3
313
232
  model_variant: 70B
@@ -315,11 +234,10 @@ models:
315
234
  gpus_per_node: 4
316
235
  num_nodes: 1
317
236
  vocab_size: 128256
318
- time: 08:00:00
319
- resource_type: l40s
320
237
  vllm_args:
321
238
  --tensor-parallel-size: 4
322
- --max-model-len: 8192
239
+ sglang_args:
240
+ --tensor-parallel-size: 4
323
241
  Meta-Llama-3-70B-Instruct:
324
242
  model_family: Meta-Llama-3
325
243
  model_variant: 70B-Instruct
@@ -327,11 +245,10 @@ models:
327
245
  gpus_per_node: 4
328
246
  num_nodes: 1
329
247
  vocab_size: 128256
330
- time: 08:00:00
331
- resource_type: l40s
332
248
  vllm_args:
333
249
  --tensor-parallel-size: 4
334
- --max-model-len: 8192
250
+ sglang_args:
251
+ --tensor-parallel-size: 4
335
252
  Meta-Llama-3.1-8B:
336
253
  model_family: Meta-Llama-3.1
337
254
  model_variant: 8B
@@ -339,10 +256,6 @@ models:
339
256
  gpus_per_node: 1
340
257
  num_nodes: 1
341
258
  vocab_size: 128256
342
- time: 08:00:00
343
- resource_type: l40s
344
- vllm_args:
345
- --max-model-len: 131072
346
259
  Meta-Llama-3.1-8B-Instruct:
347
260
  model_family: Meta-Llama-3.1
348
261
  model_variant: 8B-Instruct
@@ -350,10 +263,6 @@ models:
350
263
  gpus_per_node: 1
351
264
  num_nodes: 1
352
265
  vocab_size: 128256
353
- time: 08:00:00
354
- resource_type: l40s
355
- vllm_args:
356
- --max-model-len: 131072
357
266
  Meta-Llama-3.1-70B:
358
267
  model_family: Meta-Llama-3.1
359
268
  model_variant: 70B
@@ -361,11 +270,12 @@ models:
361
270
  gpus_per_node: 4
362
271
  num_nodes: 1
363
272
  vocab_size: 128256
364
- time: 08:00:00
365
- resource_type: l40s
366
273
  vllm_args:
367
274
  --tensor-parallel-size: 4
368
275
  --max-model-len: 65536
276
+ sglang_args:
277
+ --tensor-parallel-size: 4
278
+ --context-length: 65536
369
279
  Meta-Llama-3.1-70B-Instruct:
370
280
  model_family: Meta-Llama-3.1
371
281
  model_variant: 70B-Instruct
@@ -373,11 +283,12 @@ models:
373
283
  gpus_per_node: 4
374
284
  num_nodes: 1
375
285
  vocab_size: 128256
376
- time: 08:00:00
377
- resource_type: l40s
378
286
  vllm_args:
379
287
  --tensor-parallel-size: 4
380
288
  --max-model-len: 65536
289
+ sglang_args:
290
+ --tensor-parallel-size: 4
291
+ --context-length: 65536
381
292
  Meta-Llama-3.1-405B-Instruct:
382
293
  model_family: Meta-Llama-3.1
383
294
  model_variant: 405B-Instruct
@@ -385,12 +296,14 @@ models:
385
296
  gpus_per_node: 4
386
297
  num_nodes: 8
387
298
  vocab_size: 128256
388
- time: 08:00:00
389
- resource_type: l40s
390
299
  vllm_args:
391
300
  --pipeline-parallel-size: 8
392
301
  --tensor-parallel-size: 4
393
302
  --max-model-len: 16384
303
+ sglang_args:
304
+ --pipeline-parallel-size: 8
305
+ --tensor-parallel-size: 4
306
+ --context-length: 16384
394
307
  Mistral-7B-Instruct-v0.1:
395
308
  model_family: Mistral
396
309
  model_variant: 7B-Instruct-v0.1
@@ -398,10 +311,6 @@ models:
398
311
  gpus_per_node: 1
399
312
  num_nodes: 1
400
313
  vocab_size: 32000
401
- time: 08:00:00
402
- resource_type: l40s
403
- vllm_args:
404
- --max-model-len: 32768
405
314
  Mistral-7B-Instruct-v0.2:
406
315
  model_family: Mistral
407
316
  model_variant: 7B-Instruct-v0.2
@@ -409,10 +318,6 @@ models:
409
318
  gpus_per_node: 1
410
319
  num_nodes: 1
411
320
  vocab_size: 32000
412
- time: 08:00:00
413
- resource_type: l40s
414
- vllm_args:
415
- --max-model-len: 32768
416
321
  Mistral-7B-v0.3:
417
322
  model_family: Mistral
418
323
  model_variant: 7B-v0.3
@@ -420,10 +325,6 @@ models:
420
325
  gpus_per_node: 1
421
326
  num_nodes: 1
422
327
  vocab_size: 32768
423
- time: 08:00:00
424
- resource_type: l40s
425
- vllm_args:
426
- --max-model-len: 32768
427
328
  Mistral-7B-Instruct-v0.3:
428
329
  model_family: Mistral
429
330
  model_variant: 7B-Instruct-v0.3
@@ -431,10 +332,6 @@ models:
431
332
  gpus_per_node: 1
432
333
  num_nodes: 1
433
334
  vocab_size: 32768
434
- time: 08:00:00
435
- resource_type: l40s
436
- vllm_args:
437
- --max-model-len: 32768
438
335
  Mistral-Large-Instruct-2407:
439
336
  model_family: Mistral
440
337
  model_variant: Large-Instruct-2407
@@ -442,12 +339,14 @@ models:
442
339
  gpus_per_node: 4
443
340
  num_nodes: 2
444
341
  vocab_size: 32768
445
- time: 08:00:00
446
- resource_type: l40s
447
342
  vllm_args:
448
343
  --pipeline-parallel-size: 2
449
344
  --tensor-parallel-size: 4
450
345
  --max-model-len: 32768
346
+ sglang_args:
347
+ --pipeline-parallel-size: 2
348
+ --tensor-parallel-size: 4
349
+ --context-length: 32768
451
350
  Mistral-Large-Instruct-2411:
452
351
  model_family: Mistral
453
352
  model_variant: Large-Instruct-2411
@@ -455,12 +354,14 @@ models:
455
354
  gpus_per_node: 4
456
355
  num_nodes: 2
457
356
  vocab_size: 32768
458
- time: 08:00:00
459
- resource_type: l40s
460
357
  vllm_args:
461
358
  --pipeline-parallel-size: 2
462
359
  --tensor-parallel-size: 4
463
360
  --max-model-len: 32768
361
+ sglang_args:
362
+ --pipeline-parallel-size: 2
363
+ --tensor-parallel-size: 4
364
+ --context-length: 32768
464
365
  Mixtral-8x7B-Instruct-v0.1:
465
366
  model_family: Mixtral
466
367
  model_variant: 8x7B-Instruct-v0.1
@@ -468,11 +369,10 @@ models:
468
369
  gpus_per_node: 4
469
370
  num_nodes: 1
470
371
  vocab_size: 32000
471
- time: 08:00:00
472
- resource_type: l40s
473
372
  vllm_args:
474
373
  --tensor-parallel-size: 4
475
- --max-model-len: 32768
374
+ sglang_args:
375
+ --tensor-parallel-size: 4
476
376
  Mixtral-8x22B-v0.1:
477
377
  model_family: Mixtral
478
378
  model_variant: 8x22B-v0.1
@@ -480,12 +380,12 @@ models:
480
380
  gpus_per_node: 4
481
381
  num_nodes: 2
482
382
  vocab_size: 32768
483
- time: 08:00:00
484
- resource_type: l40s
485
383
  vllm_args:
486
384
  --pipeline-parallel-size: 2
487
385
  --tensor-parallel-size: 4
488
- --max-model-len: 65536
386
+ sglang_args:
387
+ --pipeline-parallel-size: 2
388
+ --tensor-parallel-size: 4
489
389
  Mixtral-8x22B-Instruct-v0.1:
490
390
  model_family: Mixtral
491
391
  model_variant: 8x22B-Instruct-v0.1
@@ -493,12 +393,12 @@ models:
493
393
  gpus_per_node: 4
494
394
  num_nodes: 2
495
395
  vocab_size: 32768
496
- time: 08:00:00
497
- resource_type: l40s
498
396
  vllm_args:
499
397
  --pipeline-parallel-size: 2
500
398
  --tensor-parallel-size: 4
501
- --max-model-len: 65536
399
+ sglang_args:
400
+ --pipeline-parallel-size: 2
401
+ --tensor-parallel-size: 4
502
402
  Phi-3-medium-128k-instruct:
503
403
  model_family: Phi-3
504
404
  model_variant: medium-128k-instruct
@@ -506,11 +406,10 @@ models:
506
406
  gpus_per_node: 2
507
407
  num_nodes: 1
508
408
  vocab_size: 32064
509
- time: 08:00:00
510
- resource_type: l40s
511
409
  vllm_args:
512
410
  --tensor-parallel-size: 2
513
- --max-model-len: 131072
411
+ sglang_args:
412
+ --tensor-parallel-size: 2
514
413
  Phi-3-vision-128k-instruct:
515
414
  model_family: Phi-3-vision
516
415
  model_variant: 128k-instruct
@@ -518,11 +417,12 @@ models:
518
417
  gpus_per_node: 2
519
418
  num_nodes: 1
520
419
  vocab_size: 32064
521
- time: 08:00:00
522
- resource_type: l40s
523
420
  vllm_args:
524
421
  --tensor-parallel-size: 2
525
422
  --max-model-len: 65536
423
+ sglang_args:
424
+ --tensor-parallel-size: 2
425
+ --context-length: 65536
526
426
  Llama-3.1-Nemotron-70B-Instruct-HF:
527
427
  model_family: Llama-3.1-Nemotron
528
428
  model_variant: 70B-Instruct-HF
@@ -530,11 +430,12 @@ models:
530
430
  gpus_per_node: 4
531
431
  num_nodes: 1
532
432
  vocab_size: 128256
533
- time: 08:00:00
534
- resource_type: l40s
535
433
  vllm_args:
536
434
  --tensor-parallel-size: 4
537
435
  --max-model-len: 65536
436
+ sglang_args:
437
+ --tensor-parallel-size: 4
438
+ --context-length: 65536
538
439
  Llama-3.2-1B:
539
440
  model_family: Llama-3.2
540
441
  model_variant: 1B
@@ -542,10 +443,6 @@ models:
542
443
  gpus_per_node: 1
543
444
  num_nodes: 1
544
445
  vocab_size: 128256
545
- time: 08:00:00
546
- resource_type: l40s
547
- vllm_args:
548
- --max-model-len: 131072
549
446
  Llama-3.2-1B-Instruct:
550
447
  model_family: Llama-3.2
551
448
  model_variant: 1B-Instruct
@@ -553,10 +450,6 @@ models:
553
450
  gpus_per_node: 1
554
451
  num_nodes: 1
555
452
  vocab_size: 128256
556
- time: 08:00:00
557
- resource_type: l40s
558
- vllm_args:
559
- --max-model-len: 131072
560
453
  Llama-3.2-3B:
561
454
  model_family: Llama-3.2
562
455
  model_variant: 3B
@@ -564,10 +457,6 @@ models:
564
457
  gpus_per_node: 1
565
458
  num_nodes: 1
566
459
  vocab_size: 128256
567
- time: 08:00:00
568
- resource_type: l40s
569
- vllm_args:
570
- --max-model-len: 131072
571
460
  Llama-3.2-3B-Instruct:
572
461
  model_family: Llama-3.2
573
462
  model_variant: 3B-Instruct
@@ -575,24 +464,6 @@ models:
575
464
  gpus_per_node: 1
576
465
  num_nodes: 1
577
466
  vocab_size: 128256
578
- time: 08:00:00
579
- resource_type: l40s
580
- vllm_args:
581
- --max-model-len: 131072
582
- Llama-3.2-11B-Vision:
583
- model_family: Llama-3.2
584
- model_variant: 11B-Vision
585
- model_type: VLM
586
- gpus_per_node: 2
587
- num_nodes: 1
588
- vocab_size: 128256
589
- time: 08:00:00
590
- resource_type: l40s
591
- vllm_args:
592
- --tensor-parallel-size: 2
593
- --max-model-len: 4096
594
- --max-num-seqs: 64
595
- --enforce-eager: true
596
467
  Llama-3.2-11B-Vision-Instruct:
597
468
  model_family: Llama-3.2
598
469
  model_variant: 11B-Vision-Instruct
@@ -600,27 +471,9 @@ models:
600
471
  gpus_per_node: 2
601
472
  num_nodes: 1
602
473
  vocab_size: 128256
603
- time: 08:00:00
604
- resource_type: l40s
605
- vllm_args:
474
+ engine: sglang
475
+ sglang_args:
606
476
  --tensor-parallel-size: 2
607
- --max-model-len: 4096
608
- --max-num-seqs: 64
609
- --enforce-eager: true
610
- Llama-3.2-90B-Vision:
611
- model_family: Llama-3.2
612
- model_variant: 90B-Vision
613
- model_type: VLM
614
- gpus_per_node: 4
615
- num_nodes: 2
616
- vocab_size: 128256
617
- time: 08:00:00
618
- resource_type: l40s
619
- vllm_args:
620
- --tensor-parallel-size: 8
621
- --max-model-len: 4096
622
- --max-num-seqs: 32
623
- --enforce-eager: true
624
477
  Llama-3.2-90B-Vision-Instruct:
625
478
  model_family: Llama-3.2
626
479
  model_variant: 90B-Vision-Instruct
@@ -628,13 +481,9 @@ models:
628
481
  gpus_per_node: 4
629
482
  num_nodes: 2
630
483
  vocab_size: 128256
631
- time: 08:00:00
632
- resource_type: l40s
633
- vllm_args:
484
+ engine: sglang
485
+ sglang_args:
634
486
  --tensor-parallel-size: 8
635
- --max-model-len: 4096
636
- --max-num-seqs: 32
637
- --enforce-eager: true
638
487
  Qwen2.5-0.5B-Instruct:
639
488
  model_family: Qwen2.5
640
489
  model_variant: 0.5B-Instruct
@@ -642,10 +491,6 @@ models:
642
491
  gpus_per_node: 1
643
492
  num_nodes: 1
644
493
  vocab_size: 152064
645
- time: 08:00:00
646
- resource_type: l40s
647
- vllm_args:
648
- --max-model-len: 32768
649
494
  Qwen2.5-1.5B-Instruct:
650
495
  model_family: Qwen2.5
651
496
  model_variant: 1.5B-Instruct
@@ -653,10 +498,6 @@ models:
653
498
  gpus_per_node: 1
654
499
  num_nodes: 1
655
500
  vocab_size: 152064
656
- time: 08:00:00
657
- resource_type: l40s
658
- vllm_args:
659
- --max-model-len: 32768
660
501
  Qwen2.5-3B-Instruct:
661
502
  model_family: Qwen2.5
662
503
  model_variant: 3B-Instruct
@@ -664,10 +505,6 @@ models:
664
505
  gpus_per_node: 1
665
506
  num_nodes: 1
666
507
  vocab_size: 152064
667
- time: 08:00:00
668
- resource_type: l40s
669
- vllm_args:
670
- --max-model-len: 32768
671
508
  Qwen2.5-7B-Instruct:
672
509
  model_family: Qwen2.5
673
510
  model_variant: 7B-Instruct
@@ -675,10 +512,6 @@ models:
675
512
  gpus_per_node: 1
676
513
  num_nodes: 1
677
514
  vocab_size: 152064
678
- time: 08:00:00
679
- resource_type: l40s
680
- vllm_args:
681
- --max-model-len: 32768
682
515
  Qwen2.5-14B-Instruct:
683
516
  model_family: Qwen2.5
684
517
  model_variant: 14B-Instruct
@@ -686,10 +519,6 @@ models:
686
519
  gpus_per_node: 1
687
520
  num_nodes: 1
688
521
  vocab_size: 152064
689
- time: 08:00:00
690
- resource_type: l40s
691
- vllm_args:
692
- --max-model-len: 32768
693
522
  Qwen2.5-32B-Instruct:
694
523
  model_family: Qwen2.5
695
524
  model_variant: 32B-Instruct
@@ -697,11 +526,10 @@ models:
697
526
  gpus_per_node: 2
698
527
  num_nodes: 1
699
528
  vocab_size: 152064
700
- time: 08:00:00
701
- resource_type: l40s
702
529
  vllm_args:
703
530
  --tensor-parallel-size: 2
704
- --max-model-len: 32768
531
+ sglang_args:
532
+ --tensor-parallel-size: 2
705
533
  Qwen2.5-72B-Instruct:
706
534
  model_family: Qwen2.5
707
535
  model_variant: 72B-Instruct
@@ -709,79 +537,78 @@ models:
709
537
  gpus_per_node: 4
710
538
  num_nodes: 1
711
539
  vocab_size: 152064
712
- time: 08:00:00
713
- resource_type: l40s
714
540
  vllm_args:
715
541
  --tensor-parallel-size: 4
716
- --max-model-len: 16384
542
+ sglang_args:
543
+ --tensor-parallel-size: 4
717
544
  Qwen2.5-Math-1.5B-Instruct:
718
- model_family: Qwen2.5
719
- model_variant: Math-1.5B-Instruct
545
+ model_family: Qwen2.5-Math
546
+ model_variant: 1.5B-Instruct
720
547
  model_type: LLM
721
548
  gpus_per_node: 1
722
549
  num_nodes: 1
723
550
  vocab_size: 152064
724
- time: 08:00:00
725
- resource_type: l40s
726
- vllm_args:
727
- --max-model-len: 4096
728
551
  Qwen2.5-Math-7B-Instruct:
729
- model_family: Qwen2.5
730
- model_variant: Math-7B-Instruct
552
+ model_family: Qwen2.5-Math
553
+ model_variant: 7B-Instruct
731
554
  model_type: LLM
732
555
  gpus_per_node: 1
733
556
  num_nodes: 1
734
557
  vocab_size: 152064
735
- time: 08:00:00
736
- resource_type: l40s
737
- vllm_args:
738
- --max-model-len: 4096
739
558
  Qwen2.5-Math-72B-Instruct:
740
- model_family: Qwen2.5
741
- model_variant: Math-72B-Instruct
559
+ model_family: Qwen2.5-Math
560
+ model_variant: 72B-Instruct
742
561
  model_type: LLM
743
562
  gpus_per_node: 4
744
563
  num_nodes: 1
745
564
  vocab_size: 152064
746
- time: 08:00:00
747
- resource_type: l40s
748
565
  vllm_args:
749
566
  --tensor-parallel-size: 4
750
- --max-model-len: 4096
567
+ sglang_args:
568
+ --tensor-parallel-size: 4
569
+ Qwen2.5-Coder-3B-Instruct:
570
+ model_family: Qwen2.5-Coder
571
+ model_variant: 3B-Instruct
572
+ model_type: LLM
573
+ gpus_per_node: 1
574
+ num_nodes: 1
575
+ vocab_size: 152064
751
576
  Qwen2.5-Coder-7B-Instruct:
752
- model_family: Qwen2.5
753
- model_variant: Coder-7B-Instruct
577
+ model_family: Qwen2.5-Coder
578
+ model_variant: 7B-Instruct
754
579
  model_type: LLM
755
580
  gpus_per_node: 1
756
581
  num_nodes: 1
757
582
  vocab_size: 152064
758
- time: 08:00:00
759
- resource_type: l40s
760
- vllm_args:
761
- --max-model-len: 32768
762
583
  Qwen2.5-Math-RM-72B:
763
- model_family: Qwen2.5
764
- model_variant: Math-RM-72B
584
+ model_family: Qwen2.5-Math-RM
585
+ model_variant: 72B
765
586
  model_type: Reward_Modeling
766
587
  gpus_per_node: 4
767
588
  num_nodes: 1
768
589
  vocab_size: 152064
769
- time: 08:00:00
770
- resource_type: l40s
771
590
  vllm_args:
772
591
  --tensor-parallel-size: 4
773
- --max-model-len: 4096
592
+ sglang_args:
593
+ --tensor-parallel-size: 4
774
594
  Qwen2.5-Math-PRM-7B:
775
- model_family: Qwen2.5
776
- model_variant: Math-PRM-7B
595
+ model_family: Qwen2.5-Math-PRM
596
+ model_variant: 7B
777
597
  model_type: Reward_Modeling
778
598
  gpus_per_node: 1
779
599
  num_nodes: 1
780
600
  vocab_size: 152064
781
- time: 08:00:00
782
- resource_type: l40s
601
+ Qwen2.5-VL-7B-Instruct:
602
+ model_family: Qwen2.5-VL
603
+ model_variant: 7B-Instruct
604
+ model_type: VLM
605
+ gpus_per_node: 1
606
+ num_nodes: 1
607
+ vocab_size: 152064
783
608
  vllm_args:
784
- --max-model-len: 4096
609
+ --max-model-len: 32768
610
+ sglang_args:
611
+ --context-length: 32768
785
612
  QwQ-32B:
786
613
  model_family: QwQ
787
614
  model_variant: 32B
@@ -789,11 +616,12 @@ models:
789
616
  gpus_per_node: 2
790
617
  num_nodes: 1
791
618
  vocab_size: 152064
792
- time: 08:00:00
793
- resource_type: l40s
794
619
  vllm_args:
795
620
  --tensor-parallel-size: 2
796
621
  --max-model-len: 32768
622
+ sglang_args:
623
+ --tensor-parallel-size: 2
624
+ --context-length: 32768
797
625
  Pixtral-12B-2409:
798
626
  model_family: Pixtral
799
627
  model_variant: 12B-2409
@@ -801,10 +629,10 @@ models:
801
629
  gpus_per_node: 1
802
630
  num_nodes: 1
803
631
  vocab_size: 131072
804
- time: 08:00:00
805
- resource_type: l40s
806
632
  vllm_args:
807
633
  --max-model-len: 8192
634
+ sglang_args:
635
+ --context-length: 8192
808
636
  e5-mistral-7b-instruct:
809
637
  model_family: e5
810
638
  model_variant: mistral-7b-instruct
@@ -812,10 +640,6 @@ models:
812
640
  gpus_per_node: 1
813
641
  num_nodes: 1
814
642
  vocab_size: 32000
815
- time: 08:00:00
816
- resource_type: l40s
817
- vllm_args:
818
- --max-model-len: 4096
819
643
  bge-base-en-v1.5:
820
644
  model_family: bge
821
645
  model_variant: base-en-v1.5
@@ -823,10 +647,6 @@ models:
823
647
  gpus_per_node: 1
824
648
  num_nodes: 1
825
649
  vocab_size: 30522
826
- time: 08:00:00
827
- resource_type: l40s
828
- vllm_args:
829
- --max-model-len: 512
830
650
  all-MiniLM-L6-v2:
831
651
  model_family: all-MiniLM
832
652
  model_variant: L6-v2
@@ -834,10 +654,6 @@ models:
834
654
  gpus_per_node: 1
835
655
  num_nodes: 1
836
656
  vocab_size: 30522
837
- time: 08:00:00
838
- resource_type: l40s
839
- vllm_args:
840
- --max-model-len: 512
841
657
  Llama-3.3-70B-Instruct:
842
658
  model_family: Llama-3.3
843
659
  model_variant: 70B-Instruct
@@ -845,11 +661,21 @@ models:
845
661
  gpus_per_node: 4
846
662
  num_nodes: 1
847
663
  vocab_size: 128256
848
- time: 08:00:00
849
- resource_type: l40s
850
664
  vllm_args:
851
665
  --tensor-parallel-size: 4
852
666
  --max-model-len: 65536
667
+ sglang_args:
668
+ --tensor-parallel-size: 4
669
+ --context-length: 65536
670
+ InternVL2_5-8B:
671
+ model_family: InternVL2_5
672
+ model_variant: 8B
673
+ model_type: VLM
674
+ gpus_per_node: 1
675
+ num_nodes: 1
676
+ vocab_size: 92553
677
+ vllm_args:
678
+ --trust-remote-code: true
853
679
  InternVL2_5-26B:
854
680
  model_family: InternVL2_5
855
681
  model_variant: 26B
@@ -857,11 +683,11 @@ models:
857
683
  gpus_per_node: 2
858
684
  num_nodes: 1
859
685
  vocab_size: 92553
860
- time: 08:00:00
861
- resource_type: l40s
862
686
  vllm_args:
863
687
  --tensor-parallel-size: 2
864
- --max-model-len: 32768
688
+ --trust-remote-code: true
689
+ sglang_args:
690
+ --tensor-parallel-size: 2
865
691
  InternVL2_5-38B:
866
692
  model_family: InternVL2_5
867
693
  model_variant: 38B
@@ -869,23 +695,22 @@ models:
869
695
  gpus_per_node: 4
870
696
  num_nodes: 1
871
697
  vocab_size: 92553
872
- time: 08:00:00
873
- resource_type: l40s
874
698
  vllm_args:
875
699
  --tensor-parallel-size: 4
876
- --max-model-len: 32768
877
- Aya-Expanse-32B:
878
- model_family: Aya-Expanse
879
- model_variant: 32B
700
+ --trust-remote-code: true
701
+ sglang_args:
702
+ --tensor-parallel-size: 4
703
+ aya-expanse-32b:
704
+ model_family: aya-expanse
705
+ model_variant: 32b
880
706
  model_type: LLM
881
707
  gpus_per_node: 2
882
708
  num_nodes: 1
883
709
  vocab_size: 256000
884
- time: 08:00:00
885
- resource_type: l40s
886
710
  vllm_args:
887
711
  --tensor-parallel-size: 2
888
- --max-model-len: 8192
712
+ sglang_args:
713
+ --tensor-parallel-size: 2
889
714
  DeepSeek-R1-Distill-Llama-70B:
890
715
  model_family: DeepSeek-R1
891
716
  model_variant: Distill-Llama-70B
@@ -893,11 +718,12 @@ models:
893
718
  gpus_per_node: 4
894
719
  num_nodes: 1
895
720
  vocab_size: 128256
896
- time: 08:00:00
897
- resource_type: l40s
898
721
  vllm_args:
899
722
  --tensor-parallel-size: 4
900
723
  --max-model-len: 65536
724
+ sglang_args:
725
+ --tensor-parallel-size: 4
726
+ --context-length: 65536
901
727
  DeepSeek-R1-Distill-Llama-8B:
902
728
  model_family: DeepSeek-R1
903
729
  model_variant: Distill-Llama-8B
@@ -905,10 +731,6 @@ models:
905
731
  gpus_per_node: 1
906
732
  num_nodes: 1
907
733
  vocab_size: 128256
908
- time: 08:00:00
909
- resource_type: l40s
910
- vllm_args:
911
- --max-model-len: 131072
912
734
  DeepSeek-R1-Distill-Qwen-32B:
913
735
  model_family: DeepSeek-R1
914
736
  model_variant: Distill-Qwen-32B
@@ -916,11 +738,12 @@ models:
916
738
  gpus_per_node: 2
917
739
  num_nodes: 1
918
740
  vocab_size: 152064
919
- time: 08:00:00
920
- resource_type: l40s
921
741
  vllm_args:
922
742
  --tensor-parallel-size: 2
923
743
  --max-model-len: 65536
744
+ sglang_args:
745
+ --tensor-parallel-size: 2
746
+ --context-length: 65536
924
747
  DeepSeek-R1-Distill-Qwen-14B:
925
748
  model_family: DeepSeek-R1
926
749
  model_variant: Distill-Qwen-14B
@@ -928,10 +751,10 @@ models:
928
751
  gpus_per_node: 1
929
752
  num_nodes: 1
930
753
  vocab_size: 152064
931
- time: 08:00:00
932
- resource_type: l40s
933
754
  vllm_args:
934
755
  --max-model-len: 65536
756
+ sglang_args:
757
+ --context-length: 65536
935
758
  DeepSeek-R1-Distill-Qwen-7B:
936
759
  model_family: DeepSeek-R1
937
760
  model_variant: Distill-Qwen-7B
@@ -939,10 +762,6 @@ models:
939
762
  gpus_per_node: 1
940
763
  num_nodes: 1
941
764
  vocab_size: 152064
942
- time: 08:00:00
943
- resource_type: l40s
944
- vllm_args:
945
- --max-model-len: 131072
946
765
  DeepSeek-R1-Distill-Qwen-1.5B:
947
766
  model_family: DeepSeek-R1
948
767
  model_variant: Distill-Qwen-1.5B
@@ -950,10 +769,6 @@ models:
950
769
  gpus_per_node: 1
951
770
  num_nodes: 1
952
771
  vocab_size: 152064
953
- time: 08:00:00
954
- resource_type: l40s
955
- vllm_args:
956
- --max-model-len: 131072
957
772
  Phi-3.5-vision-instruct:
958
773
  model_family: Phi-3.5-vision
959
774
  model_variant: instruct
@@ -961,22 +776,12 @@ models:
961
776
  gpus_per_node: 2
962
777
  num_nodes: 1
963
778
  vocab_size: 32064
964
- time: 08:00:00
965
- resource_type: l40s
966
779
  vllm_args:
967
780
  --tensor-parallel-size: 2
968
781
  --max-model-len: 65536
969
- InternVL2_5-8B:
970
- model_family: InternVL2_5
971
- model_variant: 8B
972
- model_type: VLM
973
- gpus_per_node: 1
974
- num_nodes: 1
975
- vocab_size: 92553
976
- time: 08:00:00
977
- resource_type: l40s
978
- vllm_args:
979
- --max-model-len: 32768
782
+ sglang_args:
783
+ --tensor-parallel-size: 2
784
+ --context-length: 65536
980
785
  glm-4v-9b:
981
786
  model_family: glm-4v
982
787
  model_variant: 9b
@@ -984,10 +789,6 @@ models:
984
789
  gpus_per_node: 1
985
790
  num_nodes: 1
986
791
  vocab_size: 151552
987
- time: 08:00:00
988
- resource_type: l40s
989
- vllm_args:
990
- --max-model-len: 8192
991
792
  Molmo-7B-D-0924:
992
793
  model_family: Molmo
993
794
  model_variant: 7B-D-0924
@@ -995,21 +796,16 @@ models:
995
796
  gpus_per_node: 1
996
797
  num_nodes: 1
997
798
  vocab_size: 152064
998
- time: 08:00:00
999
- resource_type: l40s
1000
- vllm_args:
1001
- --max-model-len: 4096
1002
799
  deepseek-vl2:
1003
800
  model_family: deepseek-vl2
1004
801
  model_type: VLM
1005
802
  gpus_per_node: 2
1006
803
  num_nodes: 1
1007
804
  vocab_size: 129280
1008
- time: 08:00:00
1009
- resource_type: l40s
1010
805
  vllm_args:
1011
806
  --tensor-parallel-size: 2
1012
- --max-model-len: 4096
807
+ sglang_args:
808
+ --tensor-parallel-size: 2
1013
809
  deepseek-vl2-small:
1014
810
  model_family: deepseek-vl2
1015
811
  model_variant: small
@@ -1017,10 +813,13 @@ models:
1017
813
  gpus_per_node: 1
1018
814
  num_nodes: 1
1019
815
  vocab_size: 129280
1020
- time: 08:00:00
1021
- resource_type: l40s
1022
- vllm_args:
1023
- --max-model-len: 4096
816
+ Qwen3-0.6B:
817
+ model_family: Qwen3
818
+ model_variant: 0.6B
819
+ model_type: LLM
820
+ gpus_per_node: 1
821
+ num_nodes: 1
822
+ vocab_size: 151936
1024
823
  Qwen3-8B:
1025
824
  model_family: Qwen3
1026
825
  model_variant: 8B
@@ -1028,10 +827,6 @@ models:
1028
827
  gpus_per_node: 1
1029
828
  num_nodes: 1
1030
829
  vocab_size: 151936
1031
- time: 08:00:00
1032
- resource_type: l40s
1033
- vllm_args:
1034
- --max-model-len: 40960
1035
830
  Qwen3-14B:
1036
831
  model_family: Qwen3
1037
832
  model_variant: 14B
@@ -1039,10 +834,6 @@ models:
1039
834
  gpus_per_node: 1
1040
835
  num_nodes: 1
1041
836
  vocab_size: 151936
1042
- time: 08:00:00
1043
- resource_type: l40s
1044
- vllm_args:
1045
- --max-model-len: 40960
1046
837
  Qwen3-32B:
1047
838
  model_family: Qwen3
1048
839
  model_variant: 32B
@@ -1050,11 +841,10 @@ models:
1050
841
  gpus_per_node: 2
1051
842
  num_nodes: 1
1052
843
  vocab_size: 151936
1053
- time: 08:00:00
1054
- resource_type: l40s
1055
844
  vllm_args:
1056
845
  --tensor-parallel-size: 2
1057
- --max-model-len: 40960
846
+ sglang_args:
847
+ --tensor-parallel-size: 2
1058
848
  gpt-oss-120b:
1059
849
  model_family: gpt-oss
1060
850
  model_variant: 120b
@@ -1062,8 +852,77 @@ models:
1062
852
  gpus_per_node: 2
1063
853
  num_nodes: 1
1064
854
  vocab_size: 201088
855
+ vllm_args:
856
+ --tensor-parallel-size: 2
857
+ sglang_args:
858
+ --tensor-parallel-size: 2
859
+ Llama-4-Maverick-17B-128E-Instruct:
860
+ model_family: Llama-4
861
+ model_variant: Maverick-17B-128E-Instruct
862
+ model_type: VLM
863
+ gpus_per_node: 8
864
+ num_nodes: 2
865
+ resource_type: h100
866
+ cpus_per_task: 6
867
+ mem-per-node: 60G
868
+ vocab_size: 202048
1065
869
  time: 08:00:00
1066
- resource_type: l40s
870
+ vllm_args:
871
+ --tensor-parallel-size: 8
872
+ --pipeline-parallel-size: 2
873
+ sglang_args:
874
+ --tensor-parallel-size: 8
875
+ --pipeline-parallel-size: 2
876
+ medgemma-4b-it:
877
+ model_family: medgemma
878
+ model_variant: 4b-it
879
+ model_type: VLM
880
+ gpus_per_node: 1
881
+ num_nodes: 1
882
+ vocab_size: 262208
883
+ medgemma-27b-it:
884
+ model_family: medgemma
885
+ model_variant: 27b-it
886
+ model_type: VLM
887
+ gpus_per_node: 2
888
+ num_nodes: 1
889
+ vocab_size: 262208
1067
890
  vllm_args:
1068
891
  --tensor-parallel-size: 2
1069
- --max-model-len: 32768
892
+ sglang_args:
893
+ --tensor-parallel-size: 2
894
+ Kimi-K2-Instruct:
895
+ model_family: Kimi-K2
896
+ model_variant: Instruct
897
+ model_type: LLM
898
+ gpus_per_node: 8
899
+ num_nodes: 2
900
+ resource_type: h100
901
+ cpus_per_task: 6
902
+ mem-per-node: 60G
903
+ vocab_size: 163840
904
+ vllm_args:
905
+ --tensor-parallel-size: 8
906
+ --pipeline-parallel-size: 2
907
+ sglang_args:
908
+ --tensor-parallel-size: 8
909
+ --pipeline-parallel-size: 2
910
+ Kimi-K2.5:
911
+ model_family: Kimi-K2.5
912
+ model_type: LLM
913
+ gpus_per_node: 8
914
+ num_nodes: 1
915
+ resource_type: h100
916
+ cpus_per_task: 6
917
+ mem-per-node: 60G
918
+ vocab_size: 163840
919
+ vllm_args:
920
+ --tensor-parallel-size: 8
921
+ sglang_args:
922
+ --tensor-parallel-size: 8
923
+ whisper-large-v3:
924
+ model_family: whisper-large-v3
925
+ model_type: Audio
926
+ gpus_per_node: 1
927
+ num_nodes: 1
928
+ vocab_size: 51866