vec-inf 0.4.0__tar.gz → 0.4.0.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/PKG-INFO +4 -3
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/README.md +2 -1
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/pyproject.toml +2 -2
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/launch_server.sh +2 -2
- vec_inf-0.4.0.post1/vec_inf/models/README.md +203 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/models/models.csv +0 -1
- vec_inf-0.4.0/vec_inf/models/README.md +0 -106
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/LICENSE +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/README.md +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/__init__.py +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/cli/__init__.py +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/cli/_cli.py +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/cli/_utils.py +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/find_port.sh +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/multinode_vllm.slurm +0 -0
- {vec_inf-0.4.0 → vec_inf-0.4.0.post1}/vec_inf/vllm.slurm +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vec-inf
|
|
3
|
-
Version: 0.4.0
|
|
3
|
+
Version: 0.4.0.post1
|
|
4
4
|
Summary: Efficient LLM inference on Slurm clusters using vLLM.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Marshall Wang
|
|
@@ -16,7 +16,7 @@ Provides-Extra: dev
|
|
|
16
16
|
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
17
17
|
Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
|
|
18
18
|
Requires-Dist: numpy (>=1.24.0,<2.0.0)
|
|
19
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: polars (>=1.15.0,<2.0.0)
|
|
20
20
|
Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
|
|
21
21
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
22
22
|
Requires-Dist: rich (>=13.7.0,<14.0.0)
|
|
@@ -94,7 +94,8 @@ You call view the full list of available models by running the `list` command:
|
|
|
94
94
|
```bash
|
|
95
95
|
vec-inf list
|
|
96
96
|
```
|
|
97
|
-
<img width="
|
|
97
|
+
<img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
|
|
98
|
+
|
|
98
99
|
|
|
99
100
|
You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
|
|
100
101
|
```bash
|
|
@@ -68,7 +68,8 @@ You call view the full list of available models by running the `list` command:
|
|
|
68
68
|
```bash
|
|
69
69
|
vec-inf list
|
|
70
70
|
```
|
|
71
|
-
<img width="
|
|
71
|
+
<img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
|
|
72
|
+
|
|
72
73
|
|
|
73
74
|
You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
|
|
74
75
|
```bash
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[tool.poetry]
|
|
2
2
|
name = "vec-inf"
|
|
3
|
-
version = "0.4.0"
|
|
3
|
+
version = "0.4.0.post1"
|
|
4
4
|
description = "Efficient LLM inference on Slurm clusters using vLLM."
|
|
5
5
|
authors = ["Marshall Wang <marshall.wang@vectorinstitute.ai>"]
|
|
6
6
|
license = "MIT license"
|
|
@@ -11,7 +11,7 @@ python = "^3.10"
|
|
|
11
11
|
requests = "^2.31.0"
|
|
12
12
|
click = "^8.1.0"
|
|
13
13
|
rich = "^13.7.0"
|
|
14
|
-
|
|
14
|
+
polars = "^1.15.0"
|
|
15
15
|
numpy = "^1.24.0"
|
|
16
16
|
vllm = { version = "^0.6.0", optional = true }
|
|
17
17
|
vllm-nccl-cu12 = { version = ">=2.18,<2.19", optional = true }
|
|
@@ -50,7 +50,7 @@ export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
|
|
|
50
50
|
|
|
51
51
|
if [ -n "$max_num_seqs" ]; then
|
|
52
52
|
export VLLM_MAX_NUM_SEQS=$max_num_seqs
|
|
53
|
-
else
|
|
53
|
+
else
|
|
54
54
|
export VLLM_MAX_NUM_SEQS=256
|
|
55
55
|
fi
|
|
56
56
|
|
|
@@ -75,7 +75,7 @@ fi
|
|
|
75
75
|
mkdir -p $LOG_DIR
|
|
76
76
|
|
|
77
77
|
# Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
|
|
78
|
-
# SLURM job
|
|
78
|
+
# SLURM job
|
|
79
79
|
export SRC_DIR="$(dirname "$0")"
|
|
80
80
|
export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
|
|
81
81
|
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# Available Models
|
|
2
|
+
More profiling metrics coming soon!
|
|
3
|
+
|
|
4
|
+
## Text Generation Models
|
|
5
|
+
|
|
6
|
+
### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
|
|
7
|
+
|
|
8
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
9
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
10
|
+
| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
11
|
+
| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
12
|
+
| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
13
|
+
|
|
14
|
+
### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
15
|
+
|
|
16
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
17
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
18
|
+
| [`CodeLlama-7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
19
|
+
| [`CodeLlama-7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
20
|
+
| [`CodeLlama-13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
21
|
+
| [`CodeLlama-13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
22
|
+
| [`CodeLlama-34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
23
|
+
| [`CodeLlama-34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
24
|
+
| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
25
|
+
| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
26
|
+
|
|
27
|
+
### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
|
|
28
|
+
|
|
29
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
30
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
31
|
+
| [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
|
|
32
|
+
|
|
33
|
+
### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
34
|
+
|
|
35
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
36
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
37
|
+
| [`gemma-2-9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
|
|
38
|
+
| [`gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
|
|
39
|
+
| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
|
|
40
|
+
| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
|
|
41
|
+
|
|
42
|
+
### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
43
|
+
|
|
44
|
+
| Variant | Suggested resource allocation |
|
|
45
|
+
|:----------:|:----------:|
|
|
46
|
+
| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
|
|
47
|
+
| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
|
|
48
|
+
| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
|
|
49
|
+
| [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
|
|
50
|
+
| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
|
|
51
|
+
| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
|
|
52
|
+
|
|
53
|
+
### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
54
|
+
|
|
55
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
56
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
57
|
+
| [`Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
|
|
58
|
+
| [`Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
|
|
59
|
+
| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
|
|
60
|
+
| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
|
|
61
|
+
|
|
62
|
+
### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
63
|
+
|
|
64
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
65
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
66
|
+
| [`Meta-Llama-3.1-8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
67
|
+
| [`Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
68
|
+
| [`Meta-Llama-3.1-70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
69
|
+
| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
70
|
+
| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
71
|
+
|
|
72
|
+
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
73
|
+
|
|
74
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
75
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
76
|
+
| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
|
|
77
|
+
| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
78
|
+
| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
|
|
79
|
+
| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
80
|
+
|
|
81
|
+
### [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
82
|
+
|
|
83
|
+
| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
84
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
85
|
+
| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
86
|
+
| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
87
|
+
| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
|
|
88
|
+
| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
|
|
89
|
+
| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
|
|
90
|
+
| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
91
|
+
| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
92
|
+
|
|
93
|
+
### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
94
|
+
|
|
95
|
+
| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
96
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
97
|
+
| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
98
|
+
| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
99
|
+
| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
100
|
+
|
|
101
|
+
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
102
|
+
|
|
103
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
104
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
105
|
+
| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
106
|
+
|
|
107
|
+
### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
|
|
108
|
+
|
|
109
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
110
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
111
|
+
| [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
112
|
+
|
|
113
|
+
### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
|
|
114
|
+
|
|
115
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
116
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
117
|
+
| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
|
|
118
|
+
|
|
119
|
+
### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
|
|
120
|
+
|
|
121
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
122
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
123
|
+
| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
124
|
+
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
125
|
+
| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
126
|
+
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
127
|
+
| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
128
|
+
| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
129
|
+
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
130
|
+
|
|
131
|
+
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
132
|
+
|
|
133
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
134
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
135
|
+
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
136
|
+
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
137
|
+
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
138
|
+
|
|
139
|
+
### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
|
|
140
|
+
|
|
141
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
142
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
143
|
+
| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
144
|
+
|
|
145
|
+
### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
|
|
146
|
+
|
|
147
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
148
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
149
|
+
| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
|
|
150
|
+
|
|
151
|
+
## Vision Language Models
|
|
152
|
+
|
|
153
|
+
### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
154
|
+
|
|
155
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
156
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
157
|
+
| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
158
|
+
| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
159
|
+
|
|
160
|
+
### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
161
|
+
|
|
162
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
163
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
164
|
+
| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
165
|
+
| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
166
|
+
|
|
167
|
+
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
168
|
+
|
|
169
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
170
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
171
|
+
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
172
|
+
|
|
173
|
+
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
174
|
+
|
|
175
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
176
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
177
|
+
| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
|
|
178
|
+
| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
179
|
+
| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
180
|
+
| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
181
|
+
|
|
182
|
+
**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
|
|
183
|
+
|
|
184
|
+
### [Mistral: Pixtral](https://huggingface.co/mistralai)
|
|
185
|
+
|
|
186
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
187
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
188
|
+
| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
|
|
189
|
+
|
|
190
|
+
## Text Embedding Models
|
|
191
|
+
|
|
192
|
+
### [Liang Wang: e5](https://huggingface.co/intfloat)
|
|
193
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
194
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
195
|
+
| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
196
|
+
|
|
197
|
+
## Reward Modeling Models
|
|
198
|
+
|
|
199
|
+
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
200
|
+
|
|
201
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
202
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
203
|
+
| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
|
|
@@ -70,5 +70,4 @@ Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,tru
|
|
|
70
70
|
Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
71
71
|
QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
72
72
|
Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
73
|
-
bge-multilingual-gemma2,bge,multilingual-gemma2,Text Embedding,1,1,256002,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
74
73
|
e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
@@ -1,106 +0,0 @@
|
|
|
1
|
-
# Available Models
|
|
2
|
-
More profiling metrics coming soon!
|
|
3
|
-
|
|
4
|
-
## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
|
|
5
|
-
|
|
6
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
7
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
8
|
-
|[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
9
|
-
|
|
10
|
-
## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
11
|
-
|
|
12
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
13
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
14
|
-
| [`CodeLlama-7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
15
|
-
| [`CodeLlama-7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
16
|
-
| [`CodeLlama-13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
17
|
-
| [`CodeLlama-13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
18
|
-
| [`CodeLlama-34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
19
|
-
| [`CodeLlama-34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
20
|
-
| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
21
|
-
| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
22
|
-
|
|
23
|
-
## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
|
|
24
|
-
|
|
25
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
26
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
27
|
-
|[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
|
|
28
|
-
|
|
29
|
-
## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
30
|
-
|
|
31
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
32
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
33
|
-
| [`gemma-2-9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
|
|
34
|
-
| [`gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
|
|
35
|
-
| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
|
|
36
|
-
| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
|
|
37
|
-
|
|
38
|
-
## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
39
|
-
|
|
40
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
41
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
42
|
-
|[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
43
|
-
|[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
44
|
-
|
|
45
|
-
## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
46
|
-
|
|
47
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
48
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
49
|
-
|[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
50
|
-
|[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
|
|
51
|
-
|
|
52
|
-
## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
53
|
-
|
|
54
|
-
| Variant | Suggested resource allocation |
|
|
55
|
-
|:----------:|:----------:|
|
|
56
|
-
| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
|
|
57
|
-
| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
|
|
58
|
-
| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
|
|
59
|
-
| [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
|
|
60
|
-
| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
|
|
61
|
-
| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
|
|
62
|
-
|
|
63
|
-
## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
64
|
-
|
|
65
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
66
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
67
|
-
| [`Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
|
|
68
|
-
| [`Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
|
|
69
|
-
| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
|
|
70
|
-
| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
|
|
71
|
-
|
|
72
|
-
## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
73
|
-
|
|
74
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
75
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
76
|
-
| [`Meta-Llama-3.1-8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
|
|
77
|
-
| [`Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
78
|
-
| [`Meta-Llama-3.1-70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
79
|
-
| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
80
|
-
| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
81
|
-
|
|
82
|
-
## [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
83
|
-
|
|
84
|
-
| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
85
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
86
|
-
|[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
|
|
87
|
-
|[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
|
|
88
|
-
|[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
|
|
89
|
-
|[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
|
|
90
|
-
|[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
|
|
91
|
-
|[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
92
|
-
|
|
93
|
-
## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
94
|
-
|
|
95
|
-
| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
96
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
97
|
-
|[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
98
|
-
|[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
99
|
-
|[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
100
|
-
|
|
101
|
-
## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
102
|
-
|
|
103
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
104
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
105
|
-
| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
106
|
-
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|