vec-inf 0.4.0__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/launch_server.sh +2 -2
- vec_inf/models/README.md +132 -35
- vec_inf/models/models.csv +0 -1
- {vec_inf-0.4.0.dist-info → vec_inf-0.4.0.post1.dist-info}/METADATA +4 -3
- vec_inf-0.4.0.post1.dist-info/RECORD +16 -0
- vec_inf-0.4.0.dist-info/RECORD +0 -16
- {vec_inf-0.4.0.dist-info → vec_inf-0.4.0.post1.dist-info}/LICENSE +0 -0
- {vec_inf-0.4.0.dist-info → vec_inf-0.4.0.post1.dist-info}/WHEEL +0 -0
- {vec_inf-0.4.0.dist-info → vec_inf-0.4.0.post1.dist-info}/entry_points.txt +0 -0
vec_inf/launch_server.sh
CHANGED
|
@@ -50,7 +50,7 @@ export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
|
|
|
50
50
|
|
|
51
51
|
if [ -n "$max_num_seqs" ]; then
|
|
52
52
|
export VLLM_MAX_NUM_SEQS=$max_num_seqs
|
|
53
|
-
else
|
|
53
|
+
else
|
|
54
54
|
export VLLM_MAX_NUM_SEQS=256
|
|
55
55
|
fi
|
|
56
56
|
|
|
@@ -75,7 +75,7 @@ fi
|
|
|
75
75
|
mkdir -p $LOG_DIR
|
|
76
76
|
|
|
77
77
|
# Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
|
|
78
|
-
# SLURM job
|
|
78
|
+
# SLURM job
|
|
79
79
|
export SRC_DIR="$(dirname "$0")"
|
|
80
80
|
export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
|
|
81
81
|
|
vec_inf/models/README.md
CHANGED
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
# Available Models
|
|
2
2
|
More profiling metrics coming soon!
|
|
3
3
|
|
|
4
|
-
##
|
|
4
|
+
## Text Generation Models
|
|
5
|
+
|
|
6
|
+
### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
|
|
5
7
|
|
|
6
8
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
7
9
|
|:----------:|:----------:|:----------:|:----------:|
|
|
8
|
-
|[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
10
|
+
| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
|
|
11
|
+
| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
12
|
+
| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
9
13
|
|
|
10
|
-
|
|
14
|
+
### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
|
|
11
15
|
|
|
12
16
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
13
17
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -20,13 +24,13 @@ More profiling metrics coming soon!
|
|
|
20
24
|
| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
21
25
|
| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
|
|
22
26
|
|
|
23
|
-
|
|
27
|
+
### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
|
|
24
28
|
|
|
25
29
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
26
30
|
|:----------:|:----------:|:----------:|:----------:|
|
|
27
|
-
|[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
|
|
31
|
+
| [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
|
|
28
32
|
|
|
29
|
-
|
|
33
|
+
### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
|
|
30
34
|
|
|
31
35
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
32
36
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -35,21 +39,7 @@ More profiling metrics coming soon!
|
|
|
35
39
|
| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
|
|
36
40
|
| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
|
|
37
41
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
41
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
42
|
-
|[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
43
|
-
|[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
44
|
-
|
|
45
|
-
## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
46
|
-
|
|
47
|
-
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
48
|
-
|:----------:|:----------:|:----------:|:----------:|
|
|
49
|
-
|[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
|
|
50
|
-
|[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
|
|
51
|
-
|
|
52
|
-
## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
42
|
+
### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
|
|
53
43
|
|
|
54
44
|
| Variant | Suggested resource allocation |
|
|
55
45
|
|:----------:|:----------:|
|
|
@@ -60,7 +50,7 @@ More profiling metrics coming soon!
|
|
|
60
50
|
| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
|
|
61
51
|
| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
|
|
62
52
|
|
|
63
|
-
|
|
53
|
+
### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
|
|
64
54
|
|
|
65
55
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
66
56
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -69,7 +59,7 @@ More profiling metrics coming soon!
|
|
|
69
59
|
| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
|
|
70
60
|
| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
|
|
71
61
|
|
|
72
|
-
|
|
62
|
+
### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
|
|
73
63
|
|
|
74
64
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
75
65
|
|:----------:|:----------:|:----------:|:----------:|
|
|
@@ -79,28 +69,135 @@ More profiling metrics coming soon!
|
|
|
79
69
|
| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
80
70
|
| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
81
71
|
|
|
82
|
-
|
|
72
|
+
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
73
|
+
|
|
74
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
75
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
76
|
+
| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
|
|
77
|
+
| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
78
|
+
| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
|
|
79
|
+
| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
80
|
+
|
|
81
|
+
### [Mistral AI: Mistral](https://huggingface.co/mistralai)
|
|
83
82
|
|
|
84
83
|
| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
85
84
|
|:----------:|:----------:|:----------:|:----------:|
|
|
86
|
-
|[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
|
|
87
|
-
|[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
|
|
88
|
-
|[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
|
|
89
|
-
|[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
|
|
90
|
-
|[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
|
|
91
|
-
|[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
85
|
+
| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
86
|
+
| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
|
|
87
|
+
| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
|
|
88
|
+
| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
|
|
89
|
+
| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
|
|
90
|
+
| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
91
|
+
| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
|
|
92
92
|
|
|
93
|
-
|
|
93
|
+
### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
|
|
94
94
|
|
|
95
95
|
| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
96
96
|
|:----------:|:----------:|:----------:|:----------:|
|
|
97
|
-
|[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
98
|
-
|[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
99
|
-
|[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
97
|
+
| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
|
|
98
|
+
| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
|
|
99
|
+
| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
|
|
100
100
|
|
|
101
|
-
|
|
101
|
+
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
102
102
|
|
|
103
103
|
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
104
104
|
|:----------:|:----------:|:----------:|:----------:|
|
|
105
105
|
| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
106
|
+
|
|
107
|
+
### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
|
|
108
|
+
|
|
109
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
110
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
111
|
+
| [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
|
|
112
|
+
|
|
113
|
+
### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
|
|
114
|
+
|
|
115
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
116
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
117
|
+
| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
|
|
118
|
+
|
|
119
|
+
### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
|
|
120
|
+
|
|
121
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
122
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
123
|
+
| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
124
|
+
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
125
|
+
| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
126
|
+
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
127
|
+
| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
128
|
+
| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
129
|
+
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
130
|
+
|
|
131
|
+
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
132
|
+
|
|
133
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
134
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
135
|
+
| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
136
|
+
| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
137
|
+
| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
|
|
138
|
+
|
|
139
|
+
### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
|
|
140
|
+
|
|
141
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
142
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
143
|
+
| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
144
|
+
|
|
145
|
+
### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
|
|
146
|
+
|
|
147
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
148
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
149
|
+
| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
|
|
150
|
+
|
|
151
|
+
## Vision Language Models
|
|
152
|
+
|
|
153
|
+
### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
|
|
154
|
+
|
|
155
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
156
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
157
|
+
| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
158
|
+
| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
159
|
+
|
|
160
|
+
### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
|
|
161
|
+
|
|
162
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
163
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
164
|
+
| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
|
|
165
|
+
| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
|
|
166
|
+
|
|
167
|
+
### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
|
|
168
|
+
|
|
169
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
170
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
106
171
|
| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
172
|
+
|
|
173
|
+
### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
|
|
174
|
+
|
|
175
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
176
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
177
|
+
| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
|
|
178
|
+
| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
|
|
179
|
+
| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
180
|
+
| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
|
|
181
|
+
|
|
182
|
+
**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
|
|
183
|
+
|
|
184
|
+
### [Mistral: Pixtral](https://huggingface.co/mistralai)
|
|
185
|
+
|
|
186
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
187
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
188
|
+
| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
|
|
189
|
+
|
|
190
|
+
## Text Embedding Models
|
|
191
|
+
|
|
192
|
+
### [Liang Wang: e5](https://huggingface.co/intfloat)
|
|
193
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
194
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
195
|
+
| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
|
|
196
|
+
|
|
197
|
+
## Reward Modeling Models
|
|
198
|
+
|
|
199
|
+
### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
|
|
200
|
+
|
|
201
|
+
| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
|
|
202
|
+
|:----------:|:----------:|:----------:|:----------:|
|
|
203
|
+
| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
|
vec_inf/models/models.csv
CHANGED
|
@@ -70,5 +70,4 @@ Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,tru
|
|
|
70
70
|
Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
71
71
|
QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
72
72
|
Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
73
|
-
bge-multilingual-gemma2,bge,multilingual-gemma2,Text Embedding,1,1,256002,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
74
73
|
e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: vec-inf
|
|
3
|
-
Version: 0.4.0
|
|
3
|
+
Version: 0.4.0.post1
|
|
4
4
|
Summary: Efficient LLM inference on Slurm clusters using vLLM.
|
|
5
5
|
License: MIT
|
|
6
6
|
Author: Marshall Wang
|
|
@@ -16,7 +16,7 @@ Provides-Extra: dev
|
|
|
16
16
|
Requires-Dist: click (>=8.1.0,<9.0.0)
|
|
17
17
|
Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
|
|
18
18
|
Requires-Dist: numpy (>=1.24.0,<2.0.0)
|
|
19
|
-
Requires-Dist:
|
|
19
|
+
Requires-Dist: polars (>=1.15.0,<2.0.0)
|
|
20
20
|
Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
|
|
21
21
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
22
22
|
Requires-Dist: rich (>=13.7.0,<14.0.0)
|
|
@@ -94,7 +94,8 @@ You call view the full list of available models by running the `list` command:
|
|
|
94
94
|
```bash
|
|
95
95
|
vec-inf list
|
|
96
96
|
```
|
|
97
|
-
<img width="
|
|
97
|
+
<img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
|
|
98
|
+
|
|
98
99
|
|
|
99
100
|
You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
|
|
100
101
|
```bash
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
|
|
2
|
+
vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
+
vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
|
|
5
|
+
vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
|
|
6
|
+
vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
|
|
7
|
+
vec_inf/launch_server.sh,sha256=gFovqXuYiQ8bEc6O31WTMDuBoNj7opB5iVfnCDhz2Nw,4165
|
|
8
|
+
vec_inf/models/README.md,sha256=YNEVTWliHehCpJTq2SXAidqgFl6CWL6GUOnAPksDYFE,14844
|
|
9
|
+
vec_inf/models/models.csv,sha256=f_cNeM7L0-4pgZqYfWilQd12-WVec2IVk6dRq5BE4mE,9875
|
|
10
|
+
vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
|
|
11
|
+
vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
|
|
12
|
+
vec_inf-0.4.0.post1.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
|
|
13
|
+
vec_inf-0.4.0.post1.dist-info/METADATA,sha256=Q6KhU-ggnR9FB5YUjWrPwy2MSd_c9GCFXAQqT9YXZOw,7032
|
|
14
|
+
vec_inf-0.4.0.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
15
|
+
vec_inf-0.4.0.post1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
|
|
16
|
+
vec_inf-0.4.0.post1.dist-info/RECORD,,
|
vec_inf-0.4.0.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
|
|
2
|
-
vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
-
vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
|
|
5
|
-
vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
|
|
6
|
-
vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
|
|
7
|
-
vec_inf/launch_server.sh,sha256=3-esdDzfuG0qSOPhrZHgx2nQ9GEiaI2tjTPw7VrdMuQ,4167
|
|
8
|
-
vec_inf/models/README.md,sha256=n9I8HsIHCafz0G9k1OFwkraK9J-OY92v6M3z42a-Nho,8146
|
|
9
|
-
vec_inf/models/models.csv,sha256=CK2NDHgdkwx5qpaduuYy9KhcHhS0z60quSeV_KtWx9c,10025
|
|
10
|
-
vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
|
|
11
|
-
vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
|
|
12
|
-
vec_inf-0.4.0.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
|
|
13
|
-
vec_inf-0.4.0.dist-info/METADATA,sha256=X-zLib_6dTZT9ZvrIBoQThImgpJSkgTFBL12oi-Dt1A,7025
|
|
14
|
-
vec_inf-0.4.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
|
15
|
-
vec_inf-0.4.0.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
|
|
16
|
-
vec_inf-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|