vec-inf 0.4.0__py3-none-any.whl → 0.4.0.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/launch_server.sh CHANGED
@@ -50,7 +50,7 @@ export MODEL_WEIGHTS_PARENT_DIR=$model_weights_parent_dir
50
50
 
51
51
  if [ -n "$max_num_seqs" ]; then
52
52
  export VLLM_MAX_NUM_SEQS=$max_num_seqs
53
- else
53
+ else
54
54
  export VLLM_MAX_NUM_SEQS=256
55
55
  fi
56
56
 
@@ -75,7 +75,7 @@ fi
75
75
  mkdir -p $LOG_DIR
76
76
 
77
77
  # Model and entrypoint configuration. API Server URL (host, port) are set automatically based on the
78
- # SLURM job
78
+ # SLURM job
79
79
  export SRC_DIR="$(dirname "$0")"
80
80
  export MODEL_DIR="${SRC_DIR}/models/${MODEL_FAMILY}"
81
81
 
vec_inf/models/README.md CHANGED
@@ -1,13 +1,17 @@
1
1
  # Available Models
2
2
  More profiling metrics coming soon!
3
3
 
4
- ## [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
4
+ ## Text Generation Models
5
+
6
+ ### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
5
7
 
6
8
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
7
9
  |:----------:|:----------:|:----------:|:----------:|
8
- |[`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus)| 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
10
+ | [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
11
+ | [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
12
+ | [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
9
13
 
10
- ## [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
14
+ ### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
11
15
 
12
16
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
13
17
  |:----------:|:----------:|:----------:|:----------:|
@@ -20,13 +24,13 @@ More profiling metrics coming soon!
20
24
  | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
21
25
  | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
22
26
 
23
- ## [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
27
+ ### [Databricks: DBRX](https://huggingface.co/collections/databricks/dbrx-6601c0852a0cdd3c59f71962)
24
28
 
25
29
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
26
30
  |:----------:|:----------:|:----------:|:----------:|
27
- |[`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct)| 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
31
+ | [`dbrx-instruct`](https://huggingface.co/databricks/dbrx-instruct) | 8x a40 (2 nodes, 4 a40/node) | 107 tokens/s | 904 tokens/s |
28
32
 
29
- ## [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
33
+ ### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
30
34
 
31
35
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
32
36
  |:----------:|:----------:|:----------:|:----------:|
@@ -35,21 +39,7 @@ More profiling metrics coming soon!
35
39
  | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
36
40
  | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
37
41
 
38
- ## [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
39
-
40
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
41
- |:----------:|:----------:|:----------:|:----------:|
42
- |[`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
43
- |[`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf)| 1x a40 | - tokens/s | - tokens/s |
44
-
45
- ## [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
46
-
47
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
48
- |:----------:|:----------:|:----------:|:----------:|
49
- |[`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)| 1x a40 | - tokens/s | - tokens/s |
50
- |[`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)| 2x a40 | - tokens/s | - tokens/s |
51
-
52
- ## [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
42
+ ### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
53
43
 
54
44
  | Variant | Suggested resource allocation |
55
45
  |:----------:|:----------:|
@@ -60,7 +50,7 @@ More profiling metrics coming soon!
60
50
  | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
61
51
  | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
62
52
 
63
- ## [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
53
+ ### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
64
54
 
65
55
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
66
56
  |:----------:|:----------:|:----------:|:----------:|
@@ -69,7 +59,7 @@ More profiling metrics coming soon!
69
59
  | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
70
60
  | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
71
61
 
72
- ## [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
62
+ ### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
73
63
 
74
64
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
75
65
  |:----------:|:----------:|:----------:|:----------:|
@@ -79,28 +69,135 @@ More profiling metrics coming soon!
79
69
  | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
80
70
  | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
81
71
 
82
- ## [Mistral AI: Mistral](https://huggingface.co/mistralai)
72
+ ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
73
+
74
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
75
+ |:----------:|:----------:|:----------:|:----------:|
76
+ | [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
77
+ | [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
78
+ | [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
79
+ | [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
80
+
81
+ ### [Mistral AI: Mistral](https://huggingface.co/mistralai)
83
82
 
84
83
  | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
85
84
  |:----------:|:----------:|:----------:|:----------:|
86
- |[`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)| 1x a40 | - tokens/s | - tokens/s|
87
- |[`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)| 1x a40 | - tokens/s | - tokens/s|
88
- |[`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2)| 1x a40 | - tokens/s | - tokens/s|
89
- |[`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3)| 1x a40 | - tokens/s | - tokens/s |
90
- |[`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3)| 1x a40 | - tokens/s | - tokens/s|
91
- |[`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407)| 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
85
+ | [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
86
+ | [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
87
+ | [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
88
+ | [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
89
+ | [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
90
+ | [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
91
+ | [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
92
92
 
93
- ## [Mistral AI: Mixtral](https://huggingface.co/mistralai)
93
+ ### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
94
94
 
95
95
  | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
96
96
  |:----------:|:----------:|:----------:|:----------:|
97
- |[`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)| 4x a40 | 222 tokens/s | 1543 tokens/s |
98
- |[`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
99
- |[`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1)| 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
97
+ | [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
98
+ | [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
99
+ | [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
100
100
 
101
- ## [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
101
+ ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
102
102
 
103
103
  | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
104
104
  |:----------:|:----------:|:----------:|:----------:|
105
105
  | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
106
+
107
+ ### [Aaditya Ura: Llama3-OpenBioLLM](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B)
108
+
109
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
110
+ |:----------:|:----------:|:----------:|:----------:|
111
+ | [`Llama3-OpenBioLLM-70B`](https://huggingface.co/aaditya/Llama3-OpenBioLLM-70B) | 4x a40 | - tokens/s | - tokens/s |
112
+
113
+ ### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
114
+
115
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
116
+ |:----------:|:----------:|:----------:|:----------:|
117
+ | [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
118
+
119
+ ### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
120
+
121
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
122
+ |:----------:|:----------:|:----------:|:----------:|
123
+ | [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
124
+ | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
125
+ | [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
126
+ | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
127
+ | [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
128
+ | [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
129
+ | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
130
+
131
+ ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
132
+
133
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
134
+ |:----------:|:----------:|:----------:|:----------:|
135
+ | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
136
+ | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
137
+ | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
138
+
139
+ ### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
140
+
141
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
142
+ |:----------:|:----------:|:----------:|:----------:|
143
+ | [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
144
+
145
+ ### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
146
+
147
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
148
+ |:----------:|:----------:|:----------:|:----------:|
149
+ | [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
150
+
151
+ ## Vision Language Models
152
+
153
+ ### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
154
+
155
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
156
+ |:----------:|:----------:|:----------:|:----------:|
157
+ | [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
158
+ | [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
159
+
160
+ ### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
161
+
162
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
163
+ |:----------:|:----------:|:----------:|:----------:|
164
+ | [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
165
+ | [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
166
+
167
+ ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
168
+
169
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
170
+ |:----------:|:----------:|:----------:|:----------:|
106
171
  | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
172
+
173
+ ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
174
+
175
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
176
+ |:----------:|:----------:|:----------:|:----------:|
177
+ | [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
178
+ | [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
179
+ | [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
180
+ | [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
181
+
182
+ **NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
183
+
184
+ ### [Mistral: Pixtral](https://huggingface.co/mistralai)
185
+
186
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
187
+ |:----------:|:----------:|:----------:|:----------:|
188
+ | [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
189
+
190
+ ## Text Embedding Models
191
+
192
+ ### [Liang Wang: e5](https://huggingface.co/intfloat)
193
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
194
+ |:----------:|:----------:|:----------:|:----------:|
195
+ | [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
196
+
197
+ ## Reward Modeling Models
198
+
199
+ ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
200
+
201
+ | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
202
+ |:----------:|:----------:|:----------:|:----------:|
203
+ | [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
vec_inf/models/models.csv CHANGED
@@ -70,5 +70,4 @@ Qwen2.5-Coder-7B-Instruct,Qwen2.5,Coder-7B-Instruct,LLM,1,1,152064,32768,256,tru
70
70
  Qwen2.5-Math-RM-72B,Qwen2.5,Math-RM-72B,Reward Modeling,4,1,152064,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
71
71
  QwQ-32B-Preview,QwQ,32B-Preview,LLM,2,1,152064,32768,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
72
72
  Pixtral-12B-2409,Pixtral,12B-2409,VLM,1,1,131072,8192,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
73
- bge-multilingual-gemma2,bge,multilingual-gemma2,Text Embedding,1,1,256002,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
74
73
  e5-mistral-7b-instruct,e5,mistral-7b-instruct,Text Embedding,1,1,32000,4096,256,true,false,m2,08:00:00,a40,auto,singularity,default,/model-weights
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: vec-inf
3
- Version: 0.4.0
3
+ Version: 0.4.0.post1
4
4
  Summary: Efficient LLM inference on Slurm clusters using vLLM.
5
5
  License: MIT
6
6
  Author: Marshall Wang
@@ -16,7 +16,7 @@ Provides-Extra: dev
16
16
  Requires-Dist: click (>=8.1.0,<9.0.0)
17
17
  Requires-Dist: cupy-cuda12x (==12.1.0) ; extra == "dev"
18
18
  Requires-Dist: numpy (>=1.24.0,<2.0.0)
19
- Requires-Dist: pandas (>=1.15.0,<2.0.0)
19
+ Requires-Dist: polars (>=1.15.0,<2.0.0)
20
20
  Requires-Dist: ray (>=2.9.3,<3.0.0) ; extra == "dev"
21
21
  Requires-Dist: requests (>=2.31.0,<3.0.0)
22
22
  Requires-Dist: rich (>=13.7.0,<14.0.0)
@@ -94,7 +94,8 @@ You call view the full list of available models by running the `list` command:
94
94
  ```bash
95
95
  vec-inf list
96
96
  ```
97
- <img width="900" alt="list_img" src="https://github.com/user-attachments/assets/7cb2b2ac-d30c-48a8-b773-f648c27d9de2">
97
+ <img width="940" alt="list_img" src="https://github.com/user-attachments/assets/8cf901c4-404c-4398-a52f-0486f00747a3">
98
+
98
99
 
99
100
  You can also view the default setup for a specific supported model by providing the model name, for example `Meta-Llama-3.1-70B-Instruct`:
100
101
  ```bash
@@ -0,0 +1,16 @@
1
+ vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
2
+ vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
5
+ vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
6
+ vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
+ vec_inf/launch_server.sh,sha256=gFovqXuYiQ8bEc6O31WTMDuBoNj7opB5iVfnCDhz2Nw,4165
8
+ vec_inf/models/README.md,sha256=YNEVTWliHehCpJTq2SXAidqgFl6CWL6GUOnAPksDYFE,14844
9
+ vec_inf/models/models.csv,sha256=f_cNeM7L0-4pgZqYfWilQd12-WVec2IVk6dRq5BE4mE,9875
10
+ vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
11
+ vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
12
+ vec_inf-0.4.0.post1.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
13
+ vec_inf-0.4.0.post1.dist-info/METADATA,sha256=Q6KhU-ggnR9FB5YUjWrPwy2MSd_c9GCFXAQqT9YXZOw,7032
14
+ vec_inf-0.4.0.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
15
+ vec_inf-0.4.0.post1.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
16
+ vec_inf-0.4.0.post1.dist-info/RECORD,,
@@ -1,16 +0,0 @@
1
- vec_inf/README.md,sha256=dxX0xKfwLioG0mJ2YFv5JJ5q1m5NlWBrVBOap1wuHfQ,624
2
- vec_inf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- vec_inf/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- vec_inf/cli/_cli.py,sha256=TRaY-QSBQ_do9b4R6Pl7fyDlrfuMN8Z8HH_xOCKkVJA,12585
5
- vec_inf/cli/_utils.py,sha256=sQqi7JdPOb7gfW4EVsXY2yhLUo8xWqxoY1spQ53bag4,4845
6
- vec_inf/find_port.sh,sha256=bGQ6LYSFVSsfDIGatrSg5YvddbZfaPL0R-Bjo4KYD6I,1088
7
- vec_inf/launch_server.sh,sha256=3-esdDzfuG0qSOPhrZHgx2nQ9GEiaI2tjTPw7VrdMuQ,4167
8
- vec_inf/models/README.md,sha256=n9I8HsIHCafz0G9k1OFwkraK9J-OY92v6M3z42a-Nho,8146
9
- vec_inf/models/models.csv,sha256=CK2NDHgdkwx5qpaduuYy9KhcHhS0z60quSeV_KtWx9c,10025
10
- vec_inf/multinode_vllm.slurm,sha256=tg0WgLRdpRFD-oT05aucOpe6h2TZiTyYJFTMqSIj-HQ,4154
11
- vec_inf/vllm.slurm,sha256=lMgBI7r9jUVVhSIdrUH2DdC-Bxz0eyQ8vuB5uwOzWt0,1847
12
- vec_inf-0.4.0.dist-info/LICENSE,sha256=mq8zeqpvVSF1EsxmydeXcokt8XnEIfSofYn66S2-cJI,1073
13
- vec_inf-0.4.0.dist-info/METADATA,sha256=X-zLib_6dTZT9ZvrIBoQThImgpJSkgTFBL12oi-Dt1A,7025
14
- vec_inf-0.4.0.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
15
- vec_inf-0.4.0.dist-info/entry_points.txt,sha256=JF4uXsj1H4XacxaBw9f0KN0P0qDzmp7K_1zTEBDappo,48
16
- vec_inf-0.4.0.dist-info/RECORD,,