vec-inf 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
vec_inf/client/config.py CHANGED
@@ -5,18 +5,19 @@ configurations, including hardware requirements and model specifications.
5
5
  """
6
6
 
7
7
  from pathlib import Path
8
- from typing import Any, Optional, Union, cast
8
+ from typing import Any, Optional, Union
9
9
 
10
10
  from pydantic import BaseModel, ConfigDict, Field
11
11
  from typing_extensions import Literal
12
12
 
13
- from vec_inf.client.slurm_vars import (
13
+ from vec_inf.client._slurm_vars import (
14
14
  DEFAULT_ARGS,
15
15
  MAX_CPUS_PER_TASK,
16
16
  MAX_GPUS_PER_NODE,
17
17
  MAX_NUM_NODES,
18
18
  PARTITION,
19
19
  QOS,
20
+ RESOURCE_TYPE,
20
21
  )
21
22
 
22
23
 
@@ -47,14 +48,18 @@ class ModelConfig(BaseModel):
47
48
  Memory allocation per node in GB format (e.g., '32G')
48
49
  vocab_size : int
49
50
  Size of the model's vocabulary (1-1,000,000)
50
- account : Optional[str], optional
51
+ account : str, optional
51
52
  Charge resources used by this job to specified account.
53
+ work_dir : str, optional
54
+ Set working directory for the batch job
52
55
  qos : Union[QOS, str], optional
53
56
  Quality of Service tier for job scheduling
54
57
  time : str, optional
55
58
  Time limit for the job in HH:MM:SS format
56
59
  partition : Union[PARTITION, str], optional
57
- GPU partition type for job scheduling
60
+ Slurm partition for job scheduling
61
+ resource_type : Union[RESOURCE_TYPE, str], optional
62
+ Type of resource to request for the job
58
63
  venv : str, optional
59
64
  Virtual environment or container system to use
60
65
  log_dir : Path, optional
@@ -83,13 +88,13 @@ class ModelConfig(BaseModel):
83
88
  )
84
89
  num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
85
90
  cpus_per_task: int = Field(
86
- default=cast(int, DEFAULT_ARGS["cpus_per_task"]),
91
+ default=int(DEFAULT_ARGS["cpus_per_task"]),
87
92
  gt=0,
88
93
  le=MAX_CPUS_PER_TASK,
89
94
  description="CPUs per task",
90
95
  )
91
96
  mem_per_node: str = Field(
92
- default=cast(str, DEFAULT_ARGS["mem_per_node"]),
97
+ default=DEFAULT_ARGS["mem_per_node"],
93
98
  pattern=r"^\d{1,4}G$",
94
99
  description="Memory per node",
95
100
  )
@@ -97,42 +102,58 @@ class ModelConfig(BaseModel):
97
102
  account: Optional[str] = Field(
98
103
  default=None, description="Account name for job scheduling"
99
104
  )
100
- qos: Union[QOS, str] = Field(
101
- default=cast(str, DEFAULT_ARGS["qos"]), description="Quality of Service tier"
105
+ work_dir: Optional[str] = Field(
106
+ default=None, description="Working directory for the job"
107
+ )
108
+ qos: Optional[Union[QOS, str]] = Field(
109
+ default=DEFAULT_ARGS["qos"] if DEFAULT_ARGS["qos"] != "" else None,
110
+ description="Quality of Service tier",
102
111
  )
103
112
  time: str = Field(
104
- default=cast(str, DEFAULT_ARGS["time"]),
113
+ default=DEFAULT_ARGS["time"],
105
114
  pattern=r"^\d{2}:\d{2}:\d{2}$",
106
115
  description="HH:MM:SS time limit",
107
116
  )
108
- partition: Union[PARTITION, str] = Field(
109
- default=cast(str, DEFAULT_ARGS["partition"]), description="GPU partition type"
117
+ partition: Optional[Union[PARTITION, str]] = Field(
118
+ default=DEFAULT_ARGS["partition"] if DEFAULT_ARGS["partition"] != "" else None,
119
+ description="GPU partition type",
120
+ )
121
+ resource_type: Optional[Union[RESOURCE_TYPE, str]] = Field(
122
+ default=DEFAULT_ARGS["resource_type"]
123
+ if DEFAULT_ARGS["resource_type"] != ""
124
+ else None,
125
+ description="Resource type",
110
126
  )
111
127
  exclude: Optional[str] = Field(
112
- default=None,
128
+ default=DEFAULT_ARGS["exclude"],
113
129
  description="Exclude certain nodes from the resources granted to the job",
114
130
  )
115
- node_list: Optional[str] = Field(
116
- default=None, description="Request a specific list of nodes for deployment"
131
+ nodelist: Optional[str] = Field(
132
+ default=DEFAULT_ARGS["nodelist"],
133
+ description="Request a specific list of nodes for deployment",
117
134
  )
118
135
  bind: Optional[str] = Field(
119
- default=None, description="Additional binds for the singularity container"
136
+ default=DEFAULT_ARGS["bind"],
137
+ description="Additional binds for the container",
120
138
  )
121
139
  venv: str = Field(
122
- default="singularity", description="Virtual environment/container system"
140
+ default=DEFAULT_ARGS["venv"],
141
+ description="Virtual environment/container system",
123
142
  )
124
143
  log_dir: Path = Field(
125
- default=Path(cast(str, DEFAULT_ARGS["log_dir"])),
144
+ default=Path(DEFAULT_ARGS["log_dir"]),
126
145
  description="Log directory path",
127
146
  )
128
147
  model_weights_parent_dir: Path = Field(
129
- default=Path(cast(str, DEFAULT_ARGS["model_weights_parent_dir"])),
148
+ default=Path(DEFAULT_ARGS["model_weights_parent_dir"]),
130
149
  description="Base directory for model weights",
131
150
  )
132
151
  vllm_args: Optional[dict[str, Any]] = Field(
133
152
  default={}, description="vLLM engine arguments"
134
153
  )
135
-
154
+ env: Optional[dict[str, Any]] = Field(
155
+ default={}, description="Environment variables to be set"
156
+ )
136
157
  model_config = ConfigDict(
137
158
  extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
138
159
  )
vec_inf/client/models.py CHANGED
@@ -82,7 +82,7 @@ class LaunchResponse:
82
82
 
83
83
  Parameters
84
84
  ----------
85
- slurm_job_id : int
85
+ slurm_job_id : str
86
86
  ID of the launched SLURM job
87
87
  model_name : str
88
88
  Name of the launched model
@@ -92,12 +92,37 @@ class LaunchResponse:
92
92
  Raw output from the launch command (hidden from repr)
93
93
  """
94
94
 
95
- slurm_job_id: int
95
+ slurm_job_id: str
96
96
  model_name: str
97
97
  config: dict[str, Any]
98
98
  raw_output: str = field(repr=False)
99
99
 
100
100
 
101
+ @dataclass
102
+ class BatchLaunchResponse:
103
+ """Response from launching multiple models in batch mode.
104
+
105
+ Parameters
106
+ ----------
107
+ slurm_job_id : str
108
+ ID of the launched SLURM job
109
+ slurm_job_name : str
110
+ Name of the launched SLURM job
111
+ model_names : list[str]
112
+ Names of the launched models
113
+ config : dict[str, Any]
114
+ Configuration used for the launch
115
+ raw_output : str
116
+ Raw output from the launch command (hidden from repr)
117
+ """
118
+
119
+ slurm_job_id: str
120
+ slurm_job_name: str
121
+ model_names: list[str]
122
+ config: dict[str, Any]
123
+ raw_output: str = field(repr=False)
124
+
125
+
101
126
  @dataclass
102
127
  class StatusResponse:
103
128
  """Response from checking a model's status.
@@ -106,6 +131,8 @@ class StatusResponse:
106
131
  ----------
107
132
  model_name : str
108
133
  Name of the model
134
+ log_dir : str
135
+ Path to the SLURM log directory
109
136
  server_status : ModelStatus
110
137
  Current status of the server
111
138
  job_state : Union[str, ModelStatus]
@@ -121,6 +148,7 @@ class StatusResponse:
121
148
  """
122
149
 
123
150
  model_name: str
151
+ log_dir: str
124
152
  server_status: ModelStatus
125
153
  job_state: Union[str, ModelStatus]
126
154
  raw_output: str = field(repr=False)
@@ -160,12 +188,16 @@ class LaunchOptions:
160
188
  Specific variant/version of the model
161
189
  partition : str, optional
162
190
  SLURM partition to use
191
+ resource_type : str, optional
192
+ Type of resource to request for the job
163
193
  num_nodes : int, optional
164
194
  Number of nodes to allocate
165
195
  gpus_per_node : int, optional
166
196
  Number of GPUs per node
167
197
  account : str, optional
168
198
  Account name for job scheduling
199
+ work_dir : str, optional
200
+ Set working directory for the batch job
169
201
  qos : str, optional
170
202
  Quality of Service level
171
203
  time : str, optional
@@ -175,7 +207,7 @@ class LaunchOptions:
175
207
  node_list : str, optional
176
208
  Request a specific list of nodes for deployment
177
209
  bind : str, optional
178
- Additional binds for the singularity container
210
+ Additional binds for the container as a comma separated list of bind paths
179
211
  vocab_size : int, optional
180
212
  Size of model vocabulary
181
213
  data_type : str, optional
@@ -188,17 +220,23 @@ class LaunchOptions:
188
220
  Parent directory containing model weights
189
221
  vllm_args : str, optional
190
222
  Additional arguments for vLLM
223
+ env : str, optional
224
+ Environment variables to be set
225
+ config : str, optional
226
+ Path to custom model config yaml
191
227
  """
192
228
 
193
229
  model_family: Optional[str] = None
194
230
  model_variant: Optional[str] = None
195
231
  partition: Optional[str] = None
232
+ resource_type: Optional[str] = None
196
233
  num_nodes: Optional[int] = None
197
234
  gpus_per_node: Optional[int] = None
198
235
  account: Optional[str] = None
236
+ work_dir: Optional[str] = None
199
237
  qos: Optional[str] = None
200
238
  exclude: Optional[str] = None
201
- node_list: Optional[str] = None
239
+ nodelist: Optional[str] = None
202
240
  bind: Optional[str] = None
203
241
  time: Optional[str] = None
204
242
  vocab_size: Optional[int] = None
@@ -207,6 +245,8 @@ class LaunchOptions:
207
245
  log_dir: Optional[str] = None
208
246
  model_weights_parent_dir: Optional[str] = None
209
247
  vllm_args: Optional[str] = None
248
+ env: Optional[str] = None
249
+ config: Optional[str] = None
210
250
 
211
251
 
212
252
  @dataclass
vec_inf/config/README.md CHANGED
@@ -1,245 +1,6 @@
1
- # Available Models
2
- More profiling metrics coming soon!
1
+ # Configs
3
2
 
4
- ## Text Generation Models
3
+ * [`environment.yaml`](environment.yaml): Configuration for the Slurm cluster environment, including image paths, resource availabilities, default value, and etc.
4
+ * [`models.yaml`](models.yaml): Configuration for launching model inference servers, including Slurm parameters as well as `vllm serve` arguments.
5
5
 
6
- ### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
7
-
8
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
9
- |:----------:|:----------:|:----------:|:----------:|
10
- | [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
11
- | [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
12
- | [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
13
-
14
- ### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
15
-
16
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
17
- |:----------:|:----------:|:----------:|:----------:|
18
- | [`CodeLlama-7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
19
- | [`CodeLlama-7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
20
- | [`CodeLlama-13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
21
- | [`CodeLlama-13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
22
- | [`CodeLlama-34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
23
- | [`CodeLlama-34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
24
- | [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
25
- | [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
26
-
27
- ### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
28
-
29
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
30
- |:----------:|:----------:|:----------:|:----------:|
31
- | [`gemma-2-9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
32
- | [`gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
33
- | [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
34
- | [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
35
-
36
- ### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
37
-
38
- | Variant | Suggested resource allocation |
39
- |:----------:|:----------:|
40
- | [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
41
- | [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
42
- | [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
43
- | [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
44
- | [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
45
- | [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
46
-
47
- ### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
48
-
49
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
50
- |:----------:|:----------:|:----------:|:----------:|
51
- | [`Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
52
- | [`Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
53
- | [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
54
- | [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
55
-
56
- ### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
57
-
58
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
59
- |:----------:|:----------:|:----------:|:----------:|
60
- | [`Meta-Llama-3.1-8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
61
- | [`Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
62
- | [`Meta-Llama-3.1-70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
63
- | [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
64
- | [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
65
-
66
- ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
67
-
68
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
69
- |:----------:|:----------:|:----------:|:----------:|
70
- | [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
71
- | [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
72
- | [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
73
- | [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
74
-
75
- ### [Mistral AI: Mistral](https://huggingface.co/mistralai)
76
-
77
- | Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
78
- |:----------:|:----------:|:----------:|:----------:|
79
- | [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
80
- | [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
81
- | [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
82
- | [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
83
- | [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
84
- | [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
85
- | [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
86
-
87
- ### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
88
-
89
- | Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
90
- |:----------:|:----------:|:----------:|:----------:|
91
- | [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
92
- | [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
93
- | [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
94
-
95
- ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
96
-
97
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
98
- |:----------:|:----------:|:----------:|:----------:|
99
- | [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
100
-
101
- ### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
102
-
103
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
104
- |:----------:|:----------:|:----------:|:----------:|
105
- | [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
106
-
107
- ### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
108
-
109
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
110
- |:----------:|:----------:|:----------:|:----------:|
111
- | [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
112
- | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
113
- | [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
114
- | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
115
- | [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
116
- | [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
117
- | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
118
-
119
- ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
120
-
121
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
122
- |:----------:|:----------:|:----------:|:----------:|
123
- | [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
124
- | [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
125
- | [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
126
-
127
- ### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
128
-
129
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
130
- |:----------:|:----------:|:----------:|:----------:|
131
- | [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
132
-
133
- ### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
134
-
135
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
136
- |:----------:|:----------:|:----------:|:----------:|
137
- | [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
138
-
139
- ### [DeepSeek-R1: Distilled Models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
140
-
141
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
142
- |:----------:|:----------:|:----------:|:----------:|
143
- | [`DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 1x a40 | - tokens/s | - tokens/s |
144
- | [`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 4x a40 | - tokens/s | - tokens/s |
145
- | [`DeepSeek-R1-Distill-Qwen-1.5B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | 1x a40 | - tokens/s | - tokens/s |
146
- | [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1x a40 | - tokens/s | - tokens/s |
147
- | [`DeepSeek-R1-Distill-Qwen-14B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | 2x a40 | - tokens/s | - tokens/s |
148
- | [`DeepSeek-R1-Distill-Qwen-32B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | 4x a40 | - tokens/s | - tokens/s |
149
-
150
-
151
- ## Vision Language Models
152
-
153
- ### [allenai: Molmo](https://huggingface.co/collections/allenai/molmo-66f379e6fe3b8ef090a8ca19)
154
-
155
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
156
- |:----------:|:----------:|:----------:|:----------:|
157
- | [`Molmo-7B-D-0924`](https://huggingface.co/allenai/Molmo-7B-D-0924) | 1x a40 | - tokens/s | - tokens/s |
158
-
159
-
160
- ### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
161
-
162
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
163
- |:----------:|:----------:|:----------:|:----------:|
164
- | [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
165
- | [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
166
-
167
- ### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
168
-
169
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
170
- |:----------:|:----------:|:----------:|:----------:|
171
- | [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
172
- | [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
173
-
174
- ### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
175
-
176
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
177
- |:----------:|:----------:|:----------:|:----------:|
178
- | [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
179
- | [`Phi-3.5-vision-instruct`](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) | 2x a40 | - tokens/s | - tokens/s |
180
-
181
- ### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
182
-
183
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
184
- |:----------:|:----------:|:----------:|:----------:|
185
- | [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
186
- | [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
187
- | [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
188
- | [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
189
-
190
- **NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
191
-
192
- ### [Mistral: Pixtral](https://huggingface.co/mistralai)
193
-
194
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
195
- |:----------:|:----------:|:----------:|:----------:|
196
- | [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
197
-
198
- ### [OpenGVLab: InternVL2.5](https://huggingface.co/collections/OpenGVLab/internvl25-673e1019b66e2218f68d7c1c)
199
-
200
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
201
- |:----------:|:----------:|:----------:|:----------:|
202
- | [`InternVL2_5-8B`](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | 1x a40 | - tokens/s | - tokens/s |
203
- | [`InternVL2_5-26B`](https://huggingface.co/OpenGVLab/InternVL2_5-26B) | 2x a40 | - tokens/s | - tokens/s |
204
- | [`InternVL2_5-38B`](https://huggingface.co/OpenGVLab/InternVL2_5-38B) | 4x a40 | - tokens/s | - tokens/s |
205
-
206
- ### [THUDM: GLM-4](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
207
-
208
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
209
- |:----------:|:----------:|:----------:|:----------:|
210
- | [`glm-4v-9b`](https://huggingface.co/THUDM/glm-4v-9b) | 1x a40 | - tokens/s | - tokens/s |
211
-
212
- ### [DeepSeek: DeepSeek-VL2](https://huggingface.co/collections/deepseek-ai/deepseek-vl2-675c22accc456d3beb4613ab)
213
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
214
- |:----------:|:----------:|:----------:|:----------:|
215
- | [`deepseek-vl2`](https://huggingface.co/deepseek-ai/deepseek-vl2) | 2x a40 | - tokens/s | - tokens/s |
216
- | [`deepseek-vl2-small`](https://huggingface.co/deepseek-ai/deepseek-vl2-small) | 1x a40 | - tokens/s | - tokens/s |
217
-
218
-
219
- ## Text Embedding Models
220
-
221
- ### [Liang Wang: e5](https://huggingface.co/intfloat)
222
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
223
- |:----------:|:----------:|:----------:|:----------:|
224
- | [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
225
-
226
- ### [BAAI: bge](https://huggingface.co/BAAI)
227
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
228
- |:----------:|:----------:|:----------:|:----------:|
229
- | [`bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1x A40 | - tokens/s | - tokens/s |
230
-
231
- ### [Sentence Transformers: MiniLM](https://huggingface.co/sentence-transformers)
232
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
233
- |:----------:|:----------:|:----------:|:----------:|
234
- | [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1x A40 | - tokens/s | - tokens/s |
235
-
236
-
237
-
238
- ## Reward Modeling Models
239
-
240
- ### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
241
-
242
- | Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
243
- |:----------:|:----------:|:----------:|:----------:|
244
- | [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
245
- | [`Qwen2.5-Math-PRM-7B`](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | 1x a40 | - tokens/s | - tokens/s |
6
+ **NOTE**: These configs acts as last resort fallbacks in the `vec-inf` package, they will be updated to match the latest cached config on the Vector Killarney cluster with each new package version release.
@@ -0,0 +1,35 @@
1
+ paths:
2
+ image_path: "/model-weights/vec-inf-shared/vector-inference_latest.sif"
3
+
4
+ containerization:
5
+ module_load_cmd: "module load apptainer"
6
+ module_name: "apptainer"
7
+
8
+ limits:
9
+ max_gpus_per_node: 8
10
+ max_num_nodes: 178
11
+ max_cpus_per_task: 64
12
+
13
+ allowed_values:
14
+ qos: []
15
+ partition: []
16
+ resource_type: ["l40s", "h100"]
17
+
18
+ required_args:
19
+ account: "VEC_INF_ACCOUNT"
20
+ work_dir: "VEC_INF_WORK_DIR"
21
+
22
+ default_args:
23
+ cpus_per_task: "16"
24
+ mem_per_node: "64G"
25
+ time: "08:00:00"
26
+ qos: ""
27
+ partition: ""
28
+ resource_type: ""
29
+ exclude: ""
30
+ nodelist: ""
31
+ bind: ""
32
+ venv: "apptainer"
33
+ data_type: "auto"
34
+ log_dir: "~/.vec-inf-logs"
35
+ model_weights_parent_dir: "/model-weights"