PyPI - vec-inf - Versions diffs - 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

vec-inf 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

vec_inf/README.md +3 -3
vec_inf/cli/_cli.py +214 -104
vec_inf/cli/_helper.py +289 -564
vec_inf/cli/_utils.py +26 -150
vec_inf/cli/_vars.py +32 -0
vec_inf/client/__init__.py +31 -0
vec_inf/client/_client_vars.py +213 -0
vec_inf/client/_exceptions.py +37 -0
vec_inf/client/_helper.py +674 -0
vec_inf/client/_slurm_script_generator.py +179 -0
vec_inf/client/_utils.py +287 -0
vec_inf/client/api.py +302 -0
vec_inf/client/config.py +128 -0
vec_inf/client/models.py +225 -0
vec_inf/client/slurm_vars.py +49 -0
vec_inf/config/README.md +0 -12
vec_inf/config/models.yaml +417 -391
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/METADATA +44 -61
vec_inf-0.6.0.dist-info/RECORD +25 -0
vec_inf/cli/_config.py +0 -87
vec_inf/multinode_vllm.slurm +0 -154
vec_inf/vllm.slurm +0 -90
vec_inf-0.5.0.dist-info/RECORD +0 -17
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/WHEEL +0 -0
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/entry_points.txt +0 -0
{vec_inf-0.5.0.dist-info → vec_inf-0.6.0.dist-info}/licenses/LICENSE +0 -0

vec_inf/config/models.yaml CHANGED Viewed

@@ -6,13 +6,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 256000
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --pipeline-parallel-size: 2
+      --tensor-parallel-size: 4
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   c4ai-command-r-plus-08-2024:
     model_family: c4ai-command-r
     model_variant: plus-08-2024
@@ -20,13 +22,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 256000
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --pipeline-parallel-size: 2
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   c4ai-command-r-08-2024:
     model_family: c4ai-command-r
     model_variant: 08-2024
@@ -34,13 +38,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 256000
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-7b-hf:
     model_family: CodeLlama
     model_variant: 7b-hf
@@ -48,13 +53,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-7b-Instruct-hf:
     model_family: CodeLlama
     model_variant: 7b-Instruct-hf
@@ -62,13 +67,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-13b-hf:
     model_family: CodeLlama
     model_variant: 13b-hf
@@ -76,13 +81,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-13b-Instruct-hf:
     model_family: CodeLlama
     model_variant: 13b-Instruct-hf
@@ -90,13 +95,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-34b-hf:
     model_family: CodeLlama
     model_variant: 34b-hf
@@ -104,13 +109,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-34b-Instruct-hf:
     model_family: CodeLlama
     model_variant: 34b-Instruct-hf
@@ -118,55 +124,44 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-70b-hf:
     model_family: CodeLlama
     model_variant: 70b-hf
     model_type: LLM
     gpus_per_node: 4
     num_nodes: 1
-    vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
+    vocab_size: 32016
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   CodeLlama-70b-Instruct-hf:
     model_family: CodeLlama
     model_variant: 70b-Instruct-hf
     model_type: LLM
     gpus_per_node: 4
     num_nodes: 1
-    vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
-    qos: m2
-    time: 08:00:00
-    partition: a40
-  dbrx-instruct:
-    model_family: dbrx
-    model_variant: instruct
-    model_type: LLM
-    gpus_per_node: 4
-    num_nodes: 2
-    vocab_size: 100352
-    max_model_len: 32000
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
+    vocab_size: 32016
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   gemma-2-9b:
     model_family: gemma-2
     model_variant: 9b
@@ -174,13 +169,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 256000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   gemma-2-9b-it:
     model_family: gemma-2
     model_variant: 9b-it
@@ -188,13 +183,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 256000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   gemma-2-27b:
     model_family: gemma-2
     model_variant: 27b
@@ -202,13 +197,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 256000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   gemma-2-27b-it:
     model_family: gemma-2
     model_variant: 27b-it
@@ -216,13 +212,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 256000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-2-7b-hf:
     model_family: Llama-2
     model_variant: 7b-hf
@@ -230,13 +227,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-2-7b-chat-hf:
     model_family: Llama-2
     model_variant: 7b-chat-hf
@@ -244,13 +241,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-2-13b-hf:
     model_family: Llama-2
     model_variant: 13b-hf
@@ -258,13 +255,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-2-13b-chat-hf:
     model_family: Llama-2
     model_variant: 13b-chat-hf
@@ -272,13 +269,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-2-70b-hf:
     model_family: Llama-2
     model_variant: 70b-hf
@@ -286,13 +283,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-2-70b-chat-hf:
     model_family: Llama-2
     model_variant: 70b-chat-hf
@@ -300,13 +298,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   llava-1.5-7b-hf:
     model_family: llava-1.5
     model_variant: 7b-hf
@@ -314,13 +313,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   llava-1.5-13b-hf:
     model_family: llava-1.5
     model_variant: 13b-hf
@@ -328,13 +327,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   llava-v1.6-mistral-7b-hf:
     model_family: llava-v1.6
     model_variant: mistral-7b-hf
@@ -342,13 +341,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   llava-v1.6-34b-hf:
     model_family: llava-v1.6
     model_variant: 34b-hf
@@ -356,13 +355,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 64064
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3-8B:
     model_family: Meta-Llama-3
     model_variant: 8B
@@ -370,13 +370,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3-8B-Instruct:
     model_family: Meta-Llama-3
     model_variant: 8B-Instruct
@@ -384,13 +384,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3-70B:
     model_family: Meta-Llama-3
     model_variant: 70B
@@ -398,13 +398,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3-70B-Instruct:
     model_family: Meta-Llama-3
     model_variant: 70B-Instruct
@@ -412,13 +413,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3.1-8B:
     model_family: Meta-Llama-3.1
     model_variant: 8B
@@ -426,13 +428,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3.1-8B-Instruct:
     model_family: Meta-Llama-3.1
     model_variant: 8B-Instruct
@@ -440,13 +442,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3.1-70B:
     model_family: Meta-Llama-3.1
     model_variant: 70B
@@ -454,13 +456,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3.1-70B-Instruct:
     model_family: Meta-Llama-3.1
     model_variant: 70B-Instruct
@@ -468,13 +471,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   Meta-Llama-3.1-405B-Instruct:
     model_family: Meta-Llama-3.1
     model_variant: 405B-Instruct
@@ -482,27 +486,15 @@ models:
     gpus_per_node: 4
     num_nodes: 8
     vocab_size: 128256
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m4
     time: 02:00:00
     partition: a40
-  Mistral-7B-v0.1:
-    model_family: Mistral
-    model_variant: 7B-v0.1
-    model_type: LLM
-    gpus_per_node: 1
-    num_nodes: 1
-    vocab_size: 32000
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
-    qos: m2
-    time: 08:00:00
-    partition: a40
+    vllm_args:
+      --pipeline-parallel-size: 8
+      --tensor-parallel-size: 4
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mistral-7B-Instruct-v0.1:
     model_family: Mistral
     model_variant: 7B-Instruct-v0.1
@@ -510,13 +502,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mistral-7B-Instruct-v0.2:
     model_family: Mistral
     model_variant: 7B-Instruct-v0.2
@@ -524,13 +516,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mistral-7B-v0.3:
     model_family: Mistral
     model_variant: 7B-v0.3
@@ -538,13 +530,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32768
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mistral-7B-Instruct-v0.3:
     model_family: Mistral
     model_variant: 7B-Instruct-v0.3
@@ -552,13 +544,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32768
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mistral-Large-Instruct-2407:
     model_family: Mistral
     model_variant: Large-Instruct-2407
@@ -566,13 +558,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 32768
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --pipeline-parallel-size: 2
+      --tensor-parallel-size: 4
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mistral-Large-Instruct-2411:
     model_family: Mistral
     model_variant: Large-Instruct-2411
@@ -580,13 +574,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 32768
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --pipeline-parallel-size: 2
+      --tensor-parallel-size: 4
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mixtral-8x7B-Instruct-v0.1:
     model_family: Mixtral
     model_variant: 8x7B-Instruct-v0.1
@@ -594,13 +590,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mixtral-8x22B-v0.1:
     model_family: Mixtral
     model_variant: 8x22B-v0.1
@@ -608,13 +605,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 32768
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --pipeline-parallel-size: 2
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   Mixtral-8x22B-Instruct-v0.1:
     model_family: Mixtral
     model_variant: 8x22B-Instruct-v0.1
@@ -622,13 +621,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 32768
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --pipeline-parallel-size: 2
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   Phi-3-medium-128k-instruct:
     model_family: Phi-3
     model_variant: medium-128k-instruct
@@ -636,13 +637,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 32064
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Phi-3-vision-128k-instruct:
     model_family: Phi-3-vision
     model_variant: 128k-instruct
@@ -650,13 +652,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 32064
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama3-OpenBioLLM-70B:
     model_family: Llama3-OpenBioLLM
     model_variant: 70B
@@ -664,13 +667,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-3.1-Nemotron-70B-Instruct-HF:
     model_family: Llama-3.1-Nemotron
     model_variant: 70B-Instruct-HF
@@ -678,13 +682,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-3.2-1B:
     model_family: Llama-3.2
     model_variant: 1B
@@ -692,13 +697,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-3.2-1B-Instruct:
     model_family: Llama-3.2
     model_variant: 1B-Instruct
@@ -706,13 +711,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-3.2-3B:
     model_family: Llama-3.2
     model_variant: 3B
@@ -720,13 +725,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-3.2-3B-Instruct:
     model_family: Llama-3.2
     model_variant: 3B-Instruct
@@ -734,13 +739,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-3.2-11B-Vision:
     model_family: Llama-3.2
     model_variant: 11B-Vision
@@ -748,13 +753,15 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 4096
-    max_num_seqs: 64
-    pipeline_parallelism: false
-    enforce_eager: true
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 4096
+      --max-num-seqs: 64
+      --compilation-config: 3
+      --enforce-eager: true
   Llama-3.2-11B-Vision-Instruct:
     model_family: Llama-3.2
     model_variant: 11B-Vision-Instruct
@@ -762,13 +769,15 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 4096
-    max_num_seqs: 64
-    pipeline_parallelism: false
-    enforce_eager: true
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 4096
+      --max-num-seqs: 64
+      --compilation-config: 3
+      --enforce-eager: true
   Llama-3.2-90B-Vision:
     model_family: Llama-3.2
     model_variant: 90B-Vision
@@ -776,13 +785,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 128256
-    max_model_len: 4096
-    max_num_seqs: 32
-    pipeline_parallelism: false
-    enforce_eager: true
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 8
+      --max-model-len: 4096
+      --max-num-seqs: 32
+      --compilation-config: 3
+      --enforce-eager: true
   Llama-3.2-90B-Vision-Instruct:
     model_family: Llama-3.2
     model_variant: 90B-Vision-Instruct
@@ -790,13 +801,15 @@ models:
     gpus_per_node: 4
     num_nodes: 2
     vocab_size: 128256
-    max_model_len: 4096
-    max_num_seqs: 32
-    pipeline_parallelism: false
-    enforce_eager: true
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 8
+      --max-model-len: 4096
+      --max-num-seqs: 32
+      --compilation-config: 3
+      --enforce-eager: true
   Qwen2.5-0.5B-Instruct:
     model_family: Qwen2.5
     model_variant: 0.5B-Instruct
@@ -804,13 +817,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-1.5B-Instruct:
     model_family: Qwen2.5
     model_variant: 1.5B-Instruct
@@ -818,13 +831,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-3B-Instruct:
     model_family: Qwen2.5
     model_variant: 3B-Instruct
@@ -832,13 +845,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-7B-Instruct:
     model_family: Qwen2.5
     model_variant: 7B-Instruct
@@ -846,13 +859,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-14B-Instruct:
     model_family: Qwen2.5
     model_variant: 14B-Instruct
@@ -860,13 +873,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-32B-Instruct:
     model_family: Qwen2.5
     model_variant: 32B-Instruct
@@ -874,13 +887,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-72B-Instruct:
     model_family: Qwen2.5
     model_variant: 72B-Instruct
@@ -888,13 +902,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 16384
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 16384
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-Math-1.5B-Instruct:
     model_family: Qwen2.5
     model_variant: Math-1.5B-Instruct
@@ -902,13 +917,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-Math-7B-Instruct:
     model_family: Qwen2.5
     model_variant: Math-7B-Instruct
@@ -916,13 +931,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-Math-72B-Instruct:
     model_family: Qwen2.5
     model_variant: Math-72B-Instruct
@@ -930,13 +945,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-Coder-7B-Instruct:
     model_family: Qwen2.5
     model_variant: Coder-7B-Instruct
@@ -944,13 +960,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-Math-RM-72B:
     model_family: Qwen2.5
     model_variant: Math-RM-72B
@@ -958,13 +974,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   Qwen2.5-Math-PRM-7B:
     model_family: Qwen2.5
     model_variant: Math-PRM-7B
@@ -972,13 +989,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   QwQ-32B-Preview:
     model_family: QwQ
     model_variant: 32B-Preview
@@ -986,13 +1003,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Pixtral-12B-2409:
     model_family: Pixtral
     model_variant: 12B-2409
@@ -1000,13 +1018,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 131072
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   e5-mistral-7b-instruct:
     model_family: e5
     model_variant: mistral-7b-instruct
@@ -1014,13 +1032,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 32000
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   bge-base-en-v1.5:
     model_family: bge
     model_variant: base-en-v1.5
@@ -1028,13 +1046,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 30522
-    max_model_len: 512
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 512
+      --max-num-seqs: 256
+      --compilation-config: 3
   all-MiniLM-L6-v2:
     model_family: all-MiniLM
     model_variant: L6-v2
@@ -1042,13 +1060,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 30522
-    max_model_len: 512
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 512
+      --max-num-seqs: 256
+      --compilation-config: 3
   Llama-3.3-70B-Instruct:
     model_family: Llama-3.3
     model_variant: 70B-Instruct
@@ -1056,13 +1074,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   InternVL2_5-26B:
     model_family: InternVL2_5
     model_variant: 26B
@@ -1070,13 +1089,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 92553
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   InternVL2_5-38B:
     model_family: InternVL2_5
     model_variant: 38B
@@ -1084,13 +1104,14 @@ models:
     gpus_per_node: 4
     num_nodes: 1
     vocab_size: 92553
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   Aya-Expanse-32B:
     model_family: Aya-Expanse
     model_variant: 32B
@@ -1098,69 +1119,72 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 256000
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   DeepSeek-R1-Distill-Llama-70B:
     model_family: DeepSeek-R1
-    model_variant: 'Distill-Llama-70B '
+    model_variant: Distill-Llama-70B
     model_type: LLM
     gpus_per_node: 4
-    num_nodes: 2
+    num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 4
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   DeepSeek-R1-Distill-Llama-8B:
     model_family: DeepSeek-R1
-    model_variant: 'Distill-Llama-8B '
+    model_variant: Distill-Llama-8B
     model_type: LLM
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 128256
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   DeepSeek-R1-Distill-Qwen-32B:
     model_family: DeepSeek-R1
     model_variant: Distill-Qwen-32B
     model_type: LLM
-    gpus_per_node: 4
+    gpus_per_node: 2
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   DeepSeek-R1-Distill-Qwen-14B:
     model_family: DeepSeek-R1
     model_variant: Distill-Qwen-14B
     model_type: LLM
-    gpus_per_node: 2
+    gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   DeepSeek-R1-Distill-Qwen-7B:
     model_family: DeepSeek-R1
     model_variant: Distill-Qwen-7B
@@ -1168,13 +1192,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   DeepSeek-R1-Distill-Qwen-1.5B:
     model_family: DeepSeek-R1
     model_variant: Distill-Qwen-1.5B
@@ -1182,13 +1206,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 131072
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 131072
+      --max-num-seqs: 256
+      --compilation-config: 3
   Phi-3.5-vision-instruct:
     model_family: Phi-3.5-vision
     model_variant: instruct
@@ -1196,13 +1220,14 @@ models:
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 32064
-    max_model_len: 65536
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 65536
+      --max-num-seqs: 256
+      --compilation-config: 3
   InternVL2_5-8B:
     model_family: InternVL2_5
     model_variant: 8B
@@ -1210,13 +1235,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 92553
-    max_model_len: 32768
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 32768
+      --max-num-seqs: 256
+      --compilation-config: 3
   glm-4v-9b:
     model_family: glm-4v
     model_variant: 9b
@@ -1224,13 +1249,13 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 151552
-    max_model_len: 8192
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 8192
+      --max-num-seqs: 256
+      --compilation-config: 3
   Molmo-7B-D-0924:
     model_family: Molmo
     model_variant: 7B-D-0924
@@ -1238,26 +1263,27 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 152064
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   deepseek-vl2:
     model_family: deepseek-vl2
     model_type: VLM
     gpus_per_node: 2
     num_nodes: 1
     vocab_size: 129280
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --tensor-parallel-size: 2
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3
   deepseek-vl2-small:
     model_family: deepseek-vl2
     model_variant: small
@@ -1265,10 +1291,10 @@ models:
     gpus_per_node: 1
     num_nodes: 1
     vocab_size: 129280
-    max_model_len: 4096
-    max_num_seqs: 256
-    pipeline_parallelism: true
-    enforce_eager: false
     qos: m2
     time: 08:00:00
     partition: a40
+    vllm_args:
+      --max-model-len: 4096
+      --max-num-seqs: 256
+      --compilation-config: 3

vec-inf 0.5.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

vec-inf 0.5.0py3-none-any.whl → 0.6.0py3-none-any.whl