PyPI - vec-inf - Versions diffs - 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

vec-inf 0.6.1py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

vec_inf/README.md +18 -4
vec_inf/cli/_cli.py +191 -34
vec_inf/cli/_helper.py +95 -14
vec_inf/client/_client_vars.py +7 -165
vec_inf/client/_helper.py +386 -40
vec_inf/client/_slurm_script_generator.py +204 -36
vec_inf/client/_slurm_templates.py +248 -0
vec_inf/client/_slurm_vars.py +86 -0
vec_inf/client/_utils.py +189 -70
vec_inf/client/api.py +96 -25
vec_inf/client/config.py +40 -19
vec_inf/client/models.py +44 -4
vec_inf/config/README.md +4 -243
vec_inf/config/environment.yaml +35 -0
vec_inf/config/models.yaml +102 -274
{vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/METADATA +43 -73
vec_inf-0.7.1.dist-info/RECORD +27 -0
vec_inf/client/slurm_vars.py +0 -49
vec_inf-0.6.1.dist-info/RECORD +0 -25
{vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/WHEEL +0 -0
{vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/entry_points.txt +0 -0
{vec_inf-0.6.1.dist-info → vec_inf-0.7.1.dist-info}/licenses/LICENSE +0 -0

vec_inf/client/config.py CHANGED Viewed

@@ -5,18 +5,19 @@ configurations, including hardware requirements and model specifications.
 """
 from pathlib import Path
-from typing import Any, Optional, Union, cast
+from typing import Any, Optional, Union
 from pydantic import BaseModel, ConfigDict, Field
 from typing_extensions import Literal
-from vec_inf.client.slurm_vars import (
+from vec_inf.client._slurm_vars import (
     DEFAULT_ARGS,
     MAX_CPUS_PER_TASK,
     MAX_GPUS_PER_NODE,
     MAX_NUM_NODES,
     PARTITION,
     QOS,
+    RESOURCE_TYPE,
 )
@@ -47,14 +48,18 @@ class ModelConfig(BaseModel):
         Memory allocation per node in GB format (e.g., '32G')
     vocab_size : int
         Size of the model's vocabulary (1-1,000,000)
-    account : Optional[str], optional
+    account : str, optional
         Charge resources used by this job to specified account.
+    work_dir : str, optional
+        Set working directory for the batch job
     qos : Union[QOS, str], optional
         Quality of Service tier for job scheduling
     time : str, optional
         Time limit for the job in HH:MM:SS format
     partition : Union[PARTITION, str], optional
-        GPU partition type for job scheduling
+        Slurm partition for job scheduling
+    resource_type : Union[RESOURCE_TYPE, str], optional
+        Type of resource to request for the job
     venv : str, optional
         Virtual environment or container system to use
     log_dir : Path, optional
@@ -83,13 +88,13 @@ class ModelConfig(BaseModel):
     )
     num_nodes: int = Field(..., gt=0, le=MAX_NUM_NODES, description="Number of nodes")
     cpus_per_task: int = Field(
-        default=cast(int, DEFAULT_ARGS["cpus_per_task"]),
+        default=int(DEFAULT_ARGS["cpus_per_task"]),
         gt=0,
         le=MAX_CPUS_PER_TASK,
         description="CPUs per task",
     )
     mem_per_node: str = Field(
-        default=cast(str, DEFAULT_ARGS["mem_per_node"]),
+        default=DEFAULT_ARGS["mem_per_node"],
         pattern=r"^\d{1,4}G$",
         description="Memory per node",
     )
@@ -97,42 +102,58 @@ class ModelConfig(BaseModel):
     account: Optional[str] = Field(
         default=None, description="Account name for job scheduling"
     )
-    qos: Union[QOS, str] = Field(
-        default=cast(str, DEFAULT_ARGS["qos"]), description="Quality of Service tier"
+    work_dir: Optional[str] = Field(
+        default=None, description="Working directory for the job"
+    )
+    qos: Optional[Union[QOS, str]] = Field(
+        default=DEFAULT_ARGS["qos"] if DEFAULT_ARGS["qos"] != "" else None,
+        description="Quality of Service tier",
     )
     time: str = Field(
-        default=cast(str, DEFAULT_ARGS["time"]),
+        default=DEFAULT_ARGS["time"],
         pattern=r"^\d{2}:\d{2}:\d{2}$",
         description="HH:MM:SS time limit",
     )
-    partition: Union[PARTITION, str] = Field(
-        default=cast(str, DEFAULT_ARGS["partition"]), description="GPU partition type"
+    partition: Optional[Union[PARTITION, str]] = Field(
+        default=DEFAULT_ARGS["partition"] if DEFAULT_ARGS["partition"] != "" else None,
+        description="GPU partition type",
+    )
+    resource_type: Optional[Union[RESOURCE_TYPE, str]] = Field(
+        default=DEFAULT_ARGS["resource_type"]
+        if DEFAULT_ARGS["resource_type"] != ""
+        else None,
+        description="Resource type",
     )
     exclude: Optional[str] = Field(
-        default=None,
+        default=DEFAULT_ARGS["exclude"],
         description="Exclude certain nodes from the resources granted to the job",
     )
-    node_list: Optional[str] = Field(
-        default=None, description="Request a specific list of nodes for deployment"
+    nodelist: Optional[str] = Field(
+        default=DEFAULT_ARGS["nodelist"],
+        description="Request a specific list of nodes for deployment",
     )
     bind: Optional[str] = Field(
-        default=None, description="Additional binds for the singularity container"
+        default=DEFAULT_ARGS["bind"],
+        description="Additional binds for the container",
     )
     venv: str = Field(
-        default="singularity", description="Virtual environment/container system"
+        default=DEFAULT_ARGS["venv"],
+        description="Virtual environment/container system",
     )
     log_dir: Path = Field(
-        default=Path(cast(str, DEFAULT_ARGS["log_dir"])),
+        default=Path(DEFAULT_ARGS["log_dir"]),
         description="Log directory path",
     )
     model_weights_parent_dir: Path = Field(
-        default=Path(cast(str, DEFAULT_ARGS["model_weights_parent_dir"])),
+        default=Path(DEFAULT_ARGS["model_weights_parent_dir"]),
         description="Base directory for model weights",
     )
     vllm_args: Optional[dict[str, Any]] = Field(
         default={}, description="vLLM engine arguments"
     )
+    env: Optional[dict[str, Any]] = Field(
+        default={}, description="Environment variables to be set"
+    )
     model_config = ConfigDict(
         extra="forbid", str_strip_whitespace=True, validate_default=True, frozen=True
     )

vec_inf/client/models.py CHANGED Viewed

@@ -82,7 +82,7 @@ class LaunchResponse:
     Parameters
     ----------
-    slurm_job_id : int
+    slurm_job_id : str
         ID of the launched SLURM job
     model_name : str
         Name of the launched model
@@ -92,12 +92,37 @@ class LaunchResponse:
         Raw output from the launch command (hidden from repr)
     """
-    slurm_job_id: int
+    slurm_job_id: str
     model_name: str
     config: dict[str, Any]
     raw_output: str = field(repr=False)
+@dataclass
+class BatchLaunchResponse:
+    """Response from launching multiple models in batch mode.
+    Parameters
+    ----------
+    slurm_job_id : str
+        ID of the launched SLURM job
+    slurm_job_name : str
+        Name of the launched SLURM job
+    model_names : list[str]
+        Names of the launched models
+    config : dict[str, Any]
+        Configuration used for the launch
+    raw_output : str
+        Raw output from the launch command (hidden from repr)
+    """
+    slurm_job_id: str
+    slurm_job_name: str
+    model_names: list[str]
+    config: dict[str, Any]
+    raw_output: str = field(repr=False)
 @dataclass
 class StatusResponse:
     """Response from checking a model's status.
@@ -106,6 +131,8 @@ class StatusResponse:
     ----------
     model_name : str
         Name of the model
+    log_dir : str
+        Path to the SLURM log directory
     server_status : ModelStatus
         Current status of the server
     job_state : Union[str, ModelStatus]
@@ -121,6 +148,7 @@ class StatusResponse:
     """
     model_name: str
+    log_dir: str
     server_status: ModelStatus
     job_state: Union[str, ModelStatus]
     raw_output: str = field(repr=False)
@@ -160,12 +188,16 @@ class LaunchOptions:
         Specific variant/version of the model
     partition : str, optional
         SLURM partition to use
+    resource_type : str, optional
+        Type of resource to request for the job
     num_nodes : int, optional
         Number of nodes to allocate
     gpus_per_node : int, optional
         Number of GPUs per node
     account : str, optional
         Account name for job scheduling
+    work_dir : str, optional
+        Set working directory for the batch job
     qos : str, optional
         Quality of Service level
     time : str, optional
@@ -175,7 +207,7 @@ class LaunchOptions:
     node_list : str, optional
         Request a specific list of nodes for deployment
     bind : str, optional
-        Additional binds for the singularity container
+        Additional binds for the container as a comma separated list of bind paths
     vocab_size : int, optional
         Size of model vocabulary
     data_type : str, optional
@@ -188,17 +220,23 @@ class LaunchOptions:
         Parent directory containing model weights
     vllm_args : str, optional
         Additional arguments for vLLM
+    env : str, optional
+        Environment variables to be set
+    config : str, optional
+        Path to custom model config yaml
     """
     model_family: Optional[str] = None
     model_variant: Optional[str] = None
     partition: Optional[str] = None
+    resource_type: Optional[str] = None
     num_nodes: Optional[int] = None
     gpus_per_node: Optional[int] = None
     account: Optional[str] = None
+    work_dir: Optional[str] = None
     qos: Optional[str] = None
     exclude: Optional[str] = None
-    node_list: Optional[str] = None
+    nodelist: Optional[str] = None
     bind: Optional[str] = None
     time: Optional[str] = None
     vocab_size: Optional[int] = None
@@ -207,6 +245,8 @@ class LaunchOptions:
     log_dir: Optional[str] = None
     model_weights_parent_dir: Optional[str] = None
     vllm_args: Optional[str] = None
+    env: Optional[str] = None
+    config: Optional[str] = None
 @dataclass

vec_inf/config/README.md CHANGED Viewed

@@ -1,245 +1,6 @@
-# Available Models
-More profiling metrics coming soon!
+# Configs
-## Text Generation Models
+* [`environment.yaml`](environment.yaml): Configuration for the Slurm cluster environment, including image paths, resource availabilities, default value, and etc.
+* [`models.yaml`](models.yaml): Configuration for launching model inference servers, including Slurm parameters as well as `vllm serve` arguments.
-### [Cohere for AI: Command R](https://huggingface.co/collections/CohereForAI/c4ai-command-r-plus-660ec4c34f7a69c50ce7f7b9)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`c4ai-command-r-plus`](https://huggingface.co/CohereForAI/c4ai-command-r-plus) | 8x a40 (2 nodes, 4 a40/node) | 412 tokens/s | 541 tokens/s |
-| [`c4ai-command-r-plus-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-plus-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-| [`c4ai-command-r-08-2024`](https://huggingface.co/CohereForAI/c4ai-command-r-08-2024) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-### [Code Llama](https://huggingface.co/collections/meta-llama/code-llama-family-661da32d0a9d678b6f55b933)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`CodeLlama-7b-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
-| [`CodeLlama-7b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-7b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
-| [`CodeLlama-13b-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
-| [`CodeLlama-13b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-13b-Instruct-hf) | 1x a40 | - tokens/s | - tokens/s |
-| [`CodeLlama-34b-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
-| [`CodeLlama-34b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-34b-Instruct-hf) | 2x a40 | - tokens/s | - tokens/s |
-| [`CodeLlama-70b-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-hf) | 4x a40 | - tokens/s | - tokens/s |
-| [`CodeLlama-70b-Instruct-hf`](https://huggingface.co/meta-llama/CodeLlama-70b-Instruct-hf) | 4x a40 | - tokens/s | - tokens/s |
-### [Google: Gemma 2](https://huggingface.co/collections/google/gemma-2-release-667d6600fd5220e7b967f315)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`gemma-2-9b`](https://huggingface.co/google/gemma-2-9b) | 1x a40 | - tokens/s | - tokens/s |
-| [`gemma-2-9b-it`](https://huggingface.co/google/gemma-2-9b-it) | 1x a40 | - tokens/s | - tokens/s |
-| [`gemma-2-27b`](https://huggingface.co/google/gemma-2-27b) | 2x a40 | - tokens/s | - tokens/s |
-| [`gemma-2-27b-it`](https://huggingface.co/google/gemma-2-27b-it) | 2x a40 | - tokens/s | - tokens/s |
-### [Meta: Llama 2](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b)
-| Variant | Suggested resource allocation |
-|:----------:|:----------:|
-| [`Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf) | 1x a40 |
-| [`Llama-2-7b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) | 1x a40 |
-| [`Llama-2-13b-hf`](https://huggingface.co/meta-llama/Llama-2-13b-hf) | 1x a40 |
-| [`Llama-2-13b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) | 1x a40 |
-| [`Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf) | 4x a40 |
-| [`Llama-2-70b-chat-hf`](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) | 4x a40 |
-### [Meta: Llama 3](https://huggingface.co/collections/meta-llama/meta-llama-3-66214712577ca38149ebb2b6)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B) | 1x a40 | 222 tokens/s | 1811 tokens/s |
-| [`Meta-Llama-3-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | 1x a40 | 371 tokens/s | 1990 tokens/s |
-| [`Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B) | 4x a40 | 81 tokens/s | 618 tokens/s |
-| [`Meta-Llama-3-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct) | 4x a40 | 301 tokens/s | 660 tokens/s |
-### [Meta: Llama 3.1](https://huggingface.co/collections/meta-llama/llama-31-669fc079a0c406a149a5738f)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Meta-Llama-3.1-8B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B) | 1x a40 | - tokens/s | - tokens/s |
-| [`Meta-Llama-3.1-8B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Meta-Llama-3.1-70B`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B) | 4x a40 | - tokens/s | - tokens/s |
-| [`Meta-Llama-3.1-70B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
-| [`Meta-Llama-3.1-405B-Instruct`](https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct) | 32x a40 (8 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Llama-3.2-1B`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 1x a40 | - tokens/s | - tokens/s |
-| [`Llama-3.2-1B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Llama-3.2-3B`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 1x a40 | - tokens/s | - tokens/s |
-| [`Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-### [Mistral AI: Mistral](https://huggingface.co/mistralai)
-| Variant (Mistral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1) | 1x a40 | - tokens/s | - tokens/s|
-| [`Mistral-7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) | 1x a40 | - tokens/s | - tokens/s|
-| [`Mistral-7B-Instruct-v0.2`](https://huggingface.co/mistralai/Mistral-7B-v0.2) | 1x a40 | - tokens/s | - tokens/s|
-| [`Mistral-7B-v0.3`](https://huggingface.co/mistralai/Mistral-7B-v0.3) | 1x a40 | - tokens/s | - tokens/s |
-| [`Mistral-7B-Instruct-v0.3`](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3) | 1x a40 | - tokens/s | - tokens/s|
-| [`Mistral-Large-Instruct-2407`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2407) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
-| [`Mistral-Large-Instruct-2411`](https://huggingface.co/mistralai/Mistral-Large-Instruct-2411) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s|
-### [Mistral AI: Mixtral](https://huggingface.co/mistralai)
-| Variant (Mixtral) | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Mixtral-8x7B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | 4x a40 | 222 tokens/s | 1543 tokens/s |
-| [`Mixtral-8x22B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 145 tokens/s | 827 tokens/s|
-| [`Mixtral-8x22B-Instruct-v0.1`](https://huggingface.co/mistralai/Mixtral-8x22B-Instruct-v0.1) | 8x a40 (2 nodes, 4 a40/node) | 95 tokens/s | 803 tokens/s|
-### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Phi-3-medium-128k-instruct`](https://huggingface.co/microsoft/Phi-3-medium-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
-### [Nvidia: Llama-3.1-Nemotron](https://huggingface.co/collections/nvidia/llama-31-nemotron-70b-670e93cd366feea16abc13d8)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Llama-3.1-Nemotron-70B-Instruct-HF`](https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Instruct-HF) | 4x a40 | - tokens/s | - tokens/s |
-### [Qwen: Qwen2.5](https://huggingface.co/collections/Qwen/qwen25-66e81a666513e518adb90d9e)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Qwen2.5-0.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-3B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-14B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-32B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
-### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Qwen2.5-1.5B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-72B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Math-72B-Instruct) | 4x a40 | - tokens/s | - tokens/s |
-### [Qwen: Qwen2.5-Coder](https://huggingface.co/collections/Qwen/qwen25-coder-66eaa22e6f99801bf65b0c2f)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Qwen2.5-Coder-7B-Instruct`](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | 1x a40 | - tokens/s | - tokens/s |
-### [Qwen: QwQ](https://huggingface.co/collections/Qwen/qwq-674762b79b75eac01735070a)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`QwQ-32B-Preview`](https://huggingface.co/Qwen/QwQ-32B-Preview) | 2x a40 | - tokens/s | - tokens/s |
-### [DeepSeek-R1: Distilled Models](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`DeepSeek-R1-Distill-Llama-8B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B) | 1x a40 | - tokens/s | - tokens/s |
-| [`DeepSeek-R1-Distill-Llama-70B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B) | 4x a40 | - tokens/s | - tokens/s |
-| [`DeepSeek-R1-Distill-Qwen-1.5B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B) | 1x a40 | - tokens/s | - tokens/s |
-| [`DeepSeek-R1-Distill-Qwen-7B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B) | 1x a40 | - tokens/s | - tokens/s |
-| [`DeepSeek-R1-Distill-Qwen-14B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B) | 2x a40 | - tokens/s | - tokens/s |
-| [`DeepSeek-R1-Distill-Qwen-32B`](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) | 4x a40 | - tokens/s | - tokens/s |
-## Vision Language Models
-### [allenai: Molmo](https://huggingface.co/collections/allenai/molmo-66f379e6fe3b8ef090a8ca19)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Molmo-7B-D-0924`](https://huggingface.co/allenai/Molmo-7B-D-0924) | 1x a40 | - tokens/s | - tokens/s |
-### [LLaVa-1.5](https://huggingface.co/collections/llava-hf/llava-15-65f762d5b6941db5c2ba07e0)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`llava-1.5-7b-hf`](https://huggingface.co/llava-hf/llava-1.5-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
-| [`llava-1.5-13b-hf`](https://huggingface.co/llava-hf/llava-1.5-13b-hf) | 1x a40 | - tokens/s | - tokens/s |
-### [LLaVa-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`llava-v1.6-mistral-7b-hf`](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) | 1x a40 | - tokens/s | - tokens/s |
-| [`llava-v1.6-34b-hf`](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | 2x a40 | - tokens/s | - tokens/s |
-### [Microsoft: Phi 3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Phi-3-vision-128k-instruct`](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct) | 2x a40 | - tokens/s | - tokens/s |
-| [`Phi-3.5-vision-instruct`](https://huggingface.co/microsoft/Phi-3.5-vision-instruct) | 2x a40 | - tokens/s | - tokens/s |
-### [Meta: Llama 3.2](https://huggingface.co/collections/meta-llama/llama-32-66f448ffc8c32f949b04c8cf)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Llama-3.2-11B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-1B) | 2x a40 | - tokens/s | - tokens/s |
-| [`Llama-3.2-11B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | 2x a40 | - tokens/s | - tokens/s |
-| [`Llama-3.2-90B-Vision`](https://huggingface.co/meta-llama/Llama-3.2-3B) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-| [`Llama-3.2-90B-Vision-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | 8x a40 (2 nodes, 4 a40/node) | - tokens/s | - tokens/s |
-**NOTE**: `MllamaForConditionalGeneration` currently doesn't support pipeline parallelsim, to save memory, maximum number of requests is reduced and enforce eager mode is on.
-### [Mistral: Pixtral](https://huggingface.co/mistralai)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Pixtral-12B-2409`](https://huggingface.co/mistralai/Pixtral-12B-2409) | 1x a40 | - tokens/s | - tokens/s |
-### [OpenGVLab: InternVL2.5](https://huggingface.co/collections/OpenGVLab/internvl25-673e1019b66e2218f68d7c1c)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`InternVL2_5-8B`](https://huggingface.co/OpenGVLab/InternVL2_5-8B) | 1x a40 | - tokens/s | - tokens/s |
-| [`InternVL2_5-26B`](https://huggingface.co/OpenGVLab/InternVL2_5-26B) | 2x a40 | - tokens/s | - tokens/s |
-| [`InternVL2_5-38B`](https://huggingface.co/OpenGVLab/InternVL2_5-38B) | 4x a40 | - tokens/s | - tokens/s |
-### [THUDM: GLM-4](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`glm-4v-9b`](https://huggingface.co/THUDM/glm-4v-9b) | 1x a40 | - tokens/s | - tokens/s |
-### [DeepSeek: DeepSeek-VL2](https://huggingface.co/collections/deepseek-ai/deepseek-vl2-675c22accc456d3beb4613ab)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`deepseek-vl2`](https://huggingface.co/deepseek-ai/deepseek-vl2) | 2x a40 | - tokens/s | - tokens/s |
-| [`deepseek-vl2-small`](https://huggingface.co/deepseek-ai/deepseek-vl2-small) | 1x a40 | - tokens/s | - tokens/s |
-## Text Embedding Models
-### [Liang Wang: e5](https://huggingface.co/intfloat)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`e5-mistral-7b-instruct`](https://huggingface.co/intfloat/e5-mistral-7b-instruct) | 1x a40 | - tokens/s | - tokens/s |
-### [BAAI: bge](https://huggingface.co/BAAI)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`bge-base-en-v1.5`](https://huggingface.co/BAAI/bge-base-en-v1.5) | 1x A40 | - tokens/s | - tokens/s |
-### [Sentence Transformers: MiniLM](https://huggingface.co/sentence-transformers)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`all-MiniLM-L6-v2`](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | 1x A40 | - tokens/s | - tokens/s |
-## Reward Modeling Models
-### [Qwen: Qwen2.5-Math](https://huggingface.co/collections/Qwen/qwen25-math-66eaa240a1b7d5ee65f1da3e)
-| Variant | Suggested resource allocation | Avg prompt throughput | Avg generation throughput |
-|:----------:|:----------:|:----------:|:----------:|
-| [`Qwen2.5-Math-RM-72B`](https://huggingface.co/Qwen/Qwen2.5-Math-RM-72B) | 4x a40 | - tokens/s | - tokens/s |
-| [`Qwen2.5-Math-PRM-7B`](https://huggingface.co/Qwen/Qwen2.5-Math-PRM-7B) | 1x a40 | - tokens/s | - tokens/s |
+**NOTE**: These configs acts as last resort fallbacks in the `vec-inf` package, they will be updated to match the latest cached config on the Vector Killarney cluster with each new package version release.

vec_inf/config/environment.yaml ADDED Viewed

@@ -0,0 +1,35 @@
+paths:
+  image_path: "/model-weights/vec-inf-shared/vector-inference_latest.sif"
+containerization:
+  module_load_cmd: "module load apptainer"
+  module_name: "apptainer"
+limits:
+  max_gpus_per_node: 8
+  max_num_nodes: 178
+  max_cpus_per_task: 64
+allowed_values:
+  qos: []
+  partition: []
+  resource_type: ["l40s", "h100"]
+required_args:
+  account: "VEC_INF_ACCOUNT"
+  work_dir: "VEC_INF_WORK_DIR"
+default_args:
+  cpus_per_task: "16"
+  mem_per_node: "64G"
+  time: "08:00:00"
+  qos: ""
+  partition: ""
+  resource_type: ""
+  exclude: ""
+  nodelist: ""
+  bind: ""
+  venv: "apptainer"
+  data_type: "auto"
+  log_dir: "~/.vec-inf-logs"
+  model_weights_parent_dir: "/model-weights"

vec-inf 0.6.1__py3-none-any.whl → 0.7.1__py3-none-any.whl

vec-inf 0.6.1py3-none-any.whl → 0.7.1py3-none-any.whl