PyPI - vec-inf - Versions diffs - 0.7.3__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

vec-inf 0.7.3py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

vec_inf/cli/_cli.py +19 -3
vec_inf/cli/_helper.py +23 -12
vec_inf/cli/_vars.py +37 -22
vec_inf/client/_client_vars.py +31 -1
vec_inf/client/_helper.py +140 -44
vec_inf/client/_slurm_script_generator.py +85 -30
vec_inf/client/_slurm_templates.py +102 -38
vec_inf/client/_slurm_vars.py +13 -4
vec_inf/client/_utils.py +10 -7
vec_inf/client/config.py +17 -7
vec_inf/client/models.py +25 -19
vec_inf/config/README.md +1 -1
vec_inf/config/environment.yaml +9 -2
vec_inf/config/models.yaml +182 -365
{vec_inf-0.7.3.dist-info → vec_inf-0.8.0.dist-info}/METADATA +16 -15
vec_inf-0.8.0.dist-info/RECORD +27 -0
vec_inf-0.7.3.dist-info/RECORD +0 -27
{vec_inf-0.7.3.dist-info → vec_inf-0.8.0.dist-info}/WHEEL +0 -0
{vec_inf-0.7.3.dist-info → vec_inf-0.8.0.dist-info}/entry_points.txt +0 -0
{vec_inf-0.7.3.dist-info → vec_inf-0.8.0.dist-info}/licenses/LICENSE +0 -0

vec_inf/cli/_cli.py CHANGED Viewed

@@ -132,10 +132,20 @@ def cli() -> None:
     type=str,
     help="Path to parent directory containing model weights",
 )
+@click.option(
+    "--engine",
+    type=str,
+    help="Inference engine to use, supports 'vllm' and 'sglang'",
+)
 @click.option(
     "--vllm-args",
     type=str,
-    help="vLLM engine arguments to be set, use the format as specified in vLLM documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
+    help="vLLM engine arguments to be set, use the format as specified in vLLM serve documentation and separate arguments with commas, e.g. --vllm-args '--max-model-len=8192,--max-num-seqs=256,--enable-prefix-caching'",
+)
+@click.option(
+    "--sglang-args",
+    type=str,
+    help="SGLang engine arguments to be set, use the format as specified in SGLang Server Arguments documentation and separate arguments with commas, e.g. --sglang-args '--context-length=8192,--mem-fraction-static=0.85'",
 )
 @click.option(
     "--json-mode",
@@ -150,7 +160,7 @@ def cli() -> None:
 @click.option(
     "--config",
     type=str,
-    help="Path to a model config yaml file to use in place of the default",
+    help="Path to a model config yaml file to use in place of the default, you can also set VEC_INF_MODEL_CONFIG to the path to the model config file",
 )
 def launch(
     model_name: str,
@@ -201,7 +211,9 @@ def launch(
         - model_weights_parent_dir : str, optional
             Path to model weights directory
         - vllm_args : str, optional
-            vLLM engine arguments
+            vllm engine arguments
+        - sglang_args : str, optional
+            sglang engine arguments
         - env : str, optional
             Environment variables
         - config : str, optional
@@ -229,6 +241,10 @@ def launch(
         if json_mode:
             click.echo(json.dumps(launch_response.config))
         else:
+            if launch_response.config.get("engine_inferred"):
+                CONSOLE.print(
+                    "Warning: Inference engine inferred from engine-specific args"
+                )
             launch_formatter = LaunchResponseFormatter(
                 model_name, launch_response.config
             )

vec_inf/cli/_helper.py CHANGED Viewed

@@ -15,7 +15,7 @@ from rich.panel import Panel
 from rich.table import Table
 from vec_inf.cli._utils import create_table
-from vec_inf.cli._vars import MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
+from vec_inf.cli._vars import ENGINE_NAME_MAP, MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
 from vec_inf.client import ModelConfig, ModelInfo, StatusResponse
@@ -49,11 +49,12 @@ class LaunchResponseFormatter:
             if self.params.get(key):
                 table.add_row(label, self.params[key])
-    def _add_vllm_config(self, table: Table) -> None:
-        """Add vLLM configuration details to the table."""
-        if self.params.get("vllm_args"):
-            table.add_row("vLLM Arguments:", style="magenta")
-            for arg, value in self.params["vllm_args"].items():
+    def _add_engine_config(self, table: Table) -> None:
+        """Add inference engine configuration details to the table."""
+        if self.params.get("engine_args"):
+            engine_name = ENGINE_NAME_MAP[self.params["engine"]]
+            table.add_row(f"{engine_name} Arguments:", style="magenta")
+            for arg, value in self.params["engine_args"].items():
                 table.add_row(f"  {arg}:", str(value))
     def _add_env_vars(self, table: Table) -> None:
@@ -111,9 +112,10 @@ class LaunchResponseFormatter:
             str(Path(self.params["model_weights_parent_dir"], self.model_name)),
         )
         table.add_row("Log Directory", self.params["log_dir"])
+        table.add_row("Inference Engine", ENGINE_NAME_MAP[self.params["engine"]])
         # Add configuration details
-        self._add_vllm_config(table)
+        self._add_engine_config(table)
         self._add_env_vars(table)
         self._add_bind_paths(table)
@@ -185,6 +187,10 @@ class BatchLaunchResponseFormatter:
             table.add_row(
                 "Memory/Node", f"  {self.params['models'][model_name]['mem_per_node']}"
             )
+            table.add_row(
+                "Inference Engine",
+                f"  {ENGINE_NAME_MAP[self.params['models'][model_name]['engine']]}",
+            )
         return table
@@ -479,14 +485,19 @@ class ListCmdDisplay:
             )
             return json.dumps(config_dict, indent=4)
+        excluded_list = ["venv", "log_dir"]
         table = create_table(key_title="Model Config", value_title="Value")
         for field, value in config.model_dump().items():
-            if field not in {"venv", "log_dir", "vllm_args"} and value:
+            if "args" in field:
+                if not value:
+                    continue
+                engine_name = ENGINE_NAME_MAP[field.split("_")[0]]
+                table.add_row(f"{engine_name} Arguments:", style="magenta")
+                for engine_arg, engine_value in value.items():
+                    table.add_row(f"  {engine_arg}:", str(engine_value))
+            elif field not in excluded_list and value:
                 table.add_row(field, str(value))
-            if field == "vllm_args":
-                table.add_row("vLLM Arguments:", style="magenta")
-                for vllm_arg, vllm_value in value.items():
-                    table.add_row(f"  {vllm_arg}:", str(vllm_value))
         return table
     def _format_all_models_output(

vec_inf/cli/_vars.py CHANGED Viewed

@@ -1,32 +1,47 @@
 """Constants for CLI rendering.
-This module defines constant mappings for model type priorities and colors
+This module defines mappings for model type priorities, colors, and engine name mappings
 used in the CLI display formatting.
+"""
-Constants
----------
-MODEL_TYPE_PRIORITY : dict
-    Mapping of model types to their display priority (lower numbers shown first)
+from typing import get_args
-MODEL_TYPE_COLORS : dict
-    Mapping of model types to their display colors in Rich
+from vec_inf.client._slurm_vars import MODEL_TYPES
-Notes
------
-These constants are used primarily by the ListCmdDisplay class to ensure
-consistent sorting and color coding of different model types in the CLI output.
-"""
-MODEL_TYPE_PRIORITY = {
-    "LLM": 0,
-    "VLM": 1,
-    "Text_Embedding": 2,
-    "Reward_Modeling": 3,
-}
+# Extract model type values from the Literal type
+_MODEL_TYPES = get_args(MODEL_TYPES)
+# Rich color options (prioritizing current colors, with fallbacks for additional types)
+_RICH_COLORS = [
+    "cyan",
+    "bright_blue",
+    "purple",
+    "bright_magenta",
+    "green",
+    "yellow",
+    "bright_green",
+    "bright_yellow",
+    "red",
+    "bright_red",
+    "blue",
+    "magenta",
+    "bright_cyan",
+    "white",
+    "bright_white",
+]
+# Mapping of model types to their display priority (lower numbers shown first)
+MODEL_TYPE_PRIORITY = {model_type: idx for idx, model_type in enumerate(_MODEL_TYPES)}
+# Mapping of model types to their display colors in Rich
 MODEL_TYPE_COLORS = {
-    "LLM": "cyan",
-    "VLM": "bright_blue",
-    "Text_Embedding": "purple",
-    "Reward_Modeling": "bright_magenta",
+    model_type: _RICH_COLORS[idx % len(_RICH_COLORS)]
+    for idx, model_type in enumerate(_MODEL_TYPES)
+}
+# Inference engine choice and name mapping
+ENGINE_NAME_MAP = {
+    "vllm": "vLLM",
+    "sglang": "SGLang",
 }

vec_inf/client/_client_vars.py CHANGED Viewed

@@ -49,7 +49,7 @@ SLURM_JOB_CONFIG_ARGS = {
     "time": "time",
     "nodes": "num_nodes",
     "exclude": "exclude",
-    "nodelist": "node_list",
+    "nodelist": "nodelist",
     "gres": "gres",
     "cpus-per-task": "cpus_per_task",
     "mem": "mem_per_node",
@@ -61,13 +61,43 @@ SLURM_JOB_CONFIG_ARGS = {
 VLLM_SHORT_TO_LONG_MAP = {
     "-tp": "--tensor-parallel-size",
     "-pp": "--pipeline-parallel-size",
+    "-n": "--nnodes",
+    "-r": "--node-rank",
+    "-dcp": "--decode-context-parallel-size",
+    "-pcp": "--prefill-context-parallel-size",
     "-dp": "--data-parallel-size",
+    "-dpn": "--data-parallel-rank",
+    "-dpr": "--data-parallel-start-rank",
     "-dpl": "--data-parallel-size-local",
     "-dpa": "--data-parallel-address",
     "-dpp": "--data-parallel-rpc-port",
+    "-dpb": "--data-parallel-backend",
+    "-dph": "--data-parallel-hybrid-lb",
+    "-dpe": "--data-parallel-external-lb",
     "-O": "--compilation-config",
     "-q": "--quantization",
 }
+# SGLang engine args mapping between short and long names
+SGLANG_SHORT_TO_LONG_MAP = {
+    "--tp": "--tensor-parallel-size",
+    "--tp-size": "--tensor-parallel-size",
+    "--pp": "--pipeline-parallel-size",
+    "--pp-size": "--pipeline-parallel-size",
+    "--dp": "--data-parallel-size",
+    "--dp-size": "--data-parallel-size",
+    "--ep": "--expert-parallel-size",
+    "--ep-size": "--expert-parallel-expert-size",
+}
+# Mapping of engine short names to their argument mappings
+ENGINE_SHORT_TO_LONG_MAP = {
+    "vllm": VLLM_SHORT_TO_LONG_MAP,
+    "sglang": SGLANG_SHORT_TO_LONG_MAP,
+}
 # Required matching arguments for batch mode
 BATCH_MODE_REQUIRED_MATCHING_ARGS = ["venv", "log_dir"]
+# Supported engines
+SUPPORTED_ENGINES = ["vllm", "sglang"]

vec_inf/client/_helper.py CHANGED Viewed

@@ -17,9 +17,10 @@ import requests
 import vec_inf.client._utils as utils
 from vec_inf.client._client_vars import (
     BATCH_MODE_REQUIRED_MATCHING_ARGS,
+    ENGINE_SHORT_TO_LONG_MAP,
     KEY_METRICS,
     SRC_DIR,
-    VLLM_SHORT_TO_LONG_MAP,
+    SUPPORTED_ENGINES,
 )
 from vec_inf.client._exceptions import (
     MissingRequiredFieldsError,
@@ -63,6 +64,7 @@ class ModelLauncher:
         self.slurm_job_id = ""
         self.slurm_script_path = Path("")
         self.model_config = self._get_model_configuration(self.kwargs.get("config"))
+        self.engine = ""
         self.params = self._get_launch_params()
     def _warn(self, message: str) -> None:
@@ -137,32 +139,38 @@ class ModelLauncher:
             f"not found at expected path '{model_weights_path}'"
         )
-    def _process_vllm_args(self, arg_string: str) -> dict[str, Any]:
-        """Process the vllm_args string into a dictionary.
+    def _process_engine_args(
+        self, arg_string: str, engine_choice: str
+    ) -> dict[str, Any]:
+        """Process the engine_args string into a dictionary.
         Parameters
         ----------
         arg_string : str
-            Comma-separated string of vLLM arguments
+            Comma-separated string of inference engine arguments
         Returns
         -------
         dict[str, Any]
-            Processed vLLM arguments as key-value pairs
+            Processed inference engine arguments as key-value pairs
         """
-        vllm_args: dict[str, str | bool] = {}
+        engine_args: dict[str, str | bool] = {}
+        engine_arg_map = ENGINE_SHORT_TO_LONG_MAP[engine_choice]
         for arg in arg_string.split(","):
             if "=" in arg:
                 key, value = arg.split("=")
-                if key.strip() in VLLM_SHORT_TO_LONG_MAP:
-                    key = VLLM_SHORT_TO_LONG_MAP[key.strip()]
-                vllm_args[key.strip()] = value.strip()
+                if key.strip() in engine_arg_map:
+                    key = engine_arg_map[key.strip()]
+                engine_args[key.strip()] = value.strip()
             elif "-O" in arg.strip():
-                key = VLLM_SHORT_TO_LONG_MAP["-O"]
-                vllm_args[key] = arg.strip()[2:].strip()
+                if engine_choice != "vllm":
+                    raise ValueError("-O is only supported for vLLM")
+                key = engine_arg_map["-O"]
+                engine_args[key] = arg.strip()[2:].strip()
             else:
-                vllm_args[arg.strip()] = True
-        return vllm_args
+                engine_args[arg.strip()] = True
+        return engine_args
     def _process_env_vars(self, env_arg: str) -> dict[str, str]:
         """Process the env string into a dictionary of environment variables.
@@ -196,6 +204,63 @@ class ModelLauncher:
                         print(f"WARNING: Could not parse env var: {line}")
         return env_vars
+    def _engine_check_override(self, params: dict[str, Any]) -> None:
+        """Check for engine override in CLI args and warn user.
+        Parameters
+        ----------
+        params : dict[str, Any]
+            Dictionary of launch parameters to check
+        """
+        def overwrite_engine_args(params: dict[str, Any]) -> None:
+            engine_args = self._process_engine_args(
+                self.kwargs[f"{self.engine}_args"], self.engine
+            )
+            for key, value in engine_args.items():
+                params["engine_args"][key] = value
+            del self.kwargs[f"{self.engine}_args"]
+        # Infer engine name from engine-specific args if provided
+        extracted_engine = ""
+        for engine in SUPPORTED_ENGINES:
+            if self.kwargs.get(f"{engine}_args"):
+                if not extracted_engine:
+                    extracted_engine = engine
+                else:
+                    raise ValueError(
+                        "Cannot provide engine-specific args for multiple engines, please choose one"
+                    )
+        # Check for mismatch between provided engine arg and engine-specific args
+        input_engine = self.kwargs.get("engine", "")
+        if input_engine and extracted_engine:
+            if input_engine != extracted_engine:
+                raise ValueError(
+                    f"Mismatch between provided engine '{input_engine}' and engine-specific args '{extracted_engine}'"
+                )
+            self.engine = input_engine
+            params["engine_args"] = params[f"{self.engine}_args"]
+            overwrite_engine_args(params)
+        elif input_engine:
+            # Only engine arg in CLI, use default engine args from config
+            self.engine = input_engine
+            params["engine_args"] = params[f"{self.engine}_args"]
+        elif extracted_engine:
+            # Only engine-specific args in CLI, infer engine and warn user
+            self.engine = extracted_engine
+            params["engine_inferred"] = True
+            params["engine_args"] = params[f"{self.engine}_args"]
+            overwrite_engine_args(params)
+        else:
+            # No engine-related args in CLI, use defaults from config
+            self.engine = params.get("engine", "vllm")
+            params["engine_args"] = params[f"{self.engine}_args"]
+        # Remove $ENGINE_NAME_args from params as they won't get populated to sjob json.
+        for engine in SUPPORTED_ENGINES:
+            del params[f"{engine}_args"]
     def _apply_cli_overrides(self, params: dict[str, Any]) -> None:
         """Apply CLI argument overrides to params.
@@ -204,11 +269,7 @@ class ModelLauncher:
         params : dict[str, Any]
             Dictionary of launch parameters to override
         """
-        if self.kwargs.get("vllm_args"):
-            vllm_args = self._process_vllm_args(self.kwargs["vllm_args"])
-            for key, value in vllm_args.items():
-                params["vllm_args"][key] = value
-            del self.kwargs["vllm_args"]
+        self._engine_check_override(params)
         if self.kwargs.get("env"):
             env_vars = self._process_env_vars(self.kwargs["env"])
@@ -241,7 +302,7 @@ class ModelLauncher:
         """
         if (
             int(params["gpus_per_node"]) > 1
-            and params["vllm_args"].get("--tensor-parallel-size") is None
+            and params["engine_args"].get("--tensor-parallel-size") is None
         ):
             raise MissingRequiredFieldsError(
                 "--tensor-parallel-size is required when gpus_per_node > 1"
@@ -252,8 +313,8 @@ class ModelLauncher:
             raise ValueError("Total number of GPUs requested must be a power of two")
         total_parallel_sizes = int(
-            params["vllm_args"].get("--tensor-parallel-size", "1")
-        ) * int(params["vllm_args"].get("--pipeline-parallel-size", "1"))
+            params["engine_args"].get("--tensor-parallel-size", "1")
+        ) * int(params["engine_args"].get("--pipeline-parallel-size", "1"))
         if total_gpus_requested != total_parallel_sizes:
             raise ValueError(
                 "Mismatch between total number of GPUs requested and parallelization settings"
@@ -312,7 +373,8 @@ class ModelLauncher:
         # Convert path to string for JSON serialization
         for field in params:
-            if field in ["vllm_args", "env"]:
+            # Keep structured fields (dicts/bools) intact
+            if field in ["engine_args", "env", "engine_inferred"]:
                 continue
             params[field] = str(params[field])
@@ -370,7 +432,7 @@ class ModelLauncher:
         # Replace venv with image path if using container
         if self.params["venv"] == CONTAINER_MODULE_NAME:
-            self.params["venv"] = IMAGE_PATH
+            self.params["venv"] = IMAGE_PATH[self.params["engine"]]
         with job_json.open("w") as file:
             json.dump(self.params, file, indent=4)
@@ -453,6 +515,53 @@ class BatchModelLauncher:
         return model_configs_dict
+    def _validate_resource_and_parallel_settings(
+        self,
+        config: ModelConfig,
+        model_engine_args: dict[str, Any] | None,
+        model_name: str,
+    ) -> None:
+        """Validate resource allocation and parallelization settings for each model.
+        Parameters
+        ----------
+        config : ModelConfig
+            Configuration of the model to validate
+        model_engine_args : dict[str, Any] | None
+            Inference engine arguments of the model to validate
+        model_name : str
+            Name of the model to validate
+        Raises
+        ------
+        MissingRequiredFieldsError
+            If tensor parallel size is not specified when using multiple GPUs
+        ValueError
+            If total # of GPUs requested is not a power of two
+            If mismatch between total # of GPUs requested and parallelization settings
+        """
+        if (
+            int(config.gpus_per_node) > 1
+            and (model_engine_args or {}).get("--tensor-parallel-size") is None
+        ):
+            raise MissingRequiredFieldsError(
+                f"--tensor-parallel-size is required when gpus_per_node > 1, check your configuration for {model_name}"
+            )
+        total_gpus_requested = int(config.gpus_per_node) * int(config.num_nodes)
+        if not utils.is_power_of_two(total_gpus_requested):
+            raise ValueError(
+                f"Total number of GPUs requested must be a power of two, check your configuration for {model_name}"
+            )
+        total_parallel_sizes = int(
+            (model_engine_args or {}).get("--tensor-parallel-size", "1")
+        ) * int((model_engine_args or {}).get("--pipeline-parallel-size", "1"))
+        if total_gpus_requested != total_parallel_sizes:
+            raise ValueError(
+                f"Mismatch between total number of GPUs requested and parallelization settings, check your configuration for {model_name}"
+            )
     def _get_launch_params(
         self, account: Optional[str] = None, work_dir: Optional[str] = None
     ) -> dict[str, Any]:
@@ -483,28 +592,15 @@ class BatchModelLauncher:
             params["models"][model_name] = config.model_dump(exclude_none=True)
             params["models"][model_name]["het_group_id"] = i
-            # Validate resource allocation and parallelization settings
-            if (
-                int(config.gpus_per_node) > 1
-                and (config.vllm_args or {}).get("--tensor-parallel-size") is None
-            ):
-                raise MissingRequiredFieldsError(
-                    f"--tensor-parallel-size is required when gpus_per_node > 1, check your configuration for {model_name}"
-                )
-            total_gpus_requested = int(config.gpus_per_node) * int(config.num_nodes)
-            if not utils.is_power_of_two(total_gpus_requested):
-                raise ValueError(
-                    f"Total number of GPUs requested must be a power of two, check your configuration for {model_name}"
-                )
+            model_engine_args = getattr(config, f"{config.engine}_args", None)
+            params["models"][model_name]["engine_args"] = model_engine_args
+            for engine in SUPPORTED_ENGINES:
+                del params["models"][model_name][f"{engine}_args"]
-            total_parallel_sizes = int(
-                (config.vllm_args or {}).get("--tensor-parallel-size", "1")
-            ) * int((config.vllm_args or {}).get("--pipeline-parallel-size", "1"))
-            if total_gpus_requested != total_parallel_sizes:
-                raise ValueError(
-                    f"Mismatch between total number of GPUs requested and parallelization settings, check your configuration for {model_name}"
-                )
+            # Validate resource allocation and parallelization settings
+            self._validate_resource_and_parallel_settings(
+                config, model_engine_args, model_name
+            )
             # Convert gpus_per_node and resource_type to gres
             params["models"][model_name]["gres"] = (

vec-inf 0.7.3__py3-none-any.whl → 0.8.0__py3-none-any.whl

vec-inf 0.7.3py3-none-any.whl → 0.8.0py3-none-any.whl