PyPI - vec-inf - Versions diffs - 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl - Mend

vec-inf 0.7.1py3-none-any.whl → 0.7.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

vec_inf/README.md +2 -1
vec_inf/cli/_cli.py +39 -10
vec_inf/cli/_helper.py +100 -19
vec_inf/client/_helper.py +80 -31
vec_inf/client/_slurm_script_generator.py +58 -30
vec_inf/client/_slurm_templates.py +27 -12
vec_inf/client/_utils.py +58 -6
vec_inf/client/api.py +55 -2
vec_inf/client/models.py +6 -0
vec_inf/config/models.yaml +47 -99
vec_inf/find_port.sh +10 -1
{vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/METADATA +7 -6
vec_inf-0.7.3.dist-info/RECORD +27 -0
{vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/WHEEL +1 -1
vec_inf-0.7.1.dist-info/RECORD +0 -27
{vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/entry_points.txt +0 -0
{vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/licenses/LICENSE +0 -0

vec_inf/client/_slurm_templates.py CHANGED Viewed

@@ -57,6 +57,8 @@ class SlurmScriptTemplate(TypedDict):
         Commands for container setup
     imports : str
         Import statements and source commands
+    bind_path : str
+        Bind path environment variable for the container
     container_command : str
         Template for container execution command
     activate_venv : str
@@ -74,7 +76,7 @@ class SlurmScriptTemplate(TypedDict):
     shebang: ShebangConfig
     container_setup: list[str]
     imports: str
-    env_vars: list[str]
+    bind_path: str
     container_command: str
     activate_venv: str
     server_setup: ServerSetupConfig
@@ -96,10 +98,8 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
         f"{CONTAINER_MODULE_NAME} exec {IMAGE_PATH} ray stop",
     ],
     "imports": "source {src_dir}/find_port.sh",
-    "env_vars": [
-        f"export {CONTAINER_MODULE_NAME}_BINDPATH=${CONTAINER_MODULE_NAME}_BINDPATH,$(echo /dev/infiniband* | sed -e 's/ /,/g')"
-    ],
-    "container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --bind {{model_weights_path}}{{additional_binds}} --containall {IMAGE_PATH} \\",
+    "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
+    "container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --containall {IMAGE_PATH} \\",
     "activate_venv": "source {venv}/bin/activate",
     "server_setup": {
         "single_node": [
@@ -112,6 +112,23 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
             "nodes_array=($nodes)",
             "head_node=${{nodes_array[0]}}",
             'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)',
+            "\n# Check for RDMA devices and set environment variable accordingly",
+            "if ! command -v ibv_devices >/dev/null 2>&1; then",
+            '   echo "ibv_devices not found; forcing TCP. (No RDMA userland on host?)"',
+            "   export NCCL_IB_DISABLE=1",
+            '   export NCCL_ENV_ARG="--env NCCL_IB_DISABLE=1"',
+            "else",
+            "   # Pick GID index based on link layer (IB vs RoCE)",
+            '   if ibv_devinfo 2>/dev/null | grep -q "link_layer:.*Ethernet"; then',
+            "       # RoCEv2 typically needs a nonzero GID index; 3 is common, try 2 if your fabric uses it",
+            "       export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}",
+            '       export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}"',
+            "   else",
+            "       # Native InfiniBand => GID 0",
+            "       export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}",
+            '       export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}"',
+            "   fi",
+            "fi",
             "\n# Start Ray head node",
             "head_node_port=$(find_available_port $head_node_ip 8080 65535)",
             "ray_head=$head_node_ip:$head_node_port",
@@ -198,8 +215,8 @@ class BatchModelLaunchScriptTemplate(TypedDict):
         Shebang line for the script
     container_setup : list[str]
         Commands for container setup
-    env_vars : list[str]
-        Environment variables to set
+    bind_path : str
+        Bind path environment variable for the container
     server_address_setup : list[str]
         Commands to setup the server address
     launch_cmd : list[str]
@@ -210,7 +227,7 @@ class BatchModelLaunchScriptTemplate(TypedDict):
     shebang: str
     container_setup: str
-    env_vars: list[str]
+    bind_path: str
     server_address_setup: list[str]
     write_to_json: list[str]
     launch_cmd: list[str]
@@ -220,9 +237,7 @@ class BatchModelLaunchScriptTemplate(TypedDict):
 BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = {
     "shebang": "#!/bin/bash\n",
     "container_setup": f"{CONTAINER_LOAD_CMD}\n",
-    "env_vars": [
-        f"export {CONTAINER_MODULE_NAME}_BINDPATH=${CONTAINER_MODULE_NAME}_BINDPATH,$(echo /dev/infiniband* | sed -e 's/ /,/g')"
-    ],
+    "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
     "server_address_setup": [
         "source {src_dir}/find_port.sh",
         "head_node_ip=${{SLURMD_NODENAME}}",
@@ -238,7 +253,7 @@ BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = {
         '    "$json_path" > temp_{model_name}.json \\',
         '    && mv temp_{model_name}.json "$json_path"\n',
     ],
-    "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --bind {{model_weights_path}}{{additional_binds}} --containall {IMAGE_PATH} \\",
+    "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {IMAGE_PATH} \\",
     "launch_cmd": [
         "vllm serve {model_weights_path} \\",
         "    --served-model-name {model_name} \\",

vec_inf/client/_utils.py CHANGED Viewed

@@ -108,15 +108,64 @@ def is_server_running(
     if isinstance(log_content, str):
         return log_content
-    status: Union[str, tuple[ModelStatus, str]] = ModelStatus.LAUNCHING
+    # Patterns that indicate fatal errors (not just warnings)
+    fatal_error_patterns = [
+        "traceback",
+        "exception",
+        "fatal error",
+        "critical error",
+        "failed to",
+        "could not",
+        "unable to",
+        "error:",
+    ]
+    # Patterns to ignore (non-fatal warnings/info messages)
+    ignore_patterns = [
+        "deprecated",
+        "futurewarning",
+        "userwarning",
+        "deprecationwarning",
+        "slurmstepd: error:",  # SLURM cancellation messages (often after server started)
+    ]
+    ready_signature_found = False
+    fatal_error_line = None
     for line in log_content:
-        if "error" in line.lower():
-            status = (ModelStatus.FAILED, line.strip("\n"))
+        line_lower = line.lower()
+        # Check for ready signature first - if found, server is running
         if MODEL_READY_SIGNATURE in line:
-            status = "RUNNING"
+            ready_signature_found = True
+            # Continue checking to see if there are errors after startup
+        # Check for fatal errors (only if we haven't seen ready signature yet)
+        if not ready_signature_found:
+            # Skip lines that match ignore patterns
+            if any(ignore_pattern in line_lower for ignore_pattern in ignore_patterns):
+                continue
+            # Check for fatal error patterns
+            for pattern in fatal_error_patterns:
+                if pattern in line_lower:
+                    # Additional check: skip if it's part of a warning message
+                    # (warnings often contain "error:" but aren't fatal)
+                    if "warning" in line_lower and "error:" in line_lower:
+                        continue
+                    fatal_error_line = line.strip("\n")
+                    break
+    # If we found a fatal error, mark as failed
+    if fatal_error_line:
+        return (ModelStatus.FAILED, fatal_error_line)
+    # If ready signature was found and no fatal errors, server is running
+    if ready_signature_found:
+        return "RUNNING"
-    return status
+    # Otherwise, still launching
+    return ModelStatus.LAUNCHING
 def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str:
@@ -387,7 +436,7 @@ def find_matching_dirs(
     return matched
-def check_required_fields(params: dict[str, Any]) -> None:
+def check_required_fields(params: dict[str, Any]) -> dict[str, Any]:
     """Check for required fields without default vals and their corresponding env vars.
     Parameters
@@ -395,12 +444,15 @@ def check_required_fields(params: dict[str, Any]) -> None:
     params : dict[str, Any]
         Dictionary of parameters to check.
     """
+    env_overrides = {}
     for arg in REQUIRED_ARGS:
         if not params.get(arg):
             default_value = os.getenv(REQUIRED_ARGS[arg])
             if default_value:
                 params[arg] = default_value
+                env_overrides[arg] = default_value
             else:
                 raise MissingRequiredFieldsError(
                     f"{arg} is required, please set it in the command arguments or environment variables"
                 )
+    return env_overrides

vec_inf/client/api.py CHANGED Viewed

@@ -10,7 +10,9 @@ vec_inf.client._helper : Helper classes for model inference server management
 vec_inf.client.models : Data models for API responses
 """
+import re
 import shutil
+import subprocess
 import time
 import warnings
 from pathlib import Path
@@ -81,7 +83,7 @@ class VecInfClient:
     def __init__(self) -> None:
         """Initialize the Vector Inference client."""
-        pass
+        self._metrics_collectors: dict[str, PerformanceMetricsCollector] = {}
     def list_models(self) -> list[ModelInfo]:
         """List all available models.
@@ -181,6 +183,51 @@ class VecInfClient:
         )
         return model_launcher.launch()
+    def fetch_running_jobs(self) -> list[str]:
+        """
+        Fetch the list of running vec-inf job IDs for the current user.
+        Returns
+        -------
+        list[str]
+            List of matching job names; empty list if squeue unavailable.
+        """
+        try:
+            res = subprocess.run(
+                ["squeue", "--me", "--noheader"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            job_ids = [
+                ln.strip().split()[0] for ln in res.stdout.splitlines() if ln.strip()
+            ]
+            if not job_ids:
+                return []
+            # For each job, fetch the full JobName and filter by suffix
+            matching_ids = []
+            for jid in job_ids:
+                try:
+                    sctl = subprocess.run(
+                        ["scontrol", "show", "job", "-o", jid],
+                        capture_output=True,
+                        text=True,
+                        check=True,
+                    )
+                    m = re.search(r"\bJobName=([^\s]+)", sctl.stdout)
+                    if m and m.group(1).endswith("-vec-inf"):
+                        matching_ids.append(jid)
+                except subprocess.CalledProcessError:
+                    # Job might have finished between squeue and scontrol; skip
+                    continue
+            return matching_ids
+        except subprocess.CalledProcessError as e:
+            raise SlurmJobError(f"Error running slurm command: {e}") from e
     def get_status(self, slurm_job_id: str) -> StatusResponse:
         """Get the status of a running model.
@@ -218,7 +265,13 @@ class VecInfClient:
             - Performance metrics or error message
             - Timestamp of collection
         """
-        performance_metrics_collector = PerformanceMetricsCollector(slurm_job_id)
+        # Use cached collector to preserve state between calls to compute throughput
+        if slurm_job_id not in self._metrics_collectors:
+            self._metrics_collectors[slurm_job_id] = PerformanceMetricsCollector(
+                slurm_job_id
+            )
+        performance_metrics_collector = self._metrics_collectors[slurm_job_id]
         metrics: Union[dict[str, float], str]
         if not performance_metrics_collector.metrics_url.startswith("http"):

vec_inf/client/models.py CHANGED Viewed

@@ -194,6 +194,10 @@ class LaunchOptions:
         Number of nodes to allocate
     gpus_per_node : int, optional
         Number of GPUs per node
+    cpus_per_task : int, optional
+        Number of CPUs per task
+    mem_per_node : str, optional
+        Memory per node
     account : str, optional
         Account name for job scheduling
     work_dir : str, optional
@@ -232,6 +236,8 @@ class LaunchOptions:
     resource_type: Optional[str] = None
     num_nodes: Optional[int] = None
     gpus_per_node: Optional[int] = None
+    cpus_per_task: Optional[int] = None
+    mem_per_node: Optional[str] = None
     account: Optional[str] = None
     work_dir: Optional[str] = None
     qos: Optional[str] = None

vec-inf 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

vec-inf 0.7.1py3-none-any.whl → 0.7.3py3-none-any.whl