vec-inf 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,6 +57,8 @@ class SlurmScriptTemplate(TypedDict):
57
57
  Commands for container setup
58
58
  imports : str
59
59
  Import statements and source commands
60
+ bind_path : str
61
+ Bind path environment variable for the container
60
62
  container_command : str
61
63
  Template for container execution command
62
64
  activate_venv : str
@@ -74,7 +76,7 @@ class SlurmScriptTemplate(TypedDict):
74
76
  shebang: ShebangConfig
75
77
  container_setup: list[str]
76
78
  imports: str
77
- env_vars: list[str]
79
+ bind_path: str
78
80
  container_command: str
79
81
  activate_venv: str
80
82
  server_setup: ServerSetupConfig
@@ -96,10 +98,8 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
96
98
  f"{CONTAINER_MODULE_NAME} exec {IMAGE_PATH} ray stop",
97
99
  ],
98
100
  "imports": "source {src_dir}/find_port.sh",
99
- "env_vars": [
100
- f"export {CONTAINER_MODULE_NAME}_BINDPATH=${CONTAINER_MODULE_NAME}_BINDPATH,$(echo /dev/infiniband* | sed -e 's/ /,/g')"
101
- ],
102
- "container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --bind {{model_weights_path}}{{additional_binds}} --containall {IMAGE_PATH} \\",
101
+ "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
102
+ "container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --containall {IMAGE_PATH} \\",
103
103
  "activate_venv": "source {venv}/bin/activate",
104
104
  "server_setup": {
105
105
  "single_node": [
@@ -112,6 +112,23 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
112
112
  "nodes_array=($nodes)",
113
113
  "head_node=${{nodes_array[0]}}",
114
114
  'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)',
115
+ "\n# Check for RDMA devices and set environment variable accordingly",
116
+ "if ! command -v ibv_devices >/dev/null 2>&1; then",
117
+ ' echo "ibv_devices not found; forcing TCP. (No RDMA userland on host?)"',
118
+ " export NCCL_IB_DISABLE=1",
119
+ ' export NCCL_ENV_ARG="--env NCCL_IB_DISABLE=1"',
120
+ "else",
121
+ " # Pick GID index based on link layer (IB vs RoCE)",
122
+ ' if ibv_devinfo 2>/dev/null | grep -q "link_layer:.*Ethernet"; then',
123
+ " # RoCEv2 typically needs a nonzero GID index; 3 is common, try 2 if your fabric uses it",
124
+ " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}",
125
+ ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}"',
126
+ " else",
127
+ " # Native InfiniBand => GID 0",
128
+ " export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}",
129
+ ' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}"',
130
+ " fi",
131
+ "fi",
115
132
  "\n# Start Ray head node",
116
133
  "head_node_port=$(find_available_port $head_node_ip 8080 65535)",
117
134
  "ray_head=$head_node_ip:$head_node_port",
@@ -198,8 +215,8 @@ class BatchModelLaunchScriptTemplate(TypedDict):
198
215
  Shebang line for the script
199
216
  container_setup : list[str]
200
217
  Commands for container setup
201
- env_vars : list[str]
202
- Environment variables to set
218
+ bind_path : str
219
+ Bind path environment variable for the container
203
220
  server_address_setup : list[str]
204
221
  Commands to setup the server address
205
222
  launch_cmd : list[str]
@@ -210,7 +227,7 @@ class BatchModelLaunchScriptTemplate(TypedDict):
210
227
 
211
228
  shebang: str
212
229
  container_setup: str
213
- env_vars: list[str]
230
+ bind_path: str
214
231
  server_address_setup: list[str]
215
232
  write_to_json: list[str]
216
233
  launch_cmd: list[str]
@@ -220,9 +237,7 @@ class BatchModelLaunchScriptTemplate(TypedDict):
220
237
  BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = {
221
238
  "shebang": "#!/bin/bash\n",
222
239
  "container_setup": f"{CONTAINER_LOAD_CMD}\n",
223
- "env_vars": [
224
- f"export {CONTAINER_MODULE_NAME}_BINDPATH=${CONTAINER_MODULE_NAME}_BINDPATH,$(echo /dev/infiniband* | sed -e 's/ /,/g')"
225
- ],
240
+ "bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
226
241
  "server_address_setup": [
227
242
  "source {src_dir}/find_port.sh",
228
243
  "head_node_ip=${{SLURMD_NODENAME}}",
@@ -238,7 +253,7 @@ BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = {
238
253
  ' "$json_path" > temp_{model_name}.json \\',
239
254
  ' && mv temp_{model_name}.json "$json_path"\n',
240
255
  ],
241
- "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --bind {{model_weights_path}}{{additional_binds}} --containall {IMAGE_PATH} \\",
256
+ "container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {IMAGE_PATH} \\",
242
257
  "launch_cmd": [
243
258
  "vllm serve {model_weights_path} \\",
244
259
  " --served-model-name {model_name} \\",
vec_inf/client/_utils.py CHANGED
@@ -108,15 +108,64 @@ def is_server_running(
108
108
  if isinstance(log_content, str):
109
109
  return log_content
110
110
 
111
- status: Union[str, tuple[ModelStatus, str]] = ModelStatus.LAUNCHING
111
+ # Patterns that indicate fatal errors (not just warnings)
112
+ fatal_error_patterns = [
113
+ "traceback",
114
+ "exception",
115
+ "fatal error",
116
+ "critical error",
117
+ "failed to",
118
+ "could not",
119
+ "unable to",
120
+ "error:",
121
+ ]
122
+
123
+ # Patterns to ignore (non-fatal warnings/info messages)
124
+ ignore_patterns = [
125
+ "deprecated",
126
+ "futurewarning",
127
+ "userwarning",
128
+ "deprecationwarning",
129
+ "slurmstepd: error:", # SLURM cancellation messages (often after server started)
130
+ ]
131
+
132
+ ready_signature_found = False
133
+ fatal_error_line = None
112
134
 
113
135
  for line in log_content:
114
- if "error" in line.lower():
115
- status = (ModelStatus.FAILED, line.strip("\n"))
136
+ line_lower = line.lower()
137
+
138
+ # Check for ready signature first - if found, server is running
116
139
  if MODEL_READY_SIGNATURE in line:
117
- status = "RUNNING"
140
+ ready_signature_found = True
141
+ # Continue checking to see if there are errors after startup
142
+
143
+ # Check for fatal errors (only if we haven't seen ready signature yet)
144
+ if not ready_signature_found:
145
+ # Skip lines that match ignore patterns
146
+ if any(ignore_pattern in line_lower for ignore_pattern in ignore_patterns):
147
+ continue
148
+
149
+ # Check for fatal error patterns
150
+ for pattern in fatal_error_patterns:
151
+ if pattern in line_lower:
152
+ # Additional check: skip if it's part of a warning message
153
+ # (warnings often contain "error:" but aren't fatal)
154
+ if "warning" in line_lower and "error:" in line_lower:
155
+ continue
156
+ fatal_error_line = line.strip("\n")
157
+ break
158
+
159
+ # If we found a fatal error, mark as failed
160
+ if fatal_error_line:
161
+ return (ModelStatus.FAILED, fatal_error_line)
162
+
163
+ # If ready signature was found and no fatal errors, server is running
164
+ if ready_signature_found:
165
+ return "RUNNING"
118
166
 
119
- return status
167
+ # Otherwise, still launching
168
+ return ModelStatus.LAUNCHING
120
169
 
121
170
 
122
171
  def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str:
@@ -387,7 +436,7 @@ def find_matching_dirs(
387
436
  return matched
388
437
 
389
438
 
390
- def check_required_fields(params: dict[str, Any]) -> None:
439
+ def check_required_fields(params: dict[str, Any]) -> dict[str, Any]:
391
440
  """Check for required fields without default vals and their corresponding env vars.
392
441
 
393
442
  Parameters
@@ -395,12 +444,15 @@ def check_required_fields(params: dict[str, Any]) -> None:
395
444
  params : dict[str, Any]
396
445
  Dictionary of parameters to check.
397
446
  """
447
+ env_overrides = {}
398
448
  for arg in REQUIRED_ARGS:
399
449
  if not params.get(arg):
400
450
  default_value = os.getenv(REQUIRED_ARGS[arg])
401
451
  if default_value:
402
452
  params[arg] = default_value
453
+ env_overrides[arg] = default_value
403
454
  else:
404
455
  raise MissingRequiredFieldsError(
405
456
  f"{arg} is required, please set it in the command arguments or environment variables"
406
457
  )
458
+ return env_overrides
vec_inf/client/api.py CHANGED
@@ -10,7 +10,9 @@ vec_inf.client._helper : Helper classes for model inference server management
10
10
  vec_inf.client.models : Data models for API responses
11
11
  """
12
12
 
13
+ import re
13
14
  import shutil
15
+ import subprocess
14
16
  import time
15
17
  import warnings
16
18
  from pathlib import Path
@@ -81,7 +83,7 @@ class VecInfClient:
81
83
 
82
84
  def __init__(self) -> None:
83
85
  """Initialize the Vector Inference client."""
84
- pass
86
+ self._metrics_collectors: dict[str, PerformanceMetricsCollector] = {}
85
87
 
86
88
  def list_models(self) -> list[ModelInfo]:
87
89
  """List all available models.
@@ -181,6 +183,51 @@ class VecInfClient:
181
183
  )
182
184
  return model_launcher.launch()
183
185
 
186
+ def fetch_running_jobs(self) -> list[str]:
187
+ """
188
+ Fetch the list of running vec-inf job IDs for the current user.
189
+
190
+ Returns
191
+ -------
192
+ list[str]
193
+ List of matching job names; empty list if squeue unavailable.
194
+ """
195
+ try:
196
+ res = subprocess.run(
197
+ ["squeue", "--me", "--noheader"],
198
+ capture_output=True,
199
+ text=True,
200
+ check=True,
201
+ )
202
+ job_ids = [
203
+ ln.strip().split()[0] for ln in res.stdout.splitlines() if ln.strip()
204
+ ]
205
+
206
+ if not job_ids:
207
+ return []
208
+
209
+ # For each job, fetch the full JobName and filter by suffix
210
+ matching_ids = []
211
+ for jid in job_ids:
212
+ try:
213
+ sctl = subprocess.run(
214
+ ["scontrol", "show", "job", "-o", jid],
215
+ capture_output=True,
216
+ text=True,
217
+ check=True,
218
+ )
219
+ m = re.search(r"\bJobName=([^\s]+)", sctl.stdout)
220
+ if m and m.group(1).endswith("-vec-inf"):
221
+ matching_ids.append(jid)
222
+ except subprocess.CalledProcessError:
223
+ # Job might have finished between squeue and scontrol; skip
224
+ continue
225
+
226
+ return matching_ids
227
+
228
+ except subprocess.CalledProcessError as e:
229
+ raise SlurmJobError(f"Error running slurm command: {e}") from e
230
+
184
231
  def get_status(self, slurm_job_id: str) -> StatusResponse:
185
232
  """Get the status of a running model.
186
233
 
@@ -218,7 +265,13 @@ class VecInfClient:
218
265
  - Performance metrics or error message
219
266
  - Timestamp of collection
220
267
  """
221
- performance_metrics_collector = PerformanceMetricsCollector(slurm_job_id)
268
+ # Use cached collector to preserve state between calls to compute throughput
269
+ if slurm_job_id not in self._metrics_collectors:
270
+ self._metrics_collectors[slurm_job_id] = PerformanceMetricsCollector(
271
+ slurm_job_id
272
+ )
273
+
274
+ performance_metrics_collector = self._metrics_collectors[slurm_job_id]
222
275
 
223
276
  metrics: Union[dict[str, float], str]
224
277
  if not performance_metrics_collector.metrics_url.startswith("http"):
vec_inf/client/models.py CHANGED
@@ -194,6 +194,10 @@ class LaunchOptions:
194
194
  Number of nodes to allocate
195
195
  gpus_per_node : int, optional
196
196
  Number of GPUs per node
197
+ cpus_per_task : int, optional
198
+ Number of CPUs per task
199
+ mem_per_node : str, optional
200
+ Memory per node
197
201
  account : str, optional
198
202
  Account name for job scheduling
199
203
  work_dir : str, optional
@@ -232,6 +236,8 @@ class LaunchOptions:
232
236
  resource_type: Optional[str] = None
233
237
  num_nodes: Optional[int] = None
234
238
  gpus_per_node: Optional[int] = None
239
+ cpus_per_task: Optional[int] = None
240
+ mem_per_node: Optional[str] = None
235
241
  account: Optional[str] = None
236
242
  work_dir: Optional[str] = None
237
243
  qos: Optional[str] = None