vec-inf 0.7.1__py3-none-any.whl → 0.7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vec_inf/README.md +2 -1
- vec_inf/cli/_cli.py +39 -10
- vec_inf/cli/_helper.py +100 -19
- vec_inf/client/_helper.py +80 -31
- vec_inf/client/_slurm_script_generator.py +58 -30
- vec_inf/client/_slurm_templates.py +27 -12
- vec_inf/client/_utils.py +58 -6
- vec_inf/client/api.py +55 -2
- vec_inf/client/models.py +6 -0
- vec_inf/config/models.yaml +47 -99
- vec_inf/find_port.sh +10 -1
- {vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/METADATA +7 -6
- vec_inf-0.7.3.dist-info/RECORD +27 -0
- {vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/WHEEL +1 -1
- vec_inf-0.7.1.dist-info/RECORD +0 -27
- {vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/entry_points.txt +0 -0
- {vec_inf-0.7.1.dist-info → vec_inf-0.7.3.dist-info}/licenses/LICENSE +0 -0
|
@@ -57,6 +57,8 @@ class SlurmScriptTemplate(TypedDict):
|
|
|
57
57
|
Commands for container setup
|
|
58
58
|
imports : str
|
|
59
59
|
Import statements and source commands
|
|
60
|
+
bind_path : str
|
|
61
|
+
Bind path environment variable for the container
|
|
60
62
|
container_command : str
|
|
61
63
|
Template for container execution command
|
|
62
64
|
activate_venv : str
|
|
@@ -74,7 +76,7 @@ class SlurmScriptTemplate(TypedDict):
|
|
|
74
76
|
shebang: ShebangConfig
|
|
75
77
|
container_setup: list[str]
|
|
76
78
|
imports: str
|
|
77
|
-
|
|
79
|
+
bind_path: str
|
|
78
80
|
container_command: str
|
|
79
81
|
activate_venv: str
|
|
80
82
|
server_setup: ServerSetupConfig
|
|
@@ -96,10 +98,8 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
|
|
|
96
98
|
f"{CONTAINER_MODULE_NAME} exec {IMAGE_PATH} ray stop",
|
|
97
99
|
],
|
|
98
100
|
"imports": "source {src_dir}/find_port.sh",
|
|
99
|
-
"
|
|
100
|
-
|
|
101
|
-
],
|
|
102
|
-
"container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --bind {{model_weights_path}}{{additional_binds}} --containall {IMAGE_PATH} \\",
|
|
101
|
+
"bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
|
|
102
|
+
"container_command": f"{CONTAINER_MODULE_NAME} exec --nv {{env_str}} --containall {IMAGE_PATH} \\",
|
|
103
103
|
"activate_venv": "source {venv}/bin/activate",
|
|
104
104
|
"server_setup": {
|
|
105
105
|
"single_node": [
|
|
@@ -112,6 +112,23 @@ SLURM_SCRIPT_TEMPLATE: SlurmScriptTemplate = {
|
|
|
112
112
|
"nodes_array=($nodes)",
|
|
113
113
|
"head_node=${{nodes_array[0]}}",
|
|
114
114
|
'head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)',
|
|
115
|
+
"\n# Check for RDMA devices and set environment variable accordingly",
|
|
116
|
+
"if ! command -v ibv_devices >/dev/null 2>&1; then",
|
|
117
|
+
' echo "ibv_devices not found; forcing TCP. (No RDMA userland on host?)"',
|
|
118
|
+
" export NCCL_IB_DISABLE=1",
|
|
119
|
+
' export NCCL_ENV_ARG="--env NCCL_IB_DISABLE=1"',
|
|
120
|
+
"else",
|
|
121
|
+
" # Pick GID index based on link layer (IB vs RoCE)",
|
|
122
|
+
' if ibv_devinfo 2>/dev/null | grep -q "link_layer:.*Ethernet"; then',
|
|
123
|
+
" # RoCEv2 typically needs a nonzero GID index; 3 is common, try 2 if your fabric uses it",
|
|
124
|
+
" export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}",
|
|
125
|
+
' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-3}}"',
|
|
126
|
+
" else",
|
|
127
|
+
" # Native InfiniBand => GID 0",
|
|
128
|
+
" export NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}",
|
|
129
|
+
' export NCCL_ENV_ARG="--env NCCL_IB_GID_INDEX={{NCCL_IB_GID_INDEX:-0}}"',
|
|
130
|
+
" fi",
|
|
131
|
+
"fi",
|
|
115
132
|
"\n# Start Ray head node",
|
|
116
133
|
"head_node_port=$(find_available_port $head_node_ip 8080 65535)",
|
|
117
134
|
"ray_head=$head_node_ip:$head_node_port",
|
|
@@ -198,8 +215,8 @@ class BatchModelLaunchScriptTemplate(TypedDict):
|
|
|
198
215
|
Shebang line for the script
|
|
199
216
|
container_setup : list[str]
|
|
200
217
|
Commands for container setup
|
|
201
|
-
|
|
202
|
-
|
|
218
|
+
bind_path : str
|
|
219
|
+
Bind path environment variable for the container
|
|
203
220
|
server_address_setup : list[str]
|
|
204
221
|
Commands to setup the server address
|
|
205
222
|
launch_cmd : list[str]
|
|
@@ -210,7 +227,7 @@ class BatchModelLaunchScriptTemplate(TypedDict):
|
|
|
210
227
|
|
|
211
228
|
shebang: str
|
|
212
229
|
container_setup: str
|
|
213
|
-
|
|
230
|
+
bind_path: str
|
|
214
231
|
server_address_setup: list[str]
|
|
215
232
|
write_to_json: list[str]
|
|
216
233
|
launch_cmd: list[str]
|
|
@@ -220,9 +237,7 @@ class BatchModelLaunchScriptTemplate(TypedDict):
|
|
|
220
237
|
BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = {
|
|
221
238
|
"shebang": "#!/bin/bash\n",
|
|
222
239
|
"container_setup": f"{CONTAINER_LOAD_CMD}\n",
|
|
223
|
-
"
|
|
224
|
-
f"export {CONTAINER_MODULE_NAME}_BINDPATH=${CONTAINER_MODULE_NAME}_BINDPATH,$(echo /dev/infiniband* | sed -e 's/ /,/g')"
|
|
225
|
-
],
|
|
240
|
+
"bind_path": f"export {CONTAINER_MODULE_NAME.upper()}_BINDPATH=${CONTAINER_MODULE_NAME.upper()}_BINDPATH,/dev,/tmp,{{model_weights_path}}{{additional_binds}}",
|
|
226
241
|
"server_address_setup": [
|
|
227
242
|
"source {src_dir}/find_port.sh",
|
|
228
243
|
"head_node_ip=${{SLURMD_NODENAME}}",
|
|
@@ -238,7 +253,7 @@ BATCH_MODEL_LAUNCH_SCRIPT_TEMPLATE: BatchModelLaunchScriptTemplate = {
|
|
|
238
253
|
' "$json_path" > temp_{model_name}.json \\',
|
|
239
254
|
' && mv temp_{model_name}.json "$json_path"\n',
|
|
240
255
|
],
|
|
241
|
-
"container_command": f"{CONTAINER_MODULE_NAME} exec --nv --
|
|
256
|
+
"container_command": f"{CONTAINER_MODULE_NAME} exec --nv --containall {IMAGE_PATH} \\",
|
|
242
257
|
"launch_cmd": [
|
|
243
258
|
"vllm serve {model_weights_path} \\",
|
|
244
259
|
" --served-model-name {model_name} \\",
|
vec_inf/client/_utils.py
CHANGED
|
@@ -108,15 +108,64 @@ def is_server_running(
|
|
|
108
108
|
if isinstance(log_content, str):
|
|
109
109
|
return log_content
|
|
110
110
|
|
|
111
|
-
|
|
111
|
+
# Patterns that indicate fatal errors (not just warnings)
|
|
112
|
+
fatal_error_patterns = [
|
|
113
|
+
"traceback",
|
|
114
|
+
"exception",
|
|
115
|
+
"fatal error",
|
|
116
|
+
"critical error",
|
|
117
|
+
"failed to",
|
|
118
|
+
"could not",
|
|
119
|
+
"unable to",
|
|
120
|
+
"error:",
|
|
121
|
+
]
|
|
122
|
+
|
|
123
|
+
# Patterns to ignore (non-fatal warnings/info messages)
|
|
124
|
+
ignore_patterns = [
|
|
125
|
+
"deprecated",
|
|
126
|
+
"futurewarning",
|
|
127
|
+
"userwarning",
|
|
128
|
+
"deprecationwarning",
|
|
129
|
+
"slurmstepd: error:", # SLURM cancellation messages (often after server started)
|
|
130
|
+
]
|
|
131
|
+
|
|
132
|
+
ready_signature_found = False
|
|
133
|
+
fatal_error_line = None
|
|
112
134
|
|
|
113
135
|
for line in log_content:
|
|
114
|
-
|
|
115
|
-
|
|
136
|
+
line_lower = line.lower()
|
|
137
|
+
|
|
138
|
+
# Check for ready signature first - if found, server is running
|
|
116
139
|
if MODEL_READY_SIGNATURE in line:
|
|
117
|
-
|
|
140
|
+
ready_signature_found = True
|
|
141
|
+
# Continue checking to see if there are errors after startup
|
|
142
|
+
|
|
143
|
+
# Check for fatal errors (only if we haven't seen ready signature yet)
|
|
144
|
+
if not ready_signature_found:
|
|
145
|
+
# Skip lines that match ignore patterns
|
|
146
|
+
if any(ignore_pattern in line_lower for ignore_pattern in ignore_patterns):
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
# Check for fatal error patterns
|
|
150
|
+
for pattern in fatal_error_patterns:
|
|
151
|
+
if pattern in line_lower:
|
|
152
|
+
# Additional check: skip if it's part of a warning message
|
|
153
|
+
# (warnings often contain "error:" but aren't fatal)
|
|
154
|
+
if "warning" in line_lower and "error:" in line_lower:
|
|
155
|
+
continue
|
|
156
|
+
fatal_error_line = line.strip("\n")
|
|
157
|
+
break
|
|
158
|
+
|
|
159
|
+
# If we found a fatal error, mark as failed
|
|
160
|
+
if fatal_error_line:
|
|
161
|
+
return (ModelStatus.FAILED, fatal_error_line)
|
|
162
|
+
|
|
163
|
+
# If ready signature was found and no fatal errors, server is running
|
|
164
|
+
if ready_signature_found:
|
|
165
|
+
return "RUNNING"
|
|
118
166
|
|
|
119
|
-
|
|
167
|
+
# Otherwise, still launching
|
|
168
|
+
return ModelStatus.LAUNCHING
|
|
120
169
|
|
|
121
170
|
|
|
122
171
|
def get_base_url(slurm_job_name: str, slurm_job_id: str, log_dir: str) -> str:
|
|
@@ -387,7 +436,7 @@ def find_matching_dirs(
|
|
|
387
436
|
return matched
|
|
388
437
|
|
|
389
438
|
|
|
390
|
-
def check_required_fields(params: dict[str, Any]) ->
|
|
439
|
+
def check_required_fields(params: dict[str, Any]) -> dict[str, Any]:
|
|
391
440
|
"""Check for required fields without default vals and their corresponding env vars.
|
|
392
441
|
|
|
393
442
|
Parameters
|
|
@@ -395,12 +444,15 @@ def check_required_fields(params: dict[str, Any]) -> None:
|
|
|
395
444
|
params : dict[str, Any]
|
|
396
445
|
Dictionary of parameters to check.
|
|
397
446
|
"""
|
|
447
|
+
env_overrides = {}
|
|
398
448
|
for arg in REQUIRED_ARGS:
|
|
399
449
|
if not params.get(arg):
|
|
400
450
|
default_value = os.getenv(REQUIRED_ARGS[arg])
|
|
401
451
|
if default_value:
|
|
402
452
|
params[arg] = default_value
|
|
453
|
+
env_overrides[arg] = default_value
|
|
403
454
|
else:
|
|
404
455
|
raise MissingRequiredFieldsError(
|
|
405
456
|
f"{arg} is required, please set it in the command arguments or environment variables"
|
|
406
457
|
)
|
|
458
|
+
return env_overrides
|
vec_inf/client/api.py
CHANGED
|
@@ -10,7 +10,9 @@ vec_inf.client._helper : Helper classes for model inference server management
|
|
|
10
10
|
vec_inf.client.models : Data models for API responses
|
|
11
11
|
"""
|
|
12
12
|
|
|
13
|
+
import re
|
|
13
14
|
import shutil
|
|
15
|
+
import subprocess
|
|
14
16
|
import time
|
|
15
17
|
import warnings
|
|
16
18
|
from pathlib import Path
|
|
@@ -81,7 +83,7 @@ class VecInfClient:
|
|
|
81
83
|
|
|
82
84
|
def __init__(self) -> None:
|
|
83
85
|
"""Initialize the Vector Inference client."""
|
|
84
|
-
|
|
86
|
+
self._metrics_collectors: dict[str, PerformanceMetricsCollector] = {}
|
|
85
87
|
|
|
86
88
|
def list_models(self) -> list[ModelInfo]:
|
|
87
89
|
"""List all available models.
|
|
@@ -181,6 +183,51 @@ class VecInfClient:
|
|
|
181
183
|
)
|
|
182
184
|
return model_launcher.launch()
|
|
183
185
|
|
|
186
|
+
def fetch_running_jobs(self) -> list[str]:
|
|
187
|
+
"""
|
|
188
|
+
Fetch the list of running vec-inf job IDs for the current user.
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
list[str]
|
|
193
|
+
List of matching job names; empty list if squeue unavailable.
|
|
194
|
+
"""
|
|
195
|
+
try:
|
|
196
|
+
res = subprocess.run(
|
|
197
|
+
["squeue", "--me", "--noheader"],
|
|
198
|
+
capture_output=True,
|
|
199
|
+
text=True,
|
|
200
|
+
check=True,
|
|
201
|
+
)
|
|
202
|
+
job_ids = [
|
|
203
|
+
ln.strip().split()[0] for ln in res.stdout.splitlines() if ln.strip()
|
|
204
|
+
]
|
|
205
|
+
|
|
206
|
+
if not job_ids:
|
|
207
|
+
return []
|
|
208
|
+
|
|
209
|
+
# For each job, fetch the full JobName and filter by suffix
|
|
210
|
+
matching_ids = []
|
|
211
|
+
for jid in job_ids:
|
|
212
|
+
try:
|
|
213
|
+
sctl = subprocess.run(
|
|
214
|
+
["scontrol", "show", "job", "-o", jid],
|
|
215
|
+
capture_output=True,
|
|
216
|
+
text=True,
|
|
217
|
+
check=True,
|
|
218
|
+
)
|
|
219
|
+
m = re.search(r"\bJobName=([^\s]+)", sctl.stdout)
|
|
220
|
+
if m and m.group(1).endswith("-vec-inf"):
|
|
221
|
+
matching_ids.append(jid)
|
|
222
|
+
except subprocess.CalledProcessError:
|
|
223
|
+
# Job might have finished between squeue and scontrol; skip
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
return matching_ids
|
|
227
|
+
|
|
228
|
+
except subprocess.CalledProcessError as e:
|
|
229
|
+
raise SlurmJobError(f"Error running slurm command: {e}") from e
|
|
230
|
+
|
|
184
231
|
def get_status(self, slurm_job_id: str) -> StatusResponse:
|
|
185
232
|
"""Get the status of a running model.
|
|
186
233
|
|
|
@@ -218,7 +265,13 @@ class VecInfClient:
|
|
|
218
265
|
- Performance metrics or error message
|
|
219
266
|
- Timestamp of collection
|
|
220
267
|
"""
|
|
221
|
-
|
|
268
|
+
# Use cached collector to preserve state between calls to compute throughput
|
|
269
|
+
if slurm_job_id not in self._metrics_collectors:
|
|
270
|
+
self._metrics_collectors[slurm_job_id] = PerformanceMetricsCollector(
|
|
271
|
+
slurm_job_id
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
performance_metrics_collector = self._metrics_collectors[slurm_job_id]
|
|
222
275
|
|
|
223
276
|
metrics: Union[dict[str, float], str]
|
|
224
277
|
if not performance_metrics_collector.metrics_url.startswith("http"):
|
vec_inf/client/models.py
CHANGED
|
@@ -194,6 +194,10 @@ class LaunchOptions:
|
|
|
194
194
|
Number of nodes to allocate
|
|
195
195
|
gpus_per_node : int, optional
|
|
196
196
|
Number of GPUs per node
|
|
197
|
+
cpus_per_task : int, optional
|
|
198
|
+
Number of CPUs per task
|
|
199
|
+
mem_per_node : str, optional
|
|
200
|
+
Memory per node
|
|
197
201
|
account : str, optional
|
|
198
202
|
Account name for job scheduling
|
|
199
203
|
work_dir : str, optional
|
|
@@ -232,6 +236,8 @@ class LaunchOptions:
|
|
|
232
236
|
resource_type: Optional[str] = None
|
|
233
237
|
num_nodes: Optional[int] = None
|
|
234
238
|
gpus_per_node: Optional[int] = None
|
|
239
|
+
cpus_per_task: Optional[int] = None
|
|
240
|
+
mem_per_node: Optional[str] = None
|
|
235
241
|
account: Optional[str] = None
|
|
236
242
|
work_dir: Optional[str] = None
|
|
237
243
|
qos: Optional[str] = None
|