skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +478 -0
- sky/backends/backend_utils.py +45 -4
- sky/backends/cloud_vm_ray_backend.py +32 -33
- sky/backends/task_codegen.py +340 -2
- sky/catalog/__init__.py +0 -3
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +14 -3
- sky/client/cli/command.py +329 -22
- sky/client/sdk.py +56 -2
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +2 -1
- sky/clouds/vast.py +10 -0
- sky/core.py +128 -36
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-abfcac9c137aa543.js → [cluster]-a7565f586ef86467.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-9e5d47818b9bdadd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-c0b5935149902e6f.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aed0ea19df7cf961.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-9faf940b253e3e06.js → [pool]-8d0f4655400b4eb9.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-2072b48b617989c9.js → jobs-e5a98f17f8513a96.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{users-f42674164aa73423.js → users-2f7646eb77785a2c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-ef19d49c6d0e8500.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-531b2f8c4bf89f82.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-64e05f17bf2cf8ce.js → webpack-fba3de387ff6bb08.js} +1 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +16 -2
- sky/global_user_state.py +3 -3
- sky/models.py +2 -0
- sky/optimizer.py +6 -5
- sky/provision/__init__.py +1 -0
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +42 -6
- sky/provision/provisioner.py +15 -6
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +10 -6
- sky/serve/server/impl.py +1 -1
- sky/server/constants.py +1 -1
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +12 -1
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +5 -1
- sky/server/requests/serializers/encoders.py +17 -0
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/server.py +78 -8
- sky/server/server_utils.py +30 -0
- sky/setup_files/dependencies.py +2 -0
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +34 -9
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +2 -1
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +8 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/users/model.conf +1 -1
- sky/users/permission.py +24 -1
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/command_runner.py +197 -5
- sky/utils/command_runner.pyi +27 -4
- sky/utils/common_utils.py +18 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/schemas.py +31 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +48 -36
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/RECORD +125 -107
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{1141-e6aa9ab418717c59.js → 1141-9c810f01ff4f398a.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{3800-7b45f9fbb6308557.js → 3800-b589397dc09c5b4e.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py
CHANGED
|
@@ -25,6 +25,7 @@ SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
|
|
|
25
25
|
# them be in $HOME makes it more convenient.
|
|
26
26
|
SKY_LOGS_DIRECTORY = '~/sky_logs'
|
|
27
27
|
SKY_REMOTE_WORKDIR = '~/sky_workdir'
|
|
28
|
+
SKY_TEMPLATES_DIRECTORY = '~/sky_templates'
|
|
28
29
|
SKY_IGNORE_FILE = '.skyignore'
|
|
29
30
|
GIT_IGNORE_FILE = '.gitignore'
|
|
30
31
|
|
|
@@ -67,10 +68,23 @@ SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
|
67
68
|
# #!/opt/conda/bin/python3
|
|
68
69
|
SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
|
|
69
70
|
f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
|
|
71
|
+
|
|
72
|
+
# Use $(which env) to find env, falling back to /usr/bin/env if which is
|
|
73
|
+
# unavailable. This works around a Slurm quirk where srun's execvp() doesn't
|
|
74
|
+
# check execute permissions, failing when $HOME/.local/bin/env (non-executable,
|
|
75
|
+
# from uv installation) shadows /usr/bin/env.
|
|
76
|
+
SKY_SLURM_UNSET_PYTHONPATH = ('$(which env 2>/dev/null || echo /usr/bin/env) '
|
|
77
|
+
'-u PYTHONPATH')
|
|
78
|
+
SKY_SLURM_PYTHON_CMD = (f'{SKY_SLURM_UNSET_PYTHONPATH} '
|
|
79
|
+
f'$({SKY_GET_PYTHON_PATH_CMD})')
|
|
80
|
+
|
|
70
81
|
# Separate env for SkyPilot runtime dependencies.
|
|
71
82
|
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
|
|
72
83
|
SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
|
73
84
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
|
85
|
+
# Place the conda root in the runtime directory, as installing to $HOME
|
|
86
|
+
# on an NFS takes too long (1-2m slower).
|
|
87
|
+
SKY_CONDA_ROOT = f'{SKY_RUNTIME_DIR}/miniconda3'
|
|
74
88
|
# uv is used for venv and pip, much faster than python implementations.
|
|
75
89
|
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
76
90
|
# set UV_SYSTEM_PYTHON to false in case the
|
|
@@ -162,6 +176,10 @@ DISABLE_GPU_ECC_COMMAND = (
|
|
|
162
176
|
'{ sudo reboot || echo "Failed to reboot. ECC mode may not be disabled"; } '
|
|
163
177
|
'|| true; ')
|
|
164
178
|
|
|
179
|
+
SETUP_SKY_DIRS_COMMANDS = (f'mkdir -p ~/sky_workdir && '
|
|
180
|
+
f'mkdir -p ~/.sky/sky_app && '
|
|
181
|
+
f'mkdir -p {SKY_RUNTIME_DIR}/.sky;')
|
|
182
|
+
|
|
165
183
|
# Install conda on the remote cluster if it is not already installed.
|
|
166
184
|
# We use conda with python 3.10 to be consistent across multiple clouds with
|
|
167
185
|
# best effort.
|
|
@@ -178,8 +196,9 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
|
178
196
|
# because for some images, conda is already installed, but not initialized.
|
|
179
197
|
# In this case, we need to initialize conda and set auto_activate_base to
|
|
180
198
|
# true.
|
|
181
|
-
'{
|
|
182
|
-
'
|
|
199
|
+
'{ '
|
|
200
|
+
f'bash Miniconda3-Linux.sh -b -p "{SKY_CONDA_ROOT}" || true; '
|
|
201
|
+
f'eval "$({SKY_CONDA_ROOT}/bin/conda shell.bash hook)" && conda init && '
|
|
183
202
|
# Caller should replace {conda_auto_activate} with either true or false.
|
|
184
203
|
'conda config --set auto_activate_base {conda_auto_activate} && '
|
|
185
204
|
'conda activate base; }; '
|
|
@@ -222,7 +241,7 @@ _sky_version = str(version.parse(sky.__version__))
|
|
|
222
241
|
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
|
|
223
242
|
RAY_INSTALLATION_COMMANDS = (
|
|
224
243
|
f'{SKY_UV_INSTALL_CMD};'
|
|
225
|
-
'
|
|
244
|
+
f'{SETUP_SKY_DIRS_COMMANDS}'
|
|
226
245
|
# Print the PATH in provision.log to help debug PATH issues.
|
|
227
246
|
'echo PATH=$PATH; '
|
|
228
247
|
# Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
|
|
@@ -256,7 +275,7 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
256
275
|
#
|
|
257
276
|
# Here, we add ~/.local/bin to the end of the PATH to make sure the issues
|
|
258
277
|
# mentioned above are resolved.
|
|
259
|
-
'export PATH=$PATH
|
|
278
|
+
f'export PATH=$PATH:{SKY_RUNTIME_DIR}/.local/bin; '
|
|
260
279
|
# Writes ray path to file if it does not exist or the file is empty.
|
|
261
280
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
|
262
281
|
f'{{ {SKY_UV_RUN_CMD} '
|
|
@@ -264,18 +283,23 @@ RAY_INSTALLATION_COMMANDS = (
|
|
|
264
283
|
|
|
265
284
|
# Copy SkyPilot templates from the installed wheel to ~/sky_templates.
|
|
266
285
|
# This must run after the skypilot wheel is installed.
|
|
286
|
+
# Note: We remove ~/sky_templates first to avoid import conflicts where Python
|
|
287
|
+
# would import from ~/sky_templates instead of site-packages (because
|
|
288
|
+
# sky_templates itself is a package), leading to src == dst error when
|
|
289
|
+
# launching on an existing cluster.
|
|
267
290
|
COPY_SKYPILOT_TEMPLATES_COMMANDS = (
|
|
291
|
+
f'rm -rf {SKY_TEMPLATES_DIRECTORY}; '
|
|
268
292
|
f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
269
293
|
f'{SKY_PYTHON_CMD} -c \''
|
|
270
294
|
'import sky_templates, shutil, os; '
|
|
271
295
|
'src = os.path.dirname(sky_templates.__file__); '
|
|
272
|
-
'dst = os.path.expanduser(\"
|
|
296
|
+
f'dst = os.path.expanduser(\"{SKY_TEMPLATES_DIRECTORY}\"); '
|
|
273
297
|
'print(f\"Copying templates from {src} to {dst}...\"); '
|
|
274
|
-
'shutil.copytree(src, dst
|
|
298
|
+
'shutil.copytree(src, dst); '
|
|
275
299
|
'print(f\"Templates copied successfully\")\'; '
|
|
276
300
|
# Make scripts executable.
|
|
277
|
-
'find
|
|
278
|
-
'-exec chmod +x {}
|
|
301
|
+
f'find {SKY_TEMPLATES_DIRECTORY} -type f ! -name "*.py" ! -name "*.md" '
|
|
302
|
+
'-exec chmod +x {} + ; ')
|
|
279
303
|
|
|
280
304
|
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
|
281
305
|
f'{SKY_UV_INSTALL_CMD};'
|
|
@@ -438,6 +462,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
438
462
|
('gcp', 'enable_gvnic'),
|
|
439
463
|
('gcp', 'enable_gpu_direct'),
|
|
440
464
|
('gcp', 'placement_policy'),
|
|
465
|
+
('vast', 'secure_only'),
|
|
441
466
|
('active_workspace',),
|
|
442
467
|
]
|
|
443
468
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
@@ -532,7 +557,7 @@ CATALOG_SCHEMA_VERSION = 'v8'
|
|
|
532
557
|
CATALOG_DIR = '~/.sky/catalogs'
|
|
533
558
|
ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
|
|
534
559
|
'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
|
|
535
|
-
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
|
|
560
|
+
'paperspace', 'primeintellect', 'do', 'nebius', 'ssh', 'slurm',
|
|
536
561
|
'hyperbolic', 'seeweb', 'shadeform')
|
|
537
562
|
# END constants used for service catalog.
|
|
538
563
|
|
sky/skylet/events.py
CHANGED
|
@@ -236,7 +236,7 @@ class AutostopEvent(SkyletEvent):
|
|
|
236
236
|
RAY_PROVISIONER_SKYPILOT_TERMINATOR):
|
|
237
237
|
logger.info('Using new provisioner to stop the cluster.')
|
|
238
238
|
self._stop_cluster_with_new_provisioner(autostop_config, config,
|
|
239
|
-
provider_name)
|
|
239
|
+
provider_name, cloud)
|
|
240
240
|
return
|
|
241
241
|
logger.info('Not using new provisioner to stop the cluster. '
|
|
242
242
|
f'Cloud of this cluster: {provider_name}')
|
|
@@ -314,7 +314,8 @@ class AutostopEvent(SkyletEvent):
|
|
|
314
314
|
raise NotImplementedError
|
|
315
315
|
|
|
316
316
|
def _stop_cluster_with_new_provisioner(self, autostop_config,
|
|
317
|
-
cluster_config, provider_name
|
|
317
|
+
cluster_config, provider_name,
|
|
318
|
+
cloud):
|
|
318
319
|
# pylint: disable=import-outside-toplevel
|
|
319
320
|
from sky import provision as provision_lib
|
|
320
321
|
autostop_lib.set_autostopping_started()
|
|
@@ -334,8 +335,13 @@ class AutostopEvent(SkyletEvent):
|
|
|
334
335
|
|
|
335
336
|
# Stop the ray autoscaler to avoid scaling up, during
|
|
336
337
|
# stopping/terminating of the cluster.
|
|
337
|
-
|
|
338
|
-
|
|
338
|
+
if not cloud.uses_ray():
|
|
339
|
+
logger.info('Skipping ray stop as cloud does not use Ray.')
|
|
340
|
+
else:
|
|
341
|
+
logger.info('Stopping the ray cluster.')
|
|
342
|
+
subprocess.run(f'{constants.SKY_RAY_CMD} stop',
|
|
343
|
+
shell=True,
|
|
344
|
+
check=True)
|
|
339
345
|
|
|
340
346
|
operation_fn = provision_lib.stop_instances
|
|
341
347
|
if autostop_config.down:
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Task Executors"""
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Slurm distributed task executor for SkyPilot.
|
|
2
|
+
|
|
3
|
+
This module is invoked on each Slurm compute node via:
|
|
4
|
+
srun python -m sky.skylet.executor.slurm --script=... --log-dir=...
|
|
5
|
+
"""
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import os
|
|
9
|
+
import pathlib
|
|
10
|
+
import socket
|
|
11
|
+
import subprocess
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
import colorama
|
|
16
|
+
|
|
17
|
+
from sky.skylet.log_lib import run_bash_command_with_log
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _get_ip_address() -> str:
|
|
21
|
+
"""Get the IP address of the current node."""
|
|
22
|
+
ip_result = subprocess.run(['hostname', '-I'],
|
|
23
|
+
capture_output=True,
|
|
24
|
+
text=True,
|
|
25
|
+
check=False)
|
|
26
|
+
return ip_result.stdout.strip().split(
|
|
27
|
+
)[0] if ip_result.returncode == 0 else 'unknown'
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _get_job_node_ips() -> str:
|
|
31
|
+
"""Get IPs of all nodes in the current Slurm job."""
|
|
32
|
+
nodelist = os.environ.get('SLURM_JOB_NODELIST', '')
|
|
33
|
+
assert nodelist, 'SLURM_JOB_NODELIST is not set'
|
|
34
|
+
|
|
35
|
+
# Expand compressed nodelist (e.g., "node[1-3,5]"
|
|
36
|
+
# -> "node1\nnode2\nnode3\nnode5")
|
|
37
|
+
result = subprocess.run(['scontrol', 'show', 'hostnames', nodelist],
|
|
38
|
+
capture_output=True,
|
|
39
|
+
text=True,
|
|
40
|
+
check=False)
|
|
41
|
+
if result.returncode != 0:
|
|
42
|
+
raise RuntimeError(f'Failed to get hostnames for: {nodelist}')
|
|
43
|
+
|
|
44
|
+
hostnames = result.stdout.strip().split('\n')
|
|
45
|
+
ips = []
|
|
46
|
+
for hostname in hostnames:
|
|
47
|
+
try:
|
|
48
|
+
ip = socket.gethostbyname(hostname)
|
|
49
|
+
ips.append(ip)
|
|
50
|
+
except socket.gaierror as e:
|
|
51
|
+
raise RuntimeError('Failed to get IP for hostname: '
|
|
52
|
+
f'{hostname}') from e
|
|
53
|
+
|
|
54
|
+
return '\n'.join(ips)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def main():
|
|
58
|
+
parser = argparse.ArgumentParser(
|
|
59
|
+
description='SkyPilot Slurm task runner for distributed execution')
|
|
60
|
+
parser.add_argument('--script', help='User script (inline, shell-quoted)')
|
|
61
|
+
parser.add_argument('--script-path',
|
|
62
|
+
help='Path to script file (if too long for inline)')
|
|
63
|
+
parser.add_argument('--env-vars',
|
|
64
|
+
default='{}',
|
|
65
|
+
help='JSON-encoded environment variables')
|
|
66
|
+
parser.add_argument('--log-dir',
|
|
67
|
+
required=True,
|
|
68
|
+
help='Directory for log files')
|
|
69
|
+
parser.add_argument('--cluster-num-nodes',
|
|
70
|
+
type=int,
|
|
71
|
+
required=True,
|
|
72
|
+
help='Total number of nodes in the cluster')
|
|
73
|
+
parser.add_argument('--cluster-ips',
|
|
74
|
+
required=True,
|
|
75
|
+
help='Comma-separated list of cluster node IPs')
|
|
76
|
+
parser.add_argument('--task-name',
|
|
77
|
+
default=None,
|
|
78
|
+
help='Task name for single-node log prefix')
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
'--is-setup',
|
|
81
|
+
action='store_true',
|
|
82
|
+
help=
|
|
83
|
+
'Whether this is a setup command (affects logging prefix and filename)')
|
|
84
|
+
parser.add_argument('--alloc-signal-file',
|
|
85
|
+
help='Path to allocation signal file')
|
|
86
|
+
parser.add_argument('--setup-done-signal-file',
|
|
87
|
+
help='Path to setup-done signal file')
|
|
88
|
+
args = parser.parse_args()
|
|
89
|
+
|
|
90
|
+
assert args.script is not None or args.script_path is not None, (
|
|
91
|
+
'Either '
|
|
92
|
+
'--script or --script-path must be provided')
|
|
93
|
+
|
|
94
|
+
# Task rank, different from index of the node in the cluster.
|
|
95
|
+
rank = int(os.environ['SLURM_PROCID'])
|
|
96
|
+
num_nodes = int(os.environ.get('SLURM_NNODES', 1))
|
|
97
|
+
is_single_node_cluster = (args.cluster_num_nodes == 1)
|
|
98
|
+
|
|
99
|
+
# Determine node index from IP (like Ray's cluster_ips_to_node_id)
|
|
100
|
+
cluster_ips = args.cluster_ips.split(',')
|
|
101
|
+
ip_addr = _get_ip_address()
|
|
102
|
+
try:
|
|
103
|
+
node_idx = cluster_ips.index(ip_addr)
|
|
104
|
+
except ValueError as e:
|
|
105
|
+
raise RuntimeError(f'IP address {ip_addr} not found in '
|
|
106
|
+
f'cluster IPs: {cluster_ips}') from e
|
|
107
|
+
node_name = 'head' if node_idx == 0 else f'worker{node_idx}'
|
|
108
|
+
|
|
109
|
+
# Log files are written to a shared filesystem, so each node must use a
|
|
110
|
+
# unique filename to avoid collisions.
|
|
111
|
+
if args.is_setup:
|
|
112
|
+
# TODO(kevin): This is inconsistent with other clouds, where it is
|
|
113
|
+
# simply called 'setup.log'. On Slurm that is obviously not possible,
|
|
114
|
+
# since the ~/sky_logs directory is shared by all nodes, so
|
|
115
|
+
# 'setup.log' will be overwritten by other nodes.
|
|
116
|
+
# Perhaps we should apply this naming convention to other clouds.
|
|
117
|
+
log_filename = f'setup-{node_name}.log'
|
|
118
|
+
elif is_single_node_cluster:
|
|
119
|
+
log_filename = 'run.log'
|
|
120
|
+
else:
|
|
121
|
+
log_filename = f'{rank}-{node_name}.log'
|
|
122
|
+
log_path = os.path.join(args.log_dir, log_filename)
|
|
123
|
+
|
|
124
|
+
if args.script_path:
|
|
125
|
+
with open(args.script_path, 'r', encoding='utf-8') as f:
|
|
126
|
+
script = f.read()
|
|
127
|
+
else:
|
|
128
|
+
script = args.script
|
|
129
|
+
|
|
130
|
+
# Parse env vars and add SKYPILOT environment variables
|
|
131
|
+
env_vars = json.loads(args.env_vars)
|
|
132
|
+
if not args.is_setup:
|
|
133
|
+
# For setup, env vars are set in CloudVmRayBackend._setup.
|
|
134
|
+
env_vars['SKYPILOT_NODE_RANK'] = str(rank)
|
|
135
|
+
env_vars['SKYPILOT_NUM_NODES'] = str(num_nodes)
|
|
136
|
+
env_vars['SKYPILOT_NODE_IPS'] = _get_job_node_ips()
|
|
137
|
+
|
|
138
|
+
# Signal file coordination for setup/run synchronization
|
|
139
|
+
# Rank 0 touches the allocation signal to indicate resources acquired
|
|
140
|
+
if args.alloc_signal_file is not None and rank == 0:
|
|
141
|
+
pathlib.Path(args.alloc_signal_file).touch()
|
|
142
|
+
|
|
143
|
+
# Wait for setup to complete.
|
|
144
|
+
while args.setup_done_signal_file is not None and not os.path.exists(
|
|
145
|
+
args.setup_done_signal_file):
|
|
146
|
+
time.sleep(0.1)
|
|
147
|
+
|
|
148
|
+
# Build log prefix
|
|
149
|
+
# For setup on head: (setup pid={pid})
|
|
150
|
+
# For setup on workers: (setup pid={pid}, ip=1.2.3.4)
|
|
151
|
+
# For single-node cluster: (task_name, pid={pid})
|
|
152
|
+
# For multi-node on head: (head, rank=0, pid={pid})
|
|
153
|
+
# For multi-node on workers: (worker1, rank=1, pid={pid}, ip=1.2.3.4)
|
|
154
|
+
# The {pid} placeholder will be replaced by run_with_log
|
|
155
|
+
if args.is_setup:
|
|
156
|
+
# Setup prefix: head (node_idx=0) shows no IP, workers show IP
|
|
157
|
+
if node_idx == 0:
|
|
158
|
+
prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}})'
|
|
159
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
160
|
+
else:
|
|
161
|
+
prefix = (f'{colorama.Fore.CYAN}(setup pid={{pid}}, ip={ip_addr})'
|
|
162
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
163
|
+
elif is_single_node_cluster:
|
|
164
|
+
# Single-node cluster: use task name
|
|
165
|
+
name_str = args.task_name if args.task_name else 'task'
|
|
166
|
+
prefix = (f'{colorama.Fore.CYAN}({name_str}, pid={{pid}})'
|
|
167
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
168
|
+
else:
|
|
169
|
+
# Multi-node cluster: head (node_idx=0) shows no IP, workers show IP
|
|
170
|
+
if node_idx == 0:
|
|
171
|
+
prefix = (
|
|
172
|
+
f'{colorama.Fore.CYAN}({node_name}, rank={rank}, pid={{pid}})'
|
|
173
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
174
|
+
else:
|
|
175
|
+
prefix = (f'{colorama.Fore.CYAN}'
|
|
176
|
+
f'({node_name}, rank={rank}, pid={{pid}}, ip={ip_addr})'
|
|
177
|
+
f'{colorama.Style.RESET_ALL} ')
|
|
178
|
+
|
|
179
|
+
returncode = run_bash_command_with_log(script,
|
|
180
|
+
log_path,
|
|
181
|
+
env_vars=env_vars,
|
|
182
|
+
stream_logs=True,
|
|
183
|
+
streaming_prefix=prefix)
|
|
184
|
+
|
|
185
|
+
sys.exit(returncode)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
if __name__ == '__main__':
|
|
189
|
+
main()
|
sky/skylet/job_lib.py
CHANGED
|
@@ -1273,4 +1273,5 @@ class JobLibCodeGen:
|
|
|
1273
1273
|
def _build(cls, code: List[str]) -> str:
|
|
1274
1274
|
code = cls._PREFIX + code
|
|
1275
1275
|
code = ';'.join(code)
|
|
1276
|
-
return f'{constants.
|
|
1276
|
+
return (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
|
|
1277
|
+
f'{constants.SKY_PYTHON_CMD} -u -c {shlex.quote(code)}')
|
sky/skylet/log_lib.py
CHANGED
|
@@ -172,7 +172,7 @@ def run_with_log(
|
|
|
172
172
|
streaming_prefix: Optional[str] = None,
|
|
173
173
|
log_cmd: bool = False,
|
|
174
174
|
**kwargs,
|
|
175
|
-
) -> Union[int, Tuple[int, str, str]]:
|
|
175
|
+
) -> Union[int, Tuple[int, str, str], Tuple[int, int]]:
|
|
176
176
|
"""Runs a command and logs its output to a file.
|
|
177
177
|
|
|
178
178
|
Args:
|
|
@@ -183,6 +183,8 @@ def run_with_log(
|
|
|
183
183
|
process_stream: Whether to post-process the stdout/stderr of the
|
|
184
184
|
command, such as replacing or skipping lines on the fly. If
|
|
185
185
|
enabled, lines are printed only when '\r' or '\n' is found.
|
|
186
|
+
streaming_prefix: Optional prefix for each log line. Can contain {pid}
|
|
187
|
+
placeholder which will be replaced with the subprocess PID.
|
|
186
188
|
|
|
187
189
|
Returns the returncode or returncode, stdout and stderr of the command.
|
|
188
190
|
Note that the stdout and stderr is already decoded.
|
|
@@ -228,6 +230,13 @@ def run_with_log(
|
|
|
228
230
|
# For backward compatibility, do not specify use_kill_pg by
|
|
229
231
|
# default.
|
|
230
232
|
subprocess_utils.kill_process_daemon(proc.pid)
|
|
233
|
+
|
|
234
|
+
# Format streaming_prefix with subprocess PID if it contains {pid}
|
|
235
|
+
formatted_streaming_prefix = streaming_prefix
|
|
236
|
+
if streaming_prefix and '{pid}' in streaming_prefix:
|
|
237
|
+
formatted_streaming_prefix = streaming_prefix.format(
|
|
238
|
+
pid=proc.pid)
|
|
239
|
+
|
|
231
240
|
stdout = ''
|
|
232
241
|
stderr = ''
|
|
233
242
|
stdout_stream_handler = None
|
|
@@ -256,7 +265,7 @@ def run_with_log(
|
|
|
256
265
|
line_processor=line_processor,
|
|
257
266
|
# Replace CRLF when the output is logged to driver by ray.
|
|
258
267
|
replace_crlf=with_ray,
|
|
259
|
-
streaming_prefix=
|
|
268
|
+
streaming_prefix=formatted_streaming_prefix,
|
|
260
269
|
)
|
|
261
270
|
stdout_stream_handler = functools.partial(
|
|
262
271
|
_handle_io_stream,
|
|
@@ -349,7 +358,8 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
349
358
|
log_path: str,
|
|
350
359
|
env_vars: Optional[Dict[str, str]] = None,
|
|
351
360
|
stream_logs: bool = False,
|
|
352
|
-
with_ray: bool = False
|
|
361
|
+
with_ray: bool = False,
|
|
362
|
+
streaming_prefix: Optional[str] = None):
|
|
353
363
|
with tempfile.NamedTemporaryFile('w', prefix='sky_app_',
|
|
354
364
|
delete=False) as fp:
|
|
355
365
|
bash_command = make_task_bash_script(bash_command, env_vars=env_vars)
|
|
@@ -364,6 +374,7 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
364
374
|
log_path,
|
|
365
375
|
stream_logs=stream_logs,
|
|
366
376
|
with_ray=with_ray,
|
|
377
|
+
streaming_prefix=streaming_prefix,
|
|
367
378
|
shell=True)
|
|
368
379
|
|
|
369
380
|
|
|
@@ -372,9 +383,14 @@ def run_bash_command_with_log_and_return_pid(
|
|
|
372
383
|
log_path: str,
|
|
373
384
|
env_vars: Optional[Dict[str, str]] = None,
|
|
374
385
|
stream_logs: bool = False,
|
|
375
|
-
with_ray: bool = False
|
|
376
|
-
|
|
377
|
-
|
|
386
|
+
with_ray: bool = False,
|
|
387
|
+
streaming_prefix: Optional[str] = None):
|
|
388
|
+
return_code = run_bash_command_with_log(bash_command,
|
|
389
|
+
log_path,
|
|
390
|
+
env_vars,
|
|
391
|
+
stream_logs,
|
|
392
|
+
with_ray,
|
|
393
|
+
streaming_prefix=streaming_prefix)
|
|
378
394
|
return {'return_code': return_code, 'pid': os.getpid()}
|
|
379
395
|
|
|
380
396
|
|
sky/skylet/log_lib.pyi
CHANGED
|
@@ -68,7 +68,7 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
68
68
|
process_stream: bool = ...,
|
|
69
69
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
70
70
|
streaming_prefix: Optional[str] = ...,
|
|
71
|
-
|
|
71
|
+
log_cmd: bool = ...,
|
|
72
72
|
**kwargs) -> int:
|
|
73
73
|
...
|
|
74
74
|
|
|
@@ -87,7 +87,7 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
87
87
|
process_stream: bool = ...,
|
|
88
88
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
89
89
|
streaming_prefix: Optional[str] = ...,
|
|
90
|
-
|
|
90
|
+
log_cmd: bool = ...,
|
|
91
91
|
**kwargs) -> Tuple[int, str, str]:
|
|
92
92
|
...
|
|
93
93
|
|
|
@@ -106,8 +106,8 @@ def run_with_log(cmd: Union[List[str], str],
|
|
|
106
106
|
process_stream: bool = ...,
|
|
107
107
|
line_processor: Optional[log_utils.LineProcessor] = ...,
|
|
108
108
|
streaming_prefix: Optional[str] = ...,
|
|
109
|
-
|
|
110
|
-
**kwargs) ->
|
|
109
|
+
log_cmd: bool = ...,
|
|
110
|
+
**kwargs) -> Tuple[int, int]:
|
|
111
111
|
...
|
|
112
112
|
|
|
113
113
|
|
|
@@ -125,7 +125,8 @@ def run_bash_command_with_log(bash_command: str,
|
|
|
125
125
|
log_path: str,
|
|
126
126
|
env_vars: Optional[Dict[str, str]] = ...,
|
|
127
127
|
stream_logs: bool = ...,
|
|
128
|
-
with_ray: bool =
|
|
128
|
+
with_ray: bool = ...,
|
|
129
|
+
streaming_prefix: Optional[str] = ...) -> int:
|
|
129
130
|
...
|
|
130
131
|
|
|
131
132
|
|
|
@@ -134,7 +135,8 @@ def run_bash_command_with_log_and_return_pid(
|
|
|
134
135
|
log_path: str,
|
|
135
136
|
env_vars: Optional[Dict[str, str]] = ...,
|
|
136
137
|
stream_logs: bool = ...,
|
|
137
|
-
with_ray: bool =
|
|
138
|
+
with_ray: bool = ...,
|
|
139
|
+
streaming_prefix: Optional[str] = ...) -> Dict[str, Union[int, str]]:
|
|
138
140
|
...
|
|
139
141
|
|
|
140
142
|
|
sky/skylet/skylet.py
CHANGED
|
@@ -48,8 +48,12 @@ def start_grpc_server(port: int = constants.SKYLET_GRPC_PORT) -> grpc.Server:
|
|
|
48
48
|
# putting it here for visibility.
|
|
49
49
|
# TODO(kevin): Determine the optimal max number of threads.
|
|
50
50
|
max_workers = min(32, (os.cpu_count() or 1) + 4)
|
|
51
|
+
# There's only a single skylet process per cluster, so disable
|
|
52
|
+
# SO_REUSEPORT to raise an error if the port is already in use.
|
|
53
|
+
options = (('grpc.so_reuseport', 0),)
|
|
51
54
|
server = grpc.server(
|
|
52
|
-
concurrent.futures.ThreadPoolExecutor(max_workers=max_workers)
|
|
55
|
+
concurrent.futures.ThreadPoolExecutor(max_workers=max_workers),
|
|
56
|
+
options=options)
|
|
53
57
|
|
|
54
58
|
autostopv1_pb2_grpc.add_AutostopServiceServicer_to_server(
|
|
55
59
|
services.AutostopServiceImpl(), server)
|
sky/skylet/subprocess_daemon.py
CHANGED
|
@@ -110,7 +110,8 @@ def kill_process_tree(process: psutil.Process,
|
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
def main():
|
|
113
|
-
|
|
113
|
+
daemonize()
|
|
114
|
+
|
|
114
115
|
parser = argparse.ArgumentParser()
|
|
115
116
|
parser.add_argument('--parent-pid', type=int, required=True)
|
|
116
117
|
parser.add_argument('--proc-pid', type=int, required=True)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Constants for SSH Node Pools"""
|
|
2
|
+
# pylint: disable=line-too-long
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
6
|
+
SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
|
|
7
|
+
NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
|
|
8
|
+
NODE_POOLS_KEY_DIR = os.path.expanduser('~/.sky/ssh_keys')
|
|
9
|
+
DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
|
10
|
+
|
|
11
|
+
# TODO (kyuds): make this configurable?
|
|
12
|
+
K3S_TOKEN = 'mytoken' # Any string can be used as the token
|
sky/ssh_node_pools/core.py
CHANGED
|
@@ -1,10 +1,15 @@
|
|
|
1
1
|
"""SSH Node Pool management core functionality."""
|
|
2
2
|
import os
|
|
3
3
|
from pathlib import Path
|
|
4
|
-
from typing import Any, Dict, List
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
import yaml
|
|
7
7
|
|
|
8
|
+
from sky import clouds
|
|
9
|
+
from sky.ssh_node_pools import constants
|
|
10
|
+
from sky.ssh_node_pools import deploy
|
|
11
|
+
from sky.usage import usage_lib
|
|
12
|
+
from sky.utils import common_utils
|
|
8
13
|
from sky.utils import yaml_utils
|
|
9
14
|
|
|
10
15
|
|
|
@@ -12,8 +17,8 @@ class SSHNodePoolManager:
|
|
|
12
17
|
"""Manager for SSH Node Pool configurations."""
|
|
13
18
|
|
|
14
19
|
def __init__(self):
|
|
15
|
-
self.config_path = Path.
|
|
16
|
-
self.keys_dir = Path.
|
|
20
|
+
self.config_path = Path(constants.DEFAULT_SSH_NODE_POOLS_PATH)
|
|
21
|
+
self.keys_dir = Path(constants.NODE_POOLS_KEY_DIR)
|
|
17
22
|
self.keys_dir.mkdir(parents=True, exist_ok=True)
|
|
18
23
|
|
|
19
24
|
def get_all_pools(self) -> Dict[str, Any]:
|
|
@@ -133,3 +138,35 @@ def list_ssh_keys() -> List[str]:
|
|
|
133
138
|
"""List available SSH keys."""
|
|
134
139
|
manager = SSHNodePoolManager()
|
|
135
140
|
return manager.list_ssh_keys()
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@usage_lib.entrypoint
|
|
144
|
+
def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
|
|
145
|
+
"""Deploys or tears down a Kubernetes cluster on SSH targets.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
infra: Name of the cluster configuration in ssh_node_pools.yaml.
|
|
149
|
+
If None, the first cluster in the file is used.
|
|
150
|
+
cleanup: If True, clean up the cluster instead of deploying.
|
|
151
|
+
"""
|
|
152
|
+
deploy.run(cleanup=cleanup, infra=infra)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@usage_lib.entrypoint
|
|
156
|
+
def ssh_status(context_name: str) -> Tuple[bool, str]:
|
|
157
|
+
"""Check the status of an SSH Node Pool context.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
context_name: The SSH context name (e.g., 'ssh-my-cluster')
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Tuple[bool, str]: (is_ready, reason)
|
|
164
|
+
- is_ready: True if the SSH Node Pool is ready, False otherwise
|
|
165
|
+
- reason: Explanation of the status
|
|
166
|
+
"""
|
|
167
|
+
try:
|
|
168
|
+
is_ready, reason = clouds.SSH.check_single_context(context_name)
|
|
169
|
+
return is_ready, reason
|
|
170
|
+
except Exception as e: # pylint: disable=broad-except
|
|
171
|
+
return False, ('Failed to check SSH context: '
|
|
172
|
+
f'{common_utils.format_exception(e)}')
|