skypilot-nightly 1.0.0.dev20251210__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +4 -2
- sky/adaptors/slurm.py +159 -72
- sky/backends/backend_utils.py +52 -10
- sky/backends/cloud_vm_ray_backend.py +192 -32
- sky/backends/task_codegen.py +40 -2
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +0 -7
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +11 -8
- sky/client/cli/command.py +106 -54
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +8 -0
- sky/client/sdk_async.py +9 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +44 -12
- sky/clouds/ssh.py +1 -1
- sky/clouds/vast.py +30 -17
- sky/core.py +69 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/{9353-8369df1cf105221c.js → 9353-7ad6bd01858556f1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-9e5d47818b9bdadd.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-ef19d49c6d0e8500.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-96e0f298308da7e2.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +29 -4
- sky/global_user_state.py +108 -16
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +9 -0
- sky/optimizer.py +2 -1
- sky/provision/__init__.py +11 -9
- sky/provision/kubernetes/utils.py +122 -15
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/instance.py +75 -29
- sky/provision/slurm/utils.py +213 -107
- sky/provision/vast/utils.py +1 -0
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +16 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/requests/request_names.py +2 -0
- sky/server/requests/requests.py +28 -10
- sky/server/requests/serializers/encoders.py +5 -0
- sky/server/requests/serializers/return_value_serializers.py +14 -4
- sky/server/server.py +434 -107
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +21 -10
- sky/sky_logging.py +2 -1
- sky/skylet/constants.py +22 -5
- sky/skylet/executor/slurm.py +4 -6
- sky/skylet/job_lib.py +89 -4
- sky/skylet/services.py +18 -3
- sky/ssh_node_pools/deploy/tunnel/cleanup-tunnel.sh +62 -0
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/templates/kubernetes-ray.yml.j2 +4 -6
- sky/templates/slurm-ray.yml.j2 +32 -2
- sky/templates/websocket_proxy.py +18 -41
- sky/users/permission.py +61 -51
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +256 -94
- sky/utils/command_runner.pyi +16 -0
- sky/utils/common_utils.py +30 -29
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +63 -20
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +53 -57
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +172 -162
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-b589397dc09c5b4e.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-12c559ec4d81fdbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-d187cd0413d72475.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-cb4da3abe08ebf19.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +0 -1
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +0 -3
- /sky/dashboard/out/_next/static/{KYAhEFa3FTfq4JyKVgo-s → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/plugins/{[...slug]-4f46050ca065d8f8.js → [...slug]-449a9f5a3bb20fb3.js} +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251210.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/setup_files/MANIFEST.in
CHANGED
|
@@ -15,6 +15,7 @@ include sky/jobs/dashboard/templates/*
|
|
|
15
15
|
include sky/jobs/dashboard/static/*
|
|
16
16
|
include sky/templates/*
|
|
17
17
|
include sky/utils/kubernetes/*
|
|
18
|
+
include sky/ssh_node_pools/deploy/tunnel/*
|
|
18
19
|
include sky/server/html/*
|
|
19
20
|
recursive-include sky/dashboard/out *
|
|
20
21
|
include sky/users/*.conf
|
sky/setup_files/dependencies.py
CHANGED
|
@@ -144,9 +144,11 @@ aws_dependencies = [
|
|
|
144
144
|
'awscli>=1.27.10',
|
|
145
145
|
'botocore>=1.29.10',
|
|
146
146
|
'boto3>=1.26.1',
|
|
147
|
-
# NOTE:
|
|
148
|
-
#
|
|
149
|
-
|
|
147
|
+
# NOTE: colorama is a dependency of awscli. We pin it to match the
|
|
148
|
+
# version constraint in awscli (<0.4.7) to prevent potential conflicts
|
|
149
|
+
# with other packages like ray, which might otherwise install a newer
|
|
150
|
+
# version.
|
|
151
|
+
'colorama<0.4.7',
|
|
150
152
|
]
|
|
151
153
|
|
|
152
154
|
# Kubernetes 32.0.0 has an authentication bug:
|
|
@@ -204,12 +206,21 @@ cloud_dependencies: Dict[str, List[str]] = {
|
|
|
204
206
|
'ssh': kubernetes_dependencies,
|
|
205
207
|
# For the container registry auth api. Reference:
|
|
206
208
|
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
209
|
+
'runpod': [
|
|
210
|
+
# For the container registry auth api. Reference:
|
|
211
|
+
# https://github.com/runpod/runpod-python/releases/tag/1.6.1
|
|
212
|
+
'runpod>=1.6.1',
|
|
213
|
+
# RunPod needs a TOML parser to read ~/.runpod/config.toml. On Python
|
|
214
|
+
# 3.11+ stdlib provides tomllib; on lower versions we depend on tomli
|
|
215
|
+
# explicitly. Instead of installing tomli conditionally, we install it
|
|
216
|
+
# explicitly. This is because the conditional installation of tomli does
|
|
217
|
+
# not work with controller package installation code.
|
|
218
|
+
'tomli',
|
|
219
|
+
# runpod installs aiodns (via aiohttp[speedups]), which is incompatible
|
|
220
|
+
# with pycares 5.0.0 due to deprecations.
|
|
221
|
+
# See https://github.com/aio-libs/aiodns/issues/214
|
|
222
|
+
'pycares<5',
|
|
223
|
+
],
|
|
213
224
|
'fluidstack': [], # No dependencies needed for fluidstack
|
|
214
225
|
'cudo': ['cudo-compute>=0.1.10'],
|
|
215
226
|
'paperspace': [], # No dependencies needed for paperspace
|
|
@@ -235,7 +246,7 @@ cloud_dependencies: Dict[str, List[str]] = {
|
|
|
235
246
|
'hyperbolic': [], # No dependencies needed for hyperbolic
|
|
236
247
|
'seeweb': ['ecsapi==0.4.0'],
|
|
237
248
|
'shadeform': [], # No dependencies needed for shadeform
|
|
238
|
-
'slurm': [],
|
|
249
|
+
'slurm': ['python-hostlist'],
|
|
239
250
|
}
|
|
240
251
|
|
|
241
252
|
# Calculate which clouds should be included in the [all] installation.
|
sky/sky_logging.py
CHANGED
|
@@ -15,7 +15,8 @@ from sky.utils import env_options
|
|
|
15
15
|
from sky.utils import rich_utils
|
|
16
16
|
|
|
17
17
|
# UX: Should we show logging prefixes and some extra information in optimizer?
|
|
18
|
-
_FORMAT = '%(levelname).1s %(asctime)s
|
|
18
|
+
_FORMAT = ('%(levelname).1s %(asctime)s.%(msecs)03d PID=%(process)d '
|
|
19
|
+
'%(filename)s:%(lineno)d] %(message)s')
|
|
19
20
|
_DATE_FORMAT = '%m-%d %H:%M:%S'
|
|
20
21
|
_SENSITIVE_LOGGER = ['sky.provisioner', 'sky.optimizer']
|
|
21
22
|
|
sky/skylet/constants.py
CHANGED
|
@@ -20,6 +20,7 @@ SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
|
|
|
20
20
|
# os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
|
|
21
21
|
# '.sky/jobs.db')
|
|
22
22
|
SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
|
|
23
|
+
SKY_CLUSTER_NAME_ENV_VAR_KEY = 'SKY_CLUSTER_NAME'
|
|
23
24
|
# We keep sky_logs and sky_workdir in $HOME, because
|
|
24
25
|
# these are artifacts that users can access, and having
|
|
25
26
|
# them be in $HOME makes it more convenient.
|
|
@@ -46,7 +47,19 @@ SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
|
|
|
46
47
|
SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
|
|
47
48
|
SKY_REMOTE_RAY_VERSION = '2.9.3'
|
|
48
49
|
|
|
49
|
-
|
|
50
|
+
# To avoid user image causing issue with the SkyPilot runtime, we run SkyPilot
|
|
51
|
+
# commands the following prefix:
|
|
52
|
+
# 1. env -u PYTHONPATH: unset PYTHONPATH to avoid any package specified in
|
|
53
|
+
# PYTHONPATH interfering with the SkyPilot runtime.
|
|
54
|
+
# 2. env -C $HOME: set the execution directory to $HOME to avoid the case when
|
|
55
|
+
# a user's WORKDIR in Dockerfile is a Python site-packages directory. Python
|
|
56
|
+
# adds CWD to the beginning of sys.path, so if WORKDIR contains packages (e.g.,
|
|
57
|
+
# compiled for a different Python version), imports will fail with errors like
|
|
58
|
+
# "ModuleNotFoundError: No module named 'rpds.rpds'".
|
|
59
|
+
#
|
|
60
|
+
# TODO(zhwu): Switch -C $HOME to PYTHONSAFEPATH=1, once we moved our runtime to
|
|
61
|
+
# Python 3.11 for a more robust setup.
|
|
62
|
+
SKY_UNSET_PYTHONPATH_AND_SET_CWD = 'env -u PYTHONPATH -C $HOME'
|
|
50
63
|
# We store the absolute path of the python executable (/opt/conda/bin/python3)
|
|
51
64
|
# in this file, so that any future internal commands that need to use python
|
|
52
65
|
# can use this path. This is useful for the case where the user has a custom
|
|
@@ -58,7 +71,8 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
|
58
71
|
f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
|
|
59
72
|
'which python3')
|
|
60
73
|
# Python executable, e.g., /opt/conda/bin/python3
|
|
61
|
-
SKY_PYTHON_CMD = f'{
|
|
74
|
+
SKY_PYTHON_CMD = (f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} '
|
|
75
|
+
f'$({SKY_GET_PYTHON_PATH_CMD})')
|
|
62
76
|
# Prefer SKY_UV_PIP_CMD, which is faster.
|
|
63
77
|
# TODO(cooperc): remove remaining usage (GCP TPU setup).
|
|
64
78
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
|
@@ -91,7 +105,7 @@ SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
|
|
91
105
|
# user provided docker image set it to true.
|
|
92
106
|
# unset PYTHONPATH in case the user provided docker image set it.
|
|
93
107
|
SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
|
|
94
|
-
f'{
|
|
108
|
+
f'{SKY_UNSET_PYTHONPATH_AND_SET_CWD} {SKY_UV_INSTALL_DIR}/uv')
|
|
95
109
|
# This won't reinstall uv if it's already installed, so it's safe to re-run.
|
|
96
110
|
SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
|
|
97
111
|
'curl -LsSf https://astral.sh/uv/install.sh '
|
|
@@ -130,7 +144,7 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
|
|
|
130
144
|
# cluster yaml is updated.
|
|
131
145
|
#
|
|
132
146
|
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
|
|
133
|
-
SKYLET_VERSION = '
|
|
147
|
+
SKYLET_VERSION = '29'
|
|
134
148
|
# The version of the lib files that skylet/jobs use. Whenever there is an API
|
|
135
149
|
# change for the job_lib or log_lib, we need to bump this version, so that the
|
|
136
150
|
# user can be notified to update their SkyPilot version on the remote cluster.
|
|
@@ -462,7 +476,7 @@ OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
|
|
|
462
476
|
('gcp', 'enable_gvnic'),
|
|
463
477
|
('gcp', 'enable_gpu_direct'),
|
|
464
478
|
('gcp', 'placement_policy'),
|
|
465
|
-
('vast', '
|
|
479
|
+
('vast', 'datacenter_only'),
|
|
466
480
|
('active_workspace',),
|
|
467
481
|
]
|
|
468
482
|
# When overriding the SkyPilot configs on the API server with the client one,
|
|
@@ -523,6 +537,9 @@ SKY_USER_FILE_PATH = '~/.sky/generated'
|
|
|
523
537
|
ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
|
|
524
538
|
OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
|
|
525
539
|
IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
|
|
540
|
+
# Environment variable that is set to 'true' if rolling update strategy is
|
|
541
|
+
# enabled for the API server deployment.
|
|
542
|
+
SKYPILOT_ROLLING_UPDATE_ENABLED = 'SKYPILOT_ROLLING_UPDATE_ENABLED'
|
|
526
543
|
|
|
527
544
|
SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
|
|
528
545
|
f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
|
sky/skylet/executor/slurm.py
CHANGED
|
@@ -19,12 +19,10 @@ from sky.skylet.log_lib import run_bash_command_with_log
|
|
|
19
19
|
|
|
20
20
|
def _get_ip_address() -> str:
|
|
21
21
|
"""Get the IP address of the current node."""
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return ip_result.stdout.strip().split(
|
|
27
|
-
)[0] if ip_result.returncode == 0 else 'unknown'
|
|
22
|
+
# Use socket.gethostbyname to be consistent with _get_job_node_ips(),
|
|
23
|
+
# which resolves hostnames the same way. Using `hostname -I` can return
|
|
24
|
+
# Docker bridge IPs (172.17.x.x) first, causing IP mismatch errors.
|
|
25
|
+
return socket.gethostbyname(socket.gethostname())
|
|
28
26
|
|
|
29
27
|
|
|
30
28
|
def _get_job_node_ips() -> str:
|
sky/skylet/job_lib.py
CHANGED
|
@@ -66,6 +66,7 @@ class JobInfoLoc(enum.IntEnum):
|
|
|
66
66
|
PID = 9
|
|
67
67
|
LOG_PATH = 10
|
|
68
68
|
METADATA = 11
|
|
69
|
+
EXIT_CODES = 12
|
|
69
70
|
|
|
70
71
|
|
|
71
72
|
def create_table(cursor, conn):
|
|
@@ -124,6 +125,8 @@ def create_table(cursor, conn):
|
|
|
124
125
|
'metadata',
|
|
125
126
|
'TEXT DEFAULT \'{}\'',
|
|
126
127
|
value_to_replace_existing_entries='{}')
|
|
128
|
+
db_utils.add_column_to_table(cursor, conn, 'jobs', 'exit_codes',
|
|
129
|
+
'TEXT DEFAULT NULL')
|
|
127
130
|
conn.commit()
|
|
128
131
|
|
|
129
132
|
|
|
@@ -388,10 +391,16 @@ def add_job(job_name: str,
|
|
|
388
391
|
assert _DB is not None
|
|
389
392
|
job_submitted_at = time.time()
|
|
390
393
|
# job_id will autoincrement with the null value
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
394
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
395
|
+
_DB.cursor.execute(
|
|
396
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?, null)', # pylint: disable=line-too-long
|
|
397
|
+
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
|
398
|
+
run_timestamp, None, resources_str, metadata))
|
|
399
|
+
else:
|
|
400
|
+
_DB.cursor.execute(
|
|
401
|
+
'INSERT INTO jobs VALUES (null, ?, ?, ?, ?, ?, ?, null, ?, 0, null, ?)', # pylint: disable=line-too-long
|
|
402
|
+
(job_name, username, job_submitted_at, JobStatus.INIT.value,
|
|
403
|
+
run_timestamp, None, resources_str, metadata))
|
|
395
404
|
_DB.conn.commit()
|
|
396
405
|
rows = _DB.cursor.execute('SELECT job_id FROM jobs WHERE run_timestamp=(?)',
|
|
397
406
|
(run_timestamp,))
|
|
@@ -468,6 +477,41 @@ def set_status(job_id: int, status: JobStatus) -> None:
|
|
|
468
477
|
_set_status_no_lock(job_id, status)
|
|
469
478
|
|
|
470
479
|
|
|
480
|
+
@init_db
|
|
481
|
+
def set_exit_codes(job_id: int, exit_codes: List[int]) -> None:
|
|
482
|
+
"""Set exit codes for a job as comma-separated string.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
job_id: The job ID to update.
|
|
486
|
+
exit_codes: A list of exit codes to store.
|
|
487
|
+
"""
|
|
488
|
+
assert _DB is not None
|
|
489
|
+
exit_codes_str = ','.join(str(code) for code in exit_codes)
|
|
490
|
+
with filelock.FileLock(_get_lock_path(job_id)):
|
|
491
|
+
_DB.cursor.execute('UPDATE jobs SET exit_codes=(?) WHERE job_id=(?)',
|
|
492
|
+
(exit_codes_str, job_id))
|
|
493
|
+
_DB.conn.commit()
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
@init_db
|
|
497
|
+
def get_exit_codes(job_id: int) -> Optional[List[int]]:
|
|
498
|
+
"""Get exit codes for a job from comma-separated string.
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
job_id: The job ID to retrieve exit codes for.
|
|
502
|
+
|
|
503
|
+
Returns:
|
|
504
|
+
A list of exit codes, or None if not found.
|
|
505
|
+
"""
|
|
506
|
+
assert _DB is not None
|
|
507
|
+
rows = _DB.cursor.execute('SELECT exit_codes FROM jobs WHERE job_id=(?)',
|
|
508
|
+
(job_id,))
|
|
509
|
+
row = rows.fetchone()
|
|
510
|
+
if row is None or row[0] is None:
|
|
511
|
+
return None
|
|
512
|
+
return [int(code) for code in row[0].split(',')]
|
|
513
|
+
|
|
514
|
+
|
|
471
515
|
@init_db
|
|
472
516
|
def set_job_started(job_id: int) -> None:
|
|
473
517
|
# TODO(mraheja): remove pylint disabling when filelock version updated.
|
|
@@ -506,6 +550,20 @@ def get_status(job_id: int) -> Optional[JobStatus]:
|
|
|
506
550
|
return get_status_no_lock(job_id)
|
|
507
551
|
|
|
508
552
|
|
|
553
|
+
def wait_for_job_completion(job_id: int, poll_interval: float = 1.0) -> None:
|
|
554
|
+
"""Wait for a job to reach a terminal state.
|
|
555
|
+
|
|
556
|
+
Args:
|
|
557
|
+
job_id: The job ID to wait for.
|
|
558
|
+
poll_interval: How often to poll the job status in seconds.
|
|
559
|
+
"""
|
|
560
|
+
while True:
|
|
561
|
+
status = get_status(job_id)
|
|
562
|
+
if status is None or status.is_terminal():
|
|
563
|
+
break
|
|
564
|
+
time.sleep(poll_interval)
|
|
565
|
+
|
|
566
|
+
|
|
509
567
|
@init_db
|
|
510
568
|
def get_statuses_payload(job_ids: List[Optional[int]]) -> str:
|
|
511
569
|
return message_utils.encode_payload(get_statuses(job_ids))
|
|
@@ -674,6 +732,14 @@ def _get_records_from_rows(rows) -> List[Dict[str, Any]]:
|
|
|
674
732
|
'pid': row[JobInfoLoc.PID.value],
|
|
675
733
|
'metadata': json.loads(row[JobInfoLoc.METADATA.value]),
|
|
676
734
|
})
|
|
735
|
+
if int(constants.SKYLET_VERSION) >= 28:
|
|
736
|
+
exit_code_str = row[JobInfoLoc.EXIT_CODES.value]
|
|
737
|
+
if not isinstance(exit_code_str, str):
|
|
738
|
+
records[-1]['exit_codes'] = None
|
|
739
|
+
else:
|
|
740
|
+
records[-1]['exit_codes'] = ([
|
|
741
|
+
int(code) for code in exit_code_str.split(',')
|
|
742
|
+
])
|
|
677
743
|
return records
|
|
678
744
|
|
|
679
745
|
|
|
@@ -1152,6 +1218,15 @@ class JobLibCodeGen:
|
|
|
1152
1218
|
]
|
|
1153
1219
|
return cls._build(code)
|
|
1154
1220
|
|
|
1221
|
+
@classmethod
|
|
1222
|
+
def wait_for_job(cls, job_id: int) -> str:
|
|
1223
|
+
code = [
|
|
1224
|
+
# TODO(kevin): backward compatibility, remove in 0.13.0.
|
|
1225
|
+
(f'job_lib.wait_for_job_completion({job_id!r}) if '
|
|
1226
|
+
'hasattr(job_lib, "wait_for_job_completion") else None'),
|
|
1227
|
+
]
|
|
1228
|
+
return cls._build(code)
|
|
1229
|
+
|
|
1155
1230
|
@classmethod
|
|
1156
1231
|
def update_status(cls) -> str:
|
|
1157
1232
|
code = ['job_lib.update_status()']
|
|
@@ -1269,6 +1344,16 @@ class JobLibCodeGen:
|
|
|
1269
1344
|
]
|
|
1270
1345
|
return cls._build(code)
|
|
1271
1346
|
|
|
1347
|
+
@classmethod
|
|
1348
|
+
def get_job_exit_codes(cls, job_id: Optional[int] = None) -> str:
|
|
1349
|
+
"""Generate shell command to retrieve exit codes."""
|
|
1350
|
+
code = [
|
|
1351
|
+
f'job_id = {job_id} if {job_id} is not None else job_lib.get_latest_job_id()', # pylint: disable=line-too-long
|
|
1352
|
+
'exit_codes = job_lib.get_exit_codes(job_id) if job_id is not None and int(constants.SKYLET_VERSION) >= 28 else {}', # pylint: disable=line-too-long
|
|
1353
|
+
'print(exit_codes, flush=True)',
|
|
1354
|
+
]
|
|
1355
|
+
return cls._build(code)
|
|
1356
|
+
|
|
1272
1357
|
@classmethod
|
|
1273
1358
|
def _build(cls, code: List[str]) -> str:
|
|
1274
1359
|
code = cls._PREFIX + code
|
sky/skylet/services.py
CHANGED
|
@@ -197,12 +197,11 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
197
197
|
f.write(request.codegen)
|
|
198
198
|
os.chmod(script_path, 0o755)
|
|
199
199
|
|
|
200
|
-
cd = f'cd {constants.SKY_REMOTE_WORKDIR}'
|
|
201
200
|
job_submit_cmd = (
|
|
202
201
|
# JOB_CMD_IDENTIFIER is used for identifying the process
|
|
203
202
|
# retrieved with pid is the same driver process.
|
|
204
203
|
f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
|
|
205
|
-
f'{
|
|
204
|
+
f'{constants.SKY_PYTHON_CMD} -u {script_path}'
|
|
206
205
|
# Do not use &>, which is not POSIX and may not work.
|
|
207
206
|
# Note that the order of ">filename 2>&1" matters.
|
|
208
207
|
f' > {remote_log_path} 2>&1')
|
|
@@ -387,6 +386,21 @@ class JobsServiceImpl(jobsv1_pb2_grpc.JobsServiceServicer):
|
|
|
387
386
|
except Exception as e: # pylint: disable=broad-except
|
|
388
387
|
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
389
388
|
|
|
389
|
+
def GetJobExitCodes( # type: ignore[return]
|
|
390
|
+
self, request: jobsv1_pb2.GetJobExitCodesRequest,
|
|
391
|
+
context: grpc.ServicerContext
|
|
392
|
+
) -> jobsv1_pb2.GetJobExitCodesResponse:
|
|
393
|
+
try:
|
|
394
|
+
job_id = request.job_id if request.HasField(
|
|
395
|
+
'job_id') else job_lib.get_latest_job_id()
|
|
396
|
+
exit_codes: Optional[List[int]] = None
|
|
397
|
+
if job_id:
|
|
398
|
+
exit_codes_list = job_lib.get_exit_codes(job_id)
|
|
399
|
+
exit_codes = exit_codes_list if exit_codes_list else []
|
|
400
|
+
return jobsv1_pb2.GetJobExitCodesResponse(exit_codes=exit_codes)
|
|
401
|
+
except Exception as e: # pylint: disable=broad-except
|
|
402
|
+
context.abort(grpc.StatusCode.INTERNAL, str(e))
|
|
403
|
+
|
|
390
404
|
|
|
391
405
|
class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
392
406
|
):
|
|
@@ -488,7 +502,8 @@ class ManagedJobsServiceImpl(managed_jobsv1_pb2_grpc.ManagedJobsServiceServicer
|
|
|
488
502
|
entrypoint=job.get('entrypoint'),
|
|
489
503
|
metadata=converted_metadata,
|
|
490
504
|
pool=job.get('pool'),
|
|
491
|
-
pool_hash=job.get('pool_hash')
|
|
505
|
+
pool_hash=job.get('pool_hash'),
|
|
506
|
+
links=job.get('links'))
|
|
492
507
|
jobs_info.append(job_info)
|
|
493
508
|
|
|
494
509
|
return managed_jobsv1_pb2.GetJobTableResponse(
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# cleanup-tunnel.sh - Script to clean up SSH tunnels for a Kubernetes context
|
|
3
|
+
|
|
4
|
+
# Usage: cleanup-tunnel.sh CONTEXT_NAME
|
|
5
|
+
|
|
6
|
+
CONTEXT="${1:-default}"
|
|
7
|
+
TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
|
|
8
|
+
PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
|
|
9
|
+
LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
|
|
10
|
+
LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
|
|
11
|
+
|
|
12
|
+
# Get the port from kubeconfig if available
|
|
13
|
+
KUBE_PORT=$(kubectl config view --minify --context="$CONTEXT" -o jsonpath='{.clusters[0].cluster.server}' 2>/dev/null | grep -o ":[0-9]\+" | tr -d ":" || echo "")
|
|
14
|
+
|
|
15
|
+
if [[ -z "$KUBE_PORT" ]]; then
|
|
16
|
+
# Default to 6443 if we can't determine the port
|
|
17
|
+
KUBE_PORT=6443
|
|
18
|
+
echo "$(date): Could not determine port from kubeconfig, using default port $KUBE_PORT" >> "$LOG_FILE"
|
|
19
|
+
else
|
|
20
|
+
echo "$(date): Found port $KUBE_PORT in kubeconfig for context $CONTEXT" >> "$LOG_FILE"
|
|
21
|
+
fi
|
|
22
|
+
|
|
23
|
+
# Check if PID file exists
|
|
24
|
+
if [[ -f "$PID_FILE" ]]; then
|
|
25
|
+
OLD_PID=$(cat "$PID_FILE")
|
|
26
|
+
|
|
27
|
+
# Log the cleanup attempt
|
|
28
|
+
echo "$(date): Attempting to clean up tunnel for context $CONTEXT (PID: $OLD_PID, Port: $KUBE_PORT)" >> "$LOG_FILE"
|
|
29
|
+
|
|
30
|
+
# Try to kill the process
|
|
31
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
32
|
+
# Process exists, kill it
|
|
33
|
+
kill "$OLD_PID" 2>/dev/null
|
|
34
|
+
|
|
35
|
+
# Wait a moment and check if it's really gone
|
|
36
|
+
sleep 1
|
|
37
|
+
if kill -0 "$OLD_PID" 2>/dev/null; then
|
|
38
|
+
# Still running, force kill
|
|
39
|
+
kill -9 "$OLD_PID" 2>/dev/null
|
|
40
|
+
echo "$(date): Forcefully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
|
|
41
|
+
else
|
|
42
|
+
echo "$(date): Successfully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
|
|
43
|
+
fi
|
|
44
|
+
else
|
|
45
|
+
echo "$(date): No running process found with PID $OLD_PID" >> "$LOG_FILE"
|
|
46
|
+
fi
|
|
47
|
+
|
|
48
|
+
# Remove PID file
|
|
49
|
+
rm -f "$PID_FILE"
|
|
50
|
+
else
|
|
51
|
+
echo "$(date): No PID file found for context $CONTEXT. Nothing to clean up." >> "$LOG_FILE"
|
|
52
|
+
fi
|
|
53
|
+
|
|
54
|
+
# Clean up lock file if it exists
|
|
55
|
+
rm -f "$LOCK_FILE"
|
|
56
|
+
|
|
57
|
+
# Check if port is still in use
|
|
58
|
+
if nc -z localhost "$KUBE_PORT" 2>/dev/null; then
|
|
59
|
+
echo "$(date): Warning: Port $KUBE_PORT is still in use after cleanup. Another process might be using it." >> "$LOG_FILE"
|
|
60
|
+
fi
|
|
61
|
+
|
|
62
|
+
echo "$(date): Cleanup complete for context $CONTEXT" >> "$LOG_FILE"
|