skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +1 -61
- sky/adaptors/slurm.py +565 -0
- sky/backends/backend_utils.py +95 -12
- sky/backends/cloud_vm_ray_backend.py +224 -65
- sky/backends/task_codegen.py +380 -4
- sky/catalog/__init__.py +0 -3
- sky/catalog/data_fetchers/fetch_gcp.py +9 -1
- sky/catalog/data_fetchers/fetch_nebius.py +1 -1
- sky/catalog/data_fetchers/fetch_vast.py +4 -2
- sky/catalog/kubernetes_catalog.py +12 -4
- sky/catalog/seeweb_catalog.py +30 -15
- sky/catalog/shadeform_catalog.py +5 -2
- sky/catalog/slurm_catalog.py +236 -0
- sky/catalog/vast_catalog.py +30 -6
- sky/check.py +25 -11
- sky/client/cli/command.py +391 -32
- sky/client/interactive_utils.py +190 -0
- sky/client/sdk.py +64 -2
- sky/client/sdk_async.py +9 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +60 -2
- sky/clouds/azure.py +2 -0
- sky/clouds/cloud.py +7 -0
- sky/clouds/kubernetes.py +2 -0
- sky/clouds/runpod.py +38 -7
- sky/clouds/slurm.py +610 -0
- sky/clouds/ssh.py +3 -2
- sky/clouds/vast.py +39 -16
- sky/core.py +197 -37
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
- sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
- sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
- sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
- sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
- sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
- sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
- sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
- sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
- sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
- sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
- sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
- sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
- sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
- sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
- sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
- sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
- sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +26 -12
- sky/data/mounting_utils.py +44 -5
- sky/global_user_state.py +111 -19
- sky/jobs/client/sdk.py +8 -3
- sky/jobs/controller.py +191 -31
- sky/jobs/recovery_strategy.py +109 -11
- sky/jobs/server/core.py +81 -4
- sky/jobs/server/server.py +14 -0
- sky/jobs/state.py +417 -19
- sky/jobs/utils.py +73 -80
- sky/models.py +11 -0
- sky/optimizer.py +8 -6
- sky/provision/__init__.py +12 -9
- sky/provision/common.py +20 -0
- sky/provision/docker_utils.py +15 -2
- sky/provision/kubernetes/utils.py +163 -20
- sky/provision/kubernetes/volume.py +52 -17
- sky/provision/provisioner.py +17 -7
- sky/provision/runpod/instance.py +3 -1
- sky/provision/runpod/utils.py +13 -1
- sky/provision/runpod/volume.py +25 -9
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +618 -0
- sky/provision/slurm/utils.py +689 -0
- sky/provision/vast/instance.py +4 -1
- sky/provision/vast/utils.py +11 -6
- sky/resources.py +135 -13
- sky/schemas/api/responses.py +4 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
- sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
- sky/schemas/db/spot_jobs/009_job_events.py +32 -0
- sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
- sky/schemas/db/spot_jobs/011_add_links.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +9 -5
- sky/schemas/generated/jobsv1_pb2.pyi +12 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
- sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
- sky/serve/serve_utils.py +232 -40
- sky/serve/server/impl.py +1 -1
- sky/server/common.py +17 -0
- sky/server/constants.py +1 -1
- sky/server/metrics.py +6 -3
- sky/server/plugins.py +238 -0
- sky/server/requests/executor.py +5 -2
- sky/server/requests/payloads.py +30 -1
- sky/server/requests/request_names.py +4 -0
- sky/server/requests/requests.py +33 -11
- sky/server/requests/serializers/encoders.py +22 -0
- sky/server/requests/serializers/return_value_serializers.py +70 -0
- sky/server/server.py +506 -109
- sky/server/server_utils.py +30 -0
- sky/server/uvicorn.py +5 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +22 -9
- sky/sky_logging.py +2 -1
- sky/skylet/attempt_skylet.py +13 -3
- sky/skylet/constants.py +55 -13
- sky/skylet/events.py +10 -4
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +187 -0
- sky/skylet/job_lib.py +91 -5
- sky/skylet/log_lib.py +22 -6
- sky/skylet/log_lib.pyi +8 -6
- sky/skylet/services.py +18 -3
- sky/skylet/skylet.py +5 -1
- sky/skylet/subprocess_daemon.py +2 -1
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
- sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +11 -13
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/templates/kubernetes-ray.yml.j2 +12 -6
- sky/templates/slurm-ray.yml.j2 +115 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +18 -41
- sky/users/model.conf +1 -1
- sky/users/permission.py +85 -52
- sky/users/rbac.py +31 -3
- sky/utils/annotations.py +108 -8
- sky/utils/auth_utils.py +42 -0
- sky/utils/cli_utils/status_utils.py +19 -5
- sky/utils/cluster_utils.py +10 -3
- sky/utils/command_runner.py +389 -35
- sky/utils/command_runner.pyi +43 -4
- sky/utils/common_utils.py +47 -31
- sky/utils/context.py +32 -0
- sky/utils/db/db_utils.py +36 -6
- sky/utils/db/migration_utils.py +41 -21
- sky/utils/infra_utils.py +5 -1
- sky/utils/instance_links.py +139 -0
- sky/utils/interactive_utils.py +49 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
- sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
- sky/utils/kubernetes/rsync_helper.sh +5 -1
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/plugin_extensions/__init__.py +14 -0
- sky/utils/plugin_extensions/external_failure_source.py +176 -0
- sky/utils/resources_utils.py +10 -8
- sky/utils/rich_utils.py +9 -11
- sky/utils/schemas.py +93 -19
- sky/utils/status_lib.py +7 -0
- sky/utils/subprocess_utils.py +17 -0
- sky/volumes/client/sdk.py +6 -3
- sky/volumes/server/core.py +65 -27
- sky_templates/ray/start_cluster +8 -4
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
- /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
- /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
sky/backends/backend_utils.py
CHANGED
|
@@ -69,6 +69,7 @@ from sky.utils import timeline
|
|
|
69
69
|
from sky.utils import ux_utils
|
|
70
70
|
from sky.utils import volume as volume_utils
|
|
71
71
|
from sky.utils import yaml_utils
|
|
72
|
+
from sky.utils.plugin_extensions import ExternalFailureSource
|
|
72
73
|
from sky.workspaces import core as workspaces_core
|
|
73
74
|
|
|
74
75
|
if typing.TYPE_CHECKING:
|
|
@@ -147,6 +148,19 @@ CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
|
|
|
147
148
|
# Remote dir that holds our runtime files.
|
|
148
149
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
149
150
|
|
|
151
|
+
# The maximum size of a command line arguments is 128 KB, i.e. the command
|
|
152
|
+
# executed with /bin/sh should be less than 128KB.
|
|
153
|
+
# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
|
|
154
|
+
#
|
|
155
|
+
# If a user have very long run or setup commands, the generated command may
|
|
156
|
+
# exceed the limit, as we directly include scripts in job submission commands.
|
|
157
|
+
# If the command is too long, we instead write it to a file, rsync and execute
|
|
158
|
+
# it.
|
|
159
|
+
#
|
|
160
|
+
# We use 100KB as a threshold to be safe for other arguments that
|
|
161
|
+
# might be added during ssh.
|
|
162
|
+
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
163
|
+
|
|
150
164
|
_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
|
151
165
|
'please retry after a while.')
|
|
152
166
|
|
|
@@ -225,6 +239,18 @@ _ACK_MESSAGE = 'ack'
|
|
|
225
239
|
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
226
240
|
|
|
227
241
|
|
|
242
|
+
def is_command_length_over_limit(command: str) -> bool:
|
|
243
|
+
"""Check if the length of the command exceeds the limit.
|
|
244
|
+
|
|
245
|
+
We calculate the length of the command after quoting the command twice as
|
|
246
|
+
when it is executed by the CommandRunner, the command will be quoted twice
|
|
247
|
+
to ensure the correctness, which will add significant length to the command.
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
quoted_length = len(shlex.quote(shlex.quote(command)))
|
|
251
|
+
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
252
|
+
|
|
253
|
+
|
|
228
254
|
def is_ip(s: str) -> bool:
|
|
229
255
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
230
256
|
return len(re.findall(IP_ADDR_REGEX, s)) == 1
|
|
@@ -738,7 +764,20 @@ def write_cluster_config(
|
|
|
738
764
|
keys=('allowed_contexts',),
|
|
739
765
|
default_value=None)
|
|
740
766
|
if allowed_contexts is None:
|
|
741
|
-
|
|
767
|
+
# Exclude both Kubernetes and SSH explicitly since:
|
|
768
|
+
# 1. isinstance(cloud, clouds.Kubernetes) matches both (SSH
|
|
769
|
+
# inherits from Kubernetes)
|
|
770
|
+
# 2. Both share the same get_credential_file_mounts() which
|
|
771
|
+
# returns the kubeconfig. So if we don't exclude both, the
|
|
772
|
+
# unexcluded one will upload the kubeconfig.
|
|
773
|
+
# TODO(romilb): This is a workaround. The right long-term fix
|
|
774
|
+
# is to have SSH Node Pools use its own kubeconfig instead of
|
|
775
|
+
# sharing the global kubeconfig at ~/.kube/config. In the
|
|
776
|
+
# interim, SSH Node Pools' get_credential_file_mounts can filter
|
|
777
|
+
# contexts starting with ssh- and create a temp kubeconfig
|
|
778
|
+
# to upload.
|
|
779
|
+
excluded_clouds.add(clouds.Kubernetes())
|
|
780
|
+
excluded_clouds.add(clouds.SSH())
|
|
742
781
|
else:
|
|
743
782
|
excluded_clouds.add(cloud)
|
|
744
783
|
|
|
@@ -946,6 +985,9 @@ def write_cluster_config(
|
|
|
946
985
|
'{conda_auto_activate}',
|
|
947
986
|
conda_auto_activate).replace('{is_custom_docker}',
|
|
948
987
|
is_custom_docker),
|
|
988
|
+
# Currently only used by Slurm. For other clouds, it is
|
|
989
|
+
# already part of ray_skypilot_installation_commands
|
|
990
|
+
'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
|
|
949
991
|
'ray_skypilot_installation_commands':
|
|
950
992
|
(constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
|
|
951
993
|
'{sky_wheel_hash}',
|
|
@@ -1058,7 +1100,11 @@ def write_cluster_config(
|
|
|
1058
1100
|
with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
|
|
1059
1101
|
f.write(restored_yaml_content)
|
|
1060
1102
|
|
|
1061
|
-
|
|
1103
|
+
# Read the cluster_name_on_cloud from the restored yaml. This is a hack to
|
|
1104
|
+
# make sure that launching on the same cluster across multiple users works
|
|
1105
|
+
# correctly. See #8232.
|
|
1106
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1107
|
+
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
1062
1108
|
|
|
1063
1109
|
# Make sure to do this before we optimize file mounts. Optimization is
|
|
1064
1110
|
# non-deterministic, but everything else before this point should be
|
|
@@ -1105,17 +1151,21 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1105
1151
|
"""
|
|
1106
1152
|
config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1107
1153
|
# Check the availability of the cloud type.
|
|
1108
|
-
if isinstance(
|
|
1154
|
+
if isinstance(
|
|
1155
|
+
cloud,
|
|
1156
|
+
(
|
|
1109
1157
|
clouds.AWS,
|
|
1110
1158
|
clouds.OCI,
|
|
1111
1159
|
clouds.SCP,
|
|
1160
|
+
# TODO(jwj): Handle Slurm-specific auth logic
|
|
1161
|
+
clouds.Slurm,
|
|
1112
1162
|
clouds.Vsphere,
|
|
1113
1163
|
clouds.Cudo,
|
|
1114
1164
|
clouds.Paperspace,
|
|
1115
1165
|
clouds.Azure,
|
|
1116
1166
|
clouds.DO,
|
|
1117
1167
|
clouds.Nebius,
|
|
1118
|
-
|
|
1168
|
+
)):
|
|
1119
1169
|
config = auth.configure_ssh_info(config)
|
|
1120
1170
|
elif isinstance(cloud, clouds.GCP):
|
|
1121
1171
|
config = auth.setup_gcp_authentication(config)
|
|
@@ -2226,6 +2276,12 @@ def _update_cluster_status(
|
|
|
2226
2276
|
for status in node_statuses) and
|
|
2227
2277
|
len(node_statuses) == handle.launched_nodes)
|
|
2228
2278
|
|
|
2279
|
+
external_cluster_failures = ExternalFailureSource.get(
|
|
2280
|
+
cluster_hash=record['cluster_hash'])
|
|
2281
|
+
logger.debug(f'Cluster {cluster_name} with cluster_hash '
|
|
2282
|
+
f'{record["cluster_hash"]} has external cluster failures: '
|
|
2283
|
+
f'{external_cluster_failures}')
|
|
2284
|
+
|
|
2229
2285
|
def get_node_counts_from_ray_status(
|
|
2230
2286
|
runner: command_runner.CommandRunner) -> Tuple[int, int, str, str]:
|
|
2231
2287
|
rc, output, stderr = runner.run(
|
|
@@ -2361,7 +2417,13 @@ def _update_cluster_status(
|
|
|
2361
2417
|
# remain healthy for a while before the cloud completely preempts the VMs.
|
|
2362
2418
|
# We have mitigated this by again first querying the VM state from the cloud
|
|
2363
2419
|
# provider.
|
|
2364
|
-
|
|
2420
|
+
cloud = handle.launched_resources.cloud
|
|
2421
|
+
|
|
2422
|
+
# For Slurm, skip Ray health check since it doesn't use Ray.
|
|
2423
|
+
should_check_ray = cloud is not None and cloud.uses_ray()
|
|
2424
|
+
if (all_nodes_up and (not should_check_ray or
|
|
2425
|
+
run_ray_status_to_check_ray_cluster_healthy()) and
|
|
2426
|
+
not external_cluster_failures):
|
|
2365
2427
|
# NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
|
|
2366
2428
|
# run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
|
|
2367
2429
|
# head-ip/worker-ips`.
|
|
@@ -2464,15 +2526,15 @@ def _update_cluster_status(
|
|
|
2464
2526
|
# (2) Otherwise, we will reset the autostop setting, unless the cluster is
|
|
2465
2527
|
# autostopping/autodowning.
|
|
2466
2528
|
some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
|
|
2467
|
-
# If all nodes are up and ray cluster is health, we would have returned
|
|
2468
|
-
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2469
|
-
# cluster must have been unhealthy.
|
|
2470
|
-
ray_cluster_unhealthy = all_nodes_up
|
|
2471
2529
|
some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
|
|
2472
2530
|
for status in node_statuses)
|
|
2473
2531
|
is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
|
|
2474
2532
|
|
|
2475
|
-
if is_abnormal:
|
|
2533
|
+
if is_abnormal and not external_cluster_failures:
|
|
2534
|
+
# If all nodes are up and ray cluster is healthy, we would have returned
|
|
2535
|
+
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2536
|
+
# cluster must have been unhealthy.
|
|
2537
|
+
ray_cluster_unhealthy = all_nodes_up
|
|
2476
2538
|
status_reason = ', '.join(
|
|
2477
2539
|
[status[1] for status in node_statuses if status[1] is not None])
|
|
2478
2540
|
|
|
@@ -2600,8 +2662,25 @@ def _update_cluster_status(
|
|
|
2600
2662
|
cluster_name,
|
|
2601
2663
|
include_user_info=include_user_info,
|
|
2602
2664
|
summary_response=summary_response)
|
|
2603
|
-
# Now
|
|
2604
|
-
#
|
|
2665
|
+
# Now either:
|
|
2666
|
+
# (1) is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2667
|
+
# STOPPED
|
|
2668
|
+
# or
|
|
2669
|
+
# (2) there are external cluster failures reported by a plugin.
|
|
2670
|
+
|
|
2671
|
+
# If there are external cluster failures and the cluster has not been
|
|
2672
|
+
# terminated on cloud (to_terminate), we can return the cluster record as is.
|
|
2673
|
+
# This is because when an external failure is detected, the cluster will be
|
|
2674
|
+
# marked as INIT with a reason indicating the details of the failure. So, we
|
|
2675
|
+
# do not want to modify the cluster status in this function except for in the
|
|
2676
|
+
# case where the cluster has been terminated on cloud, in which case we should
|
|
2677
|
+
# clean up the cluster from SkyPilot's global state.
|
|
2678
|
+
if external_cluster_failures and not to_terminate:
|
|
2679
|
+
return global_user_state.get_cluster_from_name(
|
|
2680
|
+
cluster_name,
|
|
2681
|
+
include_user_info=include_user_info,
|
|
2682
|
+
summary_response=summary_response)
|
|
2683
|
+
|
|
2605
2684
|
verb = 'terminated' if to_terminate else 'stopped'
|
|
2606
2685
|
backend = backends.CloudVmRayBackend()
|
|
2607
2686
|
global_user_state.add_cluster_event(
|
|
@@ -3327,6 +3406,8 @@ def get_clusters(
|
|
|
3327
3406
|
handle = record['handle']
|
|
3328
3407
|
record['nodes'] = handle.launched_nodes
|
|
3329
3408
|
if handle.launched_resources is None:
|
|
3409
|
+
# Set default values when launched_resources is None
|
|
3410
|
+
record['labels'] = {}
|
|
3330
3411
|
continue
|
|
3331
3412
|
record['cloud'] = (f'{handle.launched_resources.cloud}'
|
|
3332
3413
|
if handle.launched_resources.cloud else None)
|
|
@@ -3339,6 +3420,8 @@ def get_clusters(
|
|
|
3339
3420
|
record['accelerators'] = (
|
|
3340
3421
|
f'{handle.launched_resources.accelerators}'
|
|
3341
3422
|
if handle.launched_resources.accelerators else None)
|
|
3423
|
+
record['labels'] = (handle.launched_resources.labels
|
|
3424
|
+
if handle.launched_resources.labels else {})
|
|
3342
3425
|
if not include_handle:
|
|
3343
3426
|
record.pop('handle', None)
|
|
3344
3427
|
|