skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/core.py
CHANGED
@@ -1,25 +1,38 @@
|
|
1
1
|
"""SDK functions for cluster/job management."""
|
2
|
-
import
|
2
|
+
import os
|
3
|
+
import shlex
|
3
4
|
import typing
|
4
|
-
from typing import Any, Dict, List, Optional, Union
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
6
|
|
6
7
|
import colorama
|
7
8
|
|
8
9
|
from sky import backends
|
10
|
+
from sky import check as sky_check
|
9
11
|
from sky import clouds
|
10
12
|
from sky import dag
|
11
13
|
from sky import data
|
12
14
|
from sky import exceptions
|
13
15
|
from sky import global_user_state
|
16
|
+
from sky import models
|
14
17
|
from sky import sky_logging
|
15
|
-
from sky import status_lib
|
16
18
|
from sky import task
|
17
19
|
from sky.backends import backend_utils
|
20
|
+
from sky.clouds import service_catalog
|
21
|
+
from sky.jobs.server import core as managed_jobs_core
|
22
|
+
from sky.provision.kubernetes import constants as kubernetes_constants
|
23
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
18
24
|
from sky.skylet import constants
|
19
25
|
from sky.skylet import job_lib
|
26
|
+
from sky.skylet import log_lib
|
20
27
|
from sky.usage import usage_lib
|
28
|
+
from sky.utils import common
|
29
|
+
from sky.utils import common_utils
|
21
30
|
from sky.utils import controller_utils
|
31
|
+
from sky.utils import rich_utils
|
32
|
+
from sky.utils import status_lib
|
22
33
|
from sky.utils import subprocess_utils
|
34
|
+
from sky.utils import ux_utils
|
35
|
+
from sky.utils.kubernetes import kubernetes_deploy_utils
|
23
36
|
|
24
37
|
if typing.TYPE_CHECKING:
|
25
38
|
from sky import resources as resources_lib
|
@@ -30,14 +43,15 @@ logger = sky_logging.init_logger(__name__)
|
|
30
43
|
# = Cluster Management =
|
31
44
|
# ======================
|
32
45
|
|
33
|
-
# pylint: disable=redefined-builtin
|
34
|
-
|
35
46
|
|
36
47
|
@usage_lib.entrypoint
|
37
|
-
def status(
|
38
|
-
|
48
|
+
def status(
|
49
|
+
cluster_names: Optional[Union[str, List[str]]] = None,
|
50
|
+
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
51
|
+
all_users: bool = False,
|
52
|
+
) -> List[Dict[str, Any]]:
|
39
53
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
40
|
-
"""
|
54
|
+
"""Gets cluster statuses.
|
41
55
|
|
42
56
|
If cluster_names is given, return those clusters. Otherwise, return all
|
43
57
|
clusters.
|
@@ -56,6 +70,10 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
|
|
56
70
|
'autostop': (int) idle time before autostop,
|
57
71
|
'to_down': (bool) whether autodown is used instead of autostop,
|
58
72
|
'metadata': (dict) metadata of the cluster,
|
73
|
+
'user_hash': (str) user hash of the cluster owner,
|
74
|
+
'user_name': (str) user name of the cluster owner,
|
75
|
+
'resources_str': (str) the resource string representation of the
|
76
|
+
cluster,
|
59
77
|
}
|
60
78
|
|
61
79
|
Each cluster can have one of the following statuses:
|
@@ -104,9 +122,91 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
|
|
104
122
|
cluster. If a cluster is found to be terminated or not found, it will
|
105
123
|
be omitted from the returned list.
|
106
124
|
"""
|
107
|
-
|
108
|
-
|
109
|
-
|
125
|
+
clusters = backend_utils.get_clusters(refresh=refresh,
|
126
|
+
cluster_names=cluster_names,
|
127
|
+
all_users=all_users)
|
128
|
+
return clusters
|
129
|
+
|
130
|
+
|
131
|
+
def status_kubernetes(
|
132
|
+
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
133
|
+
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
134
|
+
List[Dict[str, Any]], Optional[str]]:
|
135
|
+
"""Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
|
136
|
+
|
137
|
+
Managed jobs and services are also included in the clusters returned.
|
138
|
+
The caller must parse the controllers to identify which clusters are run
|
139
|
+
as managed jobs or services.
|
140
|
+
all_clusters, unmanaged_clusters, all_jobs, context
|
141
|
+
Returns:
|
142
|
+
A tuple containing:
|
143
|
+
- all_clusters: List of KubernetesSkyPilotClusterInfoPayload with info
|
144
|
+
for all clusters, including managed jobs, services and controllers.
|
145
|
+
- unmanaged_clusters: List of KubernetesSkyPilotClusterInfoPayload with
|
146
|
+
info for all clusters excluding managed jobs and services.
|
147
|
+
Controllers are included.
|
148
|
+
- all_jobs: List of managed jobs from all controllers. Each entry is a
|
149
|
+
dictionary job info, see jobs.queue_from_kubernetes_pod for details.
|
150
|
+
- context: Kubernetes context used to fetch the cluster information.
|
151
|
+
"""
|
152
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
153
|
+
try:
|
154
|
+
pods = kubernetes_utils.get_skypilot_pods(context)
|
155
|
+
except exceptions.ResourcesUnavailableError as e:
|
156
|
+
with ux_utils.print_exception_no_traceback():
|
157
|
+
raise ValueError('Failed to get SkyPilot pods from '
|
158
|
+
f'Kubernetes: {str(e)}') from e
|
159
|
+
all_clusters, jobs_controllers, _ = (kubernetes_utils.process_skypilot_pods(
|
160
|
+
pods, context))
|
161
|
+
all_jobs = []
|
162
|
+
with rich_utils.safe_status(
|
163
|
+
ux_utils.spinner_message(
|
164
|
+
'[bold cyan]Checking in-progress managed jobs[/]')) as spinner:
|
165
|
+
for i, job_controller_info in enumerate(jobs_controllers):
|
166
|
+
user = job_controller_info.user
|
167
|
+
pod = job_controller_info.pods[0]
|
168
|
+
status_message = '[bold cyan]Checking managed jobs controller'
|
169
|
+
if len(jobs_controllers) > 1:
|
170
|
+
status_message += f's ({i + 1}/{len(jobs_controllers)})'
|
171
|
+
spinner.update(f'{status_message}[/]')
|
172
|
+
try:
|
173
|
+
job_list = managed_jobs_core.queue_from_kubernetes_pod(
|
174
|
+
pod.metadata.name)
|
175
|
+
except RuntimeError as e:
|
176
|
+
logger.warning('Failed to get managed jobs from controller '
|
177
|
+
f'{pod.metadata.name}: {str(e)}')
|
178
|
+
job_list = []
|
179
|
+
# Add user field to jobs
|
180
|
+
for job in job_list:
|
181
|
+
job['user'] = user
|
182
|
+
all_jobs.extend(job_list)
|
183
|
+
# Reconcile cluster state between managed jobs and clusters:
|
184
|
+
# To maintain a clear separation between regular SkyPilot clusters
|
185
|
+
# and those from managed jobs, we need to exclude the latter from
|
186
|
+
# the main cluster list.
|
187
|
+
# We do this by reconstructing managed job cluster names from each
|
188
|
+
# job's name and ID. We then use this set to filter out managed
|
189
|
+
# clusters from the main cluster list. This is necessary because there
|
190
|
+
# are no identifiers distinguishing clusters from managed jobs from
|
191
|
+
# regular clusters.
|
192
|
+
managed_job_cluster_names = set()
|
193
|
+
for job in all_jobs:
|
194
|
+
# Managed job cluster name is <job_name>-<job_id>
|
195
|
+
managed_cluster_name = f'{job["job_name"]}-{job["job_id"]}'
|
196
|
+
managed_job_cluster_names.add(managed_cluster_name)
|
197
|
+
unmanaged_clusters = [
|
198
|
+
c for c in all_clusters
|
199
|
+
if c.cluster_name not in managed_job_cluster_names
|
200
|
+
]
|
201
|
+
all_clusters = [
|
202
|
+
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
203
|
+
for c in all_clusters
|
204
|
+
]
|
205
|
+
unmanaged_clusters = [
|
206
|
+
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
207
|
+
for c in unmanaged_clusters
|
208
|
+
]
|
209
|
+
return all_clusters, unmanaged_clusters, all_jobs, context
|
110
210
|
|
111
211
|
|
112
212
|
def endpoints(cluster: str,
|
@@ -126,7 +226,10 @@ def endpoints(cluster: str,
|
|
126
226
|
RuntimeError: if the cluster has no ports to be exposed or no endpoints
|
127
227
|
are exposed yet.
|
128
228
|
"""
|
129
|
-
|
229
|
+
with rich_utils.safe_status(
|
230
|
+
ux_utils.spinner_message(
|
231
|
+
f'Fetching endpoints for cluster {cluster}')):
|
232
|
+
return backend_utils.get_endpoints(cluster=cluster, port=port)
|
130
233
|
|
131
234
|
|
132
235
|
@usage_lib.entrypoint
|
@@ -173,6 +276,9 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
173
276
|
|
174
277
|
for cluster_report in cluster_reports:
|
175
278
|
cluster_report['total_cost'] = get_total_cost(cluster_report)
|
279
|
+
cluster_report['cloud'] = str(cluster_report['resources'].cloud)
|
280
|
+
cluster_report['accelerators'] = cluster_report[
|
281
|
+
'resources'].accelerators
|
176
282
|
|
177
283
|
return cluster_reports
|
178
284
|
|
@@ -188,7 +294,8 @@ def _start(
|
|
188
294
|
cluster_status, handle = backend_utils.refresh_cluster_status_handle(
|
189
295
|
cluster_name)
|
190
296
|
if handle is None:
|
191
|
-
raise
|
297
|
+
raise exceptions.ClusterDoesNotExist(
|
298
|
+
f'Cluster {cluster_name!r} does not exist.')
|
192
299
|
if not force and cluster_status == status_lib.ClusterStatus.UP:
|
193
300
|
sky_logging.print(f'Cluster {cluster_name!r} is already up.')
|
194
301
|
return handle
|
@@ -266,8 +373,7 @@ def start(
|
|
266
373
|
many minute of idleness, i.e., no running or pending jobs in the
|
267
374
|
cluster's job queue. Idleness gets reset whenever setting-up/
|
268
375
|
running/pending jobs are found in the job queue. Setting this
|
269
|
-
flag is equivalent to running
|
270
|
-
``sky.launch(..., detach_run=True, ...)`` and then
|
376
|
+
flag is equivalent to running ``sky.launch()`` and then
|
271
377
|
``sky.autostop(idle_minutes=<minutes>)``. If not set, the
|
272
378
|
cluster will not be autostopped.
|
273
379
|
retry_until_up: whether to retry launching the cluster until it is
|
@@ -279,12 +385,13 @@ def start(
|
|
279
385
|
Useful for upgrading SkyPilot runtime.
|
280
386
|
|
281
387
|
Raises:
|
282
|
-
ValueError: argument values are invalid: (1)
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
388
|
+
ValueError: argument values are invalid: (1) if ``down`` is set to True
|
389
|
+
but ``idle_minutes_to_autostop`` is None; (2) if the specified
|
390
|
+
cluster is the managed jobs controller, and either
|
391
|
+
``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
|
392
|
+
them to use the default autostop settings).
|
393
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
394
|
+
exist.
|
288
395
|
sky.exceptions.NotSupportedError: if the cluster to restart was
|
289
396
|
launched using a non-default backend that does not support this
|
290
397
|
operation.
|
@@ -310,10 +417,45 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
|
|
310
417
|
return message
|
311
418
|
|
312
419
|
|
420
|
+
@usage_lib.entrypoint
|
421
|
+
def down(cluster_name: str, purge: bool = False) -> None:
|
422
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
423
|
+
"""Tears down a cluster.
|
424
|
+
|
425
|
+
Tearing down a cluster will delete all associated resources (all billing
|
426
|
+
stops), and any data on the attached disks will be lost. Accelerators
|
427
|
+
(e.g., TPUs) that are part of the cluster will be deleted too.
|
428
|
+
|
429
|
+
Args:
|
430
|
+
cluster_name: name of the cluster to down.
|
431
|
+
purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
|
432
|
+
table, even if the actual cluster termination failed on the cloud.
|
433
|
+
WARNING: This flag should only be set sparingly in certain manual
|
434
|
+
troubleshooting scenarios; with it set, it is the user's
|
435
|
+
responsibility to ensure there are no leaked instances and related
|
436
|
+
resources.
|
437
|
+
|
438
|
+
Raises:
|
439
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
440
|
+
exist.
|
441
|
+
RuntimeError: failed to tear down the cluster.
|
442
|
+
sky.exceptions.NotSupportedError: the specified cluster is the managed
|
443
|
+
jobs controller.
|
444
|
+
"""
|
445
|
+
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
446
|
+
if handle is None:
|
447
|
+
raise exceptions.ClusterDoesNotExist(
|
448
|
+
f'Cluster {cluster_name!r} does not exist.')
|
449
|
+
|
450
|
+
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
451
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
452
|
+
backend.teardown(handle, terminate=True, purge=purge)
|
453
|
+
|
454
|
+
|
313
455
|
@usage_lib.entrypoint
|
314
456
|
def stop(cluster_name: str, purge: bool = False) -> None:
|
315
457
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
316
|
-
"""
|
458
|
+
"""Stops a cluster.
|
317
459
|
|
318
460
|
Data on attached disks is not lost when a cluster is stopped. Billing for
|
319
461
|
the instances will stop, while the disks will still be charged. Those
|
@@ -332,7 +474,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
332
474
|
related resources.
|
333
475
|
|
334
476
|
Raises:
|
335
|
-
|
477
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
478
|
+
exist.
|
336
479
|
RuntimeError: failed to stop the cluster.
|
337
480
|
sky.exceptions.NotSupportedError: if the specified cluster is a spot
|
338
481
|
cluster, or a TPU VM Pod cluster, or the managed jobs controller.
|
@@ -343,7 +486,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
343
486
|
f'is not supported.')
|
344
487
|
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
345
488
|
if handle is None:
|
346
|
-
raise
|
489
|
+
raise exceptions.ClusterDoesNotExist(
|
490
|
+
f'Cluster {cluster_name!r} does not exist.')
|
347
491
|
|
348
492
|
backend = backend_utils.get_backend_from_handle(handle)
|
349
493
|
|
@@ -368,39 +512,6 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
368
512
|
backend.teardown(handle, terminate=False, purge=purge)
|
369
513
|
|
370
514
|
|
371
|
-
@usage_lib.entrypoint
|
372
|
-
def down(cluster_name: str, purge: bool = False) -> None:
|
373
|
-
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
374
|
-
"""Tear down a cluster.
|
375
|
-
|
376
|
-
Tearing down a cluster will delete all associated resources (all billing
|
377
|
-
stops), and any data on the attached disks will be lost. Accelerators
|
378
|
-
(e.g., TPUs) that are part of the cluster will be deleted too.
|
379
|
-
|
380
|
-
Args:
|
381
|
-
cluster_name: name of the cluster to down.
|
382
|
-
purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
|
383
|
-
table, even if the actual cluster termination failed on the cloud.
|
384
|
-
WARNING: This flag should only be set sparingly in certain manual
|
385
|
-
troubleshooting scenarios; with it set, it is the user's
|
386
|
-
responsibility to ensure there are no leaked instances and related
|
387
|
-
resources.
|
388
|
-
|
389
|
-
Raises:
|
390
|
-
ValueError: the specified cluster does not exist.
|
391
|
-
RuntimeError: failed to tear down the cluster.
|
392
|
-
sky.exceptions.NotSupportedError: the specified cluster is the managed
|
393
|
-
jobs controller.
|
394
|
-
"""
|
395
|
-
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
396
|
-
if handle is None:
|
397
|
-
raise ValueError(f'Cluster {cluster_name!r} does not exist.')
|
398
|
-
|
399
|
-
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
400
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
401
|
-
backend.teardown(handle, terminate=True, purge=purge)
|
402
|
-
|
403
|
-
|
404
515
|
@usage_lib.entrypoint
|
405
516
|
def autostop(
|
406
517
|
cluster_name: str,
|
@@ -408,7 +519,7 @@ def autostop(
|
|
408
519
|
down: bool = False, # pylint: disable=redefined-outer-name
|
409
520
|
) -> None:
|
410
521
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
411
|
-
"""
|
522
|
+
"""Schedules an autostop/autodown for a cluster.
|
412
523
|
|
413
524
|
Autostop/autodown will automatically stop or teardown a cluster when it
|
414
525
|
becomes idle for a specified duration. Idleness means there are no
|
@@ -441,7 +552,7 @@ def autostop(
|
|
441
552
|
rather than autostop (restartable).
|
442
553
|
|
443
554
|
Raises:
|
444
|
-
|
555
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
445
556
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
446
557
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
447
558
|
CloudVmRayBackend or the cluster is TPU VM Pod.
|
@@ -515,7 +626,7 @@ def queue(cluster_name: str,
|
|
515
626
|
skip_finished: bool = False,
|
516
627
|
all_users: bool = False) -> List[dict]:
|
517
628
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
518
|
-
"""
|
629
|
+
"""Gets the job queue of a cluster.
|
519
630
|
|
520
631
|
Please refer to the sky.cli.queue for the document.
|
521
632
|
|
@@ -526,6 +637,7 @@ def queue(cluster_name: str,
|
|
526
637
|
'job_id': (int) job id,
|
527
638
|
'job_name': (str) job name,
|
528
639
|
'username': (str) username,
|
640
|
+
'user_hash': (str) user hash,
|
529
641
|
'submitted_at': (int) timestamp of submitted,
|
530
642
|
'start_at': (int) timestamp of started,
|
531
643
|
'end_at': (int) timestamp of ended,
|
@@ -535,7 +647,7 @@ def queue(cluster_name: str,
|
|
535
647
|
}
|
536
648
|
]
|
537
649
|
raises:
|
538
|
-
|
650
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
539
651
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
540
652
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
541
653
|
CloudVmRayBackend.
|
@@ -546,10 +658,10 @@ def queue(cluster_name: str,
|
|
546
658
|
exceptions.CommandError: if failed to get the job queue with ssh.
|
547
659
|
"""
|
548
660
|
all_jobs = not skip_finished
|
549
|
-
|
661
|
+
user_hash: Optional[str] = common_utils.get_user_hash()
|
550
662
|
if all_users:
|
551
|
-
|
552
|
-
code = job_lib.JobLibCodeGen.get_job_queue(
|
663
|
+
user_hash = None
|
664
|
+
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
553
665
|
|
554
666
|
handle = backend_utils.check_cluster_available(
|
555
667
|
cluster_name,
|
@@ -576,25 +688,29 @@ def queue(cluster_name: str,
|
|
576
688
|
def cancel(
|
577
689
|
cluster_name: str,
|
578
690
|
all: bool = False,
|
691
|
+
all_users: bool = False,
|
579
692
|
job_ids: Optional[List[int]] = None,
|
580
693
|
# pylint: disable=invalid-name
|
694
|
+
# Internal only:
|
581
695
|
_try_cancel_if_cluster_is_init: bool = False,
|
582
696
|
) -> None:
|
583
697
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
584
|
-
"""
|
698
|
+
"""Cancels jobs on a cluster.
|
585
699
|
|
586
700
|
Please refer to the sky.cli.cancel for the document.
|
587
701
|
|
588
|
-
When `all`
|
702
|
+
When none of `job_ids`, `all` and `all_users` is set, cancel the latest
|
703
|
+
running job.
|
589
704
|
|
590
705
|
Additional arguments:
|
591
|
-
|
706
|
+
try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
|
592
707
|
even if the cluster is not UP, but the head node is still alive.
|
593
708
|
This is used by the jobs controller to cancel the job when the
|
594
709
|
worker node is preempted in the spot cluster.
|
595
710
|
|
596
711
|
Raises:
|
597
|
-
ValueError: if arguments are invalid
|
712
|
+
ValueError: if arguments are invalid.
|
713
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
598
714
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
599
715
|
sky.exceptions.NotSupportedError: if the specified cluster is a
|
600
716
|
controller that does not support this operation.
|
@@ -606,9 +722,9 @@ def cancel(
|
|
606
722
|
controller_utils.check_cluster_name_not_controller(
|
607
723
|
cluster_name, operation_str='Cancelling jobs')
|
608
724
|
|
609
|
-
if all and job_ids:
|
610
|
-
raise
|
611
|
-
|
725
|
+
if all and job_ids is not None:
|
726
|
+
raise exceptions.NotSupportedError(
|
727
|
+
'Cannot specify both --all and job IDs.')
|
612
728
|
|
613
729
|
# Check the status of the cluster.
|
614
730
|
handle = None
|
@@ -635,42 +751,47 @@ def cancel(
|
|
635
751
|
|
636
752
|
backend = backend_utils.get_backend_from_handle(handle)
|
637
753
|
|
638
|
-
if
|
639
|
-
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
640
|
-
f'Cancelling all jobs on cluster {cluster_name!r}...'
|
641
|
-
f'{colorama.Style.RESET_ALL}')
|
642
|
-
elif job_ids is None:
|
643
|
-
# all = False, job_ids is None => cancel the latest running job.
|
754
|
+
if all_users:
|
644
755
|
sky_logging.print(
|
645
756
|
f'{colorama.Fore.YELLOW}'
|
646
|
-
f'Cancelling
|
757
|
+
f'Cancelling all users\' jobs on cluster {cluster_name!r}...'
|
647
758
|
f'{colorama.Style.RESET_ALL}')
|
648
|
-
elif
|
649
|
-
|
759
|
+
elif all:
|
760
|
+
sky_logging.print(
|
761
|
+
f'{colorama.Fore.YELLOW}'
|
762
|
+
f'Cancelling all your jobs on cluster {cluster_name!r}...'
|
763
|
+
f'{colorama.Style.RESET_ALL}')
|
764
|
+
elif job_ids is not None:
|
650
765
|
jobs_str = ', '.join(map(str, job_ids))
|
651
766
|
sky_logging.print(
|
652
767
|
f'{colorama.Fore.YELLOW}'
|
653
768
|
f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...'
|
654
769
|
f'{colorama.Style.RESET_ALL}')
|
655
770
|
else:
|
656
|
-
|
657
|
-
|
771
|
+
sky_logging.print(
|
772
|
+
f'{colorama.Fore.YELLOW}'
|
773
|
+
f'Cancelling latest running job on cluster {cluster_name!r}...'
|
774
|
+
f'{colorama.Style.RESET_ALL}')
|
658
775
|
|
659
|
-
backend.cancel_jobs(handle,
|
776
|
+
backend.cancel_jobs(handle,
|
777
|
+
job_ids,
|
778
|
+
cancel_all=all or all_users,
|
779
|
+
user_hash=common_utils.get_user_hash())
|
660
780
|
|
661
781
|
|
662
782
|
@usage_lib.entrypoint
|
663
783
|
def tail_logs(cluster_name: str,
|
664
784
|
job_id: Optional[int],
|
665
|
-
follow: bool = True
|
785
|
+
follow: bool = True,
|
786
|
+
tail: int = 0) -> None:
|
666
787
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
667
|
-
"""
|
788
|
+
"""Tails the logs of a job.
|
668
789
|
|
669
790
|
Please refer to the sky.cli.tail_logs for the document.
|
670
791
|
|
671
792
|
Raises:
|
672
|
-
ValueError: arguments are invalid or the cluster is not supported
|
673
|
-
|
793
|
+
ValueError: if arguments are invalid or the cluster is not supported.
|
794
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
674
795
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
675
796
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
676
797
|
CloudVmRayBackend.
|
@@ -686,16 +807,8 @@ def tail_logs(cluster_name: str,
|
|
686
807
|
)
|
687
808
|
backend = backend_utils.get_backend_from_handle(handle)
|
688
809
|
|
689
|
-
job_str = f'job {job_id}'
|
690
|
-
if job_id is None:
|
691
|
-
job_str = 'the last job'
|
692
|
-
sky_logging.print(
|
693
|
-
f'{colorama.Fore.YELLOW}'
|
694
|
-
f'Tailing logs of {job_str} on cluster {cluster_name!r}...'
|
695
|
-
f'{colorama.Style.RESET_ALL}')
|
696
|
-
|
697
810
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
698
|
-
backend.tail_logs(handle, job_id, follow=follow)
|
811
|
+
backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
699
812
|
|
700
813
|
|
701
814
|
@usage_lib.entrypoint
|
@@ -704,7 +817,7 @@ def download_logs(
|
|
704
817
|
job_ids: Optional[List[str]],
|
705
818
|
local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
|
706
819
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
707
|
-
"""
|
820
|
+
"""Downloads the logs of jobs.
|
708
821
|
|
709
822
|
Args:
|
710
823
|
cluster_name: (str) name of the cluster.
|
@@ -712,7 +825,7 @@ def download_logs(
|
|
712
825
|
Returns:
|
713
826
|
Dict[str, str]: a mapping of job_id to local log path.
|
714
827
|
Raises:
|
715
|
-
|
828
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
716
829
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
717
830
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
718
831
|
CloudVmRayBackend.
|
@@ -729,7 +842,7 @@ def download_logs(
|
|
729
842
|
backend = backend_utils.get_backend_from_handle(handle)
|
730
843
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
731
844
|
|
732
|
-
if job_ids is not None and
|
845
|
+
if job_ids is not None and not job_ids:
|
733
846
|
return {}
|
734
847
|
|
735
848
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
@@ -757,7 +870,7 @@ def job_status(cluster_name: str,
|
|
757
870
|
If job_ids is None and there is no job on the cluster, it will return
|
758
871
|
{None: None}.
|
759
872
|
Raises:
|
760
|
-
|
873
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
761
874
|
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
762
875
|
sky.exceptions.NotSupportedError: if the cluster is not based on
|
763
876
|
CloudVmRayBackend.
|
@@ -778,7 +891,7 @@ def job_status(cluster_name: str,
|
|
778
891
|
f'of type {backend.__class__.__name__!r}.')
|
779
892
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
|
780
893
|
|
781
|
-
if job_ids is not None and
|
894
|
+
if job_ids is not None and not job_ids:
|
782
895
|
return {}
|
783
896
|
|
784
897
|
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
@@ -796,7 +909,7 @@ def job_status(cluster_name: str,
|
|
796
909
|
@usage_lib.entrypoint
|
797
910
|
def storage_ls() -> List[Dict[str, Any]]:
|
798
911
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
799
|
-
"""
|
912
|
+
"""Gets the storages.
|
800
913
|
|
801
914
|
Returns:
|
802
915
|
[
|
@@ -818,7 +931,7 @@ def storage_ls() -> List[Dict[str, Any]]:
|
|
818
931
|
@usage_lib.entrypoint
|
819
932
|
def storage_delete(name: str) -> None:
|
820
933
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
821
|
-
"""
|
934
|
+
"""Deletes a storage.
|
822
935
|
|
823
936
|
Raises:
|
824
937
|
ValueError: If the storage does not exist.
|
@@ -828,7 +941,148 @@ def storage_delete(name: str) -> None:
|
|
828
941
|
if handle is None:
|
829
942
|
raise ValueError(f'Storage name {name!r} not found.')
|
830
943
|
else:
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
944
|
+
storage_object = data.Storage(name=handle.storage_name,
|
945
|
+
source=handle.source,
|
946
|
+
sync_on_reconstruction=False)
|
947
|
+
storage_object.delete()
|
948
|
+
|
949
|
+
|
950
|
+
# ===================
|
951
|
+
# = Catalog Observe =
|
952
|
+
# ===================
|
953
|
+
@usage_lib.entrypoint
|
954
|
+
def enabled_clouds() -> List[clouds.Cloud]:
|
955
|
+
return global_user_state.get_cached_enabled_clouds()
|
956
|
+
|
957
|
+
|
958
|
+
@usage_lib.entrypoint
|
959
|
+
def realtime_kubernetes_gpu_availability(
|
960
|
+
context: Optional[str] = None,
|
961
|
+
name_filter: Optional[str] = None,
|
962
|
+
quantity_filter: Optional[int] = None
|
963
|
+
) -> List[models.RealtimeGpuAvailability]:
|
964
|
+
|
965
|
+
counts, capacity, available = service_catalog.list_accelerator_realtime(
|
966
|
+
gpus_only=True,
|
967
|
+
clouds='kubernetes',
|
968
|
+
name_filter=name_filter,
|
969
|
+
region_filter=context,
|
970
|
+
quantity_filter=quantity_filter,
|
971
|
+
case_sensitive=False)
|
972
|
+
assert (set(counts.keys()) == set(capacity.keys()) == set(
|
973
|
+
available.keys())), (f'Keys of counts ({list(counts.keys())}), '
|
974
|
+
f'capacity ({list(capacity.keys())}), '
|
975
|
+
f'and available ({list(available.keys())}) '
|
976
|
+
'must be same.')
|
977
|
+
if len(counts) == 0:
|
978
|
+
err_msg = 'No GPUs found in Kubernetes cluster. '
|
979
|
+
debug_msg = 'To further debug, run: sky check '
|
980
|
+
if name_filter is not None:
|
981
|
+
gpu_info_msg = f' {name_filter!r}'
|
982
|
+
if quantity_filter is not None:
|
983
|
+
gpu_info_msg += (' with requested quantity'
|
984
|
+
f' {quantity_filter}')
|
985
|
+
err_msg = (f'Resources{gpu_info_msg} not found '
|
986
|
+
'in Kubernetes cluster. ')
|
987
|
+
debug_msg = ('To show available accelerators on kubernetes,'
|
988
|
+
' run: sky show-gpus --cloud kubernetes ')
|
989
|
+
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
990
|
+
debug_msg)
|
991
|
+
raise ValueError(full_err_msg)
|
992
|
+
|
993
|
+
realtime_gpu_availability_list: List[models.RealtimeGpuAvailability] = []
|
994
|
+
|
995
|
+
for gpu, _ in sorted(counts.items()):
|
996
|
+
realtime_gpu_availability_list.append(
|
997
|
+
models.RealtimeGpuAvailability(
|
998
|
+
gpu,
|
999
|
+
counts.pop(gpu),
|
1000
|
+
capacity[gpu],
|
1001
|
+
available[gpu],
|
1002
|
+
))
|
1003
|
+
return realtime_gpu_availability_list
|
1004
|
+
|
1005
|
+
|
1006
|
+
# =================
|
1007
|
+
# = Local Cluster =
|
1008
|
+
# =================
|
1009
|
+
@usage_lib.entrypoint
|
1010
|
+
def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
|
1011
|
+
ssh_key: Optional[str], cleanup: bool) -> None:
|
1012
|
+
"""Creates a local or remote cluster."""
|
1013
|
+
|
1014
|
+
def _validate_args(ips, ssh_user, ssh_key, cleanup):
|
1015
|
+
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
1016
|
+
# all must be specified
|
1017
|
+
if bool(ips) or bool(ssh_user) or bool(ssh_key):
|
1018
|
+
if not (ips and ssh_user and ssh_key):
|
1019
|
+
with ux_utils.print_exception_no_traceback():
|
1020
|
+
raise ValueError(
|
1021
|
+
'All ips, ssh_user, and ssh_key must be specified '
|
1022
|
+
'together.')
|
1023
|
+
|
1024
|
+
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
1025
|
+
# are all provided
|
1026
|
+
if cleanup and not (ips and ssh_user and ssh_key):
|
1027
|
+
with ux_utils.print_exception_no_traceback():
|
1028
|
+
raise ValueError(
|
1029
|
+
'cleanup can only be used with ips, ssh_user and ssh_key.')
|
1030
|
+
|
1031
|
+
_validate_args(ips, ssh_user, ssh_key, cleanup)
|
1032
|
+
|
1033
|
+
# If remote deployment arguments are specified, run remote up script
|
1034
|
+
if ips:
|
1035
|
+
assert ssh_user is not None and ssh_key is not None
|
1036
|
+
kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
|
1037
|
+
cleanup)
|
1038
|
+
else:
|
1039
|
+
# Run local deployment (kind) if no remote args are specified
|
1040
|
+
kubernetes_deploy_utils.deploy_local_cluster(gpus)
|
1041
|
+
|
1042
|
+
|
1043
|
+
def local_down() -> None:
|
1044
|
+
"""Tears down the Kubernetes cluster started by local_up."""
|
1045
|
+
cluster_removed = False
|
1046
|
+
|
1047
|
+
path_to_package = os.path.dirname(__file__)
|
1048
|
+
down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
|
1049
|
+
'delete_cluster.sh')
|
1050
|
+
|
1051
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
1052
|
+
run_command = shlex.split(down_script_path)
|
1053
|
+
|
1054
|
+
# Setup logging paths
|
1055
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
1056
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
1057
|
+
'local_down.log')
|
1058
|
+
|
1059
|
+
with rich_utils.safe_status(
|
1060
|
+
ux_utils.spinner_message('Removing local cluster',
|
1061
|
+
log_path=log_path,
|
1062
|
+
is_local=True)):
|
1063
|
+
|
1064
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
1065
|
+
log_path=log_path,
|
1066
|
+
require_outputs=True,
|
1067
|
+
stream_logs=False,
|
1068
|
+
cwd=cwd)
|
1069
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
1070
|
+
|
1071
|
+
if returncode == 0:
|
1072
|
+
cluster_removed = True
|
1073
|
+
elif returncode == 100:
|
1074
|
+
logger.info(ux_utils.error_message('Local cluster does not exist.'))
|
1075
|
+
else:
|
1076
|
+
with ux_utils.print_exception_no_traceback():
|
1077
|
+
raise RuntimeError('Failed to create local cluster. '
|
1078
|
+
f'Stdout: {stdout}'
|
1079
|
+
f'\nError: {stderr}')
|
1080
|
+
if cluster_removed:
|
1081
|
+
# Run sky check
|
1082
|
+
with rich_utils.safe_status(
|
1083
|
+
ux_utils.spinner_message('Running sky check...')):
|
1084
|
+
sky_check.check(clouds=['kubernetes'], quiet=True)
|
1085
|
+
logger.info(
|
1086
|
+
ux_utils.finishing_message('Local cluster removed.',
|
1087
|
+
log_path=log_path,
|
1088
|
+
is_local=True))
|