skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +48 -22
- sky/adaptors/aws.py +2 -1
- sky/adaptors/azure.py +4 -4
- sky/adaptors/cloudflare.py +4 -4
- sky/adaptors/kubernetes.py +8 -8
- sky/authentication.py +42 -45
- sky/backends/backend.py +2 -2
- sky/backends/backend_utils.py +108 -221
- sky/backends/cloud_vm_ray_backend.py +283 -282
- sky/benchmark/benchmark_utils.py +6 -2
- sky/check.py +40 -28
- sky/cli.py +1213 -1116
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5644 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1757 -0
- sky/cloud_stores.py +12 -6
- sky/clouds/__init__.py +0 -2
- sky/clouds/aws.py +20 -13
- sky/clouds/azure.py +5 -3
- sky/clouds/cloud.py +1 -1
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +3 -2
- sky/clouds/gcp.py +10 -8
- sky/clouds/ibm.py +8 -7
- sky/clouds/kubernetes.py +7 -6
- sky/clouds/lambda_cloud.py +8 -7
- sky/clouds/oci.py +4 -3
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +8 -7
- sky/clouds/service_catalog/__init__.py +3 -3
- sky/clouds/service_catalog/aws_catalog.py +7 -1
- sky/clouds/service_catalog/common.py +4 -2
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +263 -99
- sky/dag.py +4 -0
- sky/data/mounting_utils.py +2 -1
- sky/data/storage.py +97 -35
- sky/data/storage_utils.py +69 -9
- sky/exceptions.py +138 -5
- sky/execution.py +47 -50
- sky/global_user_state.py +105 -22
- sky/jobs/__init__.py +12 -14
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +296 -0
- sky/jobs/constants.py +30 -1
- sky/jobs/controller.py +12 -6
- sky/jobs/dashboard/dashboard.py +2 -6
- sky/jobs/recovery_strategy.py +22 -29
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/{core.py → server/core.py} +101 -34
- sky/jobs/server/dashboard_utils.py +64 -0
- sky/jobs/server/server.py +182 -0
- sky/jobs/utils.py +32 -23
- sky/models.py +27 -0
- sky/optimizer.py +9 -11
- sky/provision/__init__.py +6 -3
- sky/provision/aws/config.py +2 -2
- sky/provision/aws/instance.py +1 -1
- sky/provision/azure/instance.py +1 -1
- sky/provision/cudo/instance.py +1 -1
- sky/provision/do/instance.py +1 -1
- sky/provision/do/utils.py +0 -5
- sky/provision/fluidstack/fluidstack_utils.py +4 -3
- sky/provision/fluidstack/instance.py +4 -2
- sky/provision/gcp/instance.py +1 -1
- sky/provision/instance_setup.py +2 -2
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +67 -76
- sky/provision/lambda_cloud/instance.py +3 -15
- sky/provision/logging.py +1 -1
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +1 -1
- sky/provision/provisioner.py +3 -2
- sky/provision/runpod/instance.py +1 -1
- sky/provision/vast/instance.py +1 -1
- sky/provision/vast/utils.py +2 -1
- sky/provision/vsphere/instance.py +2 -11
- sky/resources.py +55 -40
- sky/serve/__init__.py +6 -10
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +3 -0
- sky/serve/replica_managers.py +10 -10
- sky/serve/serve_utils.py +56 -36
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +37 -17
- sky/serve/server/server.py +117 -0
- sky/serve/service.py +8 -1
- sky/server/__init__.py +1 -0
- sky/server/common.py +441 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +462 -0
- sky/server/requests/payloads.py +481 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1095 -0
- sky/server/stream_utils.py +144 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +12 -4
- sky/setup_files/setup.py +1 -1
- sky/sky_logging.py +9 -13
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +46 -12
- sky/skylet/events.py +5 -6
- sky/skylet/job_lib.py +78 -66
- sky/skylet/log_lib.py +17 -11
- sky/skypilot_config.py +79 -94
- sky/task.py +119 -73
- sky/templates/aws-ray.yml.j2 +4 -4
- sky/templates/azure-ray.yml.j2 +3 -2
- sky/templates/cudo-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +3 -2
- sky/templates/gcp-ray.yml.j2 +3 -2
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +1 -12
- sky/templates/kubernetes-ray.yml.j2 +3 -2
- sky/templates/lambda-ray.yml.j2 +3 -2
- sky/templates/oci-ray.yml.j2 +3 -2
- sky/templates/paperspace-ray.yml.j2 +3 -2
- sky/templates/runpod-ray.yml.j2 +3 -2
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vsphere-ray.yml.j2 +4 -2
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +8 -0
- sky/usage/usage_lib.py +45 -11
- sky/utils/accelerator_registry.py +33 -53
- sky/utils/admin_policy_utils.py +2 -1
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +33 -3
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +69 -14
- sky/utils/common.py +74 -0
- sky/utils/common_utils.py +133 -93
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +2 -3
- sky/utils/controller_utils.py +133 -147
- sky/utils/dag_utils.py +72 -24
- sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/log_utils.py +83 -23
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +2 -2
- sky/utils/rich_utils.py +213 -34
- sky/utils/schemas.py +19 -2
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +51 -35
- sky/utils/timeline.py +7 -2
- sky/utils/ux_utils.py +95 -25
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
- sky/clouds/cloud_registry.py +0 -76
- sky/utils/cluster_yaml_utils.py +0 -24
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
@@ -13,9 +13,9 @@ import requests
|
|
13
13
|
from sky import sky_logging
|
14
14
|
from sky.adaptors import common as adaptors_common
|
15
15
|
from sky.clouds import cloud as cloud_lib
|
16
|
-
from sky.clouds import cloud_registry
|
17
16
|
from sky.clouds.service_catalog import constants
|
18
17
|
from sky.utils import common_utils
|
18
|
+
from sky.utils import registry
|
19
19
|
from sky.utils import rich_utils
|
20
20
|
from sky.utils import ux_utils
|
21
21
|
|
@@ -171,7 +171,9 @@ def read_catalog(filename: str,
|
|
171
171
|
assert (pull_frequency_hours is None or
|
172
172
|
pull_frequency_hours >= 0), pull_frequency_hours
|
173
173
|
catalog_path = get_catalog_path(filename)
|
174
|
-
cloud =
|
174
|
+
cloud = os.path.dirname(filename)
|
175
|
+
if cloud != 'common':
|
176
|
+
cloud = str(registry.CLOUD_REGISTRY.from_str(cloud))
|
175
177
|
|
176
178
|
meta_path = os.path.join(_ABSOLUTE_VERSIONED_CATALOG_DIR, '.meta', filename)
|
177
179
|
os.makedirs(os.path.dirname(meta_path), exist_ok=True)
|
@@ -5,7 +5,6 @@ VMs, GPUs, and TPUs. The script takes about 1-2 minutes to run.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
import argparse
|
8
|
-
import functools
|
9
8
|
import io
|
10
9
|
import multiprocessing
|
11
10
|
import os
|
@@ -20,6 +19,7 @@ import numpy as np
|
|
20
19
|
|
21
20
|
from sky.adaptors import common as adaptors_common
|
22
21
|
from sky.adaptors import gcp
|
22
|
+
from sky.utils import annotations
|
23
23
|
from sky.utils import common_utils
|
24
24
|
|
25
25
|
if typing.TYPE_CHECKING:
|
@@ -281,7 +281,7 @@ def filter_zones(func: Callable[[], List[str]]) -> Callable[[], List[str]]:
|
|
281
281
|
|
282
282
|
|
283
283
|
@filter_zones
|
284
|
-
@
|
284
|
+
@annotations.lru_cache(scope='global', maxsize=None)
|
285
285
|
def _get_all_zones() -> List[str]:
|
286
286
|
zones_request = gcp_client.zones().list(project=project_id)
|
287
287
|
zones = []
|
sky/clouds/utils/oci_utils.py
CHANGED
sky/clouds/vast.py
CHANGED
@@ -5,13 +5,14 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
5
5
|
|
6
6
|
from sky import clouds
|
7
7
|
from sky.clouds import service_catalog
|
8
|
+
from sky.utils import registry
|
8
9
|
from sky.utils import resources_utils
|
9
10
|
|
10
11
|
if typing.TYPE_CHECKING:
|
11
12
|
from sky import resources as resources_lib
|
12
13
|
|
13
14
|
|
14
|
-
@
|
15
|
+
@registry.CLOUD_REGISTRY.register
|
15
16
|
class Vast(clouds.Cloud):
|
16
17
|
""" Vast GPU Cloud
|
17
18
|
|
sky/clouds/vsphere.py
CHANGED
@@ -11,6 +11,7 @@ from sky.provision.vsphere import vsphere_utils
|
|
11
11
|
from sky.provision.vsphere.vsphere_utils import get_vsphere_credentials
|
12
12
|
from sky.provision.vsphere.vsphere_utils import initialize_vsphere_data
|
13
13
|
from sky.utils import common_utils
|
14
|
+
from sky.utils import registry
|
14
15
|
from sky.utils import resources_utils
|
15
16
|
|
16
17
|
if typing.TYPE_CHECKING:
|
@@ -24,7 +25,7 @@ _CREDENTIAL_FILES = [
|
|
24
25
|
]
|
25
26
|
|
26
27
|
|
27
|
-
@
|
28
|
+
@registry.CLOUD_REGISTRY.register
|
28
29
|
class Vsphere(clouds.Cloud):
|
29
30
|
"""Vsphere cloud"""
|
30
31
|
|
sky/core.py
CHANGED
@@ -1,29 +1,38 @@
|
|
1
1
|
"""SDK functions for cluster/job management."""
|
2
|
-
import
|
2
|
+
import os
|
3
|
+
import shlex
|
3
4
|
import typing
|
4
5
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
6
|
|
6
7
|
import colorama
|
7
8
|
|
8
9
|
from sky import backends
|
10
|
+
from sky import check as sky_check
|
9
11
|
from sky import clouds
|
10
12
|
from sky import dag
|
11
13
|
from sky import data
|
12
14
|
from sky import exceptions
|
13
15
|
from sky import global_user_state
|
14
|
-
from sky import
|
16
|
+
from sky import models
|
15
17
|
from sky import sky_logging
|
16
|
-
from sky import status_lib
|
17
18
|
from sky import task
|
18
19
|
from sky.backends import backend_utils
|
20
|
+
from sky.clouds import service_catalog
|
21
|
+
from sky.jobs.server import core as managed_jobs_core
|
22
|
+
from sky.provision.kubernetes import constants as kubernetes_constants
|
19
23
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
20
24
|
from sky.skylet import constants
|
21
25
|
from sky.skylet import job_lib
|
26
|
+
from sky.skylet import log_lib
|
22
27
|
from sky.usage import usage_lib
|
28
|
+
from sky.utils import common
|
29
|
+
from sky.utils import common_utils
|
23
30
|
from sky.utils import controller_utils
|
24
31
|
from sky.utils import rich_utils
|
32
|
+
from sky.utils import status_lib
|
25
33
|
from sky.utils import subprocess_utils
|
26
34
|
from sky.utils import ux_utils
|
35
|
+
from sky.utils.kubernetes import kubernetes_deploy_utils
|
27
36
|
|
28
37
|
if typing.TYPE_CHECKING:
|
29
38
|
from sky import resources as resources_lib
|
@@ -34,14 +43,15 @@ logger = sky_logging.init_logger(__name__)
|
|
34
43
|
# = Cluster Management =
|
35
44
|
# ======================
|
36
45
|
|
37
|
-
# pylint: disable=redefined-builtin
|
38
|
-
|
39
46
|
|
40
47
|
@usage_lib.entrypoint
|
41
|
-
def status(
|
42
|
-
|
48
|
+
def status(
|
49
|
+
cluster_names: Optional[Union[str, List[str]]] = None,
|
50
|
+
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
51
|
+
all_users: bool = False,
|
52
|
+
) -> List[Dict[str, Any]]:
|
43
53
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
44
|
-
"""
|
54
|
+
"""Gets cluster statuses.
|
45
55
|
|
46
56
|
If cluster_names is given, return those clusters. Otherwise, return all
|
47
57
|
clusters.
|
@@ -60,6 +70,10 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
|
|
60
70
|
'autostop': (int) idle time before autostop,
|
61
71
|
'to_down': (bool) whether autodown is used instead of autostop,
|
62
72
|
'metadata': (dict) metadata of the cluster,
|
73
|
+
'user_hash': (str) user hash of the cluster owner,
|
74
|
+
'user_name': (str) user name of the cluster owner,
|
75
|
+
'resources_str': (str) the resource string representation of the
|
76
|
+
cluster,
|
63
77
|
}
|
64
78
|
|
65
79
|
Each cluster can have one of the following statuses:
|
@@ -108,16 +122,17 @@ def status(cluster_names: Optional[Union[str, List[str]]] = None,
|
|
108
122
|
cluster. If a cluster is found to be terminated or not found, it will
|
109
123
|
be omitted from the returned list.
|
110
124
|
"""
|
111
|
-
|
112
|
-
|
113
|
-
|
125
|
+
clusters = backend_utils.get_clusters(refresh=refresh,
|
126
|
+
cluster_names=cluster_names,
|
127
|
+
all_users=all_users)
|
128
|
+
return clusters
|
114
129
|
|
115
130
|
|
116
131
|
def status_kubernetes(
|
117
|
-
) -> Tuple[List['kubernetes_utils.
|
118
|
-
List['kubernetes_utils.
|
119
|
-
|
120
|
-
"""
|
132
|
+
) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
133
|
+
List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
|
134
|
+
List[Dict[str, Any]], Optional[str]]:
|
135
|
+
"""Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
|
121
136
|
|
122
137
|
Managed jobs and services are also included in the clusters returned.
|
123
138
|
The caller must parse the controllers to identify which clusters are run
|
@@ -125,11 +140,11 @@ def status_kubernetes(
|
|
125
140
|
all_clusters, unmanaged_clusters, all_jobs, context
|
126
141
|
Returns:
|
127
142
|
A tuple containing:
|
128
|
-
- all_clusters: List of
|
129
|
-
all clusters, including managed jobs, services and controllers.
|
130
|
-
- unmanaged_clusters: List of
|
131
|
-
for all clusters excluding managed jobs and services.
|
132
|
-
are included.
|
143
|
+
- all_clusters: List of KubernetesSkyPilotClusterInfoPayload with info
|
144
|
+
for all clusters, including managed jobs, services and controllers.
|
145
|
+
- unmanaged_clusters: List of KubernetesSkyPilotClusterInfoPayload with
|
146
|
+
info for all clusters excluding managed jobs and services.
|
147
|
+
Controllers are included.
|
133
148
|
- all_jobs: List of managed jobs from all controllers. Each entry is a
|
134
149
|
dictionary job info, see jobs.queue_from_kubernetes_pod for details.
|
135
150
|
- context: Kubernetes context used to fetch the cluster information.
|
@@ -155,7 +170,7 @@ all_clusters, unmanaged_clusters, all_jobs, context
|
|
155
170
|
status_message += f's ({i + 1}/{len(jobs_controllers)})'
|
156
171
|
spinner.update(f'{status_message}[/]')
|
157
172
|
try:
|
158
|
-
job_list =
|
173
|
+
job_list = managed_jobs_core.queue_from_kubernetes_pod(
|
159
174
|
pod.metadata.name)
|
160
175
|
except RuntimeError as e:
|
161
176
|
logger.warning('Failed to get managed jobs from controller '
|
@@ -183,6 +198,14 @@ all_clusters, unmanaged_clusters, all_jobs, context
|
|
183
198
|
c for c in all_clusters
|
184
199
|
if c.cluster_name not in managed_job_cluster_names
|
185
200
|
]
|
201
|
+
all_clusters = [
|
202
|
+
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
203
|
+
for c in all_clusters
|
204
|
+
]
|
205
|
+
unmanaged_clusters = [
|
206
|
+
kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
|
207
|
+
for c in unmanaged_clusters
|
208
|
+
]
|
186
209
|
return all_clusters, unmanaged_clusters, all_jobs, context
|
187
210
|
|
188
211
|
|
@@ -253,6 +276,9 @@ def cost_report() -> List[Dict[str, Any]]:
|
|
253
276
|
|
254
277
|
for cluster_report in cluster_reports:
|
255
278
|
cluster_report['total_cost'] = get_total_cost(cluster_report)
|
279
|
+
cluster_report['cloud'] = str(cluster_report['resources'].cloud)
|
280
|
+
cluster_report['accelerators'] = cluster_report[
|
281
|
+
'resources'].accelerators
|
256
282
|
|
257
283
|
return cluster_reports
|
258
284
|
|
@@ -392,10 +418,45 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
|
|
392
418
|
return message
|
393
419
|
|
394
420
|
|
421
|
+
@usage_lib.entrypoint
|
422
|
+
def down(cluster_name: str, purge: bool = False) -> None:
|
423
|
+
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
424
|
+
"""Tears down a cluster.
|
425
|
+
|
426
|
+
Tearing down a cluster will delete all associated resources (all billing
|
427
|
+
stops), and any data on the attached disks will be lost. Accelerators
|
428
|
+
(e.g., TPUs) that are part of the cluster will be deleted too.
|
429
|
+
|
430
|
+
Args:
|
431
|
+
cluster_name: name of the cluster to down.
|
432
|
+
purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
|
433
|
+
table, even if the actual cluster termination failed on the cloud.
|
434
|
+
WARNING: This flag should only be set sparingly in certain manual
|
435
|
+
troubleshooting scenarios; with it set, it is the user's
|
436
|
+
responsibility to ensure there are no leaked instances and related
|
437
|
+
resources.
|
438
|
+
|
439
|
+
Raises:
|
440
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
441
|
+
exist.
|
442
|
+
RuntimeError: failed to tear down the cluster.
|
443
|
+
sky.exceptions.NotSupportedError: the specified cluster is the managed
|
444
|
+
jobs controller.
|
445
|
+
"""
|
446
|
+
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
447
|
+
if handle is None:
|
448
|
+
raise exceptions.ClusterDoesNotExist(
|
449
|
+
f'Cluster {cluster_name!r} does not exist.')
|
450
|
+
|
451
|
+
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
452
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
453
|
+
backend.teardown(handle, terminate=True, purge=purge)
|
454
|
+
|
455
|
+
|
395
456
|
@usage_lib.entrypoint
|
396
457
|
def stop(cluster_name: str, purge: bool = False) -> None:
|
397
458
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
398
|
-
"""
|
459
|
+
"""Stops a cluster.
|
399
460
|
|
400
461
|
Data on attached disks is not lost when a cluster is stopped. Billing for
|
401
462
|
the instances will stop, while the disks will still be charged. Those
|
@@ -452,41 +513,6 @@ def stop(cluster_name: str, purge: bool = False) -> None:
|
|
452
513
|
backend.teardown(handle, terminate=False, purge=purge)
|
453
514
|
|
454
515
|
|
455
|
-
@usage_lib.entrypoint
|
456
|
-
def down(cluster_name: str, purge: bool = False) -> None:
|
457
|
-
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
458
|
-
"""Tear down a cluster.
|
459
|
-
|
460
|
-
Tearing down a cluster will delete all associated resources (all billing
|
461
|
-
stops), and any data on the attached disks will be lost. Accelerators
|
462
|
-
(e.g., TPUs) that are part of the cluster will be deleted too.
|
463
|
-
|
464
|
-
Args:
|
465
|
-
cluster_name: name of the cluster to down.
|
466
|
-
purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
|
467
|
-
table, even if the actual cluster termination failed on the cloud.
|
468
|
-
WARNING: This flag should only be set sparingly in certain manual
|
469
|
-
troubleshooting scenarios; with it set, it is the user's
|
470
|
-
responsibility to ensure there are no leaked instances and related
|
471
|
-
resources.
|
472
|
-
|
473
|
-
Raises:
|
474
|
-
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
475
|
-
exist.
|
476
|
-
RuntimeError: failed to tear down the cluster.
|
477
|
-
sky.exceptions.NotSupportedError: the specified cluster is the managed
|
478
|
-
jobs controller.
|
479
|
-
"""
|
480
|
-
handle = global_user_state.get_handle_from_cluster_name(cluster_name)
|
481
|
-
if handle is None:
|
482
|
-
raise exceptions.ClusterDoesNotExist(
|
483
|
-
f'Cluster {cluster_name!r} does not exist.')
|
484
|
-
|
485
|
-
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
486
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
487
|
-
backend.teardown(handle, terminate=True, purge=purge)
|
488
|
-
|
489
|
-
|
490
516
|
@usage_lib.entrypoint
|
491
517
|
def autostop(
|
492
518
|
cluster_name: str,
|
@@ -494,7 +520,7 @@ def autostop(
|
|
494
520
|
down: bool = False, # pylint: disable=redefined-outer-name
|
495
521
|
) -> None:
|
496
522
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
497
|
-
"""
|
523
|
+
"""Schedules an autostop/autodown for a cluster.
|
498
524
|
|
499
525
|
Autostop/autodown will automatically stop or teardown a cluster when it
|
500
526
|
becomes idle for a specified duration. Idleness means there are no
|
@@ -601,7 +627,7 @@ def queue(cluster_name: str,
|
|
601
627
|
skip_finished: bool = False,
|
602
628
|
all_users: bool = False) -> List[dict]:
|
603
629
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
604
|
-
"""
|
630
|
+
"""Gets the job queue of a cluster.
|
605
631
|
|
606
632
|
Please refer to the sky.cli.queue for the document.
|
607
633
|
|
@@ -612,6 +638,7 @@ def queue(cluster_name: str,
|
|
612
638
|
'job_id': (int) job id,
|
613
639
|
'job_name': (str) job name,
|
614
640
|
'username': (str) username,
|
641
|
+
'user_hash': (str) user hash,
|
615
642
|
'submitted_at': (int) timestamp of submitted,
|
616
643
|
'start_at': (int) timestamp of started,
|
617
644
|
'end_at': (int) timestamp of ended,
|
@@ -632,10 +659,10 @@ def queue(cluster_name: str,
|
|
632
659
|
exceptions.CommandError: if failed to get the job queue with ssh.
|
633
660
|
"""
|
634
661
|
all_jobs = not skip_finished
|
635
|
-
|
662
|
+
user_hash: Optional[str] = common_utils.get_user_hash()
|
636
663
|
if all_users:
|
637
|
-
|
638
|
-
code = job_lib.JobLibCodeGen.get_job_queue(
|
664
|
+
user_hash = None
|
665
|
+
code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
|
639
666
|
|
640
667
|
handle = backend_utils.check_cluster_available(
|
641
668
|
cluster_name,
|
@@ -662,19 +689,22 @@ def queue(cluster_name: str,
|
|
662
689
|
def cancel(
|
663
690
|
cluster_name: str,
|
664
691
|
all: bool = False,
|
692
|
+
all_users: bool = False,
|
665
693
|
job_ids: Optional[List[int]] = None,
|
666
694
|
# pylint: disable=invalid-name
|
695
|
+
# Internal only:
|
667
696
|
_try_cancel_if_cluster_is_init: bool = False,
|
668
697
|
) -> None:
|
669
698
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
670
|
-
"""
|
699
|
+
"""Cancels jobs on a cluster.
|
671
700
|
|
672
701
|
Please refer to the sky.cli.cancel for the document.
|
673
702
|
|
674
|
-
When `all`
|
703
|
+
When none of `job_ids`, `all` and `all_users` is set, cancel the latest
|
704
|
+
running job.
|
675
705
|
|
676
706
|
Additional arguments:
|
677
|
-
|
707
|
+
try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
|
678
708
|
even if the cluster is not UP, but the head node is still alive.
|
679
709
|
This is used by the jobs controller to cancel the job when the
|
680
710
|
worker node is preempted in the spot cluster.
|
@@ -693,9 +723,9 @@ def cancel(
|
|
693
723
|
controller_utils.check_cluster_name_not_controller(
|
694
724
|
cluster_name, operation_str='Cancelling jobs')
|
695
725
|
|
696
|
-
if all and job_ids:
|
697
|
-
raise
|
698
|
-
|
726
|
+
if all and job_ids is not None:
|
727
|
+
raise exceptions.NotSupportedError(
|
728
|
+
'Cannot specify both --all and job IDs.')
|
699
729
|
|
700
730
|
# Check the status of the cluster.
|
701
731
|
handle = None
|
@@ -722,28 +752,32 @@ def cancel(
|
|
722
752
|
|
723
753
|
backend = backend_utils.get_backend_from_handle(handle)
|
724
754
|
|
725
|
-
if
|
726
|
-
sky_logging.print(f'{colorama.Fore.YELLOW}'
|
727
|
-
f'Cancelling all jobs on cluster {cluster_name!r}...'
|
728
|
-
f'{colorama.Style.RESET_ALL}')
|
729
|
-
elif job_ids is None:
|
730
|
-
# all = False, job_ids is None => cancel the latest running job.
|
755
|
+
if all_users:
|
731
756
|
sky_logging.print(
|
732
757
|
f'{colorama.Fore.YELLOW}'
|
733
|
-
f'Cancelling
|
758
|
+
f'Cancelling all users\' jobs on cluster {cluster_name!r}...'
|
734
759
|
f'{colorama.Style.RESET_ALL}')
|
735
|
-
elif
|
736
|
-
|
760
|
+
elif all:
|
761
|
+
sky_logging.print(
|
762
|
+
f'{colorama.Fore.YELLOW}'
|
763
|
+
f'Cancelling all your jobs on cluster {cluster_name!r}...'
|
764
|
+
f'{colorama.Style.RESET_ALL}')
|
765
|
+
elif job_ids is not None:
|
737
766
|
jobs_str = ', '.join(map(str, job_ids))
|
738
767
|
sky_logging.print(
|
739
768
|
f'{colorama.Fore.YELLOW}'
|
740
769
|
f'Cancelling jobs ({jobs_str}) on cluster {cluster_name!r}...'
|
741
770
|
f'{colorama.Style.RESET_ALL}')
|
742
771
|
else:
|
743
|
-
|
744
|
-
|
772
|
+
sky_logging.print(
|
773
|
+
f'{colorama.Fore.YELLOW}'
|
774
|
+
f'Cancelling latest running job on cluster {cluster_name!r}...'
|
775
|
+
f'{colorama.Style.RESET_ALL}')
|
745
776
|
|
746
|
-
backend.cancel_jobs(handle,
|
777
|
+
backend.cancel_jobs(handle,
|
778
|
+
job_ids,
|
779
|
+
cancel_all=all or all_users,
|
780
|
+
user_hash=common_utils.get_user_hash())
|
747
781
|
|
748
782
|
|
749
783
|
@usage_lib.entrypoint
|
@@ -752,7 +786,7 @@ def tail_logs(cluster_name: str,
|
|
752
786
|
follow: bool = True,
|
753
787
|
tail: int = 0) -> None:
|
754
788
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
755
|
-
"""
|
789
|
+
"""Tails the logs of a job.
|
756
790
|
|
757
791
|
Please refer to the sky.cli.tail_logs for the document.
|
758
792
|
|
@@ -774,14 +808,6 @@ def tail_logs(cluster_name: str,
|
|
774
808
|
)
|
775
809
|
backend = backend_utils.get_backend_from_handle(handle)
|
776
810
|
|
777
|
-
job_str = f'job {job_id}'
|
778
|
-
if job_id is None:
|
779
|
-
job_str = 'the last job'
|
780
|
-
sky_logging.print(
|
781
|
-
f'{colorama.Fore.YELLOW}'
|
782
|
-
f'Tailing logs of {job_str} on cluster {cluster_name!r}...'
|
783
|
-
f'{colorama.Style.RESET_ALL}')
|
784
|
-
|
785
811
|
usage_lib.record_cluster_name_for_current_operation(cluster_name)
|
786
812
|
backend.tail_logs(handle, job_id, follow=follow, tail=tail)
|
787
813
|
|
@@ -792,7 +818,7 @@ def download_logs(
|
|
792
818
|
job_ids: Optional[List[str]],
|
793
819
|
local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
|
794
820
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
795
|
-
"""
|
821
|
+
"""Downloads the logs of jobs.
|
796
822
|
|
797
823
|
Args:
|
798
824
|
cluster_name: (str) name of the cluster.
|
@@ -884,7 +910,7 @@ def job_status(cluster_name: str,
|
|
884
910
|
@usage_lib.entrypoint
|
885
911
|
def storage_ls() -> List[Dict[str, Any]]:
|
886
912
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
887
|
-
"""
|
913
|
+
"""Gets the storages.
|
888
914
|
|
889
915
|
Returns:
|
890
916
|
[
|
@@ -906,7 +932,7 @@ def storage_ls() -> List[Dict[str, Any]]:
|
|
906
932
|
@usage_lib.entrypoint
|
907
933
|
def storage_delete(name: str) -> None:
|
908
934
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
909
|
-
"""
|
935
|
+
"""Deletes a storage.
|
910
936
|
|
911
937
|
Raises:
|
912
938
|
ValueError: If the storage does not exist.
|
@@ -915,11 +941,149 @@ def storage_delete(name: str) -> None:
|
|
915
941
|
handle = global_user_state.get_handle_from_storage_name(name)
|
916
942
|
if handle is None:
|
917
943
|
raise ValueError(f'Storage name {name!r} not found.')
|
944
|
+
else:
|
945
|
+
storage_object = data.Storage(name=handle.storage_name,
|
946
|
+
source=handle.source,
|
947
|
+
sync_on_reconstruction=False)
|
948
|
+
storage_object.delete()
|
949
|
+
|
918
950
|
|
919
|
-
|
920
|
-
|
921
|
-
|
922
|
-
|
923
|
-
|
924
|
-
|
925
|
-
|
951
|
+
# ===================
|
952
|
+
# = Catalog Observe =
|
953
|
+
# ===================
|
954
|
+
@usage_lib.entrypoint
|
955
|
+
def enabled_clouds() -> List[clouds.Cloud]:
|
956
|
+
return global_user_state.get_cached_enabled_clouds()
|
957
|
+
|
958
|
+
|
959
|
+
@usage_lib.entrypoint
|
960
|
+
def realtime_kubernetes_gpu_availability(
|
961
|
+
context: Optional[str] = None,
|
962
|
+
name_filter: Optional[str] = None,
|
963
|
+
quantity_filter: Optional[int] = None
|
964
|
+
) -> List[models.RealtimeGpuAvailability]:
|
965
|
+
|
966
|
+
counts, capacity, available = service_catalog.list_accelerator_realtime(
|
967
|
+
gpus_only=True,
|
968
|
+
clouds='kubernetes',
|
969
|
+
name_filter=name_filter,
|
970
|
+
region_filter=context,
|
971
|
+
quantity_filter=quantity_filter,
|
972
|
+
case_sensitive=False)
|
973
|
+
assert (set(counts.keys()) == set(capacity.keys()) == set(
|
974
|
+
available.keys())), (f'Keys of counts ({list(counts.keys())}), '
|
975
|
+
f'capacity ({list(capacity.keys())}), '
|
976
|
+
f'and available ({list(available.keys())}) '
|
977
|
+
'must be same.')
|
978
|
+
if len(counts) == 0:
|
979
|
+
err_msg = 'No GPUs found in Kubernetes cluster. '
|
980
|
+
debug_msg = 'To further debug, run: sky check '
|
981
|
+
if name_filter is not None:
|
982
|
+
gpu_info_msg = f' {name_filter!r}'
|
983
|
+
if quantity_filter is not None:
|
984
|
+
gpu_info_msg += (' with requested quantity'
|
985
|
+
f' {quantity_filter}')
|
986
|
+
err_msg = (f'Resources{gpu_info_msg} not found '
|
987
|
+
'in Kubernetes cluster. ')
|
988
|
+
debug_msg = ('To show available accelerators on kubernetes,'
|
989
|
+
' run: sky show-gpus --cloud kubernetes ')
|
990
|
+
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
991
|
+
debug_msg)
|
992
|
+
raise ValueError(full_err_msg)
|
993
|
+
|
994
|
+
realtime_gpu_availability_list: List[models.RealtimeGpuAvailability] = []
|
995
|
+
|
996
|
+
for gpu, _ in sorted(counts.items()):
|
997
|
+
realtime_gpu_availability_list.append(
|
998
|
+
models.RealtimeGpuAvailability(
|
999
|
+
gpu,
|
1000
|
+
counts.pop(gpu),
|
1001
|
+
capacity[gpu],
|
1002
|
+
available[gpu],
|
1003
|
+
))
|
1004
|
+
return realtime_gpu_availability_list
|
1005
|
+
|
1006
|
+
|
1007
|
+
# =================
|
1008
|
+
# = Local Cluster =
|
1009
|
+
# =================
|
1010
|
+
@usage_lib.entrypoint
|
1011
|
+
def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
|
1012
|
+
ssh_key: Optional[str], cleanup: bool) -> None:
|
1013
|
+
"""Creates a local or remote cluster."""
|
1014
|
+
|
1015
|
+
def _validate_args(ips, ssh_user, ssh_key, cleanup):
|
1016
|
+
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
1017
|
+
# all must be specified
|
1018
|
+
if bool(ips) or bool(ssh_user) or bool(ssh_key):
|
1019
|
+
if not (ips and ssh_user and ssh_key):
|
1020
|
+
with ux_utils.print_exception_no_traceback():
|
1021
|
+
raise ValueError(
|
1022
|
+
'All ips, ssh_user, and ssh_key must be specified '
|
1023
|
+
'together.')
|
1024
|
+
|
1025
|
+
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
1026
|
+
# are all provided
|
1027
|
+
if cleanup and not (ips and ssh_user and ssh_key):
|
1028
|
+
with ux_utils.print_exception_no_traceback():
|
1029
|
+
raise ValueError(
|
1030
|
+
'cleanup can only be used with ips, ssh_user and ssh_key.')
|
1031
|
+
|
1032
|
+
_validate_args(ips, ssh_user, ssh_key, cleanup)
|
1033
|
+
|
1034
|
+
# If remote deployment arguments are specified, run remote up script
|
1035
|
+
if ips:
|
1036
|
+
assert ssh_user is not None and ssh_key is not None
|
1037
|
+
kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
|
1038
|
+
cleanup)
|
1039
|
+
else:
|
1040
|
+
# Run local deployment (kind) if no remote args are specified
|
1041
|
+
kubernetes_deploy_utils.deploy_local_cluster(gpus)
|
1042
|
+
|
1043
|
+
|
1044
|
+
def local_down() -> None:
|
1045
|
+
"""Tears down the Kubernetes cluster started by local_up."""
|
1046
|
+
cluster_removed = False
|
1047
|
+
|
1048
|
+
path_to_package = os.path.dirname(__file__)
|
1049
|
+
down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
|
1050
|
+
'delete_cluster.sh')
|
1051
|
+
|
1052
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
1053
|
+
run_command = shlex.split(down_script_path)
|
1054
|
+
|
1055
|
+
# Setup logging paths
|
1056
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
1057
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
1058
|
+
'local_down.log')
|
1059
|
+
|
1060
|
+
with rich_utils.safe_status(
|
1061
|
+
ux_utils.spinner_message('Removing local cluster',
|
1062
|
+
log_path=log_path,
|
1063
|
+
is_local=True)):
|
1064
|
+
|
1065
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
1066
|
+
log_path=log_path,
|
1067
|
+
require_outputs=True,
|
1068
|
+
stream_logs=False,
|
1069
|
+
cwd=cwd)
|
1070
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
1071
|
+
|
1072
|
+
if returncode == 0:
|
1073
|
+
cluster_removed = True
|
1074
|
+
elif returncode == 100:
|
1075
|
+
logger.info(ux_utils.error_message('Local cluster does not exist.'))
|
1076
|
+
else:
|
1077
|
+
with ux_utils.print_exception_no_traceback():
|
1078
|
+
raise RuntimeError('Failed to create local cluster. '
|
1079
|
+
f'Stdout: {stdout}'
|
1080
|
+
f'\nError: {stderr}')
|
1081
|
+
if cluster_removed:
|
1082
|
+
# Run sky check
|
1083
|
+
with rich_utils.safe_status(
|
1084
|
+
ux_utils.spinner_message('Running sky check...')):
|
1085
|
+
sky_check.check(clouds=['kubernetes'], quiet=True)
|
1086
|
+
logger.info(
|
1087
|
+
ux_utils.finishing_message('Local cluster removed.',
|
1088
|
+
log_path=log_path,
|
1089
|
+
is_local=True))
|
sky/dag.py
CHANGED
@@ -76,6 +76,10 @@ class Dag:
|
|
76
76
|
|
77
77
|
return out_degree_condition and in_degree_condition
|
78
78
|
|
79
|
+
def validate(self, workdir_only: bool = False):
|
80
|
+
for task in self.tasks:
|
81
|
+
task.validate(workdir_only=workdir_only)
|
82
|
+
|
79
83
|
|
80
84
|
class _DagContext(threading.local):
|
81
85
|
"""A thread-local stack of Dags."""
|
sky/data/mounting_utils.py
CHANGED
@@ -117,7 +117,8 @@ def get_az_mount_cmd(container_name: str,
|
|
117
117
|
if storage_account_key is None:
|
118
118
|
key_env_var = f'AZURE_STORAGE_SAS_TOKEN={shlex.quote(" ")}'
|
119
119
|
else:
|
120
|
-
key_env_var =
|
120
|
+
key_env_var = ('AZURE_STORAGE_ACCESS_KEY='
|
121
|
+
f'{shlex.quote(storage_account_key)}')
|
121
122
|
|
122
123
|
cache_path = _BLOBFUSE_CACHE_DIR.format(
|
123
124
|
storage_account_name=storage_account_name,
|