skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/cloud.py
CHANGED
@@ -9,19 +9,21 @@ reused across cloud object creation.
|
|
9
9
|
"""
|
10
10
|
import collections
|
11
11
|
import enum
|
12
|
+
import math
|
12
13
|
import typing
|
13
|
-
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple
|
14
|
+
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple, Union
|
14
15
|
|
15
16
|
from sky import exceptions
|
16
17
|
from sky import skypilot_config
|
17
18
|
from sky.clouds import service_catalog
|
18
19
|
from sky.utils import log_utils
|
19
20
|
from sky.utils import resources_utils
|
21
|
+
from sky.utils import timeline
|
20
22
|
from sky.utils import ux_utils
|
21
23
|
|
22
24
|
if typing.TYPE_CHECKING:
|
23
25
|
from sky import resources as resources_lib
|
24
|
-
from sky import status_lib
|
26
|
+
from sky.utils import status_lib
|
25
27
|
|
26
28
|
|
27
29
|
class CloudImplementationFeatures(enum.Enum):
|
@@ -93,12 +95,31 @@ class StatusVersion(enum.Enum):
|
|
93
95
|
return self.value >= other.value
|
94
96
|
|
95
97
|
|
98
|
+
class OpenPortsVersion(enum.Enum):
|
99
|
+
"""The version of the open ports implementation.
|
100
|
+
|
101
|
+
1: Open ports on launching of the cluster only, cannot be modified after
|
102
|
+
provisioning of the cluster. This is for clouds like RunPod which only
|
103
|
+
accepts port argument on VM creation API, and requires Web GUI and an VM
|
104
|
+
restart to update ports. We currently do not support this.
|
105
|
+
2: Open ports after provisioning of the cluster, updatable. This is for most
|
106
|
+
of the cloud providers which allow opening ports using an programmable API
|
107
|
+
and won't affect the running VMs.
|
108
|
+
"""
|
109
|
+
LAUNCH_ONLY = 'LAUNCH ONLY'
|
110
|
+
UPDATABLE = 'UPDATABLE'
|
111
|
+
|
112
|
+
def __le__(self, other):
|
113
|
+
versions = list(OpenPortsVersion)
|
114
|
+
return versions.index(self) <= versions.index(other)
|
115
|
+
|
116
|
+
|
96
117
|
class Cloud:
|
97
118
|
"""A cloud provider."""
|
98
119
|
|
99
120
|
_REPR = '<Cloud>'
|
100
121
|
_DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
|
101
|
-
_BEST_DISK_TIER = resources_utils.DiskTier.
|
122
|
+
_BEST_DISK_TIER = resources_utils.DiskTier.ULTRA
|
102
123
|
_SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
|
103
124
|
_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False
|
104
125
|
|
@@ -107,6 +128,7 @@ class Cloud:
|
|
107
128
|
# NOTE: new clouds being added should use the latest version, i.e. SKYPILOT.
|
108
129
|
PROVISIONER_VERSION = ProvisionerVersion.RAY_AUTOSCALER
|
109
130
|
STATUS_VERSION = StatusVersion.CLOUD_CLI
|
131
|
+
OPEN_PORTS_VERSION = OpenPortsVersion.UPDATABLE
|
110
132
|
|
111
133
|
@classmethod
|
112
134
|
def max_cluster_name_length(cls) -> Optional[int]:
|
@@ -157,6 +179,11 @@ class Cloud:
|
|
157
179
|
"""
|
158
180
|
raise NotImplementedError
|
159
181
|
|
182
|
+
@classmethod
|
183
|
+
def optimize_by_zone(cls) -> bool:
|
184
|
+
"""Returns whether to optimize this cloud by zone (default: region)."""
|
185
|
+
return False
|
186
|
+
|
160
187
|
@classmethod
|
161
188
|
def zones_provision_loop(
|
162
189
|
cls,
|
@@ -253,9 +280,10 @@ class Cloud:
|
|
253
280
|
def make_deploy_resources_variables(
|
254
281
|
self,
|
255
282
|
resources: 'resources_lib.Resources',
|
256
|
-
|
283
|
+
cluster_name: resources_utils.ClusterName,
|
257
284
|
region: 'Region',
|
258
285
|
zones: Optional[List['Zone']],
|
286
|
+
num_nodes: int,
|
259
287
|
dryrun: bool = False,
|
260
288
|
) -> Dict[str, Optional[str]]:
|
261
289
|
"""Converts planned sky.Resources to cloud-specific resource variables.
|
@@ -281,7 +309,7 @@ class Cloud:
|
|
281
309
|
def get_accelerators_from_instance_type(
|
282
310
|
cls,
|
283
311
|
instance_type: str,
|
284
|
-
) -> Optional[Dict[str, int]]:
|
312
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
285
313
|
"""Returns {acc: acc_count} held by 'instance_type', if any."""
|
286
314
|
raise NotImplementedError
|
287
315
|
|
@@ -340,12 +368,12 @@ class Cloud:
|
|
340
368
|
del label_key, label_value
|
341
369
|
return True, None
|
342
370
|
|
371
|
+
@timeline.event
|
343
372
|
def get_feasible_launchable_resources(
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
"""Returns ([feasible and launchable resources], [fuzzy candidates]).
|
373
|
+
self,
|
374
|
+
resources: 'resources_lib.Resources',
|
375
|
+
num_nodes: int = 1) -> 'resources_utils.FeasibleResources':
|
376
|
+
"""Returns FeasibleResources for the given resources.
|
349
377
|
|
350
378
|
Feasible resources refer to an offering respecting the resource
|
351
379
|
requirements. Currently, this function implements "filtering" the
|
@@ -353,10 +381,15 @@ class Cloud:
|
|
353
381
|
|
354
382
|
Launchable resources require a cloud and an instance type be assigned.
|
355
383
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
|
384
|
+
The returned dataclass object FeasibleResources contains three fields:
|
385
|
+
|
386
|
+
- resources_list: a list of resources that are feasible to launch
|
387
|
+
- fuzzy_candidate_list: a list of resources that loosely match requested
|
388
|
+
resources. E.g., when A100:1 GPU is requested but is not available
|
389
|
+
in a cloud/region, the fuzzy candidates are results of a fuzzy
|
390
|
+
search in the catalog that are offered in the location. E.g.,
|
391
|
+
['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
|
392
|
+
- hint: an optional string hint if no feasible resources are found.
|
360
393
|
"""
|
361
394
|
if resources.is_launchable():
|
362
395
|
self._check_instance_type_accelerators_combination(resources)
|
@@ -372,13 +405,18 @@ class Cloud:
|
|
372
405
|
# TODO(zhwu): The resources are now silently filtered out. We
|
373
406
|
# should have some logging telling the user why the resources
|
374
407
|
# are not considered.
|
375
|
-
return ([],
|
408
|
+
return resources_utils.FeasibleResources(resources_list=[],
|
409
|
+
fuzzy_candidate_list=[],
|
410
|
+
hint=None)
|
376
411
|
return self._get_feasible_launchable_resources(resources)
|
377
412
|
|
378
413
|
def _get_feasible_launchable_resources(
|
379
414
|
self, resources: 'resources_lib.Resources'
|
380
|
-
) ->
|
415
|
+
) -> 'resources_utils.FeasibleResources':
|
381
416
|
"""See get_feasible_launchable_resources()."""
|
417
|
+
# TODO: Currently only the Kubernetes implementation of this method
|
418
|
+
# returns hints when no feasible resources are found. This should be
|
419
|
+
# implemented for all clouds.
|
382
420
|
raise NotImplementedError
|
383
421
|
|
384
422
|
def get_reservations_available_resources(
|
@@ -407,11 +445,11 @@ class Cloud:
|
|
407
445
|
|
408
446
|
# TODO(zhwu): Make the return type immutable.
|
409
447
|
@classmethod
|
410
|
-
def
|
411
|
-
"""(Advanced) Returns
|
448
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
449
|
+
"""(Advanced) Returns all available user identities of this cloud.
|
412
450
|
|
413
451
|
The user "identity" is associated with each SkyPilot cluster they
|
414
|
-
|
452
|
+
create. This is used in protecting cluster operations, such as
|
415
453
|
provision, teardown and status refreshing, in a multi-identity
|
416
454
|
scenario, where the same user/device can switch between different
|
417
455
|
cloud identities. We check that the user identity matches before:
|
@@ -419,10 +457,16 @@ class Cloud:
|
|
419
457
|
- Stopping/tearing down a cluster
|
420
458
|
- Refreshing the status of a cluster
|
421
459
|
|
422
|
-
Design
|
423
|
-
|
424
|
-
|
425
|
-
|
460
|
+
Design choices:
|
461
|
+
1. We allow the operations that can correctly work with a different
|
462
|
+
user identity, as a user should have full control over all their
|
463
|
+
clusters (no matter which identity it belongs to), e.g.,
|
464
|
+
submitting jobs, viewing logs, auto-stopping, etc.
|
465
|
+
2. A cloud implementation can optionally switch between different
|
466
|
+
identities if required for cluster operations. In this case,
|
467
|
+
the cloud implementation should return multiple identities
|
468
|
+
as a list. E.g., our Kubernetes implementation can use multiple
|
469
|
+
kubeconfig contexts to switch between different identities.
|
426
470
|
|
427
471
|
The choice of what constitutes an identity is up to each cloud's
|
428
472
|
implementation. In general, to suffice for the above purposes,
|
@@ -430,24 +474,34 @@ class Cloud:
|
|
430
474
|
resources are used when the user invoked each cloud's default
|
431
475
|
CLI/API.
|
432
476
|
|
433
|
-
|
477
|
+
An identity is a list of strings. The list is in the order of
|
434
478
|
strictness, i.e., the first element is the most strict identity, and
|
435
479
|
the last element is the least strict identity.
|
436
480
|
When performing an identity check between the current active identity
|
437
481
|
and the owner identity associated with a cluster, we compare the two
|
438
482
|
lists in order: if a position does not match, we go to the next. To
|
439
|
-
see an example, see the docstring of the AWS.
|
440
|
-
|
483
|
+
see an example, see the docstring of the AWS.get_user_identities.
|
441
484
|
|
442
485
|
Example identities (see cloud implementations):
|
443
486
|
- AWS: [UserId, AccountId]
|
444
487
|
- GCP: [email address + project ID]
|
445
488
|
- Azure: [email address + subscription ID]
|
489
|
+
- Kubernetes: [context name]
|
490
|
+
|
491
|
+
Example return values:
|
492
|
+
- AWS: [[UserId, AccountId]]
|
493
|
+
- GCP: [[email address + project ID]]
|
494
|
+
- Azure: [[email address + subscription ID]]
|
495
|
+
- Kubernetes: [[current active context], [context 2], ...]
|
446
496
|
|
447
497
|
Returns:
|
448
498
|
None if the cloud does not have a concept of user identity
|
449
499
|
(access protection will be disabled for these clusters);
|
450
|
-
otherwise the
|
500
|
+
otherwise a list of available identities with the current active
|
501
|
+
identity being the first element. Most clouds have only one identity
|
502
|
+
available, so the returned list will only have one element: the
|
503
|
+
current active identity.
|
504
|
+
|
451
505
|
Raises:
|
452
506
|
exceptions.CloudUserIdentityError: If the user identity cannot be
|
453
507
|
retrieved.
|
@@ -455,13 +509,26 @@ class Cloud:
|
|
455
509
|
return None
|
456
510
|
|
457
511
|
@classmethod
|
458
|
-
def
|
459
|
-
"""Returns a user friendly representation of the
|
460
|
-
user_identity = cls.
|
512
|
+
def get_active_user_identity_str(cls) -> Optional[str]:
|
513
|
+
"""Returns a user friendly representation of the active identity."""
|
514
|
+
user_identity = cls.get_active_user_identity()
|
461
515
|
if user_identity is None:
|
462
516
|
return None
|
463
517
|
return ', '.join(user_identity)
|
464
518
|
|
519
|
+
@classmethod
|
520
|
+
def get_active_user_identity(cls) -> Optional[List[str]]:
|
521
|
+
"""Returns currently active user identity of this cloud
|
522
|
+
|
523
|
+
See get_user_identities for definition of user identity.
|
524
|
+
|
525
|
+
Returns:
|
526
|
+
None if the cloud does not have a concept of user identity;
|
527
|
+
otherwise the current active identity.
|
528
|
+
"""
|
529
|
+
identities = cls.get_user_identities()
|
530
|
+
return identities[0] if identities is not None else None
|
531
|
+
|
465
532
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
466
533
|
"""Returns the files necessary to access this cloud.
|
467
534
|
|
@@ -469,6 +536,10 @@ class Cloud:
|
|
469
536
|
"""
|
470
537
|
raise NotImplementedError
|
471
538
|
|
539
|
+
def can_credential_expire(self) -> bool:
|
540
|
+
"""Returns whether the cloud credential can expire."""
|
541
|
+
return False
|
542
|
+
|
472
543
|
@classmethod
|
473
544
|
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
474
545
|
"""Check the image size from the cloud.
|
@@ -610,8 +681,9 @@ class Cloud:
|
|
610
681
|
assert resources.is_launchable(), resources
|
611
682
|
|
612
683
|
def _equal_accelerators(
|
613
|
-
|
614
|
-
|
684
|
+
acc_requested: Optional[Dict[str, Union[int, float]]],
|
685
|
+
acc_from_instance_type: Optional[Dict[str, Union[int,
|
686
|
+
float]]]) -> bool:
|
615
687
|
"""Check the requested accelerators equals to the instance type
|
616
688
|
|
617
689
|
Check the requested accelerators equals to the accelerators
|
@@ -626,12 +698,14 @@ class Cloud:
|
|
626
698
|
for acc in acc_requested:
|
627
699
|
if acc not in acc_from_instance_type:
|
628
700
|
return False
|
629
|
-
|
701
|
+
# Avoid float point precision issue.
|
702
|
+
if not math.isclose(acc_requested[acc],
|
703
|
+
acc_from_instance_type[acc]):
|
630
704
|
return False
|
631
705
|
return True
|
632
706
|
|
633
|
-
acc_from_instance_type =
|
634
|
-
resources.instance_type)
|
707
|
+
acc_from_instance_type = cls.get_accelerators_from_instance_type(
|
708
|
+
resources.instance_type)
|
635
709
|
if not _equal_accelerators(resources.accelerators,
|
636
710
|
acc_from_instance_type):
|
637
711
|
with ux_utils.print_exception_no_traceback():
|
@@ -726,8 +800,8 @@ class Cloud:
|
|
726
800
|
# cloud._cloud_unsupported_features().
|
727
801
|
|
728
802
|
@classmethod
|
729
|
-
def create_image_from_cluster(cls,
|
730
|
-
|
803
|
+
def create_image_from_cluster(cls,
|
804
|
+
cluster_name: resources_utils.ClusterName,
|
731
805
|
region: Optional[str],
|
732
806
|
zone: Optional[str]) -> str:
|
733
807
|
"""Creates an image from the cluster.
|
@@ -756,6 +830,10 @@ class Cloud:
|
|
756
830
|
|
757
831
|
# === End of image related methods ===
|
758
832
|
|
833
|
+
@classmethod
|
834
|
+
def canonical_name(cls) -> str:
|
835
|
+
return cls.__name__.lower()
|
836
|
+
|
759
837
|
def __repr__(self):
|
760
838
|
return self._REPR
|
761
839
|
|
sky/clouds/cudo.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
"""Cudo Compute"""
|
2
|
-
import json
|
3
2
|
import subprocess
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
from sky import clouds
|
8
7
|
from sky.clouds import service_catalog
|
9
8
|
from sky.utils import common_utils
|
9
|
+
from sky.utils import registry
|
10
10
|
from sky.utils import resources_utils
|
11
11
|
|
12
12
|
if typing.TYPE_CHECKING:
|
@@ -28,7 +28,7 @@ def _run_output(cmd):
|
|
28
28
|
return proc.stdout.decode('ascii')
|
29
29
|
|
30
30
|
|
31
|
-
@
|
31
|
+
@registry.CLOUD_REGISTRY.register
|
32
32
|
class Cudo(clouds.Cloud):
|
33
33
|
"""Cudo Compute"""
|
34
34
|
_REPR = 'Cudo'
|
@@ -43,8 +43,7 @@ class Cudo(clouds.Cloud):
|
|
43
43
|
f'{_INDENT_PREFIX} $ cudoctl init\n'
|
44
44
|
f'{_INDENT_PREFIX}For more info: '
|
45
45
|
# pylint: disable=line-too-long
|
46
|
-
'https://skypilot.
|
47
|
-
)
|
46
|
+
'https://docs.skypilot.co/en/latest/getting-started/installation.html')
|
48
47
|
|
49
48
|
_PROJECT_HINT = (
|
50
49
|
'Create a project and then set it as the default project,:\n'
|
@@ -52,8 +51,7 @@ class Cudo(clouds.Cloud):
|
|
52
51
|
f'{_INDENT_PREFIX} $ cudoctl init\n'
|
53
52
|
f'{_INDENT_PREFIX}For more info: '
|
54
53
|
# pylint: disable=line-too-long
|
55
|
-
'https://skypilot.
|
56
|
-
)
|
54
|
+
'https://docs.skypilot.co/en/latest/getting-started/installation.html')
|
57
55
|
|
58
56
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
59
57
|
clouds.CloudImplementationFeatures.STOP: 'Stopping not supported.',
|
@@ -66,6 +64,10 @@ class Cudo(clouds.Cloud):
|
|
66
64
|
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
67
65
|
('Docker image is currently not supported on Cudo. You can try '
|
68
66
|
'running docker command inside the `run` section in task.yaml.'),
|
67
|
+
clouds.CloudImplementationFeatures.HOST_CONTROLLERS: (
|
68
|
+
'Cudo Compute cannot host a controller as it does not '
|
69
|
+
'autostopping, which will leave the controller to run indefinitely.'
|
70
|
+
),
|
69
71
|
}
|
70
72
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 60
|
71
73
|
|
@@ -179,7 +181,7 @@ class Cudo(clouds.Cloud):
|
|
179
181
|
def get_accelerators_from_instance_type(
|
180
182
|
cls,
|
181
183
|
instance_type: str,
|
182
|
-
) -> Optional[Dict[str, int]]:
|
184
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
183
185
|
return service_catalog.get_accelerators_from_instance_type(
|
184
186
|
instance_type, clouds='cudo')
|
185
187
|
|
@@ -190,18 +192,17 @@ class Cudo(clouds.Cloud):
|
|
190
192
|
def make_deploy_resources_variables(
|
191
193
|
self,
|
192
194
|
resources: 'resources_lib.Resources',
|
193
|
-
|
195
|
+
cluster_name: resources_utils.ClusterName,
|
194
196
|
region: 'clouds.Region',
|
195
197
|
zones: Optional[List['clouds.Zone']],
|
198
|
+
num_nodes: int,
|
196
199
|
dryrun: bool = False,
|
197
200
|
) -> Dict[str, Optional[str]]:
|
198
|
-
del zones
|
201
|
+
del zones, cluster_name # unused
|
199
202
|
r = resources
|
200
203
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
201
|
-
|
202
|
-
|
203
|
-
else:
|
204
|
-
custom_resources = None
|
204
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
205
|
+
acc_dict)
|
205
206
|
|
206
207
|
return {
|
207
208
|
'instance_type': resources.instance_type,
|
@@ -210,13 +211,16 @@ class Cudo(clouds.Cloud):
|
|
210
211
|
}
|
211
212
|
|
212
213
|
def _get_feasible_launchable_resources(
|
213
|
-
|
214
|
+
self, resources: 'resources_lib.Resources'
|
215
|
+
) -> 'resources_utils.FeasibleResources':
|
214
216
|
if resources.use_spot:
|
215
|
-
return
|
217
|
+
# TODO: Add hints to all return values in this method to help
|
218
|
+
# users understand why the resources are not launchable.
|
219
|
+
return resources_utils.FeasibleResources([], [], None)
|
216
220
|
if resources.instance_type is not None:
|
217
221
|
assert resources.is_launchable(), resources
|
218
222
|
resources = resources.copy(accelerators=None)
|
219
|
-
return ([resources], [])
|
223
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
220
224
|
|
221
225
|
def _make(instance_list):
|
222
226
|
resource_list = []
|
@@ -239,9 +243,10 @@ class Cudo(clouds.Cloud):
|
|
239
243
|
memory=resources.memory,
|
240
244
|
disk_tier=resources.disk_tier)
|
241
245
|
if default_instance_type is None:
|
242
|
-
return ([], [])
|
246
|
+
return resources_utils.FeasibleResources([], [], None)
|
243
247
|
else:
|
244
|
-
return (
|
248
|
+
return resources_utils.FeasibleResources(
|
249
|
+
_make([default_instance_type]), [], None)
|
245
250
|
|
246
251
|
assert len(accelerators) == 1, resources
|
247
252
|
acc, acc_count = list(accelerators.items())[0]
|
@@ -256,8 +261,10 @@ class Cudo(clouds.Cloud):
|
|
256
261
|
zone=resources.zone,
|
257
262
|
clouds='cudo')
|
258
263
|
if instance_list is None:
|
259
|
-
return ([], fuzzy_candidate_list
|
260
|
-
|
264
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
265
|
+
None)
|
266
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
267
|
+
fuzzy_candidate_list, None)
|
261
268
|
|
262
269
|
@classmethod
|
263
270
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
@@ -318,7 +325,7 @@ class Cudo(clouds.Cloud):
|
|
318
325
|
}
|
319
326
|
|
320
327
|
@classmethod
|
321
|
-
def
|
328
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
322
329
|
# NOTE: used for very advanced SkyPilot functionality
|
323
330
|
# Can implement later if desired
|
324
331
|
return None
|