skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/utils/scp_utils.py
CHANGED
@@ -65,7 +65,7 @@ class Metadata:
|
|
65
65
|
if value is None:
|
66
66
|
if instance_id in metadata:
|
67
67
|
metadata.pop(instance_id) # del entry
|
68
|
-
if
|
68
|
+
if not metadata:
|
69
69
|
if os.path.exists(self.path):
|
70
70
|
os.remove(self.path)
|
71
71
|
return
|
@@ -84,7 +84,7 @@ class Metadata:
|
|
84
84
|
for instance_id in list(metadata.keys()):
|
85
85
|
if instance_id not in instance_ids:
|
86
86
|
del metadata[instance_id]
|
87
|
-
if
|
87
|
+
if not metadata:
|
88
88
|
os.remove(self.path)
|
89
89
|
return
|
90
90
|
with open(self.path, 'w', encoding='utf-8') as f:
|
@@ -223,6 +223,7 @@ class SCPClient:
|
|
223
223
|
def create_security_group(self, zone_id, vpc, sg_name):
|
224
224
|
url = f'{API_ENDPOINT}/security-group/v3/security-groups'
|
225
225
|
request_body = {
|
226
|
+
'loggable': False,
|
226
227
|
'securityGroupName': sg_name,
|
227
228
|
'serviceZoneId': zone_id,
|
228
229
|
'vpcId': vpc,
|
@@ -409,7 +410,7 @@ class SCPClient:
|
|
409
410
|
parameter.append('vpcId=' + vpc_id)
|
410
411
|
if sg_name is not None:
|
411
412
|
parameter.append('securityGroupName=' + sg_name)
|
412
|
-
if
|
413
|
+
if parameter:
|
413
414
|
url = url + '?' + '&'.join(parameter)
|
414
415
|
return self._get(url)
|
415
416
|
|
sky/clouds/vast.py
ADDED
@@ -0,0 +1,280 @@
|
|
1
|
+
""" Vast Cloud. """
|
2
|
+
|
3
|
+
import typing
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
5
|
+
|
6
|
+
from sky import clouds
|
7
|
+
from sky.clouds import service_catalog
|
8
|
+
from sky.utils import registry
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
|
11
|
+
if typing.TYPE_CHECKING:
|
12
|
+
from sky import resources as resources_lib
|
13
|
+
|
14
|
+
|
15
|
+
@registry.CLOUD_REGISTRY.register
|
16
|
+
class Vast(clouds.Cloud):
|
17
|
+
""" Vast GPU Cloud
|
18
|
+
|
19
|
+
_REPR | The string representation for the Vast GPU cloud object.
|
20
|
+
"""
|
21
|
+
_REPR = 'Vast'
|
22
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
23
|
+
clouds.CloudImplementationFeatures.MULTI_NODE:
|
24
|
+
('Multi-node not supported yet, as the interconnection among nodes '
|
25
|
+
'are non-trivial on Vast.'),
|
26
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
27
|
+
('Customizing disk tier is not supported yet on Vast.'),
|
28
|
+
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
29
|
+
('Opening ports is currently not supported on Vast.'),
|
30
|
+
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
|
31
|
+
('Mounting object stores is not supported on Vast.'),
|
32
|
+
}
|
33
|
+
#
|
34
|
+
# Vast doesn't have a max cluster name limit. This number
|
35
|
+
# is reasonably large and exists to play nicely with the
|
36
|
+
# other providers
|
37
|
+
#
|
38
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
39
|
+
_regions: List[clouds.Region] = []
|
40
|
+
|
41
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
42
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
43
|
+
|
44
|
+
@classmethod
|
45
|
+
def _unsupported_features_for_resources(
|
46
|
+
cls, resources: 'resources_lib.Resources'
|
47
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
48
|
+
"""The features not supported based on the resources provided.
|
49
|
+
|
50
|
+
This method is used by check_features_are_supported() to check if the
|
51
|
+
cloud implementation supports all the requested features.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
A dict of {feature: reason} for the features not supported by the
|
55
|
+
cloud implementation.
|
56
|
+
"""
|
57
|
+
del resources # unused
|
58
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
59
|
+
|
60
|
+
@classmethod
|
61
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
62
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
63
|
+
|
64
|
+
@classmethod
|
65
|
+
def regions_with_offering(cls, instance_type: str,
|
66
|
+
accelerators: Optional[Dict[str, int]],
|
67
|
+
use_spot: bool, region: Optional[str],
|
68
|
+
zone: Optional[str]) -> List[clouds.Region]:
|
69
|
+
assert zone is None, 'Vast does not support zones.'
|
70
|
+
del accelerators, zone # unused
|
71
|
+
regions = service_catalog.get_region_zones_for_instance_type(
|
72
|
+
instance_type, use_spot, 'vast')
|
73
|
+
|
74
|
+
if region is not None:
|
75
|
+
regions = [r for r in regions if r.name == region]
|
76
|
+
return regions
|
77
|
+
|
78
|
+
@classmethod
|
79
|
+
def get_vcpus_mem_from_instance_type(
|
80
|
+
cls,
|
81
|
+
instance_type: str,
|
82
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
83
|
+
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
|
84
|
+
clouds='vast')
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def zones_provision_loop(
|
88
|
+
cls,
|
89
|
+
*,
|
90
|
+
region: str,
|
91
|
+
num_nodes: int,
|
92
|
+
instance_type: str,
|
93
|
+
accelerators: Optional[Dict[str, int]] = None,
|
94
|
+
use_spot: bool = False,
|
95
|
+
) -> Iterator[None]:
|
96
|
+
del num_nodes # unused
|
97
|
+
regions = cls.regions_with_offering(instance_type,
|
98
|
+
accelerators,
|
99
|
+
use_spot,
|
100
|
+
region=region,
|
101
|
+
zone=None)
|
102
|
+
for r in regions:
|
103
|
+
assert r.zones is None, r
|
104
|
+
yield r.zones
|
105
|
+
|
106
|
+
def instance_type_to_hourly_cost(self,
|
107
|
+
instance_type: str,
|
108
|
+
use_spot: bool,
|
109
|
+
region: Optional[str] = None,
|
110
|
+
zone: Optional[str] = None) -> float:
|
111
|
+
return service_catalog.get_hourly_cost(instance_type,
|
112
|
+
use_spot=use_spot,
|
113
|
+
region=region,
|
114
|
+
zone=zone,
|
115
|
+
clouds='vast')
|
116
|
+
|
117
|
+
def accelerators_to_hourly_cost(self,
|
118
|
+
accelerators: Dict[str, int],
|
119
|
+
use_spot: bool,
|
120
|
+
region: Optional[str] = None,
|
121
|
+
zone: Optional[str] = None) -> float:
|
122
|
+
"""Returns the hourly cost of the accelerators, in dollars/hour."""
|
123
|
+
del accelerators, use_spot, region, zone # unused
|
124
|
+
return 0.0 # Vast includes accelerators in the hourly cost.
|
125
|
+
|
126
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
127
|
+
return 0.0
|
128
|
+
|
129
|
+
@classmethod
|
130
|
+
def get_default_instance_type(
|
131
|
+
cls,
|
132
|
+
cpus: Optional[str] = None,
|
133
|
+
memory: Optional[str] = None,
|
134
|
+
disk_tier: Optional[resources_utils.DiskTier] = None
|
135
|
+
) -> Optional[str]:
|
136
|
+
"""Returns the default instance type for Vast."""
|
137
|
+
return service_catalog.get_default_instance_type(cpus=cpus,
|
138
|
+
memory=memory,
|
139
|
+
disk_tier=disk_tier,
|
140
|
+
clouds='vast')
|
141
|
+
|
142
|
+
@classmethod
|
143
|
+
def get_accelerators_from_instance_type(
|
144
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
145
|
+
return service_catalog.get_accelerators_from_instance_type(
|
146
|
+
instance_type, clouds='vast')
|
147
|
+
|
148
|
+
@classmethod
|
149
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
150
|
+
return None
|
151
|
+
|
152
|
+
def make_deploy_resources_variables(
|
153
|
+
self,
|
154
|
+
resources: 'resources_lib.Resources',
|
155
|
+
cluster_name: resources_utils.ClusterName,
|
156
|
+
region: 'clouds.Region',
|
157
|
+
zones: Optional[List['clouds.Zone']],
|
158
|
+
num_nodes: int,
|
159
|
+
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
160
|
+
del zones, dryrun, cluster_name, num_nodes # unused
|
161
|
+
|
162
|
+
r = resources
|
163
|
+
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
164
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
165
|
+
acc_dict)
|
166
|
+
|
167
|
+
if r.image_id is None:
|
168
|
+
image_id = 'vastai/base:0.0.2'
|
169
|
+
elif r.extract_docker_image() is not None:
|
170
|
+
image_id = r.extract_docker_image()
|
171
|
+
else:
|
172
|
+
image_id = r.image_id[r.region]
|
173
|
+
|
174
|
+
return {
|
175
|
+
'instance_type': resources.instance_type,
|
176
|
+
'custom_resources': custom_resources,
|
177
|
+
'region': region.name,
|
178
|
+
'image_id': image_id,
|
179
|
+
}
|
180
|
+
|
181
|
+
def _get_feasible_launchable_resources(
|
182
|
+
self, resources: 'resources_lib.Resources'
|
183
|
+
) -> 'resources_utils.FeasibleResources':
|
184
|
+
"""Returns a list of feasible resources for the given resources."""
|
185
|
+
if resources.instance_type is not None:
|
186
|
+
assert resources.is_launchable(), resources
|
187
|
+
resources = resources.copy(accelerators=None)
|
188
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
189
|
+
|
190
|
+
def _make(instance_list):
|
191
|
+
resource_list = []
|
192
|
+
for instance_type in instance_list:
|
193
|
+
r = resources.copy(
|
194
|
+
cloud=Vast(),
|
195
|
+
instance_type=instance_type,
|
196
|
+
accelerators=None,
|
197
|
+
cpus=None,
|
198
|
+
)
|
199
|
+
resource_list.append(r)
|
200
|
+
return resource_list
|
201
|
+
|
202
|
+
# Currently, handle a filter on accelerators only.
|
203
|
+
accelerators = resources.accelerators
|
204
|
+
if accelerators is None:
|
205
|
+
# Return a default instance type
|
206
|
+
default_instance_type = Vast.get_default_instance_type(
|
207
|
+
cpus=resources.cpus,
|
208
|
+
memory=resources.memory,
|
209
|
+
disk_tier=resources.disk_tier)
|
210
|
+
if default_instance_type is None:
|
211
|
+
# TODO: Add hints to all return values in this method to help
|
212
|
+
# users understand why the resources are not launchable.
|
213
|
+
return resources_utils.FeasibleResources([], [], None)
|
214
|
+
else:
|
215
|
+
return resources_utils.FeasibleResources(
|
216
|
+
_make([default_instance_type]), [], None)
|
217
|
+
|
218
|
+
assert len(accelerators) == 1, resources
|
219
|
+
acc, acc_count = list(accelerators.items())[0]
|
220
|
+
(instance_list, fuzzy_candidate_list
|
221
|
+
) = service_catalog.get_instance_type_for_accelerator(
|
222
|
+
acc,
|
223
|
+
acc_count,
|
224
|
+
use_spot=resources.use_spot,
|
225
|
+
cpus=resources.cpus,
|
226
|
+
region=resources.region,
|
227
|
+
zone=resources.zone,
|
228
|
+
memory=resources.memory,
|
229
|
+
clouds='vast')
|
230
|
+
if instance_list is None:
|
231
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
232
|
+
None)
|
233
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
234
|
+
fuzzy_candidate_list, None)
|
235
|
+
|
236
|
+
@classmethod
|
237
|
+
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
238
|
+
""" Verify that the user has valid credentials for Vast. """
|
239
|
+
try:
|
240
|
+
import vastai_sdk as _vast # pylint: disable=import-outside-toplevel
|
241
|
+
vast = _vast.VastAI()
|
242
|
+
|
243
|
+
# We only support file pased credential passing
|
244
|
+
if vast.creds_source != 'FILE':
|
245
|
+
return False, (
|
246
|
+
'error \n' # First line is indented by 4 spaces
|
247
|
+
' Credentials can be set up by running: \n'
|
248
|
+
' $ pip install vastai\n'
|
249
|
+
' $ echo [key] > ~/.vast_api_key\n'
|
250
|
+
' For more information, see https://skypilot.readthedocs.io/en/latest/getting-started/installation.html#vast' # pylint: disable=line-too-long
|
251
|
+
)
|
252
|
+
|
253
|
+
return True, None
|
254
|
+
|
255
|
+
except ImportError:
|
256
|
+
return False, ('Failed to import vast. '
|
257
|
+
'To install, run: pip install skypilot[vast]')
|
258
|
+
|
259
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
260
|
+
return {
|
261
|
+
'~/.config/vastai/vast_api_key': '~/.config/vastai/vast_api_key'
|
262
|
+
}
|
263
|
+
|
264
|
+
@classmethod
|
265
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
266
|
+
# NOTE: used for very advanced SkyPilot functionality
|
267
|
+
# Can implement later if desired
|
268
|
+
return None
|
269
|
+
|
270
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
271
|
+
return service_catalog.instance_type_exists(instance_type, 'vast')
|
272
|
+
|
273
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
274
|
+
return service_catalog.validate_region_zone(region, zone, clouds='vast')
|
275
|
+
|
276
|
+
@classmethod
|
277
|
+
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
278
|
+
# TODO: use 0.0 for now to allow all images. We should change this to
|
279
|
+
# return the docker image size.
|
280
|
+
return 0.0
|
sky/clouds/vsphere.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
"""Vsphere cloud implementation."""
|
2
|
-
import json
|
3
2
|
import subprocess
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
import requests
|
8
7
|
|
@@ -12,6 +11,7 @@ from sky.provision.vsphere import vsphere_utils
|
|
12
11
|
from sky.provision.vsphere.vsphere_utils import get_vsphere_credentials
|
13
12
|
from sky.provision.vsphere.vsphere_utils import initialize_vsphere_data
|
14
13
|
from sky.utils import common_utils
|
14
|
+
from sky.utils import registry
|
15
15
|
from sky.utils import resources_utils
|
16
16
|
|
17
17
|
if typing.TYPE_CHECKING:
|
@@ -25,7 +25,7 @@ _CREDENTIAL_FILES = [
|
|
25
25
|
]
|
26
26
|
|
27
27
|
|
28
|
-
@
|
28
|
+
@registry.CLOUD_REGISTRY.register
|
29
29
|
class Vsphere(clouds.Cloud):
|
30
30
|
"""Vsphere cloud"""
|
31
31
|
|
@@ -152,7 +152,7 @@ class Vsphere(clouds.Cloud):
|
|
152
152
|
def get_accelerators_from_instance_type(
|
153
153
|
cls,
|
154
154
|
instance_type: str,
|
155
|
-
) -> Optional[Dict[str, int]]:
|
155
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
156
156
|
return service_catalog.get_accelerators_from_instance_type(
|
157
157
|
instance_type, clouds=_CLOUD_VSPHERE)
|
158
158
|
|
@@ -171,21 +171,20 @@ class Vsphere(clouds.Cloud):
|
|
171
171
|
def make_deploy_resources_variables(
|
172
172
|
self,
|
173
173
|
resources: 'resources_lib.Resources',
|
174
|
-
|
174
|
+
cluster_name: resources_utils.ClusterName,
|
175
175
|
region: 'clouds.Region',
|
176
176
|
zones: Optional[List['clouds.Zone']],
|
177
|
+
num_nodes: int,
|
177
178
|
dryrun: bool = False,
|
178
179
|
) -> Dict[str, Optional[str]]:
|
179
180
|
# TODO get image id here.
|
180
|
-
del
|
181
|
+
del cluster_name, dryrun # unused
|
181
182
|
assert zones is not None, (region, zones)
|
182
183
|
zone_names = [zone.name for zone in zones]
|
183
184
|
r = resources
|
184
185
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
185
|
-
|
186
|
-
|
187
|
-
else:
|
188
|
-
custom_resources = None
|
186
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
187
|
+
acc_dict)
|
189
188
|
|
190
189
|
return {
|
191
190
|
'instance_type': resources.instance_type,
|
@@ -197,11 +196,13 @@ class Vsphere(clouds.Cloud):
|
|
197
196
|
def _get_feasible_launchable_resources(
|
198
197
|
self, resources: 'resources_lib.Resources'):
|
199
198
|
if resources.use_spot:
|
200
|
-
return
|
199
|
+
# TODO: Add hints to all return values in this method to help
|
200
|
+
# users understand why the resources are not launchable.
|
201
|
+
return resources_utils.FeasibleResources([], [], None)
|
201
202
|
if resources.instance_type is not None:
|
202
203
|
assert resources.is_launchable(), resources
|
203
204
|
resources = resources.copy(accelerators=None)
|
204
|
-
return ([resources], [])
|
205
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
205
206
|
|
206
207
|
def _make(instance_list):
|
207
208
|
resource_list = []
|
@@ -226,9 +227,10 @@ class Vsphere(clouds.Cloud):
|
|
226
227
|
disk_tier=resources.disk_tier,
|
227
228
|
)
|
228
229
|
if default_instance_type is None:
|
229
|
-
return ([], [])
|
230
|
+
return resources_utils.FeasibleResources([], [], None)
|
230
231
|
else:
|
231
|
-
return (
|
232
|
+
return resources_utils.FeasibleResources(
|
233
|
+
_make([default_instance_type]), [], None)
|
232
234
|
|
233
235
|
assert len(accelerators) == 1, resources
|
234
236
|
acc, acc_count = list(accelerators.items())[0]
|
@@ -246,8 +248,10 @@ class Vsphere(clouds.Cloud):
|
|
246
248
|
clouds=_CLOUD_VSPHERE,
|
247
249
|
)
|
248
250
|
if instance_list is None:
|
249
|
-
return ([], fuzzy_candidate_list
|
250
|
-
|
251
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
252
|
+
None)
|
253
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
254
|
+
fuzzy_candidate_list, None)
|
251
255
|
|
252
256
|
@classmethod
|
253
257
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
@@ -263,7 +267,7 @@ class Vsphere(clouds.Cloud):
|
|
263
267
|
'Run the following commands:'
|
264
268
|
f'\n{cls._INDENT_PREFIX} $ pip install skypilot[vSphere]'
|
265
269
|
f'\n{cls._INDENT_PREFIX}Credentials may also need to be set. '
|
266
|
-
'For more details. See https://skypilot.
|
270
|
+
'For more details. See https://docs.skypilot.co/en/latest/getting-started/installation.html#vmware-vsphere' # pylint: disable=line-too-long
|
267
271
|
f'{common_utils.format_exception(e, use_bracket=True)}')
|
268
272
|
|
269
273
|
required_keys = ['name', 'username', 'password', 'clusters']
|
@@ -303,7 +307,7 @@ class Vsphere(clouds.Cloud):
|
|
303
307
|
}
|
304
308
|
|
305
309
|
@classmethod
|
306
|
-
def
|
310
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
307
311
|
# NOTE: used for very advanced SkyPilot functionality
|
308
312
|
# Can implement later if desired
|
309
313
|
return None
|