skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/do.py
ADDED
@@ -0,0 +1,313 @@
|
|
1
|
+
""" Digital Ocean Cloud. """
|
2
|
+
|
3
|
+
import json
|
4
|
+
import typing
|
5
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
|
+
|
7
|
+
from sky import clouds
|
8
|
+
from sky.adaptors import do
|
9
|
+
from sky.clouds import service_catalog
|
10
|
+
from sky.provision.do import utils as do_utils
|
11
|
+
from sky.utils import registry
|
12
|
+
from sky.utils import resources_utils
|
13
|
+
|
14
|
+
if typing.TYPE_CHECKING:
|
15
|
+
from sky import resources as resources_lib
|
16
|
+
|
17
|
+
_CREDENTIAL_FILE = 'config.yaml'
|
18
|
+
|
19
|
+
|
20
|
+
@registry.CLOUD_REGISTRY.register(aliases=['digitalocean'])
|
21
|
+
class DO(clouds.Cloud):
|
22
|
+
"""Digital Ocean Cloud"""
|
23
|
+
|
24
|
+
_REPR = 'DO'
|
25
|
+
_CLOUD_UNSUPPORTED_FEATURES = {
|
26
|
+
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
27
|
+
'Migrating '
|
28
|
+
f'disk is not supported in {_REPR}.',
|
29
|
+
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
30
|
+
'Spot instances are '
|
31
|
+
f'not supported in {_REPR}.',
|
32
|
+
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
33
|
+
'Custom disk tiers'
|
34
|
+
f' is not supported in {_REPR}.',
|
35
|
+
}
|
36
|
+
# DO maximum node name length defined as <= 255
|
37
|
+
# https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
|
38
|
+
# 255 - 8 = 247 characters since
|
39
|
+
# our provisioner adds additional `-worker`.
|
40
|
+
_MAX_CLUSTER_NAME_LEN_LIMIT = 247
|
41
|
+
_regions: List[clouds.Region] = []
|
42
|
+
|
43
|
+
# Using the latest SkyPilot provisioner API to provision and check status.
|
44
|
+
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
45
|
+
STATUS_VERSION = clouds.StatusVersion.SKYPILOT
|
46
|
+
|
47
|
+
@classmethod
|
48
|
+
def _unsupported_features_for_resources(
|
49
|
+
cls, resources: 'resources_lib.Resources'
|
50
|
+
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
51
|
+
"""The features not supported based on the resources provided.
|
52
|
+
|
53
|
+
This method is used by check_features_are_supported() to check if the
|
54
|
+
cloud implementation supports all the requested features.
|
55
|
+
|
56
|
+
Returns:
|
57
|
+
A dict of {feature: reason} for the features not supported by the
|
58
|
+
cloud implementation.
|
59
|
+
"""
|
60
|
+
del resources # unused
|
61
|
+
return cls._CLOUD_UNSUPPORTED_FEATURES
|
62
|
+
|
63
|
+
@classmethod
|
64
|
+
def _max_cluster_name_length(cls) -> Optional[int]:
|
65
|
+
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
66
|
+
|
67
|
+
@classmethod
|
68
|
+
def regions_with_offering(
|
69
|
+
cls,
|
70
|
+
instance_type: str,
|
71
|
+
accelerators: Optional[Dict[str, int]],
|
72
|
+
use_spot: bool,
|
73
|
+
region: Optional[str],
|
74
|
+
zone: Optional[str],
|
75
|
+
) -> List[clouds.Region]:
|
76
|
+
assert zone is None, 'DO does not support zones.'
|
77
|
+
del accelerators, zone # unused
|
78
|
+
if use_spot:
|
79
|
+
return []
|
80
|
+
regions = service_catalog.get_region_zones_for_instance_type(
|
81
|
+
instance_type, use_spot, 'DO')
|
82
|
+
if region is not None:
|
83
|
+
regions = [r for r in regions if r.name == region]
|
84
|
+
return regions
|
85
|
+
|
86
|
+
@classmethod
|
87
|
+
def get_vcpus_mem_from_instance_type(
|
88
|
+
cls,
|
89
|
+
instance_type: str,
|
90
|
+
) -> Tuple[Optional[float], Optional[float]]:
|
91
|
+
return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
|
92
|
+
clouds='DO')
|
93
|
+
|
94
|
+
@classmethod
|
95
|
+
def zones_provision_loop(
|
96
|
+
cls,
|
97
|
+
*,
|
98
|
+
region: str,
|
99
|
+
num_nodes: int,
|
100
|
+
instance_type: str,
|
101
|
+
accelerators: Optional[Dict[str, int]] = None,
|
102
|
+
use_spot: bool = False,
|
103
|
+
) -> Iterator[None]:
|
104
|
+
del num_nodes # unused
|
105
|
+
regions = cls.regions_with_offering(instance_type,
|
106
|
+
accelerators,
|
107
|
+
use_spot,
|
108
|
+
region=region,
|
109
|
+
zone=None)
|
110
|
+
for r in regions:
|
111
|
+
assert r.zones is None, r
|
112
|
+
yield r.zones
|
113
|
+
|
114
|
+
def instance_type_to_hourly_cost(
|
115
|
+
self,
|
116
|
+
instance_type: str,
|
117
|
+
use_spot: bool,
|
118
|
+
region: Optional[str] = None,
|
119
|
+
zone: Optional[str] = None,
|
120
|
+
) -> float:
|
121
|
+
return service_catalog.get_hourly_cost(
|
122
|
+
instance_type,
|
123
|
+
use_spot=use_spot,
|
124
|
+
region=region,
|
125
|
+
zone=zone,
|
126
|
+
clouds='DO',
|
127
|
+
)
|
128
|
+
|
129
|
+
def accelerators_to_hourly_cost(
|
130
|
+
self,
|
131
|
+
accelerators: Dict[str, int],
|
132
|
+
use_spot: bool,
|
133
|
+
region: Optional[str] = None,
|
134
|
+
zone: Optional[str] = None,
|
135
|
+
) -> float:
|
136
|
+
"""Returns the hourly cost of the accelerators, in dollars/hour."""
|
137
|
+
# the acc price is include in the instance price.
|
138
|
+
del accelerators, use_spot, region, zone # unused
|
139
|
+
return 0.0
|
140
|
+
|
141
|
+
def get_egress_cost(self, num_gigabytes: float) -> float:
|
142
|
+
return 0.0
|
143
|
+
|
144
|
+
def __repr__(self):
|
145
|
+
return self._REPR
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def get_default_instance_type(
|
149
|
+
cls,
|
150
|
+
cpus: Optional[str] = None,
|
151
|
+
memory: Optional[str] = None,
|
152
|
+
disk_tier: Optional[resources_utils.DiskTier] = None,
|
153
|
+
) -> Optional[str]:
|
154
|
+
"""Returns the default instance type for DO."""
|
155
|
+
return service_catalog.get_default_instance_type(cpus=cpus,
|
156
|
+
memory=memory,
|
157
|
+
disk_tier=disk_tier,
|
158
|
+
clouds='DO')
|
159
|
+
|
160
|
+
@classmethod
|
161
|
+
def get_accelerators_from_instance_type(
|
162
|
+
cls, instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
163
|
+
return service_catalog.get_accelerators_from_instance_type(
|
164
|
+
instance_type, clouds='DO')
|
165
|
+
|
166
|
+
@classmethod
|
167
|
+
def get_zone_shell_cmd(cls) -> Optional[str]:
|
168
|
+
return None
|
169
|
+
|
170
|
+
def make_deploy_resources_variables(
|
171
|
+
self,
|
172
|
+
resources: 'resources_lib.Resources',
|
173
|
+
cluster_name: resources_utils.ClusterName,
|
174
|
+
region: 'clouds.Region',
|
175
|
+
zones: Optional[List['clouds.Zone']],
|
176
|
+
num_nodes: int,
|
177
|
+
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
178
|
+
del zones, dryrun, cluster_name
|
179
|
+
|
180
|
+
r = resources
|
181
|
+
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
182
|
+
if acc_dict is not None:
|
183
|
+
custom_resources = json.dumps(acc_dict, separators=(',', ':'))
|
184
|
+
else:
|
185
|
+
custom_resources = None
|
186
|
+
image_id = None
|
187
|
+
if (resources.image_id is not None and
|
188
|
+
resources.extract_docker_image() is None):
|
189
|
+
if None in resources.image_id:
|
190
|
+
image_id = resources.image_id[None]
|
191
|
+
else:
|
192
|
+
assert region.name in resources.image_id
|
193
|
+
image_id = resources.image_id[region.name]
|
194
|
+
return {
|
195
|
+
'instance_type': resources.instance_type,
|
196
|
+
'custom_resources': custom_resources,
|
197
|
+
'region': region.name,
|
198
|
+
**({
|
199
|
+
'image_id': image_id
|
200
|
+
} if image_id else {})
|
201
|
+
}
|
202
|
+
|
203
|
+
def _get_feasible_launchable_resources(
|
204
|
+
self, resources: 'resources_lib.Resources'
|
205
|
+
) -> resources_utils.FeasibleResources:
|
206
|
+
"""Returns a list of feasible resources for the given resources."""
|
207
|
+
if resources.use_spot:
|
208
|
+
# TODO: Add hints to all return values in this method to help
|
209
|
+
# users understand why the resources are not launchable.
|
210
|
+
return resources_utils.FeasibleResources([], [], None)
|
211
|
+
if resources.instance_type is not None:
|
212
|
+
assert resources.is_launchable(), resources
|
213
|
+
resources = resources.copy(accelerators=None)
|
214
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
215
|
+
|
216
|
+
def _make(instance_list):
|
217
|
+
resource_list = []
|
218
|
+
for instance_type in instance_list:
|
219
|
+
r = resources.copy(
|
220
|
+
cloud=DO(),
|
221
|
+
instance_type=instance_type,
|
222
|
+
accelerators=None,
|
223
|
+
cpus=None,
|
224
|
+
)
|
225
|
+
resource_list.append(r)
|
226
|
+
return resource_list
|
227
|
+
|
228
|
+
# Currently, handle a filter on accelerators only.
|
229
|
+
accelerators = resources.accelerators
|
230
|
+
if accelerators is None:
|
231
|
+
# Return a default instance type
|
232
|
+
default_instance_type = DO.get_default_instance_type(
|
233
|
+
cpus=resources.cpus,
|
234
|
+
memory=resources.memory,
|
235
|
+
disk_tier=resources.disk_tier)
|
236
|
+
if default_instance_type is None:
|
237
|
+
return resources_utils.FeasibleResources([], [], None)
|
238
|
+
else:
|
239
|
+
return resources_utils.FeasibleResources(
|
240
|
+
_make([default_instance_type]), [], None)
|
241
|
+
|
242
|
+
assert len(accelerators) == 1, resources
|
243
|
+
acc, acc_count = list(accelerators.items())[0]
|
244
|
+
(instance_list, fuzzy_candidate_list) = (
|
245
|
+
service_catalog.get_instance_type_for_accelerator(
|
246
|
+
acc,
|
247
|
+
acc_count,
|
248
|
+
use_spot=resources.use_spot,
|
249
|
+
cpus=resources.cpus,
|
250
|
+
memory=resources.memory,
|
251
|
+
region=resources.region,
|
252
|
+
zone=resources.zone,
|
253
|
+
clouds='DO',
|
254
|
+
))
|
255
|
+
if instance_list is None:
|
256
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
257
|
+
None)
|
258
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
259
|
+
fuzzy_candidate_list, None)
|
260
|
+
|
261
|
+
@classmethod
|
262
|
+
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
263
|
+
"""Verify that the user has valid credentials for DO."""
|
264
|
+
|
265
|
+
try:
|
266
|
+
do.exceptions()
|
267
|
+
except ImportError as err:
|
268
|
+
return False, str(err)
|
269
|
+
|
270
|
+
try:
|
271
|
+
# attempt to make a CURL request for listing instances
|
272
|
+
do_utils.client().droplets.list()
|
273
|
+
except do.exceptions().HttpResponseError as err:
|
274
|
+
return False, str(err)
|
275
|
+
except do_utils.DigitalOceanError as err:
|
276
|
+
return False, str(err)
|
277
|
+
|
278
|
+
return True, None
|
279
|
+
|
280
|
+
def get_credential_file_mounts(self) -> Dict[str, str]:
|
281
|
+
try:
|
282
|
+
do_utils.client()
|
283
|
+
return {
|
284
|
+
f'~/.config/doctl/{_CREDENTIAL_FILE}': do_utils.CREDENTIALS_PATH
|
285
|
+
}
|
286
|
+
except do_utils.DigitalOceanError:
|
287
|
+
return {}
|
288
|
+
|
289
|
+
@classmethod
|
290
|
+
def get_current_user_identity(cls) -> Optional[List[str]]:
|
291
|
+
# NOTE: used for very advanced SkyPilot functionality
|
292
|
+
# Can implement later if desired
|
293
|
+
return None
|
294
|
+
|
295
|
+
@classmethod
|
296
|
+
def get_image_size(cls, image_id: str, region: Optional[str]) -> float:
|
297
|
+
del region
|
298
|
+
try:
|
299
|
+
response = do_utils.client().images.get(image_id=image_id)
|
300
|
+
return response['image']['size_gigabytes']
|
301
|
+
except do.exceptions().HttpResponseError as err:
|
302
|
+
raise do_utils.DigitalOceanError(
|
303
|
+
'HTTP error while retrieving size of '
|
304
|
+
f'image_id {response}: {err.error.message}') from err
|
305
|
+
except KeyError as err:
|
306
|
+
raise do_utils.DigitalOceanError(
|
307
|
+
f'No image_id `{image_id}` found') from err
|
308
|
+
|
309
|
+
def instance_type_exists(self, instance_type: str) -> bool:
|
310
|
+
return service_catalog.instance_type_exists(instance_type, 'DO')
|
311
|
+
|
312
|
+
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
313
|
+
return service_catalog.validate_region_zone(region, zone, clouds='DO')
|
sky/clouds/fluidstack.py
CHANGED
@@ -1,28 +1,28 @@
|
|
1
1
|
"""Fluidstack Cloud."""
|
2
|
-
import json
|
3
2
|
import os
|
4
3
|
import typing
|
5
|
-
from typing import Dict, Iterator, List, Optional, Tuple
|
4
|
+
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
6
5
|
|
7
6
|
import requests
|
8
7
|
|
9
8
|
from sky import clouds
|
10
|
-
from sky import status_lib
|
11
9
|
from sky.clouds import service_catalog
|
12
10
|
from sky.provision.fluidstack import fluidstack_utils
|
11
|
+
from sky.utils import registry
|
12
|
+
from sky.utils import resources_utils
|
13
|
+
from sky.utils import status_lib
|
13
14
|
from sky.utils.resources_utils import DiskTier
|
14
15
|
|
15
16
|
_CREDENTIAL_FILES = [
|
16
17
|
# credential files for FluidStack,
|
17
|
-
fluidstack_utils.FLUIDSTACK_API_KEY_PATH
|
18
|
-
fluidstack_utils.FLUIDSTACK_API_TOKEN_PATH,
|
18
|
+
fluidstack_utils.FLUIDSTACK_API_KEY_PATH
|
19
19
|
]
|
20
20
|
if typing.TYPE_CHECKING:
|
21
21
|
# Renaming to avoid shadowing variables.
|
22
22
|
from sky import resources as resources_lib
|
23
23
|
|
24
24
|
|
25
|
-
@
|
25
|
+
@registry.CLOUD_REGISTRY.register
|
26
26
|
class Fluidstack(clouds.Cloud):
|
27
27
|
"""FluidStack GPU Cloud."""
|
28
28
|
|
@@ -155,7 +155,7 @@ class Fluidstack(clouds.Cloud):
|
|
155
155
|
def get_accelerators_from_instance_type(
|
156
156
|
cls,
|
157
157
|
instance_type: str,
|
158
|
-
) -> Optional[Dict[str, int]]:
|
158
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
159
159
|
return service_catalog.get_accelerators_from_instance_type(
|
160
160
|
instance_type, clouds='fluidstack')
|
161
161
|
|
@@ -174,9 +174,10 @@ class Fluidstack(clouds.Cloud):
|
|
174
174
|
def make_deploy_resources_variables(
|
175
175
|
self,
|
176
176
|
resources: 'resources_lib.Resources',
|
177
|
-
|
177
|
+
cluster_name: resources_utils.ClusterName,
|
178
178
|
region: clouds.Region,
|
179
179
|
zones: Optional[List[clouds.Zone]],
|
180
|
+
num_nodes: int,
|
180
181
|
dryrun: bool = False,
|
181
182
|
) -> Dict[str, Optional[str]]:
|
182
183
|
|
@@ -184,24 +185,14 @@ class Fluidstack(clouds.Cloud):
|
|
184
185
|
|
185
186
|
r = resources
|
186
187
|
acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
custom_resources = None
|
191
|
-
cuda_installation_commands = """
|
192
|
-
sudo wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb -O /usr/local/cuda-keyring_1.1-1_all.deb;
|
193
|
-
sudo dpkg -i /usr/local/cuda-keyring_1.1-1_all.deb;
|
194
|
-
sudo apt-get update;
|
195
|
-
sudo apt-get -y install cuda-toolkit-12-3;
|
196
|
-
sudo apt-get install -y cuda-drivers;
|
197
|
-
sudo apt-get install -y python3-pip;
|
198
|
-
nvidia-smi || sudo reboot;"""
|
188
|
+
custom_resources = resources_utils.make_ray_custom_resources_str(
|
189
|
+
acc_dict)
|
190
|
+
|
199
191
|
return {
|
200
192
|
'instance_type': resources.instance_type,
|
201
193
|
'custom_resources': custom_resources,
|
202
194
|
'region': region.name,
|
203
|
-
'fluidstack_username':
|
204
|
-
'cuda_installation_commands': cuda_installation_commands,
|
195
|
+
'fluidstack_username': 'ubuntu',
|
205
196
|
}
|
206
197
|
|
207
198
|
def _get_feasible_launchable_resources(
|
@@ -210,7 +201,9 @@ class Fluidstack(clouds.Cloud):
|
|
210
201
|
assert resources.is_launchable(), resources
|
211
202
|
# Accelerators are part of the instance type in Fluidstack Cloud
|
212
203
|
resources = resources.copy(accelerators=None)
|
213
|
-
return
|
204
|
+
# TODO: Add hints to all return values in this method to help
|
205
|
+
# users understand why the resources are not launchable.
|
206
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
214
207
|
|
215
208
|
def _make(instance_list):
|
216
209
|
resource_list = []
|
@@ -238,9 +231,10 @@ class Fluidstack(clouds.Cloud):
|
|
238
231
|
memory=resources.memory,
|
239
232
|
disk_tier=resources.disk_tier)
|
240
233
|
if default_instance_type is None:
|
241
|
-
return ([], [])
|
234
|
+
return resources_utils.FeasibleResources([], [], None)
|
242
235
|
else:
|
243
|
-
return (
|
236
|
+
return resources_utils.FeasibleResources(
|
237
|
+
_make([default_instance_type]), [], None)
|
244
238
|
|
245
239
|
assert len(accelerators) == 1, resources
|
246
240
|
acc, acc_count = list(accelerators.items())[0]
|
@@ -255,8 +249,10 @@ class Fluidstack(clouds.Cloud):
|
|
255
249
|
zone=resources.zone,
|
256
250
|
clouds='fluidstack')
|
257
251
|
if instance_list is None:
|
258
|
-
return ([], fuzzy_candidate_list
|
259
|
-
|
252
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
253
|
+
None)
|
254
|
+
return resources_utils.FeasibleResources(_make(instance_list),
|
255
|
+
fuzzy_candidate_list, None)
|
260
256
|
|
261
257
|
@classmethod
|
262
258
|
def check_credentials(cls) -> Tuple[bool, Optional[str]]:
|
@@ -264,17 +260,26 @@ class Fluidstack(clouds.Cloud):
|
|
264
260
|
try:
|
265
261
|
assert os.path.exists(
|
266
262
|
os.path.expanduser(fluidstack_utils.FLUIDSTACK_API_KEY_PATH))
|
267
|
-
|
268
|
-
|
263
|
+
|
264
|
+
with open(os.path.expanduser(
|
265
|
+
fluidstack_utils.FLUIDSTACK_API_KEY_PATH),
|
266
|
+
encoding='UTF-8') as f:
|
267
|
+
api_key = f.read().strip()
|
268
|
+
if not api_key.startswith('api_key'):
|
269
|
+
return False, ('Invalid FluidStack API key format. '
|
270
|
+
'To configure credentials, go to:\n '
|
271
|
+
' https://dashboard.fluidstack.io \n '
|
272
|
+
'to obtain an API key, '
|
273
|
+
'then add save the contents '
|
274
|
+
'to ~/.fluidstack/api_key \n')
|
269
275
|
except AssertionError:
|
270
|
-
return False, (
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
'to ~/.fluidstack/api_key and ~/.fluidstack/api_token \n')
|
276
|
+
return False, ('Failed to access FluidStack Cloud'
|
277
|
+
' with credentials. '
|
278
|
+
'To configure credentials, go to:\n '
|
279
|
+
' https://dashboard.fluidstack.io \n '
|
280
|
+
'to obtain an API key, '
|
281
|
+
'then add save the contents '
|
282
|
+
'to ~/.fluidstack/api_key \n')
|
278
283
|
except requests.exceptions.ConnectionError:
|
279
284
|
return False, ('Failed to verify FluidStack Cloud credentials. '
|
280
285
|
'Check your network connection '
|
@@ -285,8 +290,8 @@ class Fluidstack(clouds.Cloud):
|
|
285
290
|
return {filename: filename for filename in _CREDENTIAL_FILES}
|
286
291
|
|
287
292
|
@classmethod
|
288
|
-
def
|
289
|
-
# TODO(mjibril): Implement
|
293
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
294
|
+
# TODO(mjibril): Implement get_active_user_identity for Fluidstack
|
290
295
|
return None
|
291
296
|
|
292
297
|
def instance_type_exists(self, instance_type: str) -> bool:
|
@@ -297,21 +302,6 @@ class Fluidstack(clouds.Cloud):
|
|
297
302
|
zone,
|
298
303
|
clouds='fluidstack')
|
299
304
|
|
300
|
-
@classmethod
|
301
|
-
def default_username(cls, region: str) -> str:
|
302
|
-
return {
|
303
|
-
'norway_2_eu': 'ubuntu',
|
304
|
-
'calgary_1_canada': 'ubuntu',
|
305
|
-
'norway_3_eu': 'ubuntu',
|
306
|
-
'norway_4_eu': 'ubuntu',
|
307
|
-
'india_2': 'root',
|
308
|
-
'nevada_1_usa': 'fsuser',
|
309
|
-
'generic_1_canada': 'ubuntu',
|
310
|
-
'iceland_1_eu': 'ubuntu',
|
311
|
-
'new_york_1_usa': 'fsuser',
|
312
|
-
'illinois_1_usa': 'fsuser'
|
313
|
-
}.get(region, 'ubuntu')
|
314
|
-
|
315
305
|
@classmethod
|
316
306
|
def query_status(
|
317
307
|
cls,
|