skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/clouds/gcp.py
CHANGED
@@ -1,30 +1,32 @@
|
|
1
1
|
"""Google Cloud Platform."""
|
2
2
|
import enum
|
3
|
-
import functools
|
4
3
|
import json
|
5
4
|
import os
|
6
5
|
import re
|
7
6
|
import subprocess
|
8
7
|
import time
|
9
8
|
import typing
|
10
|
-
from typing import Dict, Iterator, List, Optional, Set, Tuple
|
9
|
+
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple, Union
|
11
10
|
|
12
11
|
import colorama
|
13
12
|
|
14
13
|
from sky import clouds
|
15
14
|
from sky import exceptions
|
16
15
|
from sky import sky_logging
|
16
|
+
from sky import skypilot_config
|
17
17
|
from sky.adaptors import gcp
|
18
18
|
from sky.clouds import service_catalog
|
19
19
|
from sky.clouds.utils import gcp_utils
|
20
|
+
from sky.utils import annotations
|
20
21
|
from sky.utils import common_utils
|
22
|
+
from sky.utils import registry
|
21
23
|
from sky.utils import resources_utils
|
22
24
|
from sky.utils import subprocess_utils
|
23
25
|
from sky.utils import ux_utils
|
24
26
|
|
25
27
|
if typing.TYPE_CHECKING:
|
26
28
|
from sky import resources
|
27
|
-
from sky import status_lib
|
29
|
+
from sky.utils import status_lib
|
28
30
|
|
29
31
|
logger = sky_logging.init_logger(__name__)
|
30
32
|
|
@@ -93,6 +95,12 @@ _IMAGE_NOT_FOUND_UX_MESSAGE = (
|
|
93
95
|
f'\nTo query common AI images: {colorama.Style.BRIGHT}gcloud compute images list --project deeplearning-platform-release | less{colorama.Style.RESET_ALL}'
|
94
96
|
)
|
95
97
|
|
98
|
+
# Image ID tags
|
99
|
+
_DEFAULT_CPU_IMAGE_ID = 'skypilot:custom-cpu-ubuntu-2204'
|
100
|
+
# For GPU-related package version, see sky/clouds/service_catalog/images/provisioners/cuda.sh
|
101
|
+
_DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-2204'
|
102
|
+
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-debian-10'
|
103
|
+
|
96
104
|
|
97
105
|
def _run_output(cmd):
|
98
106
|
proc = subprocess.run(cmd,
|
@@ -125,8 +133,11 @@ class GCPIdentityType(enum.Enum):
|
|
125
133
|
|
126
134
|
SHARED_CREDENTIALS_FILE = ''
|
127
135
|
|
136
|
+
def can_credential_expire(self) -> bool:
|
137
|
+
return self == GCPIdentityType.SHARED_CREDENTIALS_FILE
|
138
|
+
|
128
139
|
|
129
|
-
@
|
140
|
+
@registry.CLOUD_REGISTRY.register
|
130
141
|
class GCP(clouds.Cloud):
|
131
142
|
"""Google Cloud Platform."""
|
132
143
|
|
@@ -160,7 +171,7 @@ class GCP(clouds.Cloud):
|
|
160
171
|
# ~/.config/gcloud/application_default_credentials.json.
|
161
172
|
f'{_INDENT_PREFIX} $ gcloud auth application-default login\n'
|
162
173
|
f'{_INDENT_PREFIX}For more info: '
|
163
|
-
'https://skypilot.
|
174
|
+
'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long
|
164
175
|
)
|
165
176
|
_APPLICATION_CREDENTIAL_HINT = (
|
166
177
|
'Run the following commands:\n'
|
@@ -168,7 +179,7 @@ class GCP(clouds.Cloud):
|
|
168
179
|
f'{_INDENT_PREFIX}Or set the environment variable GOOGLE_APPLICATION_CREDENTIALS '
|
169
180
|
'to the path of your service account key file.\n'
|
170
181
|
f'{_INDENT_PREFIX}For more info: '
|
171
|
-
'https://skypilot.
|
182
|
+
'https://docs.skypilot.co/en/latest/getting-started/installation.html#google-cloud-platform-gcp' # pylint: disable=line-too-long
|
172
183
|
)
|
173
184
|
|
174
185
|
_SUPPORTED_DISK_TIERS = set(resources_utils.DiskTier)
|
@@ -179,20 +190,33 @@ class GCP(clouds.Cloud):
|
|
179
190
|
def _unsupported_features_for_resources(
|
180
191
|
cls, resources: 'resources.Resources'
|
181
192
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
193
|
+
unsupported = {}
|
182
194
|
if gcp_utils.is_tpu_vm_pod(resources):
|
183
|
-
|
195
|
+
unsupported = {
|
184
196
|
clouds.CloudImplementationFeatures.STOP: (
|
185
|
-
'TPU VM pods cannot be stopped. Please refer to:
|
197
|
+
'TPU VM pods cannot be stopped. Please refer to: '
|
198
|
+
'https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm#stopping_your_resources'
|
186
199
|
)
|
187
200
|
}
|
188
201
|
if gcp_utils.is_tpu(resources) and not gcp_utils.is_tpu_vm(resources):
|
189
202
|
# TPU node does not support multi-node.
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
203
|
+
unsupported[clouds.CloudImplementationFeatures.MULTI_NODE] = (
|
204
|
+
'TPU node does not support multi-node. Please set '
|
205
|
+
'num_nodes to 1.')
|
206
|
+
# TODO(zhwu): We probably need to store the MIG requirement in resources
|
207
|
+
# because `skypilot_config` may change for an existing cluster.
|
208
|
+
# Clusters created with MIG (only GPU clusters) cannot be stopped.
|
209
|
+
if (skypilot_config.get_nested(
|
210
|
+
('gcp', 'managed_instance_group'),
|
211
|
+
None,
|
212
|
+
override_configs=resources.cluster_config_overrides) is not None
|
213
|
+
and resources.accelerators):
|
214
|
+
unsupported[clouds.CloudImplementationFeatures.STOP] = (
|
215
|
+
'Managed Instance Group (MIG) does not support stopping yet.')
|
216
|
+
unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = (
|
217
|
+
'Managed Instance Group with DWS does not support '
|
218
|
+
'spot instances.')
|
219
|
+
return unsupported
|
196
220
|
|
197
221
|
@classmethod
|
198
222
|
def max_cluster_name_length(cls) -> Optional[int]:
|
@@ -246,6 +270,10 @@ class GCP(clouds.Cloud):
|
|
246
270
|
regions = [r for r in regions if r.zones]
|
247
271
|
return regions
|
248
272
|
|
273
|
+
@classmethod
|
274
|
+
def optimize_by_zone(cls) -> bool:
|
275
|
+
return True
|
276
|
+
|
249
277
|
@classmethod
|
250
278
|
def zones_provision_loop(
|
251
279
|
cls,
|
@@ -321,7 +349,7 @@ class GCP(clouds.Cloud):
|
|
321
349
|
return find_machine is not None
|
322
350
|
|
323
351
|
@classmethod
|
324
|
-
@
|
352
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
325
353
|
def _get_image_size(cls, image_id: str) -> float:
|
326
354
|
if image_id.startswith('skypilot:'):
|
327
355
|
return DEFAULT_GCP_IMAGE_GB
|
@@ -390,9 +418,10 @@ class GCP(clouds.Cloud):
|
|
390
418
|
def make_deploy_resources_variables(
|
391
419
|
self,
|
392
420
|
resources: 'resources.Resources',
|
393
|
-
|
421
|
+
cluster_name: resources_utils.ClusterName,
|
394
422
|
region: 'clouds.Region',
|
395
423
|
zones: Optional[List['clouds.Zone']],
|
424
|
+
num_nodes: int,
|
396
425
|
dryrun: bool = False) -> Dict[str, Optional[str]]:
|
397
426
|
assert zones is not None, (region, zones)
|
398
427
|
|
@@ -404,7 +433,22 @@ class GCP(clouds.Cloud):
|
|
404
433
|
# --no-standard-images
|
405
434
|
# We use the debian image, as the ubuntu image has some connectivity
|
406
435
|
# issue when first booted.
|
407
|
-
image_id =
|
436
|
+
image_id = _DEFAULT_CPU_IMAGE_ID
|
437
|
+
|
438
|
+
def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
|
439
|
+
if (r.disk_tier is not None and
|
440
|
+
r.disk_tier != resources_utils.DiskTier.BEST):
|
441
|
+
return r.disk_tier
|
442
|
+
# Failover disk tier from ultra to low.
|
443
|
+
all_tiers = list(reversed(resources_utils.DiskTier))
|
444
|
+
start_index = all_tiers.index(GCP._translate_disk_tier(r.disk_tier))
|
445
|
+
while start_index < len(all_tiers):
|
446
|
+
disk_tier = all_tiers[start_index]
|
447
|
+
ok, _ = GCP.check_disk_tier(r.instance_type, disk_tier)
|
448
|
+
if ok:
|
449
|
+
return disk_tier
|
450
|
+
start_index += 1
|
451
|
+
assert False, 'Low disk tier should always be supported on GCP.'
|
408
452
|
|
409
453
|
r = resources
|
410
454
|
# Find GPU spec, if any.
|
@@ -419,6 +463,7 @@ class GCP(clouds.Cloud):
|
|
419
463
|
'custom_resources': None,
|
420
464
|
'use_spot': r.use_spot,
|
421
465
|
'gcp_project_id': self.get_project_id(dryrun),
|
466
|
+
**GCP._get_disk_specs(_failover_disk_tier()),
|
422
467
|
}
|
423
468
|
accelerators = r.accelerators
|
424
469
|
if accelerators is not None:
|
@@ -437,13 +482,16 @@ class GCP(clouds.Cloud):
|
|
437
482
|
'runtime_version']
|
438
483
|
resources_vars['tpu_node_name'] = r.accelerator_args.get(
|
439
484
|
'tpu_name')
|
485
|
+
# TPU VMs require privileged mode for docker containers to
|
486
|
+
# access TPU devices.
|
487
|
+
resources_vars['docker_run_options'] = ['--privileged']
|
440
488
|
else:
|
441
489
|
# Convert to GCP names:
|
442
490
|
# https://cloud.google.com/compute/docs/gpus
|
443
491
|
if acc in ('A100-80GB', 'L4'):
|
444
492
|
# A100-80GB and L4 have a different name pattern.
|
445
493
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}'
|
446
|
-
elif acc
|
494
|
+
elif acc in ('H100', 'H100-MEGA'):
|
447
495
|
resources_vars['gpu'] = f'nvidia-{acc.lower()}-80gb'
|
448
496
|
else:
|
449
497
|
resources_vars['gpu'] = 'nvidia-tesla-{}'.format(
|
@@ -453,10 +501,10 @@ class GCP(clouds.Cloud):
|
|
453
501
|
# Though the image is called cu113, it actually has later
|
454
502
|
# versions of CUDA as noted below.
|
455
503
|
# CUDA driver version 470.57.02, CUDA Library 11.4
|
456
|
-
image_id =
|
504
|
+
image_id = _DEFAULT_GPU_K80_IMAGE_ID
|
457
505
|
else:
|
458
506
|
# CUDA driver version 535.86.10, CUDA Library 12.2
|
459
|
-
image_id =
|
507
|
+
image_id = _DEFAULT_GPU_IMAGE_ID
|
460
508
|
|
461
509
|
if (resources.image_id is not None and
|
462
510
|
resources.extract_docker_image() is None):
|
@@ -477,30 +525,52 @@ class GCP(clouds.Cloud):
|
|
477
525
|
resources_vars['machine_image'] = image_id
|
478
526
|
resources_vars['image_id'] = None
|
479
527
|
|
480
|
-
resources_vars['disk_tier'] = GCP._get_disk_type(r.disk_tier)
|
481
|
-
|
482
528
|
firewall_rule = None
|
483
529
|
if resources.ports is not None:
|
484
|
-
firewall_rule = (
|
485
|
-
|
530
|
+
firewall_rule = (USER_PORTS_FIREWALL_RULE_NAME.format(
|
531
|
+
cluster_name.name_on_cloud))
|
486
532
|
resources_vars['firewall_rule'] = firewall_rule
|
487
533
|
|
488
534
|
# For TPU nodes. TPU VMs do not need TPU_NAME.
|
489
535
|
tpu_node_name = resources_vars.get('tpu_node_name')
|
490
536
|
if gcp_utils.is_tpu(resources) and not gcp_utils.is_tpu_vm(resources):
|
491
537
|
if tpu_node_name is None:
|
492
|
-
tpu_node_name =
|
538
|
+
tpu_node_name = cluster_name.name_on_cloud
|
493
539
|
|
494
540
|
resources_vars['tpu_node_name'] = tpu_node_name
|
495
541
|
|
542
|
+
managed_instance_group_config = skypilot_config.get_nested(
|
543
|
+
('gcp', 'managed_instance_group'),
|
544
|
+
None,
|
545
|
+
override_configs=resources.cluster_config_overrides)
|
546
|
+
use_mig = managed_instance_group_config is not None
|
547
|
+
resources_vars['gcp_use_managed_instance_group'] = use_mig
|
548
|
+
# Convert boolean to 0 or 1 in string, as GCP does not support boolean
|
549
|
+
# value in labels for TPU VM APIs.
|
550
|
+
resources_vars['gcp_use_managed_instance_group_value'] = str(
|
551
|
+
int(use_mig))
|
552
|
+
if use_mig:
|
553
|
+
resources_vars.update(managed_instance_group_config)
|
554
|
+
resources_vars[
|
555
|
+
'force_enable_external_ips'] = skypilot_config.get_nested(
|
556
|
+
('gcp', 'force_enable_external_ips'), False)
|
557
|
+
|
558
|
+
# Add gVNIC from config
|
559
|
+
resources_vars['enable_gvnic'] = skypilot_config.get_nested(
|
560
|
+
('gcp', 'enable_gvnic'), False)
|
561
|
+
|
496
562
|
return resources_vars
|
497
563
|
|
498
564
|
def _get_feasible_launchable_resources(
|
499
565
|
self, resources: 'resources.Resources'
|
500
|
-
) ->
|
566
|
+
) -> 'resources_utils.FeasibleResources':
|
501
567
|
if resources.instance_type is not None:
|
502
568
|
assert resources.is_launchable(), resources
|
503
|
-
|
569
|
+
ok, _ = GCP.check_disk_tier(resources.instance_type,
|
570
|
+
resources.disk_tier)
|
571
|
+
if not ok:
|
572
|
+
return resources_utils.FeasibleResources([], [], None)
|
573
|
+
return resources_utils.FeasibleResources([resources], [], None)
|
504
574
|
|
505
575
|
if resources.accelerators is None:
|
506
576
|
# Return a default instance type with the given number of vCPUs.
|
@@ -509,16 +579,20 @@ class GCP(clouds.Cloud):
|
|
509
579
|
memory=resources.memory,
|
510
580
|
disk_tier=resources.disk_tier)
|
511
581
|
if host_vm_type is None:
|
512
|
-
return
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
582
|
+
# TODO: Add hints to all return values in this method to help
|
583
|
+
# users understand why the resources are not launchable.
|
584
|
+
return resources_utils.FeasibleResources([], [], None)
|
585
|
+
ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier)
|
586
|
+
if not ok:
|
587
|
+
return resources_utils.FeasibleResources([], [], None)
|
588
|
+
r = resources.copy(
|
589
|
+
cloud=GCP(),
|
590
|
+
instance_type=host_vm_type,
|
591
|
+
accelerators=None,
|
592
|
+
cpus=None,
|
593
|
+
memory=None,
|
594
|
+
)
|
595
|
+
return resources_utils.FeasibleResources([r], [], None)
|
522
596
|
|
523
597
|
# Find instance candidates to meet user's requirements
|
524
598
|
assert len(resources.accelerators.items()
|
@@ -540,7 +614,8 @@ class GCP(clouds.Cloud):
|
|
540
614
|
clouds='gcp')
|
541
615
|
|
542
616
|
if instance_list is None:
|
543
|
-
return ([], fuzzy_candidate_list
|
617
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
618
|
+
None)
|
544
619
|
assert len(
|
545
620
|
instance_list
|
546
621
|
) == 1, f'More than one instance type matched, {instance_list}'
|
@@ -555,11 +630,13 @@ class GCP(clouds.Cloud):
|
|
555
630
|
if resources.cpus.endswith('+'):
|
556
631
|
cpus = float(resources.cpus[:-1])
|
557
632
|
if cpus > num_cpus_in_tpu_vm:
|
558
|
-
return (
|
633
|
+
return resources_utils.FeasibleResources(
|
634
|
+
[], fuzzy_candidate_list, None)
|
559
635
|
else:
|
560
636
|
cpus = float(resources.cpus)
|
561
637
|
if cpus != num_cpus_in_tpu_vm:
|
562
|
-
return (
|
638
|
+
return resources_utils.FeasibleResources(
|
639
|
+
[], fuzzy_candidate_list, None)
|
563
640
|
# FIXME(woosuk, wei-lin): This leverages the fact that TPU VMs
|
564
641
|
# have 334 GB RAM, and 400 GB RAM for tpu-v4. We need to move
|
565
642
|
# this to service catalog, instead.
|
@@ -568,14 +645,20 @@ class GCP(clouds.Cloud):
|
|
568
645
|
if resources.memory.endswith('+'):
|
569
646
|
memory = float(resources.memory[:-1])
|
570
647
|
if memory > memory_in_tpu_vm:
|
571
|
-
return (
|
648
|
+
return resources_utils.FeasibleResources(
|
649
|
+
[], fuzzy_candidate_list, None)
|
572
650
|
else:
|
573
651
|
memory = float(resources.memory)
|
574
652
|
if memory != memory_in_tpu_vm:
|
575
|
-
return (
|
653
|
+
return resources_utils.FeasibleResources(
|
654
|
+
[], fuzzy_candidate_list, None)
|
576
655
|
else:
|
577
656
|
host_vm_type = instance_list[0]
|
578
657
|
|
658
|
+
ok, _ = GCP.check_disk_tier(host_vm_type, resources.disk_tier)
|
659
|
+
if not ok:
|
660
|
+
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
661
|
+
None)
|
579
662
|
acc_dict = {acc: acc_count}
|
580
663
|
r = resources.copy(
|
581
664
|
cloud=GCP(),
|
@@ -584,13 +667,14 @@ class GCP(clouds.Cloud):
|
|
584
667
|
cpus=None,
|
585
668
|
memory=None,
|
586
669
|
)
|
587
|
-
return ([r], fuzzy_candidate_list
|
670
|
+
return resources_utils.FeasibleResources([r], fuzzy_candidate_list,
|
671
|
+
None)
|
588
672
|
|
589
673
|
@classmethod
|
590
674
|
def get_accelerators_from_instance_type(
|
591
675
|
cls,
|
592
676
|
instance_type: str,
|
593
|
-
) -> Optional[Dict[str, int]]:
|
677
|
+
) -> Optional[Dict[str, Union[int, float]]]:
|
594
678
|
# GCP handles accelerators separately from regular instance types,
|
595
679
|
# hence return none here.
|
596
680
|
return None
|
@@ -675,7 +759,7 @@ class GCP(clouds.Cloud):
|
|
675
759
|
project_id = cls.get_project_id()
|
676
760
|
|
677
761
|
# Check if the user is activated.
|
678
|
-
identity = cls.
|
762
|
+
identity = cls.get_active_user_identity()
|
679
763
|
except (auth.exceptions.DefaultCredentialsError,
|
680
764
|
exceptions.CloudUserIdentityError) as e:
|
681
765
|
# See also: https://stackoverflow.com/a/53307505/1165051
|
@@ -736,13 +820,13 @@ class GCP(clouds.Cloud):
|
|
736
820
|
|
737
821
|
# pylint: disable=import-outside-toplevel,unused-import
|
738
822
|
import google.auth
|
739
|
-
import googleapiclient.discovery
|
740
823
|
|
741
824
|
# This takes user's credential info from "~/.config/gcloud/application_default_credentials.json". # pylint: disable=line-too-long
|
742
825
|
credentials, project = google.auth.default()
|
743
|
-
crm =
|
744
|
-
|
745
|
-
|
826
|
+
crm = gcp.build('cloudresourcemanager',
|
827
|
+
'v1',
|
828
|
+
credentials=credentials,
|
829
|
+
cache_discovery=False)
|
746
830
|
gcp_minimal_permissions = gcp_utils.get_minimal_permissions()
|
747
831
|
permissions = {'permissions': gcp_minimal_permissions}
|
748
832
|
request = crm.projects().testIamPermissions(resource=project,
|
@@ -750,13 +834,13 @@ class GCP(clouds.Cloud):
|
|
750
834
|
ret_permissions = request.execute().get('permissions', [])
|
751
835
|
|
752
836
|
diffs = set(gcp_minimal_permissions).difference(set(ret_permissions))
|
753
|
-
if
|
837
|
+
if diffs:
|
754
838
|
identity_str = identity[0] if identity else None
|
755
839
|
return False, (
|
756
840
|
'The following permissions are not enabled for the current '
|
757
841
|
f'GCP identity ({identity_str}):\n '
|
758
842
|
f'{diffs}\n '
|
759
|
-
'For more details, visit: https://skypilot.
|
843
|
+
'For more details, visit: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html') # pylint: disable=line-too-long
|
760
844
|
return True, None
|
761
845
|
|
762
846
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
@@ -783,19 +867,29 @@ class GCP(clouds.Cloud):
|
|
783
867
|
pass
|
784
868
|
return credentials
|
785
869
|
|
870
|
+
@annotations.lru_cache(scope='global', maxsize=1)
|
871
|
+
def can_credential_expire(self) -> bool:
|
872
|
+
identity_type = self._get_identity_type()
|
873
|
+
return (identity_type is not None and
|
874
|
+
identity_type.can_credential_expire())
|
875
|
+
|
786
876
|
@classmethod
|
787
877
|
def _get_identity_type(cls) -> Optional[GCPIdentityType]:
|
788
878
|
try:
|
789
|
-
account = cls.
|
879
|
+
account = cls.get_active_user_identity()
|
790
880
|
except exceptions.CloudUserIdentityError:
|
791
881
|
return None
|
792
|
-
if
|
882
|
+
if account is None:
|
883
|
+
return None
|
884
|
+
assert account is not None
|
885
|
+
if GCPIdentityType.SERVICE_ACCOUNT.value in account[0]:
|
793
886
|
return GCPIdentityType.SERVICE_ACCOUNT
|
794
887
|
return GCPIdentityType.SHARED_CREDENTIALS_FILE
|
795
888
|
|
796
889
|
@classmethod
|
797
|
-
@
|
798
|
-
|
890
|
+
@annotations.lru_cache(scope='request',
|
891
|
+
maxsize=1) # Cache since getting identity is slow.
|
892
|
+
def get_user_identities(cls) -> List[List[str]]:
|
799
893
|
"""Returns the email address + project id of the active user."""
|
800
894
|
try:
|
801
895
|
account = _run_output('gcloud auth list --filter=status:ACTIVE '
|
@@ -826,11 +920,13 @@ class GCP(clouds.Cloud):
|
|
826
920
|
' Reason: '
|
827
921
|
f'{common_utils.format_exception(e, use_bracket=True)}'
|
828
922
|
) from e
|
829
|
-
|
923
|
+
# TODO: Return a list of identities in the profile when we support
|
924
|
+
# automatic switching for GCP. Currently we only support one identity.
|
925
|
+
return [[f'{account} [project_id={project_id}]']]
|
830
926
|
|
831
927
|
@classmethod
|
832
|
-
def
|
833
|
-
user_identity = cls.
|
928
|
+
def get_active_user_identity_str(cls) -> Optional[str]:
|
929
|
+
user_identity = cls.get_active_user_identity()
|
834
930
|
if user_identity is None:
|
835
931
|
return None
|
836
932
|
return user_identity[0].replace('\n', '')
|
@@ -871,17 +967,59 @@ class GCP(clouds.Cloud):
|
|
871
967
|
resources.instance_type, resources.accelerators, resources.zone,
|
872
968
|
'gcp')
|
873
969
|
|
970
|
+
@classmethod
|
971
|
+
def check_disk_tier(
|
972
|
+
cls, instance_type: Optional[str],
|
973
|
+
disk_tier: Optional[resources_utils.DiskTier]) -> Tuple[bool, str]:
|
974
|
+
if disk_tier != resources_utils.DiskTier.ULTRA or instance_type is None:
|
975
|
+
return True, ''
|
976
|
+
# Ultra disk tier (pd-extreme) only support m2, m3 and part of n2
|
977
|
+
# instance types, so we failover to lower tiers for other instance
|
978
|
+
# types. Reference:
|
979
|
+
# https://cloud.google.com/compute/docs/disks/extreme-persistent-disk#machine_shape_support # pylint: disable=line-too-long
|
980
|
+
series = instance_type.split('-')[0]
|
981
|
+
if series in ['m2', 'm3', 'n2']:
|
982
|
+
if series == 'n2':
|
983
|
+
num_cpus = int(instance_type.split('-')[2])
|
984
|
+
if num_cpus < 64:
|
985
|
+
return False, ('n2 series with less than 64 vCPUs are '
|
986
|
+
'not supported with pd-extreme.')
|
987
|
+
return True, ''
|
988
|
+
return False, (f'{series} series is not supported with pd-extreme. '
|
989
|
+
'Only m2, m3 series and n2 series with 64 or more vCPUs '
|
990
|
+
'are supported.')
|
991
|
+
|
992
|
+
@classmethod
|
993
|
+
def check_disk_tier_enabled(cls, instance_type: Optional[str],
|
994
|
+
disk_tier: resources_utils.DiskTier) -> None:
|
995
|
+
ok, msg = cls.check_disk_tier(instance_type, disk_tier)
|
996
|
+
if not ok:
|
997
|
+
with ux_utils.print_exception_no_traceback():
|
998
|
+
raise exceptions.NotSupportedError(msg)
|
999
|
+
|
874
1000
|
@classmethod
|
875
1001
|
def _get_disk_type(cls,
|
876
1002
|
disk_tier: Optional[resources_utils.DiskTier]) -> str:
|
877
1003
|
tier = cls._translate_disk_tier(disk_tier)
|
878
1004
|
tier2name = {
|
1005
|
+
resources_utils.DiskTier.ULTRA: 'pd-extreme',
|
879
1006
|
resources_utils.DiskTier.HIGH: 'pd-ssd',
|
880
1007
|
resources_utils.DiskTier.MEDIUM: 'pd-balanced',
|
881
1008
|
resources_utils.DiskTier.LOW: 'pd-standard',
|
882
1009
|
}
|
883
1010
|
return tier2name[tier]
|
884
1011
|
|
1012
|
+
@classmethod
|
1013
|
+
def _get_disk_specs(
|
1014
|
+
cls,
|
1015
|
+
disk_tier: Optional[resources_utils.DiskTier]) -> Dict[str, Any]:
|
1016
|
+
specs: Dict[str, Any] = {'disk_tier': cls._get_disk_type(disk_tier)}
|
1017
|
+
if disk_tier == resources_utils.DiskTier.ULTRA:
|
1018
|
+
# Only pd-extreme supports custom iops.
|
1019
|
+
# see https://cloud.google.com/compute/docs/disks#disk-types
|
1020
|
+
specs['disk_iops'] = 20000
|
1021
|
+
return specs
|
1022
|
+
|
885
1023
|
@classmethod
|
886
1024
|
def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
|
887
1025
|
return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())
|
@@ -976,8 +1114,8 @@ class GCP(clouds.Cloud):
|
|
976
1114
|
assert False, 'This code path should not be used.'
|
977
1115
|
|
978
1116
|
@classmethod
|
979
|
-
def create_image_from_cluster(cls,
|
980
|
-
|
1117
|
+
def create_image_from_cluster(cls,
|
1118
|
+
cluster_name: resources_utils.ClusterName,
|
981
1119
|
region: Optional[str],
|
982
1120
|
zone: Optional[str]) -> str:
|
983
1121
|
del region # unused
|
@@ -986,7 +1124,7 @@ class GCP(clouds.Cloud):
|
|
986
1124
|
# `ray-cluster-name` tag, which is guaranteed by the current `ray`
|
987
1125
|
# backend. Once the `provision.query_instances` is implemented for GCP,
|
988
1126
|
# we should be able to get rid of this assumption.
|
989
|
-
tag_filters = {'ray-cluster-name':
|
1127
|
+
tag_filters = {'ray-cluster-name': cluster_name.name_on_cloud}
|
990
1128
|
label_filter_str = cls._label_filter_str(tag_filters)
|
991
1129
|
instance_name_cmd = ('gcloud compute instances list '
|
992
1130
|
f'--filter="({label_filter_str})" '
|
@@ -998,7 +1136,8 @@ class GCP(clouds.Cloud):
|
|
998
1136
|
subprocess_utils.handle_returncode(
|
999
1137
|
returncode,
|
1000
1138
|
instance_name_cmd,
|
1001
|
-
error_msg=
|
1139
|
+
error_msg=
|
1140
|
+
f'Failed to get instance name for {cluster_name.display_name!r}',
|
1002
1141
|
stderr=stderr,
|
1003
1142
|
stream_logs=True)
|
1004
1143
|
instance_names = json.loads(stdout)
|
@@ -1009,7 +1148,7 @@ class GCP(clouds.Cloud):
|
|
1009
1148
|
f'instance, but got: {instance_names}')
|
1010
1149
|
instance_name = instance_names[0]['name']
|
1011
1150
|
|
1012
|
-
image_name = f'skypilot-{cluster_name}-{int(time.time())}'
|
1151
|
+
image_name = f'skypilot-{cluster_name.display_name}-{int(time.time())}'
|
1013
1152
|
create_image_cmd = (f'gcloud compute images create {image_name} '
|
1014
1153
|
f'--source-disk {instance_name} '
|
1015
1154
|
f'--source-disk-zone {zone}')
|
@@ -1021,7 +1160,8 @@ class GCP(clouds.Cloud):
|
|
1021
1160
|
subprocess_utils.handle_returncode(
|
1022
1161
|
returncode,
|
1023
1162
|
create_image_cmd,
|
1024
|
-
error_msg=
|
1163
|
+
error_msg=
|
1164
|
+
f'Failed to create image for {cluster_name.display_name!r}',
|
1025
1165
|
stderr=stderr,
|
1026
1166
|
stream_logs=True)
|
1027
1167
|
|
@@ -1035,7 +1175,8 @@ class GCP(clouds.Cloud):
|
|
1035
1175
|
subprocess_utils.handle_returncode(
|
1036
1176
|
returncode,
|
1037
1177
|
image_uri_cmd,
|
1038
|
-
error_msg=
|
1178
|
+
error_msg=
|
1179
|
+
f'Failed to get image uri for {cluster_name.display_name!r}',
|
1039
1180
|
stderr=stderr,
|
1040
1181
|
stream_logs=True)
|
1041
1182
|
|