skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/cloud_vm_ray_backend.py +16 -4
- sky/check.py +109 -44
- sky/cli.py +261 -90
- sky/client/cli.py +261 -90
- sky/client/sdk.py +122 -3
- sky/clouds/__init__.py +5 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +30 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +160 -23
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +2 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +59 -17
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +56 -7
- sky/jobs/server/core.py +4 -2
- sky/optimizer.py +29 -15
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +177 -4
- sky/serve/server/core.py +2 -4
- sky/server/common.py +46 -9
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/requests/payloads.py +7 -0
- sky/server/server.py +80 -8
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +12 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +214 -1
- sky/utils/resources_utils.py +14 -0
- sky/utils/schemas.py +67 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
sky/optimizer.py
CHANGED
@@ -21,6 +21,7 @@ from sky.usage import usage_lib
|
|
21
21
|
from sky.utils import common
|
22
22
|
from sky.utils import env_options
|
23
23
|
from sky.utils import log_utils
|
24
|
+
from sky.utils import registry
|
24
25
|
from sky.utils import resources_utils
|
25
26
|
from sky.utils import rich_utils
|
26
27
|
from sky.utils import subprocess_utils
|
@@ -73,8 +74,8 @@ class Optimizer:
|
|
73
74
|
def _egress_cost(src_cloud: clouds.Cloud, dst_cloud: clouds.Cloud,
|
74
75
|
gigabytes: float) -> float:
|
75
76
|
"""Returns estimated egress cost."""
|
76
|
-
if isinstance(src_cloud, DummyCloud) or isinstance(
|
77
|
-
dst_cloud, DummyCloud):
|
77
|
+
if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
|
78
|
+
dst_cloud, clouds.DummyCloud):
|
78
79
|
return 0.0
|
79
80
|
|
80
81
|
if not src_cloud.is_same_cloud(dst_cloud):
|
@@ -88,8 +89,8 @@ class Optimizer:
|
|
88
89
|
gigabytes: float) -> float:
|
89
90
|
"""Returns estimated egress time in seconds."""
|
90
91
|
# FIXME: estimate bandwidth between each cloud-region pair.
|
91
|
-
if isinstance(src_cloud, DummyCloud) or isinstance(
|
92
|
-
dst_cloud, DummyCloud):
|
92
|
+
if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
|
93
|
+
dst_cloud, clouds.DummyCloud):
|
93
94
|
return 0.0
|
94
95
|
if not src_cloud.is_same_cloud(dst_cloud):
|
95
96
|
# 10Gbps is close to the average of observed b/w from S3
|
@@ -167,7 +168,7 @@ class Optimizer:
|
|
167
168
|
|
168
169
|
def make_dummy(name):
|
169
170
|
dummy = task_lib.Task(name)
|
170
|
-
dummy.set_resources({DummyResources(cloud=DummyCloud())})
|
171
|
+
dummy.set_resources({DummyResources(cloud=clouds.DummyCloud())})
|
171
172
|
dummy.set_time_estimator(lambda _: 0)
|
172
173
|
return dummy
|
173
174
|
|
@@ -197,7 +198,7 @@ class Optimizer:
|
|
197
198
|
node: task_lib.Task,
|
198
199
|
resources: resources_lib.Resources,
|
199
200
|
) -> Tuple[Optional[clouds.Cloud], Optional[clouds.Cloud], Optional[float]]:
|
200
|
-
if isinstance(parent_resources.cloud, DummyCloud):
|
201
|
+
if isinstance(parent_resources.cloud, clouds.DummyCloud):
|
201
202
|
# Special case. The current 'node' is a real
|
202
203
|
# source node, and its input may be on a different
|
203
204
|
# cloud from 'resources'.
|
@@ -376,6 +377,10 @@ class Optimizer:
|
|
376
377
|
if any(orig_resources.cloud is None
|
377
378
|
for orig_resources in node.resources):
|
378
379
|
source_hint = 'catalog and kubernetes cluster'
|
380
|
+
elif all(
|
381
|
+
isinstance(orig_resources.cloud, clouds.SSH)
|
382
|
+
for orig_resources in node.resources):
|
383
|
+
source_hint = 'node pool'
|
379
384
|
elif all(
|
380
385
|
isinstance(orig_resources.cloud, clouds.Kubernetes)
|
381
386
|
for orig_resources in node.resources):
|
@@ -858,11 +863,19 @@ class Optimizer:
|
|
858
863
|
'accelerators': f'{resources.accelerators}',
|
859
864
|
'use_spot': resources.use_spot
|
860
865
|
}
|
866
|
+
|
867
|
+
# Handle special case for Kubernetes and SSH clouds
|
861
868
|
if isinstance(resources.cloud, clouds.Kubernetes):
|
862
|
-
# Region for Kubernetes
|
863
|
-
# Kubernetes clusters. We add
|
864
|
-
#
|
869
|
+
# Region for Kubernetes-like clouds (SSH, Kubernetes) is the
|
870
|
+
# context name, i.e. different Kubernetes clusters. We add
|
871
|
+
# region to the key to show all the Kubernetes clusters in the
|
872
|
+
# optimizer table for better UX.
|
873
|
+
|
874
|
+
if resources.cloud.__class__.__name__ == 'SSH':
|
875
|
+
resource_key_dict[
|
876
|
+
'cloud'] = 'SSH' # Force the cloud name to be SSH
|
865
877
|
resource_key_dict['region'] = resources.region
|
878
|
+
|
866
879
|
return json.dumps(resource_key_dict, sort_keys=True)
|
867
880
|
|
868
881
|
# Print the list of resouces that the optimizer considered.
|
@@ -1158,11 +1171,6 @@ class DummyResources(resources_lib.Resources):
|
|
1158
1171
|
return 0
|
1159
1172
|
|
1160
1173
|
|
1161
|
-
class DummyCloud(clouds.Cloud):
|
1162
|
-
"""A dummy Cloud that has zero egress cost from/to."""
|
1163
|
-
pass
|
1164
|
-
|
1165
|
-
|
1166
1174
|
def _filter_out_blocked_launchable_resources(
|
1167
1175
|
launchable_resources: Iterable[resources_lib.Resources],
|
1168
1176
|
blocked_resources: Iterable[resources_lib.Resources]):
|
@@ -1221,7 +1229,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
|
|
1221
1229
|
if disabled_clouds:
|
1222
1230
|
is_or_are = 'is' if len(disabled_clouds) == 1 else 'are'
|
1223
1231
|
task_name = f' {task.name!r}' if task.name is not None else ''
|
1224
|
-
|
1232
|
+
disabled_display_names = []
|
1233
|
+
for c in disabled_clouds:
|
1234
|
+
cloud_obj_one = registry.CLOUD_REGISTRY.from_str(c)
|
1235
|
+
if cloud_obj_one is not None:
|
1236
|
+
disabled_display_names.append(cloud_obj_one.display_name())
|
1237
|
+
cloud_names = ', '.join(disabled_display_names)
|
1238
|
+
msg = (f'Task{task_name} requires {cloud_names} '
|
1225
1239
|
f'which {is_or_are} not enabled. To enable access, change '
|
1226
1240
|
f'the task cloud requirement or run: {colorama.Style.BRIGHT}'
|
1227
1241
|
f'sky check {" ".join(c.lower() for c in disabled_clouds)}'
|
sky/provision/__init__.py
CHANGED
@@ -23,6 +23,7 @@ from sky.provision import lambda_cloud
|
|
23
23
|
from sky.provision import nebius
|
24
24
|
from sky.provision import oci
|
25
25
|
from sky.provision import runpod
|
26
|
+
from sky.provision import ssh
|
26
27
|
from sky.provision import vast
|
27
28
|
from sky.provision import vsphere
|
28
29
|
from sky.utils import command_runner
|
sky/provision/aws/instance.py
CHANGED
@@ -836,7 +836,23 @@ def open_ports(
|
|
836
836
|
|
837
837
|
# For the case when every new ports is already opened.
|
838
838
|
if ip_permissions:
|
839
|
-
|
839
|
+
# Filter out any permissions that already exist in the security group
|
840
|
+
existing_permissions = set()
|
841
|
+
for rule in sg.ip_permissions:
|
842
|
+
if rule['IpProtocol'] == 'tcp':
|
843
|
+
for ip_range in rule.get('IpRanges', []):
|
844
|
+
if ip_range.get('CidrIp') == '0.0.0.0/0':
|
845
|
+
existing_permissions.add(
|
846
|
+
(rule['FromPort'], rule['ToPort']))
|
847
|
+
|
848
|
+
# Remove any permissions that already exist
|
849
|
+
filtered_permissions = []
|
850
|
+
for perm in ip_permissions:
|
851
|
+
if (perm['FromPort'], perm['ToPort']) not in existing_permissions:
|
852
|
+
filtered_permissions.append(perm)
|
853
|
+
|
854
|
+
if filtered_permissions:
|
855
|
+
sg.authorize_ingress(IpPermissions=filtered_permissions)
|
840
856
|
|
841
857
|
|
842
858
|
def cleanup_ports(
|
sky/provision/gcp/constants.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Constants used by the GCP provisioner."""
|
2
|
+
import textwrap
|
2
3
|
|
3
4
|
VERSION = 'v1'
|
4
5
|
# Using v2 according to
|
@@ -53,9 +54,7 @@ CLUSTER_PREFIX_LENGTH = 10
|
|
53
54
|
|
54
55
|
COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
|
55
56
|
COLLOCATED_COLLOCATION = 'COLLOCATED'
|
56
|
-
GPU_DIRECT_TCPX_USER_DATA = """
|
57
|
-
set -e
|
58
|
-
set -x
|
57
|
+
GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
|
59
58
|
# Install GPU Direct TCPX
|
60
59
|
cos-extensions install gpu -- --version=latest;
|
61
60
|
sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
|
@@ -83,7 +82,7 @@ GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
|
|
83
82
|
sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
|
84
83
|
sudo mount -o remount,exec /var/lib/tcpx;
|
85
84
|
echo "GPU Direct TCPX installed"
|
86
|
-
"""
|
85
|
+
""")
|
87
86
|
|
88
87
|
GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
89
88
|
'--cap-add=IPC_LOCK',
|
@@ -106,6 +105,150 @@ GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
|
106
105
|
'--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
|
107
106
|
]
|
108
107
|
|
108
|
+
PD_EXTREME_IOPS = 20000
|
109
|
+
DEFAULT_DISK_SIZE = 100
|
110
|
+
NETWORK_STORAGE_TYPE = 'PERSISTENT'
|
111
|
+
INSTANCE_STORAGE_TYPE = 'SCRATCH'
|
112
|
+
INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
|
113
|
+
INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
|
114
|
+
INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
|
115
|
+
DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
|
116
|
+
|
117
|
+
BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
|
118
|
+
set -e
|
119
|
+
set -x
|
120
|
+
""")
|
121
|
+
DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
|
122
|
+
# Define arrays for devices and mount points
|
123
|
+
declare -A device_mounts=(
|
124
|
+
{device_mounts}
|
125
|
+
)
|
126
|
+
|
127
|
+
# Function to format and mount a single device
|
128
|
+
format_and_mount() {{
|
129
|
+
local device_name="$1"
|
130
|
+
local mount_point="$2"
|
131
|
+
|
132
|
+
if [ ! -e "$device_name" ]; then
|
133
|
+
echo "Error: Device $device_name does not exist."
|
134
|
+
return 1
|
135
|
+
fi
|
136
|
+
|
137
|
+
# Check if filesystem is already formatted (ext4)
|
138
|
+
if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
|
139
|
+
if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
|
140
|
+
echo "Formatting local SSD $device_name..."
|
141
|
+
if ! sudo mkfs.ext4 -F "$device_name"; then
|
142
|
+
echo "Error: Failed to format $device_name"
|
143
|
+
return 1
|
144
|
+
fi
|
145
|
+
else
|
146
|
+
echo "Formatting persistent disk $device_name..."
|
147
|
+
if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
|
148
|
+
echo "Error: Failed to format $device_name"
|
149
|
+
return 1
|
150
|
+
fi
|
151
|
+
fi
|
152
|
+
else
|
153
|
+
echo "$device_name is already formatted."
|
154
|
+
fi
|
155
|
+
|
156
|
+
# Check if already mounted
|
157
|
+
if ! grep -q "$mount_point" /proc/mounts; then
|
158
|
+
echo "Mounting $device_name to $mount_point..."
|
159
|
+
if ! sudo mkdir -p "$mount_point"; then
|
160
|
+
echo "Error: Failed to create mount point $mount_point"
|
161
|
+
return 1
|
162
|
+
fi
|
163
|
+
|
164
|
+
if ! sudo mount "$device_name" "$mount_point"; then
|
165
|
+
echo "Error: Failed to mount $device_name to $mount_point"
|
166
|
+
return 1
|
167
|
+
fi
|
168
|
+
|
169
|
+
# Add to fstab if not already present
|
170
|
+
if ! grep -q " $mount_point " /etc/fstab; then
|
171
|
+
echo "Adding mount entry to /etc/fstab..."
|
172
|
+
echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
|
173
|
+
else
|
174
|
+
echo "Mount entry already exists in /etc/fstab"
|
175
|
+
fi
|
176
|
+
else
|
177
|
+
echo "$device_name is already mounted at $mount_point"
|
178
|
+
fi
|
179
|
+
}}
|
180
|
+
|
181
|
+
# Main execution
|
182
|
+
echo "Starting device mounting process..."
|
183
|
+
|
184
|
+
# Process each device-mount pair
|
185
|
+
for device in "${{!device_mounts[@]}}"; do
|
186
|
+
mount_point="${{device_mounts[$device]}}"
|
187
|
+
echo "Processing device: $device -> $mount_point"
|
188
|
+
if ! format_and_mount "$device" "$mount_point"; then
|
189
|
+
echo "Failed to process device $device"
|
190
|
+
# Continue with other devices even if one fails
|
191
|
+
continue
|
192
|
+
fi
|
193
|
+
done
|
194
|
+
|
195
|
+
echo "Device mounting process completed."
|
196
|
+
""")
|
197
|
+
|
198
|
+
# The local SSDs will be attached automatically to the following
|
199
|
+
# machine types with the following number of disks.
|
200
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
|
201
|
+
SSD_AUTO_ATTACH_MACHINE_TYPES = {
|
202
|
+
'c4a-standard-4-lssd': 1,
|
203
|
+
'c4a-highmem-4-lssd': 1,
|
204
|
+
'c4a-standard-8-lssd': 2,
|
205
|
+
'c4a-highmem-8-lssd': 2,
|
206
|
+
'c4a-standard-16-lssd': 4,
|
207
|
+
'c4a-highmem-16-lssd': 4,
|
208
|
+
'c4a-standard-32-lssd': 6,
|
209
|
+
'c4a-highmem-32-lssd': 6,
|
210
|
+
'c4a-standard-48-lssd': 10,
|
211
|
+
'c4a-highmem-48-lssd': 10,
|
212
|
+
'c4a-standard-64-lssd': 14,
|
213
|
+
'c4a-highmem-64-lssd': 14,
|
214
|
+
'c4a-standard-72-lssd': 16,
|
215
|
+
'c4a-highmem-72-lssd': 16,
|
216
|
+
'c3-standard-4-lssd': 1,
|
217
|
+
'c3-standard-8-lssd': 2,
|
218
|
+
'c3-standard-22-lssd': 4,
|
219
|
+
'c3-standard-44-lssd': 8,
|
220
|
+
'c3-standard-88-lssd': 16,
|
221
|
+
'c3-standard-176-lssd': 32,
|
222
|
+
'c3d-standard-8-lssd': 1,
|
223
|
+
'c3d-highmem-8-lssd': 1,
|
224
|
+
'c3d-standard-16-lssd': 1,
|
225
|
+
'c3d-highmem-16-lssd': 1,
|
226
|
+
'c3d-standard-30-lssd': 2,
|
227
|
+
'c3d-highmem-30-lssd': 2,
|
228
|
+
'c3d-standard-60-lssd': 4,
|
229
|
+
'c3d-highmem-60-lssd': 4,
|
230
|
+
'c3d-standard-90-lssd': 8,
|
231
|
+
'c3d-highmem-90-lssd': 8,
|
232
|
+
'c3d-standard-180-lssd': 16,
|
233
|
+
'c3d-highmem-180-lssd': 16,
|
234
|
+
'c3d-standard-360-lssd': 32,
|
235
|
+
'c3d-highmem-360-lssd': 32,
|
236
|
+
'a4-highgpu-8g': 32,
|
237
|
+
'a3-ultragpu-8g': 32,
|
238
|
+
'a3-megagpu-8g': 16,
|
239
|
+
'a3-highgpu-1g': 2,
|
240
|
+
'a3-highgpu-2g': 4,
|
241
|
+
'a3-highgpu-4g': 8,
|
242
|
+
'a3-highgpu-8g': 16,
|
243
|
+
'a3-edgegpu-8g': 16,
|
244
|
+
'a2-ultragpu-1g': 1,
|
245
|
+
'a2-ultragpu-2g': 2,
|
246
|
+
'a2-ultragpu-4g': 4,
|
247
|
+
'a2-ultragpu-8g': 8,
|
248
|
+
'z3-highmem-88': 12,
|
249
|
+
'z3-highmem-176': 12,
|
250
|
+
}
|
251
|
+
|
109
252
|
# Below parameters are from the default VPC on GCP.
|
110
253
|
# https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
|
111
254
|
VPC_TEMPLATE: dict = {
|
@@ -826,6 +826,16 @@ class GCPComputeInstance(GCPInstance):
|
|
826
826
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/bulkInsert # pylint: disable=line-too-long
|
827
827
|
if config.get('sourceMachineImage') is not None:
|
828
828
|
return False
|
829
|
+
# bulkInsert does not support attaching existing
|
830
|
+
# disks to the instances with READ_WRITE mode.
|
831
|
+
if config.get('disks') is not None:
|
832
|
+
for disk in config['disks']:
|
833
|
+
if disk.get('source') is not None and disk.get(
|
834
|
+
'mode', 'READ_WRITE') == 'READ_WRITE':
|
835
|
+
return False
|
836
|
+
if disk.get('initializeParams') is not None and disk.get(
|
837
|
+
'initializeParams', {}).get('diskName') is not None:
|
838
|
+
return False
|
829
839
|
return True
|
830
840
|
|
831
841
|
@classmethod
|
@@ -0,0 +1,247 @@
|
|
1
|
+
"""Utilities for GCP volumes."""
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
|
+
|
4
|
+
from sky import clouds
|
5
|
+
from sky import exceptions
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky.adaptors import gcp
|
8
|
+
from sky.provision.gcp import constants
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
from sky.utils import ux_utils
|
11
|
+
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def get_data_disk_tier_mapping(
|
16
|
+
instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
|
17
|
+
# Define the default mapping from disk tiers to disk types.
|
18
|
+
# Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
|
19
|
+
# and https://cloud.google.com/compute/docs/disks/persistent-disks
|
20
|
+
tier2name = {
|
21
|
+
resources_utils.DiskTier.ULTRA: 'pd-extreme',
|
22
|
+
resources_utils.DiskTier.HIGH: 'pd-ssd',
|
23
|
+
resources_utils.DiskTier.MEDIUM: 'pd-balanced',
|
24
|
+
resources_utils.DiskTier.LOW: 'pd-standard',
|
25
|
+
}
|
26
|
+
|
27
|
+
if instance_type is None:
|
28
|
+
return tier2name
|
29
|
+
|
30
|
+
# Remap series-specific disk types.
|
31
|
+
series = instance_type.split('-')[0]
|
32
|
+
|
33
|
+
if series in ['a4', 'x4']:
|
34
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
35
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
36
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
37
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
38
|
+
elif series in ['m4']:
|
39
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
40
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
41
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
42
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
43
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
44
|
+
if num_cpus < 112:
|
45
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
46
|
+
elif series in ['c4', 'c4a', 'c4d']:
|
47
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
48
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
49
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
50
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
51
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
52
|
+
if num_cpus < 64:
|
53
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
54
|
+
elif series in ['a3']:
|
55
|
+
if (instance_type.startswith('a3-ultragpu') or
|
56
|
+
instance_type.startswith('a3-megagpu') or
|
57
|
+
instance_type.startswith('a3-edgegpu')):
|
58
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
59
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
60
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
61
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
62
|
+
elif instance_type.startswith('a3-highgpu'):
|
63
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
64
|
+
if instance_type.startswith('a3-highgpu-8g'):
|
65
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
66
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
67
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
68
|
+
elif instance_type.startswith('a3-highgpu-4g'):
|
69
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
70
|
+
else:
|
71
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
72
|
+
elif series in ['c3d']:
|
73
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
74
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
75
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
76
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
77
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
78
|
+
if num_cpus < 60:
|
79
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
80
|
+
elif series in ['c3']:
|
81
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
82
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
83
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
84
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
85
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
86
|
+
if num_cpus < 88:
|
87
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
88
|
+
elif series in ['n4']:
|
89
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
90
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
91
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
92
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
93
|
+
elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
|
94
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
95
|
+
elif series in ['z3']:
|
96
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
97
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
98
|
+
elif series in ['h3']:
|
99
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
100
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
101
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
102
|
+
elif series in ['m3']:
|
103
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
104
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
105
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
106
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
107
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
108
|
+
if num_cpus < 64:
|
109
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
110
|
+
elif series in ['m2']:
|
111
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
112
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
113
|
+
elif series in ['m1']:
|
114
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
115
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
116
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
117
|
+
if num_cpus < 80:
|
118
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
119
|
+
elif series in ['g2']:
|
120
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
121
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
122
|
+
elif series in ['n2']:
|
123
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
124
|
+
if num_cpus < 64:
|
125
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
126
|
+
elif num_cpus >= 80:
|
127
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
128
|
+
|
129
|
+
return tier2name
|
130
|
+
|
131
|
+
|
132
|
+
def validate_instance_volumes(
|
133
|
+
instance_type: Optional[str],
|
134
|
+
volumes: Optional[List[Dict[str, Any]]],
|
135
|
+
) -> None:
|
136
|
+
if not volumes:
|
137
|
+
return
|
138
|
+
if instance_type is None:
|
139
|
+
logger.warning('Instance type is not specified,'
|
140
|
+
' skipping instance volume validation')
|
141
|
+
return
|
142
|
+
instance_volume_count = 0
|
143
|
+
for volume in volumes:
|
144
|
+
if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
|
145
|
+
instance_volume_count += 1
|
146
|
+
if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
|
147
|
+
instance_volume_count >
|
148
|
+
constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
|
149
|
+
raise exceptions.ResourcesUnavailableError(
|
150
|
+
f'The instance type {instance_type} supports'
|
151
|
+
f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
|
152
|
+
f' instance storage, but {instance_volume_count} are specified')
|
153
|
+
# TODO(hailong):
|
154
|
+
# check the instance storage count for the other instance types,
|
155
|
+
# refer to https://cloud.google.com/compute/docs/disks/local-ssd
|
156
|
+
|
157
|
+
|
158
|
+
def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
|
159
|
+
if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
|
160
|
+
return 'READ_ONLY'
|
161
|
+
return 'READ_WRITE'
|
162
|
+
|
163
|
+
|
164
|
+
def check_volume_name_exist_in_region(
|
165
|
+
project_id: str, region: clouds.Region, use_mig: bool,
|
166
|
+
volume_name: str) -> Optional[Dict[str, Any]]:
|
167
|
+
"""Check if the volume name exists and return the volume info."""
|
168
|
+
logger.debug(f'Checking volume {volume_name} in region {region}')
|
169
|
+
try:
|
170
|
+
compute = gcp.build('compute',
|
171
|
+
'v1',
|
172
|
+
credentials=None,
|
173
|
+
cache_discovery=False)
|
174
|
+
except gcp.credential_error_exception():
|
175
|
+
with ux_utils.print_exception_no_traceback():
|
176
|
+
raise ValueError('Not able to build compute client') from None
|
177
|
+
|
178
|
+
# Get all the zones in the region
|
179
|
+
all_zones = compute.zones().list(project=project_id).execute()
|
180
|
+
region_zones = []
|
181
|
+
if 'items' in all_zones:
|
182
|
+
for zone in all_zones['items']:
|
183
|
+
if zone['region'].split('/')[-1] == region.name:
|
184
|
+
region_zones.append(zone['name'])
|
185
|
+
volume_info = None
|
186
|
+
for zone in region_zones:
|
187
|
+
try:
|
188
|
+
volume_info = compute.disks().get(project=project_id,
|
189
|
+
zone=zone,
|
190
|
+
disk=volume_name).execute()
|
191
|
+
if volume_info is not None:
|
192
|
+
if use_mig:
|
193
|
+
# With MIG, instance template will be used, in this case,
|
194
|
+
# the `selfLink` for zonal disk needs to be the volume name
|
195
|
+
# Refer to https://cloud.google.com/compute/docs/
|
196
|
+
# reference/rest/v1/instances/insert
|
197
|
+
volume_info['selfLink'] = volume_name
|
198
|
+
volume_info['available_zones'] = [zone]
|
199
|
+
return volume_info
|
200
|
+
except gcp.http_error_exception() as e:
|
201
|
+
if e.resp.status == 403:
|
202
|
+
with ux_utils.print_exception_no_traceback():
|
203
|
+
raise ValueError('Not able to access the volume '
|
204
|
+
f'{volume_name!r}') from None
|
205
|
+
if e.resp.status == 404:
|
206
|
+
continue # Try next zone
|
207
|
+
raise
|
208
|
+
|
209
|
+
# If not found in any zone, check region disk
|
210
|
+
try:
|
211
|
+
volume_info = compute.regionDisks().get(project=project_id,
|
212
|
+
region=region.name,
|
213
|
+
disk=volume_name).execute()
|
214
|
+
# 'replicaZones':
|
215
|
+
# ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
|
216
|
+
# 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
|
217
|
+
if volume_info is not None and 'replicaZones' in volume_info:
|
218
|
+
replica_zones = [
|
219
|
+
zone.split('/')[-1] for zone in volume_info['replicaZones']
|
220
|
+
]
|
221
|
+
volume_info['available_zones'] = replica_zones
|
222
|
+
return volume_info
|
223
|
+
except gcp.http_error_exception() as e:
|
224
|
+
if e.resp.status == 403:
|
225
|
+
with ux_utils.print_exception_no_traceback():
|
226
|
+
raise ValueError('Not able to access the volume '
|
227
|
+
f'{volume_name!r}') from None
|
228
|
+
if e.resp.status == 404:
|
229
|
+
logger.warning(
|
230
|
+
f'Volume {volume_name} is not found in region {region}.'
|
231
|
+
f' It will be created.')
|
232
|
+
return volume_info
|
233
|
+
raise
|
234
|
+
|
235
|
+
|
236
|
+
def check_volume_zone_match(volume_name: str,
|
237
|
+
zones: Optional[List[clouds.Zone]],
|
238
|
+
available_zones: List[str]):
|
239
|
+
if zones is None:
|
240
|
+
return None
|
241
|
+
for zone in zones:
|
242
|
+
if zone.name in available_zones:
|
243
|
+
return None
|
244
|
+
with ux_utils.print_exception_no_traceback():
|
245
|
+
# Return a ResourcesUnavailableError to trigger failover
|
246
|
+
raise exceptions.ResourcesUnavailableError(
|
247
|
+
f'Volume {volume_name} not available in zones {zones}') from None
|
@@ -1265,6 +1265,8 @@ def query_instances(
|
|
1265
1265
|
assert provider_config is not None
|
1266
1266
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
1267
1267
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
1268
|
+
is_ssh = context.startswith('ssh-') if context else False
|
1269
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
1268
1270
|
|
1269
1271
|
# Get all the pods with the label skypilot-cluster: <cluster_name>
|
1270
1272
|
try:
|
@@ -1274,15 +1276,24 @@ def query_instances(
|
|
1274
1276
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
1275
1277
|
except kubernetes.max_retry_error():
|
1276
1278
|
with ux_utils.print_exception_no_traceback():
|
1277
|
-
|
1279
|
+
if is_ssh:
|
1280
|
+
node_pool = context.lstrip('ssh-') if context else ''
|
1281
|
+
msg = (
|
1282
|
+
f'Cannot connect to SSH Node Pool {node_pool}. '
|
1283
|
+
'Please check if the SSH Node Pool is up and accessible. '
|
1284
|
+
'To debug, run `sky check ssh` to check the status of '
|
1285
|
+
'the SSH Node Pool.')
|
1286
|
+
else:
|
1287
|
+
ctx = kubernetes_utils.get_current_kube_config_context_name()
|
1288
|
+
msg = (f'Network error - check if the {identity} in '
|
1289
|
+
f'context {ctx} is up and accessible.')
|
1278
1290
|
raise exceptions.ClusterStatusFetchingError(
|
1279
|
-
f'Failed to query cluster {cluster_name_on_cloud!r} status. '
|
1280
|
-
|
1281
|
-
f'context {ctx} is up and accessible.') from None
|
1291
|
+
f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
|
1292
|
+
msg) from None
|
1282
1293
|
except Exception as e: # pylint: disable=broad-except
|
1283
1294
|
with ux_utils.print_exception_no_traceback():
|
1284
1295
|
raise exceptions.ClusterStatusFetchingError(
|
1285
|
-
f'Failed to query
|
1296
|
+
f'Failed to query {identity} {cluster_name_on_cloud!r} '
|
1286
1297
|
f'status: {common_utils.format_exception(e)}')
|
1287
1298
|
|
1288
1299
|
# Check if the pods are running or pending
|