skypilot-nightly 1.0.0.dev20250520__py3-none-any.whl → 1.0.0.dev20250522__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +56 -37
- sky/check.py +3 -3
- sky/cli.py +89 -16
- sky/client/cli.py +89 -16
- sky/client/sdk.py +92 -4
- sky/clouds/__init__.py +2 -0
- sky/clouds/cloud.py +6 -0
- sky/clouds/gcp.py +156 -21
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +1 -0
- sky/core.py +6 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +6 -0
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +1 -0
- sky/dashboard/out/_next/static/chunks/{678-206dddca808e6d16.js → 582-683f4f27b81996dc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +1 -0
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +57 -8
- sky/jobs/server/core.py +5 -3
- sky/jobs/utils.py +38 -7
- sky/optimizer.py +41 -39
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/provisioner.py +16 -7
- sky/resources.py +233 -18
- sky/serve/serve_utils.py +5 -13
- sky/serve/server/core.py +2 -4
- sky/server/common.py +60 -14
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/requests/payloads.py +3 -3
- sky/server/server.py +40 -8
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +0 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/cli_utils/status_utils.py +95 -56
- sky/utils/common_utils.py +35 -2
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/infra_utils.py +175 -0
- sky/utils/resources_utils.py +55 -21
- sky/utils/schemas.py +111 -5
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/RECORD +73 -68
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/8hlc2dkbIDDBOkxtEW7X6/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- /sky/dashboard/out/_next/static/{8hlc2dkbIDDBOkxtEW7X6 → CzOVV6JpRQBRt5GhZuhyK}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/top_level.txt +0 -0
sky/optimizer.py
CHANGED
@@ -73,8 +73,8 @@ class Optimizer:
|
|
73
73
|
def _egress_cost(src_cloud: clouds.Cloud, dst_cloud: clouds.Cloud,
|
74
74
|
gigabytes: float) -> float:
|
75
75
|
"""Returns estimated egress cost."""
|
76
|
-
if isinstance(src_cloud, DummyCloud) or isinstance(
|
77
|
-
dst_cloud, DummyCloud):
|
76
|
+
if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
|
77
|
+
dst_cloud, clouds.DummyCloud):
|
78
78
|
return 0.0
|
79
79
|
|
80
80
|
if not src_cloud.is_same_cloud(dst_cloud):
|
@@ -88,8 +88,8 @@ class Optimizer:
|
|
88
88
|
gigabytes: float) -> float:
|
89
89
|
"""Returns estimated egress time in seconds."""
|
90
90
|
# FIXME: estimate bandwidth between each cloud-region pair.
|
91
|
-
if isinstance(src_cloud, DummyCloud) or isinstance(
|
92
|
-
dst_cloud, DummyCloud):
|
91
|
+
if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
|
92
|
+
dst_cloud, clouds.DummyCloud):
|
93
93
|
return 0.0
|
94
94
|
if not src_cloud.is_same_cloud(dst_cloud):
|
95
95
|
# 10Gbps is close to the average of observed b/w from S3
|
@@ -167,7 +167,7 @@ class Optimizer:
|
|
167
167
|
|
168
168
|
def make_dummy(name):
|
169
169
|
dummy = task_lib.Task(name)
|
170
|
-
dummy.set_resources({DummyResources(DummyCloud()
|
170
|
+
dummy.set_resources({DummyResources(cloud=clouds.DummyCloud())})
|
171
171
|
dummy.set_time_estimator(lambda _: 0)
|
172
172
|
return dummy
|
173
173
|
|
@@ -197,7 +197,7 @@ class Optimizer:
|
|
197
197
|
node: task_lib.Task,
|
198
198
|
resources: resources_lib.Resources,
|
199
199
|
) -> Tuple[Optional[clouds.Cloud], Optional[clouds.Cloud], Optional[float]]:
|
200
|
-
if isinstance(parent_resources.cloud, DummyCloud):
|
200
|
+
if isinstance(parent_resources.cloud, clouds.DummyCloud):
|
201
201
|
# Special case. The current 'node' is a real
|
202
202
|
# source node, and its input may be on a different
|
203
203
|
# cloud from 'resources'.
|
@@ -321,10 +321,10 @@ class Optimizer:
|
|
321
321
|
estimated_runtime = 1 * 3600
|
322
322
|
else:
|
323
323
|
# We assume the time estimator takes in a partial resource
|
324
|
-
# Resources('V100')
|
324
|
+
# Resources(accelerators='V100')
|
325
325
|
# and treats their launchable versions
|
326
|
-
# Resources(
|
327
|
-
# Resources(
|
326
|
+
# Resources(infra='aws', instance_type='p3.2xlarge'),
|
327
|
+
# Resources(infra='gcp', accelerators='V100'),
|
328
328
|
# ...
|
329
329
|
# as having the same run time.
|
330
330
|
# FIXME(zongheng): take 'num_nodes' as an arg/into
|
@@ -772,6 +772,15 @@ class Optimizer:
|
|
772
772
|
f'{colorama.Style.BRIGHT}Estimated total cost: '
|
773
773
|
f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
|
774
774
|
|
775
|
+
def _instance_type_str(resources: 'resources_lib.Resources') -> str:
|
776
|
+
instance_type = resources.instance_type
|
777
|
+
assert instance_type is not None, 'Instance type must be specified'
|
778
|
+
if isinstance(resources.cloud, clouds.Kubernetes):
|
779
|
+
instance_type = '-'
|
780
|
+
if resources.use_spot:
|
781
|
+
instance_type = ''
|
782
|
+
return instance_type
|
783
|
+
|
775
784
|
def _get_resources_element_list(
|
776
785
|
resources: 'resources_lib.Resources') -> List[str]:
|
777
786
|
accelerators = resources.get_accelerators_str()
|
@@ -794,22 +803,20 @@ class Optimizer:
|
|
794
803
|
vcpus = format_number(vcpus_)
|
795
804
|
mem = format_number(mem_)
|
796
805
|
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
region_or_zone = resources.zone
|
806
|
+
# Format infra as CLOUD (REGION/ZONE)
|
807
|
+
infra = resources.infra.formatted_str()
|
808
|
+
|
801
809
|
return [
|
802
|
-
|
803
|
-
resources
|
810
|
+
infra,
|
811
|
+
_instance_type_str(resources) + spot,
|
804
812
|
vcpus,
|
805
813
|
mem,
|
806
814
|
str(accelerators),
|
807
|
-
str(region_or_zone),
|
808
815
|
]
|
809
816
|
|
810
817
|
Row = collections.namedtuple('Row', [
|
811
|
-
'
|
812
|
-
'
|
818
|
+
'infra', 'instance', 'vcpus', 'mem', 'accelerators', 'cost_str',
|
819
|
+
'chosen_str'
|
813
820
|
])
|
814
821
|
|
815
822
|
def _get_resources_named_tuple(resources: 'resources_lib.Resources',
|
@@ -833,18 +840,15 @@ class Optimizer:
|
|
833
840
|
vcpus = format_number(vcpus_)
|
834
841
|
mem = format_number(mem_)
|
835
842
|
|
836
|
-
|
837
|
-
region_or_zone = resources.region
|
838
|
-
else:
|
839
|
-
region_or_zone = resources.zone
|
843
|
+
infra = resources.infra.formatted_str()
|
840
844
|
|
841
845
|
chosen_str = ''
|
842
846
|
if chosen:
|
843
847
|
chosen_str = (colorama.Fore.GREEN + ' ' + '\u2714' +
|
844
848
|
colorama.Style.RESET_ALL)
|
845
|
-
row = Row(
|
846
|
-
|
847
|
-
chosen_str)
|
849
|
+
row = Row(infra,
|
850
|
+
_instance_type_str(resources) + spot, vcpus, mem,
|
851
|
+
str(accelerators), cost_str, chosen_str)
|
848
852
|
|
849
853
|
return row
|
850
854
|
|
@@ -862,10 +866,7 @@ class Optimizer:
|
|
862
866
|
return json.dumps(resource_key_dict, sort_keys=True)
|
863
867
|
|
864
868
|
# Print the list of resouces that the optimizer considered.
|
865
|
-
resource_fields = [
|
866
|
-
'CLOUD', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'ACCELERATORS',
|
867
|
-
'REGION/ZONE'
|
868
|
-
]
|
869
|
+
resource_fields = ['INFRA', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'GPUS']
|
869
870
|
if len(ordered_best_plan) > 1:
|
870
871
|
best_plan_rows = []
|
871
872
|
for t, r in ordered_best_plan.items():
|
@@ -993,13 +994,19 @@ class Optimizer:
|
|
993
994
|
if len(candidate_list) > 1:
|
994
995
|
is_multi_instances = True
|
995
996
|
instance_list = [
|
996
|
-
res.instance_type
|
997
|
+
res.instance_type
|
998
|
+
for res in candidate_list
|
999
|
+
if res.instance_type is not None
|
997
1000
|
]
|
1001
|
+
candidate_str = resources_utils.format_resource(
|
1002
|
+
candidate_list[0], simplify=True)
|
1003
|
+
|
998
1004
|
logger.info(
|
999
|
-
f'Multiple {cloud} instances
|
1000
|
-
f'{acc_name}:{int(acc_count)}. '
|
1001
|
-
f'The cheapest {
|
1002
|
-
f'among
|
1005
|
+
f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
|
1006
|
+
f'satisfy {acc_name}:{int(acc_count)}. '
|
1007
|
+
f'The cheapest {candidate_str} is considered '
|
1008
|
+
f'among: {", ".join(instance_list)}.'
|
1009
|
+
f'{colorama.Style.RESET_ALL}')
|
1003
1010
|
if is_multi_instances:
|
1004
1011
|
logger.info(
|
1005
1012
|
f'To list more details, run: sky show-gpus {acc_name}\n')
|
@@ -1151,11 +1158,6 @@ class DummyResources(resources_lib.Resources):
|
|
1151
1158
|
return 0
|
1152
1159
|
|
1153
1160
|
|
1154
|
-
class DummyCloud(clouds.Cloud):
|
1155
|
-
"""A dummy Cloud that has zero egress cost from/to."""
|
1156
|
-
pass
|
1157
|
-
|
1158
|
-
|
1159
1161
|
def _filter_out_blocked_launchable_resources(
|
1160
1162
|
launchable_resources: Iterable[resources_lib.Resources],
|
1161
1163
|
blocked_resources: Iterable[resources_lib.Resources]):
|
sky/provision/gcp/constants.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
"""Constants used by the GCP provisioner."""
|
2
|
+
import textwrap
|
2
3
|
|
3
4
|
VERSION = 'v1'
|
4
5
|
# Using v2 according to
|
@@ -53,9 +54,7 @@ CLUSTER_PREFIX_LENGTH = 10
|
|
53
54
|
|
54
55
|
COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
|
55
56
|
COLLOCATED_COLLOCATION = 'COLLOCATED'
|
56
|
-
GPU_DIRECT_TCPX_USER_DATA = """
|
57
|
-
set -e
|
58
|
-
set -x
|
57
|
+
GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
|
59
58
|
# Install GPU Direct TCPX
|
60
59
|
cos-extensions install gpu -- --version=latest;
|
61
60
|
sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
|
@@ -83,7 +82,7 @@ GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
|
|
83
82
|
sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
|
84
83
|
sudo mount -o remount,exec /var/lib/tcpx;
|
85
84
|
echo "GPU Direct TCPX installed"
|
86
|
-
"""
|
85
|
+
""")
|
87
86
|
|
88
87
|
GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
89
88
|
'--cap-add=IPC_LOCK',
|
@@ -106,6 +105,150 @@ GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
|
|
106
105
|
'--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
|
107
106
|
]
|
108
107
|
|
108
|
+
PD_EXTREME_IOPS = 20000
|
109
|
+
DEFAULT_DISK_SIZE = 100
|
110
|
+
NETWORK_STORAGE_TYPE = 'PERSISTENT'
|
111
|
+
INSTANCE_STORAGE_TYPE = 'SCRATCH'
|
112
|
+
INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
|
113
|
+
INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
|
114
|
+
INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
|
115
|
+
DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
|
116
|
+
|
117
|
+
BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
|
118
|
+
set -e
|
119
|
+
set -x
|
120
|
+
""")
|
121
|
+
DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
|
122
|
+
# Define arrays for devices and mount points
|
123
|
+
declare -A device_mounts=(
|
124
|
+
{device_mounts}
|
125
|
+
)
|
126
|
+
|
127
|
+
# Function to format and mount a single device
|
128
|
+
format_and_mount() {{
|
129
|
+
local device_name="$1"
|
130
|
+
local mount_point="$2"
|
131
|
+
|
132
|
+
if [ ! -e "$device_name" ]; then
|
133
|
+
echo "Error: Device $device_name does not exist."
|
134
|
+
return 1
|
135
|
+
fi
|
136
|
+
|
137
|
+
# Check if filesystem is already formatted (ext4)
|
138
|
+
if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
|
139
|
+
if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
|
140
|
+
echo "Formatting local SSD $device_name..."
|
141
|
+
if ! sudo mkfs.ext4 -F "$device_name"; then
|
142
|
+
echo "Error: Failed to format $device_name"
|
143
|
+
return 1
|
144
|
+
fi
|
145
|
+
else
|
146
|
+
echo "Formatting persistent disk $device_name..."
|
147
|
+
if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
|
148
|
+
echo "Error: Failed to format $device_name"
|
149
|
+
return 1
|
150
|
+
fi
|
151
|
+
fi
|
152
|
+
else
|
153
|
+
echo "$device_name is already formatted."
|
154
|
+
fi
|
155
|
+
|
156
|
+
# Check if already mounted
|
157
|
+
if ! grep -q "$mount_point" /proc/mounts; then
|
158
|
+
echo "Mounting $device_name to $mount_point..."
|
159
|
+
if ! sudo mkdir -p "$mount_point"; then
|
160
|
+
echo "Error: Failed to create mount point $mount_point"
|
161
|
+
return 1
|
162
|
+
fi
|
163
|
+
|
164
|
+
if ! sudo mount "$device_name" "$mount_point"; then
|
165
|
+
echo "Error: Failed to mount $device_name to $mount_point"
|
166
|
+
return 1
|
167
|
+
fi
|
168
|
+
|
169
|
+
# Add to fstab if not already present
|
170
|
+
if ! grep -q " $mount_point " /etc/fstab; then
|
171
|
+
echo "Adding mount entry to /etc/fstab..."
|
172
|
+
echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
|
173
|
+
else
|
174
|
+
echo "Mount entry already exists in /etc/fstab"
|
175
|
+
fi
|
176
|
+
else
|
177
|
+
echo "$device_name is already mounted at $mount_point"
|
178
|
+
fi
|
179
|
+
}}
|
180
|
+
|
181
|
+
# Main execution
|
182
|
+
echo "Starting device mounting process..."
|
183
|
+
|
184
|
+
# Process each device-mount pair
|
185
|
+
for device in "${{!device_mounts[@]}}"; do
|
186
|
+
mount_point="${{device_mounts[$device]}}"
|
187
|
+
echo "Processing device: $device -> $mount_point"
|
188
|
+
if ! format_and_mount "$device" "$mount_point"; then
|
189
|
+
echo "Failed to process device $device"
|
190
|
+
# Continue with other devices even if one fails
|
191
|
+
continue
|
192
|
+
fi
|
193
|
+
done
|
194
|
+
|
195
|
+
echo "Device mounting process completed."
|
196
|
+
""")
|
197
|
+
|
198
|
+
# The local SSDs will be attached automatically to the following
|
199
|
+
# machine types with the following number of disks.
|
200
|
+
# Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
|
201
|
+
SSD_AUTO_ATTACH_MACHINE_TYPES = {
|
202
|
+
'c4a-standard-4-lssd': 1,
|
203
|
+
'c4a-highmem-4-lssd': 1,
|
204
|
+
'c4a-standard-8-lssd': 2,
|
205
|
+
'c4a-highmem-8-lssd': 2,
|
206
|
+
'c4a-standard-16-lssd': 4,
|
207
|
+
'c4a-highmem-16-lssd': 4,
|
208
|
+
'c4a-standard-32-lssd': 6,
|
209
|
+
'c4a-highmem-32-lssd': 6,
|
210
|
+
'c4a-standard-48-lssd': 10,
|
211
|
+
'c4a-highmem-48-lssd': 10,
|
212
|
+
'c4a-standard-64-lssd': 14,
|
213
|
+
'c4a-highmem-64-lssd': 14,
|
214
|
+
'c4a-standard-72-lssd': 16,
|
215
|
+
'c4a-highmem-72-lssd': 16,
|
216
|
+
'c3-standard-4-lssd': 1,
|
217
|
+
'c3-standard-8-lssd': 2,
|
218
|
+
'c3-standard-22-lssd': 4,
|
219
|
+
'c3-standard-44-lssd': 8,
|
220
|
+
'c3-standard-88-lssd': 16,
|
221
|
+
'c3-standard-176-lssd': 32,
|
222
|
+
'c3d-standard-8-lssd': 1,
|
223
|
+
'c3d-highmem-8-lssd': 1,
|
224
|
+
'c3d-standard-16-lssd': 1,
|
225
|
+
'c3d-highmem-16-lssd': 1,
|
226
|
+
'c3d-standard-30-lssd': 2,
|
227
|
+
'c3d-highmem-30-lssd': 2,
|
228
|
+
'c3d-standard-60-lssd': 4,
|
229
|
+
'c3d-highmem-60-lssd': 4,
|
230
|
+
'c3d-standard-90-lssd': 8,
|
231
|
+
'c3d-highmem-90-lssd': 8,
|
232
|
+
'c3d-standard-180-lssd': 16,
|
233
|
+
'c3d-highmem-180-lssd': 16,
|
234
|
+
'c3d-standard-360-lssd': 32,
|
235
|
+
'c3d-highmem-360-lssd': 32,
|
236
|
+
'a4-highgpu-8g': 32,
|
237
|
+
'a3-ultragpu-8g': 32,
|
238
|
+
'a3-megagpu-8g': 16,
|
239
|
+
'a3-highgpu-1g': 2,
|
240
|
+
'a3-highgpu-2g': 4,
|
241
|
+
'a3-highgpu-4g': 8,
|
242
|
+
'a3-highgpu-8g': 16,
|
243
|
+
'a3-edgegpu-8g': 16,
|
244
|
+
'a2-ultragpu-1g': 1,
|
245
|
+
'a2-ultragpu-2g': 2,
|
246
|
+
'a2-ultragpu-4g': 4,
|
247
|
+
'a2-ultragpu-8g': 8,
|
248
|
+
'z3-highmem-88': 12,
|
249
|
+
'z3-highmem-176': 12,
|
250
|
+
}
|
251
|
+
|
109
252
|
# Below parameters are from the default VPC on GCP.
|
110
253
|
# https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
|
111
254
|
VPC_TEMPLATE: dict = {
|
@@ -826,6 +826,16 @@ class GCPComputeInstance(GCPInstance):
|
|
826
826
|
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/bulkInsert # pylint: disable=line-too-long
|
827
827
|
if config.get('sourceMachineImage') is not None:
|
828
828
|
return False
|
829
|
+
# bulkInsert does not support attaching existing
|
830
|
+
# disks to the instances with READ_WRITE mode.
|
831
|
+
if config.get('disks') is not None:
|
832
|
+
for disk in config['disks']:
|
833
|
+
if disk.get('source') is not None and disk.get(
|
834
|
+
'mode', 'READ_WRITE') == 'READ_WRITE':
|
835
|
+
return False
|
836
|
+
if disk.get('initializeParams') is not None and disk.get(
|
837
|
+
'initializeParams', {}).get('diskName') is not None:
|
838
|
+
return False
|
829
839
|
return True
|
830
840
|
|
831
841
|
@classmethod
|
@@ -0,0 +1,247 @@
|
|
1
|
+
"""Utilities for GCP volumes."""
|
2
|
+
from typing import Any, Dict, List, Optional
|
3
|
+
|
4
|
+
from sky import clouds
|
5
|
+
from sky import exceptions
|
6
|
+
from sky import sky_logging
|
7
|
+
from sky.adaptors import gcp
|
8
|
+
from sky.provision.gcp import constants
|
9
|
+
from sky.utils import resources_utils
|
10
|
+
from sky.utils import ux_utils
|
11
|
+
|
12
|
+
logger = sky_logging.init_logger(__name__)
|
13
|
+
|
14
|
+
|
15
|
+
def get_data_disk_tier_mapping(
|
16
|
+
instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
|
17
|
+
# Define the default mapping from disk tiers to disk types.
|
18
|
+
# Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
|
19
|
+
# and https://cloud.google.com/compute/docs/disks/persistent-disks
|
20
|
+
tier2name = {
|
21
|
+
resources_utils.DiskTier.ULTRA: 'pd-extreme',
|
22
|
+
resources_utils.DiskTier.HIGH: 'pd-ssd',
|
23
|
+
resources_utils.DiskTier.MEDIUM: 'pd-balanced',
|
24
|
+
resources_utils.DiskTier.LOW: 'pd-standard',
|
25
|
+
}
|
26
|
+
|
27
|
+
if instance_type is None:
|
28
|
+
return tier2name
|
29
|
+
|
30
|
+
# Remap series-specific disk types.
|
31
|
+
series = instance_type.split('-')[0]
|
32
|
+
|
33
|
+
if series in ['a4', 'x4']:
|
34
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
35
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
36
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
37
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
38
|
+
elif series in ['m4']:
|
39
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
40
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
41
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
42
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
43
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
44
|
+
if num_cpus < 112:
|
45
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
46
|
+
elif series in ['c4', 'c4a', 'c4d']:
|
47
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
48
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
49
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
50
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
51
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
52
|
+
if num_cpus < 64:
|
53
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
54
|
+
elif series in ['a3']:
|
55
|
+
if (instance_type.startswith('a3-ultragpu') or
|
56
|
+
instance_type.startswith('a3-megagpu') or
|
57
|
+
instance_type.startswith('a3-edgegpu')):
|
58
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
59
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
60
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
61
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
62
|
+
elif instance_type.startswith('a3-highgpu'):
|
63
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
64
|
+
if instance_type.startswith('a3-highgpu-8g'):
|
65
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
66
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
67
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
68
|
+
elif instance_type.startswith('a3-highgpu-4g'):
|
69
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
70
|
+
else:
|
71
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
72
|
+
elif series in ['c3d']:
|
73
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
74
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
75
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
76
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
77
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
78
|
+
if num_cpus < 60:
|
79
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
80
|
+
elif series in ['c3']:
|
81
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
82
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
83
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
84
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
85
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
86
|
+
if num_cpus < 88:
|
87
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
88
|
+
elif series in ['n4']:
|
89
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
90
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
91
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
|
92
|
+
tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
|
93
|
+
elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
|
94
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
95
|
+
elif series in ['z3']:
|
96
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
97
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
98
|
+
elif series in ['h3']:
|
99
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
100
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
101
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
102
|
+
elif series in ['m3']:
|
103
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
104
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
105
|
+
tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
|
106
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
107
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
108
|
+
if num_cpus < 64:
|
109
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
110
|
+
elif series in ['m2']:
|
111
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
112
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
113
|
+
elif series in ['m1']:
|
114
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
115
|
+
tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
|
116
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
117
|
+
if num_cpus < 80:
|
118
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
|
119
|
+
elif series in ['g2']:
|
120
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
121
|
+
tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
|
122
|
+
elif series in ['n2']:
|
123
|
+
num_cpus = int(instance_type.split('-')[2]) # type: ignore
|
124
|
+
if num_cpus < 64:
|
125
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
|
126
|
+
elif num_cpus >= 80:
|
127
|
+
tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
|
128
|
+
|
129
|
+
return tier2name
|
130
|
+
|
131
|
+
|
132
|
+
def validate_instance_volumes(
|
133
|
+
instance_type: Optional[str],
|
134
|
+
volumes: Optional[List[Dict[str, Any]]],
|
135
|
+
) -> None:
|
136
|
+
if not volumes:
|
137
|
+
return
|
138
|
+
if instance_type is None:
|
139
|
+
logger.warning('Instance type is not specified,'
|
140
|
+
' skipping instance volume validation')
|
141
|
+
return
|
142
|
+
instance_volume_count = 0
|
143
|
+
for volume in volumes:
|
144
|
+
if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
|
145
|
+
instance_volume_count += 1
|
146
|
+
if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
|
147
|
+
instance_volume_count >
|
148
|
+
constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
|
149
|
+
raise exceptions.ResourcesUnavailableError(
|
150
|
+
f'The instance type {instance_type} supports'
|
151
|
+
f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
|
152
|
+
f' instance storage, but {instance_volume_count} are specified')
|
153
|
+
# TODO(hailong):
|
154
|
+
# check the instance storage count for the other instance types,
|
155
|
+
# refer to https://cloud.google.com/compute/docs/disks/local-ssd
|
156
|
+
|
157
|
+
|
158
|
+
def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
|
159
|
+
if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
|
160
|
+
return 'READ_ONLY'
|
161
|
+
return 'READ_WRITE'
|
162
|
+
|
163
|
+
|
164
|
+
def check_volume_name_exist_in_region(
|
165
|
+
project_id: str, region: clouds.Region, use_mig: bool,
|
166
|
+
volume_name: str) -> Optional[Dict[str, Any]]:
|
167
|
+
"""Check if the volume name exists and return the volume info."""
|
168
|
+
logger.debug(f'Checking volume {volume_name} in region {region}')
|
169
|
+
try:
|
170
|
+
compute = gcp.build('compute',
|
171
|
+
'v1',
|
172
|
+
credentials=None,
|
173
|
+
cache_discovery=False)
|
174
|
+
except gcp.credential_error_exception():
|
175
|
+
with ux_utils.print_exception_no_traceback():
|
176
|
+
raise ValueError('Not able to build compute client') from None
|
177
|
+
|
178
|
+
# Get all the zones in the region
|
179
|
+
all_zones = compute.zones().list(project=project_id).execute()
|
180
|
+
region_zones = []
|
181
|
+
if 'items' in all_zones:
|
182
|
+
for zone in all_zones['items']:
|
183
|
+
if zone['region'].split('/')[-1] == region.name:
|
184
|
+
region_zones.append(zone['name'])
|
185
|
+
volume_info = None
|
186
|
+
for zone in region_zones:
|
187
|
+
try:
|
188
|
+
volume_info = compute.disks().get(project=project_id,
|
189
|
+
zone=zone,
|
190
|
+
disk=volume_name).execute()
|
191
|
+
if volume_info is not None:
|
192
|
+
if use_mig:
|
193
|
+
# With MIG, instance template will be used, in this case,
|
194
|
+
# the `selfLink` for zonal disk needs to be the volume name
|
195
|
+
# Refer to https://cloud.google.com/compute/docs/
|
196
|
+
# reference/rest/v1/instances/insert
|
197
|
+
volume_info['selfLink'] = volume_name
|
198
|
+
volume_info['available_zones'] = [zone]
|
199
|
+
return volume_info
|
200
|
+
except gcp.http_error_exception() as e:
|
201
|
+
if e.resp.status == 403:
|
202
|
+
with ux_utils.print_exception_no_traceback():
|
203
|
+
raise ValueError('Not able to access the volume '
|
204
|
+
f'{volume_name!r}') from None
|
205
|
+
if e.resp.status == 404:
|
206
|
+
continue # Try next zone
|
207
|
+
raise
|
208
|
+
|
209
|
+
# If not found in any zone, check region disk
|
210
|
+
try:
|
211
|
+
volume_info = compute.regionDisks().get(project=project_id,
|
212
|
+
region=region.name,
|
213
|
+
disk=volume_name).execute()
|
214
|
+
# 'replicaZones':
|
215
|
+
# ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
|
216
|
+
# 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
|
217
|
+
if volume_info is not None and 'replicaZones' in volume_info:
|
218
|
+
replica_zones = [
|
219
|
+
zone.split('/')[-1] for zone in volume_info['replicaZones']
|
220
|
+
]
|
221
|
+
volume_info['available_zones'] = replica_zones
|
222
|
+
return volume_info
|
223
|
+
except gcp.http_error_exception() as e:
|
224
|
+
if e.resp.status == 403:
|
225
|
+
with ux_utils.print_exception_no_traceback():
|
226
|
+
raise ValueError('Not able to access the volume '
|
227
|
+
f'{volume_name!r}') from None
|
228
|
+
if e.resp.status == 404:
|
229
|
+
logger.warning(
|
230
|
+
f'Volume {volume_name} is not found in region {region}.'
|
231
|
+
f' It will be created.')
|
232
|
+
return volume_info
|
233
|
+
raise
|
234
|
+
|
235
|
+
|
236
|
+
def check_volume_zone_match(volume_name: str,
|
237
|
+
zones: Optional[List[clouds.Zone]],
|
238
|
+
available_zones: List[str]):
|
239
|
+
if zones is None:
|
240
|
+
return None
|
241
|
+
for zone in zones:
|
242
|
+
if zone.name in available_zones:
|
243
|
+
return None
|
244
|
+
with ux_utils.print_exception_no_traceback():
|
245
|
+
# Return a ResourcesUnavailableError to trigger failover
|
246
|
+
raise exceptions.ResourcesUnavailableError(
|
247
|
+
f'Volume {volume_name} not available in zones {zones}') from None
|