skypilot-nightly 1.0.0.dev20250520__py3-none-any.whl → 1.0.0.dev20250522__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +56 -37
  4. sky/check.py +3 -3
  5. sky/cli.py +89 -16
  6. sky/client/cli.py +89 -16
  7. sky/client/sdk.py +92 -4
  8. sky/clouds/__init__.py +2 -0
  9. sky/clouds/cloud.py +6 -0
  10. sky/clouds/gcp.py +156 -21
  11. sky/clouds/service_catalog/__init__.py +3 -0
  12. sky/clouds/service_catalog/common.py +9 -2
  13. sky/clouds/service_catalog/constants.py +1 -0
  14. sky/core.py +6 -8
  15. sky/dashboard/out/404.html +1 -1
  16. sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +6 -0
  18. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/{678-206dddca808e6d16.js → 582-683f4f27b81996dc.js} +2 -2
  22. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +1 -0
  29. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +3 -0
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/index.html +1 -1
  34. sky/dashboard/out/infra.html +1 -0
  35. sky/dashboard/out/jobs/[job].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/data/storage.py +1 -0
  38. sky/execution.py +57 -8
  39. sky/jobs/server/core.py +5 -3
  40. sky/jobs/utils.py +38 -7
  41. sky/optimizer.py +41 -39
  42. sky/provision/gcp/constants.py +147 -4
  43. sky/provision/gcp/instance_utils.py +10 -0
  44. sky/provision/gcp/volume_utils.py +247 -0
  45. sky/provision/provisioner.py +16 -7
  46. sky/resources.py +233 -18
  47. sky/serve/serve_utils.py +5 -13
  48. sky/serve/server/core.py +2 -4
  49. sky/server/common.py +60 -14
  50. sky/server/constants.py +2 -0
  51. sky/server/html/token_page.html +154 -0
  52. sky/server/requests/executor.py +3 -6
  53. sky/server/requests/payloads.py +3 -3
  54. sky/server/server.py +40 -8
  55. sky/skypilot_config.py +117 -31
  56. sky/task.py +24 -1
  57. sky/templates/gcp-ray.yml.j2 +44 -1
  58. sky/templates/nebius-ray.yml.j2 +0 -2
  59. sky/utils/admin_policy_utils.py +26 -22
  60. sky/utils/cli_utils/status_utils.py +95 -56
  61. sky/utils/common_utils.py +35 -2
  62. sky/utils/context.py +36 -6
  63. sky/utils/context_utils.py +15 -0
  64. sky/utils/infra_utils.py +175 -0
  65. sky/utils/resources_utils.py +55 -21
  66. sky/utils/schemas.py +111 -5
  67. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/METADATA +1 -1
  68. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/RECORD +73 -68
  69. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/WHEEL +1 -1
  70. sky/dashboard/out/_next/static/8hlc2dkbIDDBOkxtEW7X6/_buildManifest.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  72. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  73. sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +0 -1
  74. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  75. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  76. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  78. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  79. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  81. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  82. /sky/dashboard/out/_next/static/{8hlc2dkbIDDBOkxtEW7X6 → CzOVV6JpRQBRt5GhZuhyK}/_ssgManifest.js +0 -0
  83. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/entry_points.txt +0 -0
  84. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/licenses/LICENSE +0 -0
  85. {skypilot_nightly-1.0.0.dev20250520.dist-info → skypilot_nightly-1.0.0.dev20250522.dist-info}/top_level.txt +0 -0
sky/optimizer.py CHANGED
@@ -73,8 +73,8 @@ class Optimizer:
73
73
  def _egress_cost(src_cloud: clouds.Cloud, dst_cloud: clouds.Cloud,
74
74
  gigabytes: float) -> float:
75
75
  """Returns estimated egress cost."""
76
- if isinstance(src_cloud, DummyCloud) or isinstance(
77
- dst_cloud, DummyCloud):
76
+ if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
77
+ dst_cloud, clouds.DummyCloud):
78
78
  return 0.0
79
79
 
80
80
  if not src_cloud.is_same_cloud(dst_cloud):
@@ -88,8 +88,8 @@ class Optimizer:
88
88
  gigabytes: float) -> float:
89
89
  """Returns estimated egress time in seconds."""
90
90
  # FIXME: estimate bandwidth between each cloud-region pair.
91
- if isinstance(src_cloud, DummyCloud) or isinstance(
92
- dst_cloud, DummyCloud):
91
+ if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
92
+ dst_cloud, clouds.DummyCloud):
93
93
  return 0.0
94
94
  if not src_cloud.is_same_cloud(dst_cloud):
95
95
  # 10Gbps is close to the average of observed b/w from S3
@@ -167,7 +167,7 @@ class Optimizer:
167
167
 
168
168
  def make_dummy(name):
169
169
  dummy = task_lib.Task(name)
170
- dummy.set_resources({DummyResources(DummyCloud(), None)})
170
+ dummy.set_resources({DummyResources(cloud=clouds.DummyCloud())})
171
171
  dummy.set_time_estimator(lambda _: 0)
172
172
  return dummy
173
173
 
@@ -197,7 +197,7 @@ class Optimizer:
197
197
  node: task_lib.Task,
198
198
  resources: resources_lib.Resources,
199
199
  ) -> Tuple[Optional[clouds.Cloud], Optional[clouds.Cloud], Optional[float]]:
200
- if isinstance(parent_resources.cloud, DummyCloud):
200
+ if isinstance(parent_resources.cloud, clouds.DummyCloud):
201
201
  # Special case. The current 'node' is a real
202
202
  # source node, and its input may be on a different
203
203
  # cloud from 'resources'.
@@ -321,10 +321,10 @@ class Optimizer:
321
321
  estimated_runtime = 1 * 3600
322
322
  else:
323
323
  # We assume the time estimator takes in a partial resource
324
- # Resources('V100')
324
+ # Resources(accelerators='V100')
325
325
  # and treats their launchable versions
326
- # Resources(AWS, 'p3.2xlarge'),
327
- # Resources(GCP, '...', 'V100'),
326
+ # Resources(infra='aws', instance_type='p3.2xlarge'),
327
+ # Resources(infra='gcp', accelerators='V100'),
328
328
  # ...
329
329
  # as having the same run time.
330
330
  # FIXME(zongheng): take 'num_nodes' as an arg/into
@@ -772,6 +772,15 @@ class Optimizer:
772
772
  f'{colorama.Style.BRIGHT}Estimated total cost: '
773
773
  f'{colorama.Style.RESET_ALL}${total_cost:.1f}\n')
774
774
 
775
+ def _instance_type_str(resources: 'resources_lib.Resources') -> str:
776
+ instance_type = resources.instance_type
777
+ assert instance_type is not None, 'Instance type must be specified'
778
+ if isinstance(resources.cloud, clouds.Kubernetes):
779
+ instance_type = '-'
780
+ if resources.use_spot:
781
+ instance_type = ''
782
+ return instance_type
783
+
775
784
  def _get_resources_element_list(
776
785
  resources: 'resources_lib.Resources') -> List[str]:
777
786
  accelerators = resources.get_accelerators_str()
@@ -794,22 +803,20 @@ class Optimizer:
794
803
  vcpus = format_number(vcpus_)
795
804
  mem = format_number(mem_)
796
805
 
797
- if resources.zone is None:
798
- region_or_zone = resources.region
799
- else:
800
- region_or_zone = resources.zone
806
+ # Format infra as CLOUD (REGION/ZONE)
807
+ infra = resources.infra.formatted_str()
808
+
801
809
  return [
802
- str(cloud),
803
- resources.instance_type + spot,
810
+ infra,
811
+ _instance_type_str(resources) + spot,
804
812
  vcpus,
805
813
  mem,
806
814
  str(accelerators),
807
- str(region_or_zone),
808
815
  ]
809
816
 
810
817
  Row = collections.namedtuple('Row', [
811
- 'cloud', 'instance', 'vcpus', 'mem', 'accelerators',
812
- 'region_or_zone', 'cost_str', 'chosen_str'
818
+ 'infra', 'instance', 'vcpus', 'mem', 'accelerators', 'cost_str',
819
+ 'chosen_str'
813
820
  ])
814
821
 
815
822
  def _get_resources_named_tuple(resources: 'resources_lib.Resources',
@@ -833,18 +840,15 @@ class Optimizer:
833
840
  vcpus = format_number(vcpus_)
834
841
  mem = format_number(mem_)
835
842
 
836
- if resources.zone is None:
837
- region_or_zone = resources.region
838
- else:
839
- region_or_zone = resources.zone
843
+ infra = resources.infra.formatted_str()
840
844
 
841
845
  chosen_str = ''
842
846
  if chosen:
843
847
  chosen_str = (colorama.Fore.GREEN + ' ' + '\u2714' +
844
848
  colorama.Style.RESET_ALL)
845
- row = Row(cloud, resources.instance_type + spot, vcpus, mem,
846
- str(accelerators), str(region_or_zone), cost_str,
847
- chosen_str)
849
+ row = Row(infra,
850
+ _instance_type_str(resources) + spot, vcpus, mem,
851
+ str(accelerators), cost_str, chosen_str)
848
852
 
849
853
  return row
850
854
 
@@ -862,10 +866,7 @@ class Optimizer:
862
866
  return json.dumps(resource_key_dict, sort_keys=True)
863
867
 
864
868
  # Print the list of resouces that the optimizer considered.
865
- resource_fields = [
866
- 'CLOUD', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'ACCELERATORS',
867
- 'REGION/ZONE'
868
- ]
869
+ resource_fields = ['INFRA', 'INSTANCE', 'vCPUs', 'Mem(GB)', 'GPUS']
869
870
  if len(ordered_best_plan) > 1:
870
871
  best_plan_rows = []
871
872
  for t, r in ordered_best_plan.items():
@@ -993,13 +994,19 @@ class Optimizer:
993
994
  if len(candidate_list) > 1:
994
995
  is_multi_instances = True
995
996
  instance_list = [
996
- res.instance_type for res in candidate_list
997
+ res.instance_type
998
+ for res in candidate_list
999
+ if res.instance_type is not None
997
1000
  ]
1001
+ candidate_str = resources_utils.format_resource(
1002
+ candidate_list[0], simplify=True)
1003
+
998
1004
  logger.info(
999
- f'Multiple {cloud} instances satisfy '
1000
- f'{acc_name}:{int(acc_count)}. '
1001
- f'The cheapest {candidate_list[0]!r} is considered '
1002
- f'among:\n{instance_list}.')
1005
+ f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
1006
+ f'satisfy {acc_name}:{int(acc_count)}. '
1007
+ f'The cheapest {candidate_str} is considered '
1008
+ f'among: {", ".join(instance_list)}.'
1009
+ f'{colorama.Style.RESET_ALL}')
1003
1010
  if is_multi_instances:
1004
1011
  logger.info(
1005
1012
  f'To list more details, run: sky show-gpus {acc_name}\n')
@@ -1151,11 +1158,6 @@ class DummyResources(resources_lib.Resources):
1151
1158
  return 0
1152
1159
 
1153
1160
 
1154
- class DummyCloud(clouds.Cloud):
1155
- """A dummy Cloud that has zero egress cost from/to."""
1156
- pass
1157
-
1158
-
1159
1161
  def _filter_out_blocked_launchable_resources(
1160
1162
  launchable_resources: Iterable[resources_lib.Resources],
1161
1163
  blocked_resources: Iterable[resources_lib.Resources]):
@@ -1,4 +1,5 @@
1
1
  """Constants used by the GCP provisioner."""
2
+ import textwrap
2
3
 
3
4
  VERSION = 'v1'
4
5
  # Using v2 according to
@@ -53,9 +54,7 @@ CLUSTER_PREFIX_LENGTH = 10
53
54
 
54
55
  COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
55
56
  COLLOCATED_COLLOCATION = 'COLLOCATED'
56
- GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
57
- set -e
58
- set -x
57
+ GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
59
58
  # Install GPU Direct TCPX
60
59
  cos-extensions install gpu -- --version=latest;
61
60
  sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
@@ -83,7 +82,7 @@ GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
83
82
  sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
84
83
  sudo mount -o remount,exec /var/lib/tcpx;
85
84
  echo "GPU Direct TCPX installed"
86
- """
85
+ """)
87
86
 
88
87
  GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
89
88
  '--cap-add=IPC_LOCK',
@@ -106,6 +105,150 @@ GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
106
105
  '--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
107
106
  ]
108
107
 
108
+ PD_EXTREME_IOPS = 20000
109
+ DEFAULT_DISK_SIZE = 100
110
+ NETWORK_STORAGE_TYPE = 'PERSISTENT'
111
+ INSTANCE_STORAGE_TYPE = 'SCRATCH'
112
+ INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
113
+ INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
114
+ INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
115
+ DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
116
+
117
+ BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
118
+ set -e
119
+ set -x
120
+ """)
121
+ DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
122
+ # Define arrays for devices and mount points
123
+ declare -A device_mounts=(
124
+ {device_mounts}
125
+ )
126
+
127
+ # Function to format and mount a single device
128
+ format_and_mount() {{
129
+ local device_name="$1"
130
+ local mount_point="$2"
131
+
132
+ if [ ! -e "$device_name" ]; then
133
+ echo "Error: Device $device_name does not exist."
134
+ return 1
135
+ fi
136
+
137
+ # Check if filesystem is already formatted (ext4)
138
+ if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
139
+ if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
140
+ echo "Formatting local SSD $device_name..."
141
+ if ! sudo mkfs.ext4 -F "$device_name"; then
142
+ echo "Error: Failed to format $device_name"
143
+ return 1
144
+ fi
145
+ else
146
+ echo "Formatting persistent disk $device_name..."
147
+ if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
148
+ echo "Error: Failed to format $device_name"
149
+ return 1
150
+ fi
151
+ fi
152
+ else
153
+ echo "$device_name is already formatted."
154
+ fi
155
+
156
+ # Check if already mounted
157
+ if ! grep -q "$mount_point" /proc/mounts; then
158
+ echo "Mounting $device_name to $mount_point..."
159
+ if ! sudo mkdir -p "$mount_point"; then
160
+ echo "Error: Failed to create mount point $mount_point"
161
+ return 1
162
+ fi
163
+
164
+ if ! sudo mount "$device_name" "$mount_point"; then
165
+ echo "Error: Failed to mount $device_name to $mount_point"
166
+ return 1
167
+ fi
168
+
169
+ # Add to fstab if not already present
170
+ if ! grep -q " $mount_point " /etc/fstab; then
171
+ echo "Adding mount entry to /etc/fstab..."
172
+ echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
173
+ else
174
+ echo "Mount entry already exists in /etc/fstab"
175
+ fi
176
+ else
177
+ echo "$device_name is already mounted at $mount_point"
178
+ fi
179
+ }}
180
+
181
+ # Main execution
182
+ echo "Starting device mounting process..."
183
+
184
+ # Process each device-mount pair
185
+ for device in "${{!device_mounts[@]}}"; do
186
+ mount_point="${{device_mounts[$device]}}"
187
+ echo "Processing device: $device -> $mount_point"
188
+ if ! format_and_mount "$device" "$mount_point"; then
189
+ echo "Failed to process device $device"
190
+ # Continue with other devices even if one fails
191
+ continue
192
+ fi
193
+ done
194
+
195
+ echo "Device mounting process completed."
196
+ """)
197
+
198
+ # The local SSDs will be attached automatically to the following
199
+ # machine types with the following number of disks.
200
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
201
+ SSD_AUTO_ATTACH_MACHINE_TYPES = {
202
+ 'c4a-standard-4-lssd': 1,
203
+ 'c4a-highmem-4-lssd': 1,
204
+ 'c4a-standard-8-lssd': 2,
205
+ 'c4a-highmem-8-lssd': 2,
206
+ 'c4a-standard-16-lssd': 4,
207
+ 'c4a-highmem-16-lssd': 4,
208
+ 'c4a-standard-32-lssd': 6,
209
+ 'c4a-highmem-32-lssd': 6,
210
+ 'c4a-standard-48-lssd': 10,
211
+ 'c4a-highmem-48-lssd': 10,
212
+ 'c4a-standard-64-lssd': 14,
213
+ 'c4a-highmem-64-lssd': 14,
214
+ 'c4a-standard-72-lssd': 16,
215
+ 'c4a-highmem-72-lssd': 16,
216
+ 'c3-standard-4-lssd': 1,
217
+ 'c3-standard-8-lssd': 2,
218
+ 'c3-standard-22-lssd': 4,
219
+ 'c3-standard-44-lssd': 8,
220
+ 'c3-standard-88-lssd': 16,
221
+ 'c3-standard-176-lssd': 32,
222
+ 'c3d-standard-8-lssd': 1,
223
+ 'c3d-highmem-8-lssd': 1,
224
+ 'c3d-standard-16-lssd': 1,
225
+ 'c3d-highmem-16-lssd': 1,
226
+ 'c3d-standard-30-lssd': 2,
227
+ 'c3d-highmem-30-lssd': 2,
228
+ 'c3d-standard-60-lssd': 4,
229
+ 'c3d-highmem-60-lssd': 4,
230
+ 'c3d-standard-90-lssd': 8,
231
+ 'c3d-highmem-90-lssd': 8,
232
+ 'c3d-standard-180-lssd': 16,
233
+ 'c3d-highmem-180-lssd': 16,
234
+ 'c3d-standard-360-lssd': 32,
235
+ 'c3d-highmem-360-lssd': 32,
236
+ 'a4-highgpu-8g': 32,
237
+ 'a3-ultragpu-8g': 32,
238
+ 'a3-megagpu-8g': 16,
239
+ 'a3-highgpu-1g': 2,
240
+ 'a3-highgpu-2g': 4,
241
+ 'a3-highgpu-4g': 8,
242
+ 'a3-highgpu-8g': 16,
243
+ 'a3-edgegpu-8g': 16,
244
+ 'a2-ultragpu-1g': 1,
245
+ 'a2-ultragpu-2g': 2,
246
+ 'a2-ultragpu-4g': 4,
247
+ 'a2-ultragpu-8g': 8,
248
+ 'z3-highmem-88': 12,
249
+ 'z3-highmem-176': 12,
250
+ }
251
+
109
252
  # Below parameters are from the default VPC on GCP.
110
253
  # https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
111
254
  VPC_TEMPLATE: dict = {
@@ -826,6 +826,16 @@ class GCPComputeInstance(GCPInstance):
826
826
  # https://cloud.google.com/compute/docs/reference/rest/v1/instances/bulkInsert # pylint: disable=line-too-long
827
827
  if config.get('sourceMachineImage') is not None:
828
828
  return False
829
+ # bulkInsert does not support attaching existing
830
+ # disks to the instances with READ_WRITE mode.
831
+ if config.get('disks') is not None:
832
+ for disk in config['disks']:
833
+ if disk.get('source') is not None and disk.get(
834
+ 'mode', 'READ_WRITE') == 'READ_WRITE':
835
+ return False
836
+ if disk.get('initializeParams') is not None and disk.get(
837
+ 'initializeParams', {}).get('diskName') is not None:
838
+ return False
829
839
  return True
830
840
 
831
841
  @classmethod
@@ -0,0 +1,247 @@
1
+ """Utilities for GCP volumes."""
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from sky import clouds
5
+ from sky import exceptions
6
+ from sky import sky_logging
7
+ from sky.adaptors import gcp
8
+ from sky.provision.gcp import constants
9
+ from sky.utils import resources_utils
10
+ from sky.utils import ux_utils
11
+
12
+ logger = sky_logging.init_logger(__name__)
13
+
14
+
15
+ def get_data_disk_tier_mapping(
16
+ instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
17
+ # Define the default mapping from disk tiers to disk types.
18
+ # Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
19
+ # and https://cloud.google.com/compute/docs/disks/persistent-disks
20
+ tier2name = {
21
+ resources_utils.DiskTier.ULTRA: 'pd-extreme',
22
+ resources_utils.DiskTier.HIGH: 'pd-ssd',
23
+ resources_utils.DiskTier.MEDIUM: 'pd-balanced',
24
+ resources_utils.DiskTier.LOW: 'pd-standard',
25
+ }
26
+
27
+ if instance_type is None:
28
+ return tier2name
29
+
30
+ # Remap series-specific disk types.
31
+ series = instance_type.split('-')[0]
32
+
33
+ if series in ['a4', 'x4']:
34
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
35
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
36
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
37
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
38
+ elif series in ['m4']:
39
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
40
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
41
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
42
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
43
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
44
+ if num_cpus < 112:
45
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
46
+ elif series in ['c4', 'c4a', 'c4d']:
47
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
48
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
49
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
50
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
51
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
52
+ if num_cpus < 64:
53
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
54
+ elif series in ['a3']:
55
+ if (instance_type.startswith('a3-ultragpu') or
56
+ instance_type.startswith('a3-megagpu') or
57
+ instance_type.startswith('a3-edgegpu')):
58
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
59
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
60
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
61
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
62
+ elif instance_type.startswith('a3-highgpu'):
63
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
64
+ if instance_type.startswith('a3-highgpu-8g'):
65
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
66
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
67
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
68
+ elif instance_type.startswith('a3-highgpu-4g'):
69
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
70
+ else:
71
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
72
+ elif series in ['c3d']:
73
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
74
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
75
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
76
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
77
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
78
+ if num_cpus < 60:
79
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
80
+ elif series in ['c3']:
81
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
82
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
83
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
84
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
85
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
86
+ if num_cpus < 88:
87
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
88
+ elif series in ['n4']:
89
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
90
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
91
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
92
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
93
+ elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
94
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
95
+ elif series in ['z3']:
96
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
97
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
98
+ elif series in ['h3']:
99
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
100
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
101
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
102
+ elif series in ['m3']:
103
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
104
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
105
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
106
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
107
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
108
+ if num_cpus < 64:
109
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
110
+ elif series in ['m2']:
111
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
112
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
113
+ elif series in ['m1']:
114
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
115
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
116
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
117
+ if num_cpus < 80:
118
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
119
+ elif series in ['g2']:
120
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
121
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
122
+ elif series in ['n2']:
123
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
124
+ if num_cpus < 64:
125
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
126
+ elif num_cpus >= 80:
127
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
128
+
129
+ return tier2name
130
+
131
+
132
+ def validate_instance_volumes(
133
+ instance_type: Optional[str],
134
+ volumes: Optional[List[Dict[str, Any]]],
135
+ ) -> None:
136
+ if not volumes:
137
+ return
138
+ if instance_type is None:
139
+ logger.warning('Instance type is not specified,'
140
+ ' skipping instance volume validation')
141
+ return
142
+ instance_volume_count = 0
143
+ for volume in volumes:
144
+ if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
145
+ instance_volume_count += 1
146
+ if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
147
+ instance_volume_count >
148
+ constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
149
+ raise exceptions.ResourcesUnavailableError(
150
+ f'The instance type {instance_type} supports'
151
+ f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
152
+ f' instance storage, but {instance_volume_count} are specified')
153
+ # TODO(hailong):
154
+ # check the instance storage count for the other instance types,
155
+ # refer to https://cloud.google.com/compute/docs/disks/local-ssd
156
+
157
+
158
+ def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
159
+ if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
160
+ return 'READ_ONLY'
161
+ return 'READ_WRITE'
162
+
163
+
164
+ def check_volume_name_exist_in_region(
165
+ project_id: str, region: clouds.Region, use_mig: bool,
166
+ volume_name: str) -> Optional[Dict[str, Any]]:
167
+ """Check if the volume name exists and return the volume info."""
168
+ logger.debug(f'Checking volume {volume_name} in region {region}')
169
+ try:
170
+ compute = gcp.build('compute',
171
+ 'v1',
172
+ credentials=None,
173
+ cache_discovery=False)
174
+ except gcp.credential_error_exception():
175
+ with ux_utils.print_exception_no_traceback():
176
+ raise ValueError('Not able to build compute client') from None
177
+
178
+ # Get all the zones in the region
179
+ all_zones = compute.zones().list(project=project_id).execute()
180
+ region_zones = []
181
+ if 'items' in all_zones:
182
+ for zone in all_zones['items']:
183
+ if zone['region'].split('/')[-1] == region.name:
184
+ region_zones.append(zone['name'])
185
+ volume_info = None
186
+ for zone in region_zones:
187
+ try:
188
+ volume_info = compute.disks().get(project=project_id,
189
+ zone=zone,
190
+ disk=volume_name).execute()
191
+ if volume_info is not None:
192
+ if use_mig:
193
+ # With MIG, instance template will be used, in this case,
194
+ # the `selfLink` for zonal disk needs to be the volume name
195
+ # Refer to https://cloud.google.com/compute/docs/
196
+ # reference/rest/v1/instances/insert
197
+ volume_info['selfLink'] = volume_name
198
+ volume_info['available_zones'] = [zone]
199
+ return volume_info
200
+ except gcp.http_error_exception() as e:
201
+ if e.resp.status == 403:
202
+ with ux_utils.print_exception_no_traceback():
203
+ raise ValueError('Not able to access the volume '
204
+ f'{volume_name!r}') from None
205
+ if e.resp.status == 404:
206
+ continue # Try next zone
207
+ raise
208
+
209
+ # If not found in any zone, check region disk
210
+ try:
211
+ volume_info = compute.regionDisks().get(project=project_id,
212
+ region=region.name,
213
+ disk=volume_name).execute()
214
+ # 'replicaZones':
215
+ # ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
216
+ # 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
217
+ if volume_info is not None and 'replicaZones' in volume_info:
218
+ replica_zones = [
219
+ zone.split('/')[-1] for zone in volume_info['replicaZones']
220
+ ]
221
+ volume_info['available_zones'] = replica_zones
222
+ return volume_info
223
+ except gcp.http_error_exception() as e:
224
+ if e.resp.status == 403:
225
+ with ux_utils.print_exception_no_traceback():
226
+ raise ValueError('Not able to access the volume '
227
+ f'{volume_name!r}') from None
228
+ if e.resp.status == 404:
229
+ logger.warning(
230
+ f'Volume {volume_name} is not found in region {region}.'
231
+ f' It will be created.')
232
+ return volume_info
233
+ raise
234
+
235
+
236
+ def check_volume_zone_match(volume_name: str,
237
+ zones: Optional[List[clouds.Zone]],
238
+ available_zones: List[str]):
239
+ if zones is None:
240
+ return None
241
+ for zone in zones:
242
+ if zone.name in available_zones:
243
+ return None
244
+ with ux_utils.print_exception_no_traceback():
245
+ # Return a ResourcesUnavailableError to trigger failover
246
+ raise exceptions.ResourcesUnavailableError(
247
+ f'Volume {volume_name} not available in zones {zones}') from None