skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +122 -3
  8. sky/clouds/__init__.py +5 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +30 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +160 -23
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/__init__.py +3 -0
  25. sky/clouds/service_catalog/common.py +9 -2
  26. sky/clouds/service_catalog/constants.py +2 -1
  27. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  28. sky/clouds/ssh.py +203 -0
  29. sky/clouds/vast.py +2 -1
  30. sky/clouds/vsphere.py +2 -1
  31. sky/core.py +59 -17
  32. sky/dashboard/out/404.html +1 -1
  33. sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/data/storage.py +1 -0
  43. sky/execution.py +56 -7
  44. sky/jobs/server/core.py +4 -2
  45. sky/optimizer.py +29 -15
  46. sky/provision/__init__.py +1 -0
  47. sky/provision/aws/instance.py +17 -1
  48. sky/provision/gcp/constants.py +147 -4
  49. sky/provision/gcp/instance_utils.py +10 -0
  50. sky/provision/gcp/volume_utils.py +247 -0
  51. sky/provision/kubernetes/instance.py +16 -5
  52. sky/provision/kubernetes/utils.py +37 -19
  53. sky/provision/nebius/instance.py +3 -1
  54. sky/provision/nebius/utils.py +14 -2
  55. sky/provision/ssh/__init__.py +18 -0
  56. sky/resources.py +177 -4
  57. sky/serve/server/core.py +2 -4
  58. sky/server/common.py +46 -9
  59. sky/server/constants.py +2 -0
  60. sky/server/html/token_page.html +154 -0
  61. sky/server/requests/executor.py +3 -6
  62. sky/server/requests/payloads.py +7 -0
  63. sky/server/server.py +80 -8
  64. sky/setup_files/dependencies.py +1 -0
  65. sky/skypilot_config.py +117 -31
  66. sky/task.py +24 -1
  67. sky/templates/gcp-ray.yml.j2 +44 -1
  68. sky/templates/nebius-ray.yml.j2 +12 -2
  69. sky/utils/admin_policy_utils.py +26 -22
  70. sky/utils/context.py +36 -6
  71. sky/utils/context_utils.py +15 -0
  72. sky/utils/infra_utils.py +21 -1
  73. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  74. sky/utils/kubernetes/create_cluster.sh +1 -0
  75. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  76. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  77. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  78. sky/utils/log_utils.py +214 -1
  79. sky/utils/resources_utils.py +14 -0
  80. sky/utils/schemas.py +67 -0
  81. sky/utils/ux_utils.py +2 -1
  82. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  83. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
  84. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  85. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  86. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  87. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  88. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  89. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  90. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
sky/optimizer.py CHANGED
@@ -21,6 +21,7 @@ from sky.usage import usage_lib
21
21
  from sky.utils import common
22
22
  from sky.utils import env_options
23
23
  from sky.utils import log_utils
24
+ from sky.utils import registry
24
25
  from sky.utils import resources_utils
25
26
  from sky.utils import rich_utils
26
27
  from sky.utils import subprocess_utils
@@ -73,8 +74,8 @@ class Optimizer:
73
74
  def _egress_cost(src_cloud: clouds.Cloud, dst_cloud: clouds.Cloud,
74
75
  gigabytes: float) -> float:
75
76
  """Returns estimated egress cost."""
76
- if isinstance(src_cloud, DummyCloud) or isinstance(
77
- dst_cloud, DummyCloud):
77
+ if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
78
+ dst_cloud, clouds.DummyCloud):
78
79
  return 0.0
79
80
 
80
81
  if not src_cloud.is_same_cloud(dst_cloud):
@@ -88,8 +89,8 @@ class Optimizer:
88
89
  gigabytes: float) -> float:
89
90
  """Returns estimated egress time in seconds."""
90
91
  # FIXME: estimate bandwidth between each cloud-region pair.
91
- if isinstance(src_cloud, DummyCloud) or isinstance(
92
- dst_cloud, DummyCloud):
92
+ if isinstance(src_cloud, clouds.DummyCloud) or isinstance(
93
+ dst_cloud, clouds.DummyCloud):
93
94
  return 0.0
94
95
  if not src_cloud.is_same_cloud(dst_cloud):
95
96
  # 10Gbps is close to the average of observed b/w from S3
@@ -167,7 +168,7 @@ class Optimizer:
167
168
 
168
169
  def make_dummy(name):
169
170
  dummy = task_lib.Task(name)
170
- dummy.set_resources({DummyResources(cloud=DummyCloud())})
171
+ dummy.set_resources({DummyResources(cloud=clouds.DummyCloud())})
171
172
  dummy.set_time_estimator(lambda _: 0)
172
173
  return dummy
173
174
 
@@ -197,7 +198,7 @@ class Optimizer:
197
198
  node: task_lib.Task,
198
199
  resources: resources_lib.Resources,
199
200
  ) -> Tuple[Optional[clouds.Cloud], Optional[clouds.Cloud], Optional[float]]:
200
- if isinstance(parent_resources.cloud, DummyCloud):
201
+ if isinstance(parent_resources.cloud, clouds.DummyCloud):
201
202
  # Special case. The current 'node' is a real
202
203
  # source node, and its input may be on a different
203
204
  # cloud from 'resources'.
@@ -376,6 +377,10 @@ class Optimizer:
376
377
  if any(orig_resources.cloud is None
377
378
  for orig_resources in node.resources):
378
379
  source_hint = 'catalog and kubernetes cluster'
380
+ elif all(
381
+ isinstance(orig_resources.cloud, clouds.SSH)
382
+ for orig_resources in node.resources):
383
+ source_hint = 'node pool'
379
384
  elif all(
380
385
  isinstance(orig_resources.cloud, clouds.Kubernetes)
381
386
  for orig_resources in node.resources):
@@ -858,11 +863,19 @@ class Optimizer:
858
863
  'accelerators': f'{resources.accelerators}',
859
864
  'use_spot': resources.use_spot
860
865
  }
866
+
867
+ # Handle special case for Kubernetes and SSH clouds
861
868
  if isinstance(resources.cloud, clouds.Kubernetes):
862
- # Region for Kubernetes is the context name, i.e. different
863
- # Kubernetes clusters. We add region to the key to show all the
864
- # Kubernetes clusters in the optimizer table for better UX.
869
+ # Region for Kubernetes-like clouds (SSH, Kubernetes) is the
870
+ # context name, i.e. different Kubernetes clusters. We add
871
+ # region to the key to show all the Kubernetes clusters in the
872
+ # optimizer table for better UX.
873
+
874
+ if resources.cloud.__class__.__name__ == 'SSH':
875
+ resource_key_dict[
876
+ 'cloud'] = 'SSH' # Force the cloud name to be SSH
865
877
  resource_key_dict['region'] = resources.region
878
+
866
879
  return json.dumps(resource_key_dict, sort_keys=True)
867
880
 
868
881
  # Print the list of resouces that the optimizer considered.
@@ -1158,11 +1171,6 @@ class DummyResources(resources_lib.Resources):
1158
1171
  return 0
1159
1172
 
1160
1173
 
1161
- class DummyCloud(clouds.Cloud):
1162
- """A dummy Cloud that has zero egress cost from/to."""
1163
- pass
1164
-
1165
-
1166
1174
  def _filter_out_blocked_launchable_resources(
1167
1175
  launchable_resources: Iterable[resources_lib.Resources],
1168
1176
  blocked_resources: Iterable[resources_lib.Resources]):
@@ -1221,7 +1229,13 @@ def _check_specified_clouds(dag: 'dag_lib.Dag') -> None:
1221
1229
  if disabled_clouds:
1222
1230
  is_or_are = 'is' if len(disabled_clouds) == 1 else 'are'
1223
1231
  task_name = f' {task.name!r}' if task.name is not None else ''
1224
- msg = (f'Task{task_name} requires {", ".join(disabled_clouds)} '
1232
+ disabled_display_names = []
1233
+ for c in disabled_clouds:
1234
+ cloud_obj_one = registry.CLOUD_REGISTRY.from_str(c)
1235
+ if cloud_obj_one is not None:
1236
+ disabled_display_names.append(cloud_obj_one.display_name())
1237
+ cloud_names = ', '.join(disabled_display_names)
1238
+ msg = (f'Task{task_name} requires {cloud_names} '
1225
1239
  f'which {is_or_are} not enabled. To enable access, change '
1226
1240
  f'the task cloud requirement or run: {colorama.Style.BRIGHT}'
1227
1241
  f'sky check {" ".join(c.lower() for c in disabled_clouds)}'
sky/provision/__init__.py CHANGED
@@ -23,6 +23,7 @@ from sky.provision import lambda_cloud
23
23
  from sky.provision import nebius
24
24
  from sky.provision import oci
25
25
  from sky.provision import runpod
26
+ from sky.provision import ssh
26
27
  from sky.provision import vast
27
28
  from sky.provision import vsphere
28
29
  from sky.utils import command_runner
@@ -836,7 +836,23 @@ def open_ports(
836
836
 
837
837
  # For the case when every new ports is already opened.
838
838
  if ip_permissions:
839
- sg.authorize_ingress(IpPermissions=ip_permissions)
839
+ # Filter out any permissions that already exist in the security group
840
+ existing_permissions = set()
841
+ for rule in sg.ip_permissions:
842
+ if rule['IpProtocol'] == 'tcp':
843
+ for ip_range in rule.get('IpRanges', []):
844
+ if ip_range.get('CidrIp') == '0.0.0.0/0':
845
+ existing_permissions.add(
846
+ (rule['FromPort'], rule['ToPort']))
847
+
848
+ # Remove any permissions that already exist
849
+ filtered_permissions = []
850
+ for perm in ip_permissions:
851
+ if (perm['FromPort'], perm['ToPort']) not in existing_permissions:
852
+ filtered_permissions.append(perm)
853
+
854
+ if filtered_permissions:
855
+ sg.authorize_ingress(IpPermissions=filtered_permissions)
840
856
 
841
857
 
842
858
  def cleanup_ports(
@@ -1,4 +1,5 @@
1
1
  """Constants used by the GCP provisioner."""
2
+ import textwrap
2
3
 
3
4
  VERSION = 'v1'
4
5
  # Using v2 according to
@@ -53,9 +54,7 @@ CLUSTER_PREFIX_LENGTH = 10
53
54
 
54
55
  COMPACT_GROUP_PLACEMENT_POLICY = 'compact'
55
56
  COLLOCATED_COLLOCATION = 'COLLOCATED'
56
- GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
57
- set -e
58
- set -x
57
+ GPU_DIRECT_TCPX_USER_DATA = textwrap.dedent("""
59
58
  # Install GPU Direct TCPX
60
59
  cos-extensions install gpu -- --version=latest;
61
60
  sudo mount --bind /var/lib/nvidia /var/lib/nvidia;
@@ -83,7 +82,7 @@ GPU_DIRECT_TCPX_USER_DATA = """#!/bin/bash
83
82
  sudo mount --bind /var/lib/tcpx /var/lib/tcpx;
84
83
  sudo mount -o remount,exec /var/lib/tcpx;
85
84
  echo "GPU Direct TCPX installed"
86
- """
85
+ """)
87
86
 
88
87
  GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
89
88
  '--cap-add=IPC_LOCK',
@@ -106,6 +105,150 @@ GPU_DIRECT_TCPX_SPECIFIC_OPTIONS = [
106
105
  '--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64:/usr/local/tcpx/lib64',
107
106
  ]
108
107
 
108
+ PD_EXTREME_IOPS = 20000
109
+ DEFAULT_DISK_SIZE = 100
110
+ NETWORK_STORAGE_TYPE = 'PERSISTENT'
111
+ INSTANCE_STORAGE_TYPE = 'SCRATCH'
112
+ INSTANCE_STORAGE_DISK_TYPE = 'local-ssd'
113
+ INSTANCE_STORAGE_INTERFACE_TYPE = 'NVME'
114
+ INSTANCE_STORAGE_DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-local-nvme-ssd-'
115
+ DEVICE_NAME_PREFIX = '/dev/disk/by-id/google-'
116
+
117
+ BASH_SCRIPT_START = textwrap.dedent("""#!/bin/bash
118
+ set -e
119
+ set -x
120
+ """)
121
+ DISK_MOUNT_USER_DATA_TEMPLATE = textwrap.dedent("""
122
+ # Define arrays for devices and mount points
123
+ declare -A device_mounts=(
124
+ {device_mounts}
125
+ )
126
+
127
+ # Function to format and mount a single device
128
+ format_and_mount() {{
129
+ local device_name="$1"
130
+ local mount_point="$2"
131
+
132
+ if [ ! -e "$device_name" ]; then
133
+ echo "Error: Device $device_name does not exist."
134
+ return 1
135
+ fi
136
+
137
+ # Check if filesystem is already formatted (ext4)
138
+ if ! sudo blkid "$device_name" | grep -q 'TYPE="ext4"'; then
139
+ if [[ "$device_name" == "/dev/disk/by-id/google-local-nvme-ssd"* ]]; then
140
+ echo "Formatting local SSD $device_name..."
141
+ if ! sudo mkfs.ext4 -F "$device_name"; then
142
+ echo "Error: Failed to format $device_name"
143
+ return 1
144
+ fi
145
+ else
146
+ echo "Formatting persistent disk $device_name..."
147
+ if ! sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard "$device_name"; then
148
+ echo "Error: Failed to format $device_name"
149
+ return 1
150
+ fi
151
+ fi
152
+ else
153
+ echo "$device_name is already formatted."
154
+ fi
155
+
156
+ # Check if already mounted
157
+ if ! grep -q "$mount_point" /proc/mounts; then
158
+ echo "Mounting $device_name to $mount_point..."
159
+ if ! sudo mkdir -p "$mount_point"; then
160
+ echo "Error: Failed to create mount point $mount_point"
161
+ return 1
162
+ fi
163
+
164
+ if ! sudo mount "$device_name" "$mount_point"; then
165
+ echo "Error: Failed to mount $device_name to $mount_point"
166
+ return 1
167
+ fi
168
+
169
+ # Add to fstab if not already present
170
+ if ! grep -q " $mount_point " /etc/fstab; then
171
+ echo "Adding mount entry to /etc/fstab..."
172
+ echo "UUID=`sudo blkid -s UUID -o value $device_name` $mount_point ext4 defaults,nofail 0 2" | sudo tee -a /etc/fstab
173
+ else
174
+ echo "Mount entry already exists in /etc/fstab"
175
+ fi
176
+ else
177
+ echo "$device_name is already mounted at $mount_point"
178
+ fi
179
+ }}
180
+
181
+ # Main execution
182
+ echo "Starting device mounting process..."
183
+
184
+ # Process each device-mount pair
185
+ for device in "${{!device_mounts[@]}}"; do
186
+ mount_point="${{device_mounts[$device]}}"
187
+ echo "Processing device: $device -> $mount_point"
188
+ if ! format_and_mount "$device" "$mount_point"; then
189
+ echo "Failed to process device $device"
190
+ # Continue with other devices even if one fails
191
+ continue
192
+ fi
193
+ done
194
+
195
+ echo "Device mounting process completed."
196
+ """)
197
+
198
+ # The local SSDs will be attached automatically to the following
199
+ # machine types with the following number of disks.
200
+ # Refer to https://cloud.google.com/compute/docs/disks/local-ssd#lssd_disks_fixed
201
+ SSD_AUTO_ATTACH_MACHINE_TYPES = {
202
+ 'c4a-standard-4-lssd': 1,
203
+ 'c4a-highmem-4-lssd': 1,
204
+ 'c4a-standard-8-lssd': 2,
205
+ 'c4a-highmem-8-lssd': 2,
206
+ 'c4a-standard-16-lssd': 4,
207
+ 'c4a-highmem-16-lssd': 4,
208
+ 'c4a-standard-32-lssd': 6,
209
+ 'c4a-highmem-32-lssd': 6,
210
+ 'c4a-standard-48-lssd': 10,
211
+ 'c4a-highmem-48-lssd': 10,
212
+ 'c4a-standard-64-lssd': 14,
213
+ 'c4a-highmem-64-lssd': 14,
214
+ 'c4a-standard-72-lssd': 16,
215
+ 'c4a-highmem-72-lssd': 16,
216
+ 'c3-standard-4-lssd': 1,
217
+ 'c3-standard-8-lssd': 2,
218
+ 'c3-standard-22-lssd': 4,
219
+ 'c3-standard-44-lssd': 8,
220
+ 'c3-standard-88-lssd': 16,
221
+ 'c3-standard-176-lssd': 32,
222
+ 'c3d-standard-8-lssd': 1,
223
+ 'c3d-highmem-8-lssd': 1,
224
+ 'c3d-standard-16-lssd': 1,
225
+ 'c3d-highmem-16-lssd': 1,
226
+ 'c3d-standard-30-lssd': 2,
227
+ 'c3d-highmem-30-lssd': 2,
228
+ 'c3d-standard-60-lssd': 4,
229
+ 'c3d-highmem-60-lssd': 4,
230
+ 'c3d-standard-90-lssd': 8,
231
+ 'c3d-highmem-90-lssd': 8,
232
+ 'c3d-standard-180-lssd': 16,
233
+ 'c3d-highmem-180-lssd': 16,
234
+ 'c3d-standard-360-lssd': 32,
235
+ 'c3d-highmem-360-lssd': 32,
236
+ 'a4-highgpu-8g': 32,
237
+ 'a3-ultragpu-8g': 32,
238
+ 'a3-megagpu-8g': 16,
239
+ 'a3-highgpu-1g': 2,
240
+ 'a3-highgpu-2g': 4,
241
+ 'a3-highgpu-4g': 8,
242
+ 'a3-highgpu-8g': 16,
243
+ 'a3-edgegpu-8g': 16,
244
+ 'a2-ultragpu-1g': 1,
245
+ 'a2-ultragpu-2g': 2,
246
+ 'a2-ultragpu-4g': 4,
247
+ 'a2-ultragpu-8g': 8,
248
+ 'z3-highmem-88': 12,
249
+ 'z3-highmem-176': 12,
250
+ }
251
+
109
252
  # Below parameters are from the default VPC on GCP.
110
253
  # https://cloud.google.com/vpc/docs/firewalls#more_rules_default_vpc
111
254
  VPC_TEMPLATE: dict = {
@@ -826,6 +826,16 @@ class GCPComputeInstance(GCPInstance):
826
826
  # https://cloud.google.com/compute/docs/reference/rest/v1/instances/bulkInsert # pylint: disable=line-too-long
827
827
  if config.get('sourceMachineImage') is not None:
828
828
  return False
829
+ # bulkInsert does not support attaching existing
830
+ # disks to the instances with READ_WRITE mode.
831
+ if config.get('disks') is not None:
832
+ for disk in config['disks']:
833
+ if disk.get('source') is not None and disk.get(
834
+ 'mode', 'READ_WRITE') == 'READ_WRITE':
835
+ return False
836
+ if disk.get('initializeParams') is not None and disk.get(
837
+ 'initializeParams', {}).get('diskName') is not None:
838
+ return False
829
839
  return True
830
840
 
831
841
  @classmethod
@@ -0,0 +1,247 @@
1
+ """Utilities for GCP volumes."""
2
+ from typing import Any, Dict, List, Optional
3
+
4
+ from sky import clouds
5
+ from sky import exceptions
6
+ from sky import sky_logging
7
+ from sky.adaptors import gcp
8
+ from sky.provision.gcp import constants
9
+ from sky.utils import resources_utils
10
+ from sky.utils import ux_utils
11
+
12
+ logger = sky_logging.init_logger(__name__)
13
+
14
+
15
+ def get_data_disk_tier_mapping(
16
+ instance_type: Optional[str],) -> Dict[resources_utils.DiskTier, str]:
17
+ # Define the default mapping from disk tiers to disk types.
18
+ # Refer to https://cloud.google.com/compute/docs/disks/hyperdisks
19
+ # and https://cloud.google.com/compute/docs/disks/persistent-disks
20
+ tier2name = {
21
+ resources_utils.DiskTier.ULTRA: 'pd-extreme',
22
+ resources_utils.DiskTier.HIGH: 'pd-ssd',
23
+ resources_utils.DiskTier.MEDIUM: 'pd-balanced',
24
+ resources_utils.DiskTier.LOW: 'pd-standard',
25
+ }
26
+
27
+ if instance_type is None:
28
+ return tier2name
29
+
30
+ # Remap series-specific disk types.
31
+ series = instance_type.split('-')[0]
32
+
33
+ if series in ['a4', 'x4']:
34
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
35
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
36
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
37
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
38
+ elif series in ['m4']:
39
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
40
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
41
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
42
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
43
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
44
+ if num_cpus < 112:
45
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
46
+ elif series in ['c4', 'c4a', 'c4d']:
47
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
48
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
49
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
50
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
51
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
52
+ if num_cpus < 64:
53
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
54
+ elif series in ['a3']:
55
+ if (instance_type.startswith('a3-ultragpu') or
56
+ instance_type.startswith('a3-megagpu') or
57
+ instance_type.startswith('a3-edgegpu')):
58
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
59
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
60
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
61
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
62
+ elif instance_type.startswith('a3-highgpu'):
63
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
64
+ if instance_type.startswith('a3-highgpu-8g'):
65
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
66
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
67
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
68
+ elif instance_type.startswith('a3-highgpu-4g'):
69
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
70
+ else:
71
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
72
+ elif series in ['c3d']:
73
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
74
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
75
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
76
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
77
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
78
+ if num_cpus < 60:
79
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
80
+ elif series in ['c3']:
81
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
82
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
83
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
84
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
85
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
86
+ if num_cpus < 88:
87
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
88
+ elif series in ['n4']:
89
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
90
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
91
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'hyperdisk-balanced'
92
+ tier2name[resources_utils.DiskTier.LOW] = 'hyperdisk-balanced'
93
+ elif series in ['n2d', 'n1', 't2d', 't2a', 'e2', 'c2', 'c2d', 'a2']:
94
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
95
+ elif series in ['z3']:
96
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
97
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
98
+ elif series in ['h3']:
99
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
100
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
101
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
102
+ elif series in ['m3']:
103
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
104
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
105
+ tier2name[resources_utils.DiskTier.MEDIUM] = 'pd-ssd'
106
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
107
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
108
+ if num_cpus < 64:
109
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
110
+ elif series in ['m2']:
111
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
112
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
113
+ elif series in ['m1']:
114
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
115
+ tier2name[resources_utils.DiskTier.HIGH] = 'hyperdisk-balanced'
116
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
117
+ if num_cpus < 80:
118
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-balanced'
119
+ elif series in ['g2']:
120
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
121
+ tier2name[resources_utils.DiskTier.LOW] = 'pd-balanced'
122
+ elif series in ['n2']:
123
+ num_cpus = int(instance_type.split('-')[2]) # type: ignore
124
+ if num_cpus < 64:
125
+ tier2name[resources_utils.DiskTier.ULTRA] = 'pd-ssd'
126
+ elif num_cpus >= 80:
127
+ tier2name[resources_utils.DiskTier.ULTRA] = 'hyperdisk-extreme'
128
+
129
+ return tier2name
130
+
131
+
132
+ def validate_instance_volumes(
133
+ instance_type: Optional[str],
134
+ volumes: Optional[List[Dict[str, Any]]],
135
+ ) -> None:
136
+ if not volumes:
137
+ return
138
+ if instance_type is None:
139
+ logger.warning('Instance type is not specified,'
140
+ ' skipping instance volume validation')
141
+ return
142
+ instance_volume_count = 0
143
+ for volume in volumes:
144
+ if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
145
+ instance_volume_count += 1
146
+ if (instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES and
147
+ instance_volume_count >
148
+ constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]):
149
+ raise exceptions.ResourcesUnavailableError(
150
+ f'The instance type {instance_type} supports'
151
+ f' {constants.SSD_AUTO_ATTACH_MACHINE_TYPES[instance_type]}'
152
+ f' instance storage, but {instance_volume_count} are specified')
153
+ # TODO(hailong):
154
+ # check the instance storage count for the other instance types,
155
+ # refer to https://cloud.google.com/compute/docs/disks/local-ssd
156
+
157
+
158
+ def translate_attach_mode(attach_mode: resources_utils.DiskAttachMode) -> str:
159
+ if attach_mode == resources_utils.DiskAttachMode.READ_ONLY:
160
+ return 'READ_ONLY'
161
+ return 'READ_WRITE'
162
+
163
+
164
+ def check_volume_name_exist_in_region(
165
+ project_id: str, region: clouds.Region, use_mig: bool,
166
+ volume_name: str) -> Optional[Dict[str, Any]]:
167
+ """Check if the volume name exists and return the volume info."""
168
+ logger.debug(f'Checking volume {volume_name} in region {region}')
169
+ try:
170
+ compute = gcp.build('compute',
171
+ 'v1',
172
+ credentials=None,
173
+ cache_discovery=False)
174
+ except gcp.credential_error_exception():
175
+ with ux_utils.print_exception_no_traceback():
176
+ raise ValueError('Not able to build compute client') from None
177
+
178
+ # Get all the zones in the region
179
+ all_zones = compute.zones().list(project=project_id).execute()
180
+ region_zones = []
181
+ if 'items' in all_zones:
182
+ for zone in all_zones['items']:
183
+ if zone['region'].split('/')[-1] == region.name:
184
+ region_zones.append(zone['name'])
185
+ volume_info = None
186
+ for zone in region_zones:
187
+ try:
188
+ volume_info = compute.disks().get(project=project_id,
189
+ zone=zone,
190
+ disk=volume_name).execute()
191
+ if volume_info is not None:
192
+ if use_mig:
193
+ # With MIG, instance template will be used, in this case,
194
+ # the `selfLink` for zonal disk needs to be the volume name
195
+ # Refer to https://cloud.google.com/compute/docs/
196
+ # reference/rest/v1/instances/insert
197
+ volume_info['selfLink'] = volume_name
198
+ volume_info['available_zones'] = [zone]
199
+ return volume_info
200
+ except gcp.http_error_exception() as e:
201
+ if e.resp.status == 403:
202
+ with ux_utils.print_exception_no_traceback():
203
+ raise ValueError('Not able to access the volume '
204
+ f'{volume_name!r}') from None
205
+ if e.resp.status == 404:
206
+ continue # Try next zone
207
+ raise
208
+
209
+ # If not found in any zone, check region disk
210
+ try:
211
+ volume_info = compute.regionDisks().get(project=project_id,
212
+ region=region.name,
213
+ disk=volume_name).execute()
214
+ # 'replicaZones':
215
+ # ['https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-a',
216
+ # 'https://xxx/compute/v1/projects/sky-dev-465/zones/us-central1-c']
217
+ if volume_info is not None and 'replicaZones' in volume_info:
218
+ replica_zones = [
219
+ zone.split('/')[-1] for zone in volume_info['replicaZones']
220
+ ]
221
+ volume_info['available_zones'] = replica_zones
222
+ return volume_info
223
+ except gcp.http_error_exception() as e:
224
+ if e.resp.status == 403:
225
+ with ux_utils.print_exception_no_traceback():
226
+ raise ValueError('Not able to access the volume '
227
+ f'{volume_name!r}') from None
228
+ if e.resp.status == 404:
229
+ logger.warning(
230
+ f'Volume {volume_name} is not found in region {region}.'
231
+ f' It will be created.')
232
+ return volume_info
233
+ raise
234
+
235
+
236
+ def check_volume_zone_match(volume_name: str,
237
+ zones: Optional[List[clouds.Zone]],
238
+ available_zones: List[str]):
239
+ if zones is None:
240
+ return None
241
+ for zone in zones:
242
+ if zone.name in available_zones:
243
+ return None
244
+ with ux_utils.print_exception_no_traceback():
245
+ # Return a ResourcesUnavailableError to trigger failover
246
+ raise exceptions.ResourcesUnavailableError(
247
+ f'Volume {volume_name} not available in zones {zones}') from None
@@ -1265,6 +1265,8 @@ def query_instances(
1265
1265
  assert provider_config is not None
1266
1266
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1267
1267
  context = kubernetes_utils.get_context_from_config(provider_config)
1268
+ is_ssh = context.startswith('ssh-') if context else False
1269
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1268
1270
 
1269
1271
  # Get all the pods with the label skypilot-cluster: <cluster_name>
1270
1272
  try:
@@ -1274,15 +1276,24 @@ def query_instances(
1274
1276
  _request_timeout=kubernetes.API_TIMEOUT).items
1275
1277
  except kubernetes.max_retry_error():
1276
1278
  with ux_utils.print_exception_no_traceback():
1277
- ctx = kubernetes_utils.get_current_kube_config_context_name()
1279
+ if is_ssh:
1280
+ node_pool = context.lstrip('ssh-') if context else ''
1281
+ msg = (
1282
+ f'Cannot connect to SSH Node Pool {node_pool}. '
1283
+ 'Please check if the SSH Node Pool is up and accessible. '
1284
+ 'To debug, run `sky check ssh` to check the status of '
1285
+ 'the SSH Node Pool.')
1286
+ else:
1287
+ ctx = kubernetes_utils.get_current_kube_config_context_name()
1288
+ msg = (f'Network error - check if the {identity} in '
1289
+ f'context {ctx} is up and accessible.')
1278
1290
  raise exceptions.ClusterStatusFetchingError(
1279
- f'Failed to query cluster {cluster_name_on_cloud!r} status. '
1280
- 'Network error - check if the Kubernetes cluster in '
1281
- f'context {ctx} is up and accessible.') from None
1291
+ f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
1292
+ msg) from None
1282
1293
  except Exception as e: # pylint: disable=broad-except
1283
1294
  with ux_utils.print_exception_no_traceback():
1284
1295
  raise exceptions.ClusterStatusFetchingError(
1285
- f'Failed to query Kubernetes cluster {cluster_name_on_cloud!r} '
1296
+ f'Failed to query {identity} {cluster_name_on_cloud!r} '
1286
1297
  f'status: {common_utils.format_exception(e)}')
1287
1298
 
1288
1299
  # Check if the pods are running or pending