skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +122 -3
  8. sky/clouds/__init__.py +5 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +30 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +160 -23
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/__init__.py +3 -0
  25. sky/clouds/service_catalog/common.py +9 -2
  26. sky/clouds/service_catalog/constants.py +2 -1
  27. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  28. sky/clouds/ssh.py +203 -0
  29. sky/clouds/vast.py +2 -1
  30. sky/clouds/vsphere.py +2 -1
  31. sky/core.py +59 -17
  32. sky/dashboard/out/404.html +1 -1
  33. sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/data/storage.py +1 -0
  43. sky/execution.py +56 -7
  44. sky/jobs/server/core.py +4 -2
  45. sky/optimizer.py +29 -15
  46. sky/provision/__init__.py +1 -0
  47. sky/provision/aws/instance.py +17 -1
  48. sky/provision/gcp/constants.py +147 -4
  49. sky/provision/gcp/instance_utils.py +10 -0
  50. sky/provision/gcp/volume_utils.py +247 -0
  51. sky/provision/kubernetes/instance.py +16 -5
  52. sky/provision/kubernetes/utils.py +37 -19
  53. sky/provision/nebius/instance.py +3 -1
  54. sky/provision/nebius/utils.py +14 -2
  55. sky/provision/ssh/__init__.py +18 -0
  56. sky/resources.py +177 -4
  57. sky/serve/server/core.py +2 -4
  58. sky/server/common.py +46 -9
  59. sky/server/constants.py +2 -0
  60. sky/server/html/token_page.html +154 -0
  61. sky/server/requests/executor.py +3 -6
  62. sky/server/requests/payloads.py +7 -0
  63. sky/server/server.py +80 -8
  64. sky/setup_files/dependencies.py +1 -0
  65. sky/skypilot_config.py +117 -31
  66. sky/task.py +24 -1
  67. sky/templates/gcp-ray.yml.j2 +44 -1
  68. sky/templates/nebius-ray.yml.j2 +12 -2
  69. sky/utils/admin_policy_utils.py +26 -22
  70. sky/utils/context.py +36 -6
  71. sky/utils/context_utils.py +15 -0
  72. sky/utils/infra_utils.py +21 -1
  73. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  74. sky/utils/kubernetes/create_cluster.sh +1 -0
  75. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  76. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  77. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  78. sky/utils/log_utils.py +214 -1
  79. sky/utils/resources_utils.py +14 -0
  80. sky/utils/schemas.py +67 -0
  81. sky/utils/ux_utils.py +2 -1
  82. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  83. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
  84. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  85. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  86. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  87. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  88. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  89. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  90. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
sky/clouds/__init__.py CHANGED
@@ -2,7 +2,9 @@
2
2
 
3
3
  from sky.clouds.cloud import Cloud
4
4
  from sky.clouds.cloud import cloud_in_iterable
5
+ from sky.clouds.cloud import CloudCapability
5
6
  from sky.clouds.cloud import CloudImplementationFeatures
7
+ from sky.clouds.cloud import DummyCloud
6
8
  from sky.clouds.cloud import OpenPortsVersion
7
9
  from sky.clouds.cloud import ProvisionerVersion
8
10
  from sky.clouds.cloud import Region
@@ -25,6 +27,7 @@ from sky.clouds.oci import OCI
25
27
  from sky.clouds.paperspace import Paperspace
26
28
  from sky.clouds.runpod import RunPod
27
29
  from sky.clouds.scp import SCP
30
+ from sky.clouds.ssh import SSH
28
31
  from sky.clouds.vast import Vast
29
32
  from sky.clouds.vsphere import Vsphere
30
33
 
@@ -34,6 +37,7 @@ __all__ = [
34
37
  'Azure',
35
38
  'Cloud',
36
39
  'Cudo',
40
+ 'DummyCloud',
37
41
  'GCP',
38
42
  'Lambda',
39
43
  'DO',
@@ -44,6 +48,7 @@ __all__ = [
44
48
  'OCI',
45
49
  'Vsphere',
46
50
  'Kubernetes',
51
+ 'SSH',
47
52
  'CloudImplementationFeatures',
48
53
  'Region',
49
54
  'Zone',
sky/clouds/aws.py CHANGED
@@ -565,12 +565,14 @@ class AWS(clouds.Cloud):
565
565
  fuzzy_candidate_list, None)
566
566
 
567
567
  @classmethod
568
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
568
+ def _check_compute_credentials(
569
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
569
570
  """Checks if the user has access credentials to this AWS's compute service."""
570
571
  return cls._check_credentials()
571
572
 
572
573
  @classmethod
573
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
574
+ def _check_storage_credentials(
575
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
574
576
  """Checks if the user has access credentials to this AWS's storage service."""
575
577
  # TODO(seungjin): Implement separate check for
576
578
  # if the user has access to S3.
sky/clouds/azure.py CHANGED
@@ -518,12 +518,14 @@ class Azure(clouds.Cloud):
518
518
  fuzzy_candidate_list, None)
519
519
 
520
520
  @classmethod
521
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
521
+ def _check_compute_credentials(
522
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
522
523
  """Checks if the user has access credentials to this cloud's compute service."""
523
524
  return cls._check_credentials()
524
525
 
525
526
  @classmethod
526
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
527
+ def _check_storage_credentials(
528
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
527
529
  """Checks if the user has access credentials to this cloud's storage service."""
528
530
  # TODO(seungjin): Implement separate check for
529
531
  # if the user has access to Azure Blob Storage.
sky/clouds/cloud.py CHANGED
@@ -457,12 +457,14 @@ class Cloud:
457
457
 
458
458
  @classmethod
459
459
  def check_credentials(
460
- cls,
461
- cloud_capability: CloudCapability) -> Tuple[bool, Optional[str]]:
460
+ cls, cloud_capability: CloudCapability
461
+ ) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
462
462
  """Checks if the user has access credentials to this cloud.
463
463
 
464
- Returns a boolean of whether the user can access this cloud, and a
465
- string describing the reason if the user cannot access.
464
+ Returns a boolean of whether the user can access this cloud, and:
465
+ - For SSH and Kubernetes, a dictionary that maps context names to
466
+ the status of the context.
467
+ - For others, a string describing the reason if cannot access.
466
468
 
467
469
  Raises NotSupportedError if the capability is
468
470
  not supported by this cloud.
@@ -474,19 +476,30 @@ class Cloud:
474
476
  assert_never(cloud_capability)
475
477
 
476
478
  @classmethod
477
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
479
+ def _check_compute_credentials(
480
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
478
481
  """Checks if the user has access credentials to
479
482
  this cloud's compute service."""
480
483
  raise exceptions.NotSupportedError(
481
484
  f'{cls._REPR} does not support {CloudCapability.COMPUTE.value}.')
482
485
 
483
486
  @classmethod
484
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
487
+ def _check_storage_credentials(
488
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
485
489
  """Checks if the user has access credentials to
486
490
  this cloud's storage service."""
487
491
  raise exceptions.NotSupportedError(
488
492
  f'{cls._REPR} does not support {CloudCapability.STORAGE.value}.')
489
493
 
494
+ @classmethod
495
+ def get_infras(cls) -> List[str]:
496
+ """Returns a list of enabled infrastructures for this cloud.
497
+
498
+ For Kubernetes and SSH, return a list of resource pools.
499
+ For all other clouds, return self.
500
+ """
501
+ return [cls._REPR.lower()]
502
+
490
503
  # TODO(zhwu): Make the return type immutable.
491
504
  @classmethod
492
505
  def get_user_identities(cls) -> Optional[List[List[str]]]:
@@ -878,6 +891,11 @@ class Cloud:
878
891
  def canonical_name(cls) -> str:
879
892
  return cls.__name__.lower()
880
893
 
894
+ @classmethod
895
+ def display_name(cls) -> str:
896
+ """Name of the cloud used in messages displayed to the user."""
897
+ return cls.canonical_name()
898
+
881
899
  def __repr__(self):
882
900
  return self._REPR
883
901
 
@@ -888,6 +906,12 @@ class Cloud:
888
906
  return state
889
907
 
890
908
 
909
+ class DummyCloud(Cloud):
910
+ """A dummy Cloud that has zero egress cost from/to for optimization
911
+ purpose."""
912
+ pass
913
+
914
+
891
915
  # === Helper functions ===
892
916
  def cloud_in_iterable(cloud: Cloud, cloud_list: Iterable[Cloud]) -> bool:
893
917
  """Returns whether the cloud is in the given cloud list."""
sky/clouds/cudo.py CHANGED
@@ -270,7 +270,8 @@ class Cudo(clouds.Cloud):
270
270
  fuzzy_candidate_list, None)
271
271
 
272
272
  @classmethod
273
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
273
+ def _check_compute_credentials(
274
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
274
275
  """Checks if the user has access credentials to
275
276
  Cudo's compute service."""
276
277
  try:
sky/clouds/do.py CHANGED
@@ -264,7 +264,8 @@ class DO(clouds.Cloud):
264
264
  fuzzy_candidate_list, None)
265
265
 
266
266
  @classmethod
267
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
267
+ def _check_compute_credentials(
268
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
268
269
  """Verify that the user has valid credentials for
269
270
  DO's compute service."""
270
271
 
sky/clouds/fluidstack.py CHANGED
@@ -261,7 +261,8 @@ class Fluidstack(clouds.Cloud):
261
261
  fuzzy_candidate_list, None)
262
262
 
263
263
  @classmethod
264
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
264
+ def _check_compute_credentials(
265
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
265
266
  """Checks if the user has access credentials to
266
267
  FluidStack's compute service."""
267
268
  try:
sky/clouds/gcp.py CHANGED
@@ -18,6 +18,7 @@ from sky.adaptors import gcp
18
18
  from sky.clouds import service_catalog
19
19
  from sky.clouds.utils import gcp_utils
20
20
  from sky.provision.gcp import constants
21
+ from sky.provision.gcp import volume_utils
21
22
  from sky.utils import annotations
22
23
  from sky.utils import common_utils
23
24
  from sky.utils import registry
@@ -443,6 +444,25 @@ class GCP(clouds.Cloud):
443
444
  disk_tier=disk_tier,
444
445
  clouds='gcp')
445
446
 
447
+ @classmethod
448
+ def failover_disk_tier(
449
+ cls, instance_type: Optional[str],
450
+ disk_tier: Optional[resources_utils.DiskTier]
451
+ ) -> Optional[resources_utils.DiskTier]:
452
+ if (disk_tier is not None and
453
+ disk_tier != resources_utils.DiskTier.BEST):
454
+ return disk_tier
455
+ # Failover disk tier from ultra to low.
456
+ all_tiers = list(reversed(resources_utils.DiskTier))
457
+ start_index = all_tiers.index(GCP._translate_disk_tier(disk_tier))
458
+ while start_index < len(all_tiers):
459
+ disk_tier = all_tiers[start_index]
460
+ ok, _ = GCP.check_disk_tier(instance_type, disk_tier)
461
+ if ok:
462
+ return disk_tier
463
+ start_index += 1
464
+ assert False, 'Low disk tier should always be supported on GCP.'
465
+
446
466
  def make_deploy_resources_variables(
447
467
  self,
448
468
  resources: 'resources.Resources',
@@ -463,21 +483,6 @@ class GCP(clouds.Cloud):
463
483
  # issue when first booted.
464
484
  image_id = _DEFAULT_CPU_IMAGE_ID
465
485
 
466
- def _failover_disk_tier() -> Optional[resources_utils.DiskTier]:
467
- if (r.disk_tier is not None and
468
- r.disk_tier != resources_utils.DiskTier.BEST):
469
- return r.disk_tier
470
- # Failover disk tier from ultra to low.
471
- all_tiers = list(reversed(resources_utils.DiskTier))
472
- start_index = all_tiers.index(GCP._translate_disk_tier(r.disk_tier))
473
- while start_index < len(all_tiers):
474
- disk_tier = all_tiers[start_index]
475
- ok, _ = GCP.check_disk_tier(r.instance_type, disk_tier)
476
- if ok:
477
- return disk_tier
478
- start_index += 1
479
- assert False, 'Low disk tier should always be supported on GCP.'
480
-
481
486
  r = resources
482
487
  # Find GPU spec, if any.
483
488
  resources_vars = {
@@ -491,7 +496,9 @@ class GCP(clouds.Cloud):
491
496
  'custom_resources': None,
492
497
  'use_spot': r.use_spot,
493
498
  'gcp_project_id': self.get_project_id(dryrun),
494
- **GCP._get_disk_specs(r.instance_type, _failover_disk_tier()),
499
+ **GCP._get_disk_specs(
500
+ r.instance_type,
501
+ GCP.failover_disk_tier(r.instance_type, r.disk_tier)),
495
502
  }
496
503
  enable_gpu_direct = skypilot_config.get_nested(
497
504
  ('gcp', 'enable_gpu_direct'),
@@ -593,6 +600,27 @@ class GCP(clouds.Cloud):
593
600
  'force_enable_external_ips'] = skypilot_config.get_nested(
594
601
  ('gcp', 'force_enable_external_ips'), False)
595
602
 
603
+ volumes, device_mount_points = GCP._get_volumes_specs(
604
+ region, zones, r.instance_type, r.volumes, use_mig,
605
+ resources_vars['tpu_vm'])
606
+ resources_vars['volumes'] = volumes
607
+
608
+ resources_vars['user_data'] = None
609
+ user_data = ''
610
+ docker_run_options = []
611
+ if device_mount_points:
612
+ # Build the device_mounts array
613
+ device_mounts_array = []
614
+ for device_name, mount_point in device_mount_points.items():
615
+ device_mounts_array.append(f'["{device_name}"]="{mount_point}"')
616
+ docker_run_options.append(
617
+ f'--volume={mount_point}:{mount_point}')
618
+ device_mounts_str = '\n '.join(device_mounts_array)
619
+
620
+ # Format the template with the device_mounts array
621
+ user_data += constants.DISK_MOUNT_USER_DATA_TEMPLATE.format(
622
+ device_mounts=device_mounts_str)
623
+
596
624
  # Add gVNIC from config
597
625
  resources_vars['enable_gvnic'] = skypilot_config.get_nested(
598
626
  ('gcp', 'enable_gvnic'),
@@ -602,13 +630,16 @@ class GCP(clouds.Cloud):
602
630
  ('gcp', 'placement_policy'),
603
631
  None,
604
632
  override_configs=resources.cluster_config_overrides)
605
- resources_vars['user_data'] = None
606
633
  if enable_gpu_direct:
607
- resources_vars['user_data'] = constants.GPU_DIRECT_TCPX_USER_DATA
608
- resources_vars[
609
- 'docker_run_options'] = constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
634
+ user_data += constants.GPU_DIRECT_TCPX_USER_DATA
635
+ docker_run_options += constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
610
636
  if placement_policy is None:
611
637
  placement_policy = constants.COMPACT_GROUP_PLACEMENT_POLICY
638
+ if user_data:
639
+ resources_vars[
640
+ 'user_data'] = constants.BASH_SCRIPT_START + user_data
641
+ if docker_run_options:
642
+ resources_vars['docker_run_options'] = docker_run_options
612
643
  resources_vars['placement_policy'] = placement_policy
613
644
 
614
645
  return resources_vars
@@ -760,7 +791,8 @@ class GCP(clouds.Cloud):
760
791
  return DEFAULT_GCP_APPLICATION_CREDENTIAL_PATH
761
792
 
762
793
  @classmethod
763
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
794
+ def _check_compute_credentials(
795
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
764
796
  """Checks if the user has access credentials to this cloud's compute service."""
765
797
  return cls._check_credentials(
766
798
  [
@@ -772,7 +804,8 @@ class GCP(clouds.Cloud):
772
804
  gcp_utils.get_minimal_compute_permissions())
773
805
 
774
806
  @classmethod
775
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
807
+ def _check_storage_credentials(
808
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
776
809
  """Checks if the user has access credentials to this cloud's storage service."""
777
810
  return cls._check_credentials(
778
811
  [('storage', 'Cloud Storage')],
@@ -1122,6 +1155,17 @@ class GCP(clouds.Cloud):
1122
1155
 
1123
1156
  return tier2name[tier]
1124
1157
 
1158
+ @classmethod
1159
+ def _get_data_disk_type(
1160
+ cls,
1161
+ instance_type: Optional[str],
1162
+ disk_tier: Optional[resources_utils.DiskTier],
1163
+ ) -> str:
1164
+
1165
+ tier = cls._translate_disk_tier(disk_tier)
1166
+ tier2name = volume_utils.get_data_disk_tier_mapping(instance_type)
1167
+ return tier2name[tier]
1168
+
1125
1169
  @classmethod
1126
1170
  def _get_disk_specs(
1127
1171
  cls, instance_type: Optional[str],
@@ -1133,9 +1177,102 @@ class GCP(clouds.Cloud):
1133
1177
  specs['disk_tier'] == 'pd-extreme'):
1134
1178
  # Only pd-extreme supports custom iops.
1135
1179
  # see https://cloud.google.com/compute/docs/disks#disk-types
1136
- specs['disk_iops'] = 20000
1180
+ specs['disk_iops'] = constants.PD_EXTREME_IOPS
1137
1181
  return specs
1138
1182
 
1183
+ @classmethod
1184
+ def _get_volumes_specs(
1185
+ cls,
1186
+ region: 'clouds.Region',
1187
+ zones: Optional[List['clouds.Zone']],
1188
+ instance_type: Optional[str],
1189
+ volumes: Optional[List[Dict[str, Any]]],
1190
+ use_mig: bool,
1191
+ tpu_vm: bool,
1192
+ ) -> Tuple[List[Dict[str, Any]], Dict[str, str]]:
1193
+ if volumes is None:
1194
+ return [], {}
1195
+
1196
+ project_id = cls.get_project_id()
1197
+
1198
+ volume_utils.validate_instance_volumes(instance_type, volumes)
1199
+
1200
+ volumes_specs: List[Dict[str, Any]] = []
1201
+ device_mount_points: Dict[str, str] = {}
1202
+ ssd_index = 0
1203
+ # TPU data disk index starts from 1, 0 is the boot disk
1204
+ tpu_disk_index = 1
1205
+ for i, volume in enumerate(volumes):
1206
+ volume_spec = {
1207
+ 'device_name': f'sky-disk-{i}',
1208
+ 'auto_delete': volume['auto_delete'],
1209
+ }
1210
+ if ('name' in volume and volume['storage_type']
1211
+ == resources_utils.StorageType.NETWORK):
1212
+ volume_info = volume_utils.check_volume_name_exist_in_region(
1213
+ project_id, region, use_mig, volume['name'])
1214
+ if volume_info is not None:
1215
+ volume_utils.check_volume_zone_match(
1216
+ volume['name'], zones, volume_info['available_zones'])
1217
+ volume_spec['source'] = volume_info['selfLink']
1218
+ volume_spec[
1219
+ 'attach_mode'] = volume_utils.translate_attach_mode(
1220
+ volume['attach_mode'])
1221
+ volume_spec['storage_type'] = constants.NETWORK_STORAGE_TYPE
1222
+ volumes_specs.append(volume_spec)
1223
+ device_name = f'{constants.DEVICE_NAME_PREFIX}sky-disk-{i}'
1224
+ if tpu_vm:
1225
+ # TPU VM does not support specifying the device name,
1226
+ # so we use the default device name.
1227
+ device_name = f'{constants.DEVICE_NAME_PREFIX}persistent-disk-{tpu_disk_index}'
1228
+ tpu_disk_index += 1
1229
+ device_mount_points[device_name] = volume['path']
1230
+ continue
1231
+ if tpu_vm:
1232
+ # TODO(hailong): support creating block storage for TPU VM
1233
+ continue
1234
+ if volume['storage_type'] == resources_utils.StorageType.INSTANCE:
1235
+ device_name = f'{constants.INSTANCE_STORAGE_DEVICE_NAME_PREFIX}{ssd_index}'
1236
+ ssd_index += 1
1237
+ device_mount_points[device_name] = volume['path']
1238
+
1239
+ if instance_type is not None and instance_type in constants.SSD_AUTO_ATTACH_MACHINE_TYPES:
1240
+ # The instance storage will be attached automatically,
1241
+ # so we skip the following steps.
1242
+ continue
1243
+
1244
+ volume_spec['disk_tier'] = constants.INSTANCE_STORAGE_DISK_TYPE
1245
+ volume_spec[
1246
+ 'interface_type'] = constants.INSTANCE_STORAGE_INTERFACE_TYPE
1247
+ volume_spec['storage_type'] = constants.INSTANCE_STORAGE_TYPE
1248
+ # Disk size of instance storage is fixed to 375GB
1249
+ volume_spec['disk_size'] = None
1250
+ volume_spec['auto_delete'] = True
1251
+ else:
1252
+ # TODO(hailong): this should be fixed when move the
1253
+ # disk creation out of the instance creation phase
1254
+ if not use_mig:
1255
+ volume_spec['disk_name'] = volume['name']
1256
+ device_name = f'{constants.DEVICE_NAME_PREFIX}sky-disk-{i}'
1257
+ device_mount_points[device_name] = volume['path']
1258
+
1259
+ volume_spec['storage_type'] = constants.NETWORK_STORAGE_TYPE
1260
+ if 'disk_size' in volume:
1261
+ volume_spec['disk_size'] = volume['disk_size']
1262
+ else:
1263
+ volume_spec['disk_size'] = constants.DEFAULT_DISK_SIZE
1264
+ disk_tier = cls.failover_disk_tier(instance_type,
1265
+ volume['disk_tier'])
1266
+ volume_spec['disk_tier'] = cls._get_data_disk_type(
1267
+ instance_type, disk_tier)
1268
+ if volume_spec['disk_tier'] == 'pd-extreme':
1269
+ # Only pd-extreme supports custom iops.
1270
+ # see https://cloud.google.com/compute/docs/disks#disk-types
1271
+ volume_spec['disk_iops'] = constants.PD_EXTREME_IOPS
1272
+ volumes_specs.append(volume_spec)
1273
+
1274
+ return volumes_specs, device_mount_points
1275
+
1139
1276
  @classmethod
1140
1277
  def _label_filter_str(cls, tag_filters: Dict[str, str]) -> str:
1141
1278
  return ' '.join(f'labels.{k}={v}' for k, v in tag_filters.items())
sky/clouds/ibm.py CHANGED
@@ -399,13 +399,15 @@ class IBM(clouds.Cloud):
399
399
  return image_size
400
400
 
401
401
  @classmethod
402
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
402
+ def _check_compute_credentials(
403
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
403
404
  """Checks if the user has access credentials to
404
405
  IBM's compute service."""
405
406
  return cls._check_credentials()
406
407
 
407
408
  @classmethod
408
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
409
+ def _check_storage_credentials(
410
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
409
411
  """Checks if the user has access credentials to
410
412
  IBM's storage service."""
411
413
  # TODO(seungjin): Implement separate check for
sky/clouds/kubernetes.py CHANGED
@@ -4,6 +4,8 @@ import re
4
4
  import typing
5
5
  from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
6
6
 
7
+ import colorama
8
+
7
9
  from sky import clouds
8
10
  from sky import exceptions
9
11
  from sky import sky_logging
@@ -149,7 +151,7 @@ class Kubernetes(clouds.Cloud):
149
151
  'Ignoring these contexts.')
150
152
 
151
153
  @classmethod
152
- def existing_allowed_contexts(cls) -> List[str]:
154
+ def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
153
155
  """Get existing allowed contexts.
154
156
 
155
157
  If None is returned in the list, it means that we are running in a pod
@@ -162,6 +164,12 @@ class Kubernetes(clouds.Cloud):
162
164
 
163
165
  all_contexts = set(all_contexts)
164
166
 
167
+ # Exclude contexts starting with `ssh-`
168
+ # TODO(romilb): Remove when SSH Node Pools use a separate kubeconfig.
169
+ all_contexts = [
170
+ ctx for ctx in all_contexts if not ctx.startswith('ssh-')
171
+ ]
172
+
165
173
  allowed_contexts = skypilot_config.get_nested(
166
174
  ('kubernetes', 'allowed_contexts'), None)
167
175
 
@@ -183,8 +191,12 @@ class Kubernetes(clouds.Cloud):
183
191
  if context in all_contexts:
184
192
  existing_contexts.append(context)
185
193
  else:
194
+ # Skip SSH Node Pool contexts
195
+ if context.startswith('ssh-'):
196
+ continue
186
197
  skipped_contexts.append(context)
187
- cls._log_skipped_contexts_once(tuple(skipped_contexts))
198
+ if not silent:
199
+ cls._log_skipped_contexts_once(tuple(skipped_contexts))
188
200
  return existing_contexts
189
201
 
190
202
  @classmethod
@@ -640,7 +652,7 @@ class Kubernetes(clouds.Cloud):
640
652
  resource_list = []
641
653
  for instance_type in instance_list:
642
654
  r = resources.copy(
643
- cloud=Kubernetes(),
655
+ cloud=self.__class__(),
644
656
  instance_type=instance_type,
645
657
  accelerators=None,
646
658
  )
@@ -692,7 +704,43 @@ class Kubernetes(clouds.Cloud):
692
704
  [], None)
693
705
 
694
706
  @classmethod
695
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
707
+ def _check_single_context(cls, context: str) -> Tuple[bool, str]:
708
+ """Check if the user has access credentials to a single SSH context."""
709
+
710
+ def _red_color(str_to_format: str) -> str:
711
+ return (f'{colorama.Fore.LIGHTRED_EX}'
712
+ f'{str_to_format}'
713
+ f'{colorama.Style.RESET_ALL}')
714
+
715
+ def _dim_color(str_to_format: str) -> str:
716
+ return (f'{colorama.Style.DIM}'
717
+ f'{str_to_format}'
718
+ f'{colorama.Style.RESET_ALL}')
719
+
720
+ def _bright_green_color(str_to_format: str) -> str:
721
+ return (f'{colorama.Fore.GREEN}'
722
+ f'{str_to_format}'
723
+ f'{colorama.Style.RESET_ALL}')
724
+
725
+ try:
726
+ check_result = kubernetes_utils.check_credentials(
727
+ context, run_optional_checks=True)
728
+ if check_result[0]:
729
+ if check_result[1] is not None:
730
+ return True, (_bright_green_color('enabled.') +
731
+ _dim_color(f' Note: {check_result[1]}'))
732
+ else:
733
+ return True, _bright_green_color('enabled.')
734
+ else:
735
+ assert check_result[1] is not None
736
+ return False, (_red_color('disabled.') +
737
+ _dim_color(f' Reason: {check_result[1]}'))
738
+ except Exception as e: # pylint: disable=broad-except
739
+ return False, _red_color(str(e))
740
+
741
+ @classmethod
742
+ def _check_compute_credentials(
743
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
696
744
  """Checks if the user has access credentials to
697
745
  Kubernetes."""
698
746
  # Check for port forward dependencies
@@ -719,26 +767,15 @@ class Kubernetes(clouds.Cloud):
719
767
  return (False, 'No available context found in kubeconfig. '
720
768
  'Check if you have a valid kubeconfig file' +
721
769
  check_skypilot_config_msg)
722
- reasons = []
723
- hints = []
770
+
771
+ ctx2text = {}
724
772
  success = False
725
773
  for context in existing_allowed_contexts:
726
- try:
727
- check_result = kubernetes_utils.check_credentials(
728
- context, run_optional_checks=True)
729
- if check_result[0]:
730
- success = True
731
- if check_result[1] is not None:
732
- hints.append(f'Context {context}: {check_result[1]}')
733
- else:
734
- reasons.append(f'Context {context}: {check_result[1]}')
735
- except Exception as e: # pylint: disable=broad-except
736
- return (False, f'Credential check failed for {context}: '
737
- f'{common_utils.format_exception(e)}')
738
- if success:
739
- return (True, cls._format_credential_check_results(hints, reasons))
740
- return (False, 'Failed to find available context with working '
741
- 'credentials. Details:\n' + '\n'.join(reasons))
774
+ suc, text = cls._check_single_context(context)
775
+ success = success or suc
776
+ ctx2text[context] = text
777
+
778
+ return success, ctx2text
742
779
 
743
780
  @classmethod
744
781
  def _format_credential_check_results(cls, hints: List[str],
@@ -855,3 +892,10 @@ class Kubernetes(clouds.Cloud):
855
892
  if not key_valid or not value_valid:
856
893
  return False, error_msg
857
894
  return True, None
895
+
896
+ @classmethod
897
+ def get_infras(cls) -> List[str]:
898
+ return [
899
+ f'{cls._REPR.lower()}/{c}'
900
+ for c in cls.existing_allowed_contexts(silent=True)
901
+ ]
@@ -244,7 +244,8 @@ class Lambda(clouds.Cloud):
244
244
  fuzzy_candidate_list, None)
245
245
 
246
246
  @classmethod
247
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
247
+ def _check_compute_credentials(
248
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
248
249
  """Checks if the user has access credentials to
249
250
  Lambda's compute service."""
250
251
  try:
sky/clouds/nebius.py CHANGED
@@ -4,6 +4,7 @@ import typing
4
4
  from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
5
5
 
6
6
  from sky import clouds
7
+ from sky import skypilot_config
7
8
  from sky.adaptors import nebius
8
9
  from sky.clouds import service_catalog
9
10
  from sky.utils import annotations
@@ -210,6 +211,18 @@ class Nebius(clouds.Cloud):
210
211
  raise RuntimeError('Unsupported instance type for Nebius cloud:'
211
212
  f' {resources.instance_type}')
212
213
 
214
+ config_fs = skypilot_config.get_nested(
215
+ ('nebius', region.name, 'filesystems'), [])
216
+ resources_vars_fs = []
217
+ for i, fs in enumerate(config_fs):
218
+ resources_vars_fs.append({
219
+ 'filesystem_id': fs['filesystem_id'],
220
+ 'filesystem_attach_mode': fs.get('attach_mode', 'READ_WRITE'),
221
+ 'filesystem_mount_path': fs.get(
222
+ 'mount_path', f'/mnt/filesystem-skypilot-{i+1}'),
223
+ 'filesystem_mount_tag': f'filesystem-skypilot-{i+1}'
224
+ })
225
+
213
226
  resources_vars: Dict[str, Any] = {
214
227
  'instance_type': resources.instance_type,
215
228
  'custom_resources': custom_resources,
@@ -217,6 +230,7 @@ class Nebius(clouds.Cloud):
217
230
  'image_id': image_family,
218
231
  # Nebius does not support specific zones.
219
232
  'zones': None,
233
+ 'filesystems': resources_vars_fs
220
234
  }
221
235
 
222
236
  if acc_dict is not None:
@@ -283,7 +297,8 @@ class Nebius(clouds.Cloud):
283
297
 
284
298
  @classmethod
285
299
  @annotations.lru_cache(scope='request')
286
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
300
+ def _check_compute_credentials(
301
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
287
302
  """Checks if the user has access credentials to
288
303
  Nebius's compute service."""
289
304
  token_cred_msg = (
@@ -314,7 +329,8 @@ class Nebius(clouds.Cloud):
314
329
 
315
330
  @classmethod
316
331
  @annotations.lru_cache(scope='request')
317
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
332
+ def _check_storage_credentials(
333
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
318
334
  """Checks if the user has access credentials to Nebius Object Storage.
319
335
 
320
336
  Returns: