skypilot-nightly 1.0.0.dev20251027__py3-none-any.whl → 1.0.0.dev20251029__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/coreweave.py +278 -0
  3. sky/backends/backend_utils.py +9 -6
  4. sky/backends/cloud_vm_ray_backend.py +2 -3
  5. sky/check.py +25 -13
  6. sky/client/cli/command.py +5 -1
  7. sky/cloud_stores.py +73 -0
  8. sky/core.py +7 -5
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → DabuSAKsc_y0wyJxpTIdQ}/_buildManifest.js +1 -1
  11. sky/dashboard/out/_next/static/chunks/{1141-d5204f35a3388bf4.js → 1141-c3c10e2c6ed71a8f.js} +1 -1
  12. sky/dashboard/out/_next/static/chunks/2755.a239c652bf8684dd.js +26 -0
  13. sky/dashboard/out/_next/static/chunks/3294.87a13fba0058865b.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/{3785.538eb23a098fc304.js → 3785.170be320e0060eaf.js} +1 -1
  15. sky/dashboard/out/_next/static/chunks/4282-49b2065b7336e496.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/7615-80aa7b09f45a86d2.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/8969-4ed9236db997b42b.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/9360.10a3aac7aad5e3aa.js +31 -0
  19. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ac4a217f17b087cb.js +16 -0
  20. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-fbf2907ce2bb67e2.js → [cluster]-1704039ccaf997cf.js} +1 -1
  21. sky/dashboard/out/_next/static/chunks/pages/{jobs-0dc34cf9a8710a9f.js → jobs-7eee823559e5cf9f.js} +1 -1
  22. sky/dashboard/out/_next/static/chunks/pages/{users-96d6b8bb2dec055f.js → users-2b172f13f8538a7a.js} +1 -1
  23. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-fb1b4d3bfb047cad.js → [name]-bbfe5860c93470fd.js} +1 -1
  24. sky/dashboard/out/_next/static/chunks/pages/{workspaces-6fc994fa1ee6c6bf.js → workspaces-1891376c08050940.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/{webpack-585d805f693dbceb.js → webpack-485984ca04e021d0.js} +1 -1
  26. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  27. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  28. sky/dashboard/out/clusters/[cluster].html +1 -1
  29. sky/dashboard/out/clusters.html +1 -1
  30. sky/dashboard/out/config.html +1 -1
  31. sky/dashboard/out/index.html +1 -1
  32. sky/dashboard/out/infra/[context].html +1 -1
  33. sky/dashboard/out/infra.html +1 -1
  34. sky/dashboard/out/jobs/[job].html +1 -1
  35. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/dashboard/out/users.html +1 -1
  38. sky/dashboard/out/volumes.html +1 -1
  39. sky/dashboard/out/workspace/new.html +1 -1
  40. sky/dashboard/out/workspaces/[name].html +1 -1
  41. sky/dashboard/out/workspaces.html +1 -1
  42. sky/data/data_utils.py +92 -1
  43. sky/data/mounting_utils.py +39 -0
  44. sky/data/storage.py +166 -9
  45. sky/global_user_state.py +14 -18
  46. sky/jobs/server/server.py +2 -2
  47. sky/jobs/utils.py +5 -6
  48. sky/optimizer.py +1 -1
  49. sky/provision/kubernetes/instance.py +88 -19
  50. sky/provision/kubernetes/volume.py +2 -2
  51. sky/schemas/api/responses.py +2 -5
  52. sky/serve/replica_managers.py +2 -2
  53. sky/serve/serve_utils.py +9 -2
  54. sky/server/requests/payloads.py +2 -0
  55. sky/server/requests/requests.py +137 -102
  56. sky/server/requests/serializers/decoders.py +0 -6
  57. sky/server/requests/serializers/encoders.py +33 -6
  58. sky/server/server.py +2 -1
  59. sky/server/stream_utils.py +56 -13
  60. sky/setup_files/dependencies.py +2 -0
  61. sky/task.py +10 -0
  62. sky/templates/nebius-ray.yml.j2 +1 -0
  63. sky/utils/cli_utils/status_utils.py +8 -2
  64. sky/utils/context_utils.py +13 -1
  65. sky/utils/resources_utils.py +53 -29
  66. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/METADATA +52 -36
  67. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/RECORD +73 -72
  68. sky/dashboard/out/_next/static/chunks/2755.227c84f5adf75c6b.js +0 -26
  69. sky/dashboard/out/_next/static/chunks/3015-2dcace420c8939f4.js +0 -1
  70. sky/dashboard/out/_next/static/chunks/3294.6d5054a953a818cb.js +0 -1
  71. sky/dashboard/out/_next/static/chunks/4282-d2f3ef2fbf78e347.js +0 -1
  72. sky/dashboard/out/_next/static/chunks/8969-0389e2cb52412db3.js +0 -1
  73. sky/dashboard/out/_next/static/chunks/9360.07d78b8552bc9d17.js +0 -31
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c815b90e296b8075.js +0 -16
  75. sky/dashboard/out/_next/static/css/4c052b4444e52a58.css +0 -3
  76. /sky/dashboard/out/_next/static/{YP5Vc3ROcDnTGta0XAhcs → DabuSAKsc_y0wyJxpTIdQ}/_ssgManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/chunks/pages/{_app-513d332313670f2a.js → _app-bde01e4a2beec258.js} +0 -0
  78. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20251027.dist-info → skypilot_nightly-1.0.0.dev20251029.dist-info}/top_level.txt +0 -0
sky/data/storage.py CHANGED
@@ -23,6 +23,7 @@ from sky import skypilot_config
23
23
  from sky.adaptors import aws
24
24
  from sky.adaptors import azure
25
25
  from sky.adaptors import cloudflare
26
+ from sky.adaptors import coreweave
26
27
  from sky.adaptors import gcp
27
28
  from sky.adaptors import ibm
28
29
  from sky.adaptors import nebius
@@ -62,6 +63,7 @@ STORE_ENABLED_CLOUDS: List[str] = [
62
63
  str(clouds.OCI()),
63
64
  str(clouds.Nebius()),
64
65
  cloudflare.NAME,
66
+ coreweave.NAME,
65
67
  ]
66
68
 
67
69
  # Maximum number of concurrent rsync upload processes
@@ -93,6 +95,12 @@ def get_cached_enabled_storage_cloud_names_or_refresh(
93
95
  r2_is_enabled, _ = cloudflare.check_storage_credentials()
94
96
  if r2_is_enabled:
95
97
  enabled_clouds.append(cloudflare.NAME)
98
+
99
+ # Similarly, handle CoreWeave storage credentials
100
+ coreweave_is_enabled, _ = coreweave.check_storage_credentials()
101
+ if coreweave_is_enabled:
102
+ enabled_clouds.append(coreweave.NAME)
103
+
96
104
  if raise_if_no_cloud_access and not enabled_clouds:
97
105
  raise exceptions.NoCloudAccessError(
98
106
  'No cloud access available for storage. '
@@ -126,6 +134,7 @@ class StoreType(enum.Enum):
126
134
  IBM = 'IBM'
127
135
  OCI = 'OCI'
128
136
  NEBIUS = 'NEBIUS'
137
+ COREWEAVE = 'COREWEAVE'
129
138
  VOLUME = 'VOLUME'
130
139
 
131
140
  @classmethod
@@ -883,7 +892,7 @@ class Storage(object):
883
892
  f'{source} in the file_mounts section of your YAML')
884
893
  is_local_source = True
885
894
  elif split_path.scheme in [
886
- 's3', 'gs', 'https', 'r2', 'cos', 'oci', 'nebius'
895
+ 's3', 'gs', 'https', 'r2', 'cos', 'oci', 'nebius', 'cw'
887
896
  ]:
888
897
  is_local_source = False
889
898
  # Storage mounting does not support mounting specific files from
@@ -908,7 +917,8 @@ class Storage(object):
908
917
  with ux_utils.print_exception_no_traceback():
909
918
  raise exceptions.StorageSourceError(
910
919
  f'Supported paths: local, s3://, gs://, https://, '
911
- f'r2://, cos://, oci://, nebius://. Got: {source}')
920
+ f'r2://, cos://, oci://, nebius://, cw://. '
921
+ f'Got: {source}')
912
922
  return source, is_local_source
913
923
 
914
924
  def _validate_storage_spec(self, name: Optional[str]) -> None:
@@ -923,7 +933,16 @@ class Storage(object):
923
933
  """
924
934
  prefix = name.split('://')[0]
925
935
  prefix = prefix.lower()
926
- if prefix in ['s3', 'gs', 'https', 'r2', 'cos', 'oci', 'nebius']:
936
+ if prefix in [
937
+ 's3',
938
+ 'gs',
939
+ 'https',
940
+ 'r2',
941
+ 'cos',
942
+ 'oci',
943
+ 'nebius',
944
+ 'cw',
945
+ ]:
927
946
  with ux_utils.print_exception_no_traceback():
928
947
  raise exceptions.StorageNameError(
929
948
  'Prefix detected: `name` cannot start with '
@@ -1062,6 +1081,12 @@ class Storage(object):
1062
1081
  source=self.source,
1063
1082
  sync_on_reconstruction=self.sync_on_reconstruction,
1064
1083
  _bucket_sub_path=self._bucket_sub_path)
1084
+ elif s_type == StoreType.COREWEAVE:
1085
+ store = CoreWeaveStore.from_metadata(
1086
+ s_metadata,
1087
+ source=self.source,
1088
+ sync_on_reconstruction=self.sync_on_reconstruction,
1089
+ _bucket_sub_path=self._bucket_sub_path)
1065
1090
  else:
1066
1091
  with ux_utils.print_exception_no_traceback():
1067
1092
  raise ValueError(f'Unknown store type: {s_type}')
@@ -1417,6 +1442,7 @@ class S3CompatibleConfig:
1417
1442
  aws_profile: Optional[str] = None
1418
1443
  get_endpoint_url: Optional[Callable[[], str]] = None
1419
1444
  credentials_file: Optional[str] = None
1445
+ config_file: Optional[str] = None
1420
1446
  extra_cli_args: Optional[List[str]] = None
1421
1447
 
1422
1448
  # Provider-specific settings
@@ -1437,8 +1463,8 @@ class S3CompatibleStore(AbstractStore):
1437
1463
  """Base class for S3-compatible object storage providers.
1438
1464
 
1439
1465
  This class provides a unified interface for all S3-compatible storage
1440
- providers (AWS S3, Cloudflare R2, Nebius, MinIO, etc.) by leveraging
1441
- a configuration-driven approach that eliminates code duplication.
1466
+ providers (AWS S3, Cloudflare R2, Nebius, MinIO, CoreWeave, etc.) by
1467
+ leveraging a configuration-driven approach that eliminates code duplication
1442
1468
 
1443
1469
  ## Adding a New S3-Compatible Store
1444
1470
 
@@ -1864,6 +1890,9 @@ class S3CompatibleStore(AbstractStore):
1864
1890
  if self.config.credentials_file:
1865
1891
  cmd = 'AWS_SHARED_CREDENTIALS_FILE=' + \
1866
1892
  f'{self.config.credentials_file} {cmd}'
1893
+ if self.config.config_file:
1894
+ cmd = 'AWS_CONFIG_FILE=' + \
1895
+ f'{self.config.config_file} {cmd}'
1867
1896
 
1868
1897
  return cmd
1869
1898
 
@@ -1909,6 +1938,9 @@ class S3CompatibleStore(AbstractStore):
1909
1938
  if self.config.credentials_file:
1910
1939
  cmd = 'AWS_SHARED_CREDENTIALS_FILE=' + \
1911
1940
  f'{self.config.credentials_file} {cmd}'
1941
+ if self.config.config_file:
1942
+ cmd = 'AWS_CONFIG_FILE=' + \
1943
+ f'{self.config.config_file} {cmd}'
1912
1944
 
1913
1945
  return cmd
1914
1946
 
@@ -1962,6 +1994,9 @@ class S3CompatibleStore(AbstractStore):
1962
1994
  if self.config.credentials_file:
1963
1995
  command = (f'AWS_SHARED_CREDENTIALS_FILE='
1964
1996
  f'{self.config.credentials_file} {command}')
1997
+ if self.config.config_file:
1998
+ command = 'AWS_CONFIG_FILE=' + \
1999
+ f'{self.config.config_file} {command}'
1965
2000
  with ux_utils.print_exception_no_traceback():
1966
2001
  raise exceptions.StorageBucketGetError(
1967
2002
  _BUCKET_FAIL_TO_CONNECT_MESSAGE.format(name=self.name) +
@@ -2034,7 +2069,9 @@ class S3CompatibleStore(AbstractStore):
2034
2069
  remove_command = (f'AWS_SHARED_CREDENTIALS_FILE='
2035
2070
  f'{self.config.credentials_file} '
2036
2071
  f'{remove_command}')
2037
-
2072
+ if self.config.config_file:
2073
+ remove_command = 'AWS_CONFIG_FILE=' + \
2074
+ f'{self.config.config_file} {remove_command}'
2038
2075
  return self._execute_remove_command(
2039
2076
  remove_command, bucket_name,
2040
2077
  f'Deleting {self.config.store_type} bucket {bucket_name}',
@@ -2047,8 +2084,9 @@ class S3CompatibleStore(AbstractStore):
2047
2084
  try:
2048
2085
  with rich_utils.safe_status(
2049
2086
  ux_utils.spinner_message(hint_operating)):
2050
- subprocess.check_output(command.split(' '),
2051
- stderr=subprocess.STDOUT)
2087
+ subprocess.check_output(command,
2088
+ stderr=subprocess.STDOUT,
2089
+ shell=True)
2052
2090
  except subprocess.CalledProcessError as e:
2053
2091
  if 'NoSuchBucket' in e.output.decode('utf-8'):
2054
2092
  logger.debug(
@@ -2091,7 +2129,9 @@ class S3CompatibleStore(AbstractStore):
2091
2129
  remove_command = (f'AWS_SHARED_CREDENTIALS_FILE='
2092
2130
  f'{self.config.credentials_file} '
2093
2131
  f'{remove_command}')
2094
-
2132
+ if self.config.config_file:
2133
+ remove_command = 'AWS_CONFIG_FILE=' + \
2134
+ f'{self.config.config_file} {remove_command}'
2095
2135
  return self._execute_remove_command(
2096
2136
  remove_command, bucket_name,
2097
2137
  (f'Removing objects from {self.config.store_type} bucket '
@@ -2168,6 +2208,10 @@ class GcsStore(AbstractStore):
2168
2208
  elif self.source.startswith('oci://'):
2169
2209
  raise NotImplementedError(
2170
2210
  'Moving data from OCI to GCS is currently not supported.')
2211
+ elif self.source.startswith('cw://'):
2212
+ raise NotImplementedError(
2213
+ 'Moving data from CoreWeave Object Storage to GCS is'
2214
+ ' currently not supported.')
2171
2215
  # Validate name
2172
2216
  self.name = self.validate_name(self.name)
2173
2217
  # Check if the storage is enabled
@@ -2783,6 +2827,10 @@ class AzureBlobStore(AbstractStore):
2783
2827
  elif self.source.startswith('oci://'):
2784
2828
  raise NotImplementedError(
2785
2829
  'Moving data from OCI to AZureBlob is not supported.')
2830
+ elif self.source.startswith('cw://'):
2831
+ raise NotImplementedError(
2832
+ 'Moving data from CoreWeave Object Storage to AzureBlob is'
2833
+ ' currently not supported.')
2786
2834
  # Validate name
2787
2835
  self.name = self.validate_name(self.name)
2788
2836
 
@@ -3154,6 +3202,8 @@ class AzureBlobStore(AbstractStore):
3154
3202
  raise NotImplementedError(error_message.format('OCI'))
3155
3203
  elif self.source.startswith('nebius://'):
3156
3204
  raise NotImplementedError(error_message.format('NEBIUS'))
3205
+ elif self.source.startswith('cw://'):
3206
+ raise NotImplementedError(error_message.format('CoreWeave'))
3157
3207
  else:
3158
3208
  self.batch_az_blob_sync([self.source])
3159
3209
  except exceptions.StorageUploadError:
@@ -3572,6 +3622,10 @@ class IBMCosStore(AbstractStore):
3572
3622
  assert self.name == data_utils.split_cos_path(self.source)[0], (
3573
3623
  'COS Bucket is specified as path, the name should be '
3574
3624
  'the same as COS bucket.')
3625
+ elif self.source.startswith('cw://'):
3626
+ raise NotImplementedError(
3627
+ 'Moving data from CoreWeave Object Storage to COS is '
3628
+ 'currently not supported.')
3575
3629
  # Validate name
3576
3630
  self.name = IBMCosStore.validate_name(self.name)
3577
3631
 
@@ -3670,6 +3724,9 @@ class IBMCosStore(AbstractStore):
3670
3724
  elif self.source.startswith('r2://'):
3671
3725
  raise Exception('IBM COS currently not supporting'
3672
3726
  'data transfers between COS and r2')
3727
+ elif self.source.startswith('cw://'):
3728
+ raise Exception('IBM COS currently not supporting'
3729
+ 'data transfers between COS and CoreWeave')
3673
3730
  else:
3674
3731
  self.batch_ibm_rsync([self.source])
3675
3732
 
@@ -4595,3 +4652,103 @@ class NebiusStore(S3CompatibleStore):
4595
4652
  rclone_config, rclone_profile_name, self.bucket.name, mount_path)
4596
4653
  return mounting_utils.get_mounting_command(mount_path, install_cmd,
4597
4654
  mount_cached_cmd)
4655
+
4656
+
4657
+ @register_s3_compatible_store
4658
+ class CoreWeaveStore(S3CompatibleStore):
4659
+ """CoreWeaveStore inherits from S3CompatibleStore and represents the backend
4660
+ for CoreWeave Object Storage buckets.
4661
+ """
4662
+
4663
+ @classmethod
4664
+ def get_config(cls) -> S3CompatibleConfig:
4665
+ """Return the configuration for CoreWeave Object Storage."""
4666
+ return S3CompatibleConfig(
4667
+ store_type='COREWEAVE',
4668
+ url_prefix='cw://',
4669
+ client_factory=lambda region: data_utils.create_coreweave_client(),
4670
+ resource_factory=lambda name: coreweave.resource('s3').Bucket(name),
4671
+ split_path=data_utils.split_coreweave_path,
4672
+ verify_bucket=data_utils.verify_coreweave_bucket,
4673
+ aws_profile=coreweave.COREWEAVE_PROFILE_NAME,
4674
+ get_endpoint_url=coreweave.get_endpoint,
4675
+ credentials_file=coreweave.COREWEAVE_CREDENTIALS_PATH,
4676
+ config_file=coreweave.COREWEAVE_CONFIG_PATH,
4677
+ cloud_name=coreweave.NAME,
4678
+ default_region=coreweave.DEFAULT_REGION,
4679
+ mount_cmd_factory=cls._get_coreweave_mount_cmd,
4680
+ )
4681
+
4682
+ def _get_bucket(self) -> Tuple[StorageHandle, bool]:
4683
+ """Get or create bucket using CoreWeave's S3 API"""
4684
+ bucket = self.config.resource_factory(self.name)
4685
+
4686
+ # Use our custom bucket verification instead of head_bucket
4687
+ if data_utils.verify_coreweave_bucket(self.name):
4688
+ self._validate_existing_bucket()
4689
+ return bucket, False
4690
+
4691
+ # TODO(hailong): Enable the bucket creation for CoreWeave
4692
+ # Disable this to avoid waiting too long until the following
4693
+ # issue is resolved:
4694
+ # https://github.com/skypilot-org/skypilot/issues/7736
4695
+ raise exceptions.StorageBucketGetError(
4696
+ f'Bucket {self.name!r} does not exist. CoreWeave buckets can take'
4697
+ ' a long time to become accessible after creation, so SkyPilot'
4698
+ ' does not create them automatically. Please create the bucket'
4699
+ ' manually in CoreWeave and wait for it to be accessible before'
4700
+ ' using it.')
4701
+
4702
+ # # Check if this is a source with URL prefix (existing bucket case)
4703
+ # if isinstance(self.source, str) and self.source.startswith(
4704
+ # self.config.url_prefix):
4705
+ # with ux_utils.print_exception_no_traceback():
4706
+ # raise exceptions.StorageBucketGetError(
4707
+ # 'Attempted to use a non-existent bucket as a source: '
4708
+ # f'{self.source}.')
4709
+
4710
+ # # If bucket cannot be found, create it if needed
4711
+ # if self.sync_on_reconstruction:
4712
+ # bucket = self._create_bucket(self.name)
4713
+ # return bucket, True
4714
+ # else:
4715
+ # raise exceptions.StorageExternalDeletionError(
4716
+ # 'Attempted to fetch a non-existent bucket: '
4717
+ # f'{self.name}')
4718
+
4719
+ @classmethod
4720
+ def _get_coreweave_mount_cmd(cls, bucket_name: str, mount_path: str,
4721
+ bucket_sub_path: Optional[str]) -> str:
4722
+ """Factory method for CoreWeave mount command."""
4723
+ endpoint_url = coreweave.get_endpoint()
4724
+ return mounting_utils.get_coreweave_mount_cmd(
4725
+ coreweave.COREWEAVE_CREDENTIALS_PATH,
4726
+ coreweave.COREWEAVE_PROFILE_NAME, bucket_name, endpoint_url,
4727
+ mount_path, bucket_sub_path)
4728
+
4729
+ def mount_cached_command(self, mount_path: str) -> str:
4730
+ """CoreWeave-specific cached mount implementation using rclone."""
4731
+ install_cmd = mounting_utils.get_rclone_install_cmd()
4732
+ rclone_profile_name = (
4733
+ data_utils.Rclone.RcloneStores.COREWEAVE.get_profile_name(
4734
+ self.name))
4735
+ rclone_config = data_utils.Rclone.RcloneStores.COREWEAVE.get_config(
4736
+ rclone_profile_name=rclone_profile_name)
4737
+ mount_cached_cmd = mounting_utils.get_mount_cached_cmd(
4738
+ rclone_config, rclone_profile_name, self.bucket.name, mount_path)
4739
+ return mounting_utils.get_mounting_command(mount_path, install_cmd,
4740
+ mount_cached_cmd)
4741
+
4742
+ def _create_bucket(self, bucket_name: str) -> StorageHandle:
4743
+ """Create bucket using S3 API with timing handling for CoreWeave."""
4744
+ result = super()._create_bucket(bucket_name)
4745
+ # Ensure bucket is created
4746
+ # The newly created bucket ever takes about 18min to be accessible,
4747
+ # here we just retry for 36 times (5s * 36 = 180s) to avoid waiting
4748
+ # too long
4749
+ # TODO(hailong): Update the logic here when the following
4750
+ # issue is resolved:
4751
+ # https://github.com/skypilot-org/skypilot/issues/7736
4752
+ data_utils.verify_coreweave_bucket(bucket_name, retry=36)
4753
+
4754
+ return result
sky/global_user_state.py CHANGED
@@ -1605,7 +1605,6 @@ def get_cluster_from_name(
1605
1605
  cluster_table.c.owner,
1606
1606
  cluster_table.c.metadata,
1607
1607
  cluster_table.c.cluster_hash,
1608
- cluster_table.c.storage_mounts_metadata,
1609
1608
  cluster_table.c.cluster_ever_up,
1610
1609
  cluster_table.c.status_updated_at,
1611
1610
  cluster_table.c.user_hash,
@@ -1642,8 +1641,6 @@ def get_cluster_from_name(
1642
1641
  'owner': _load_owner(row.owner),
1643
1642
  'metadata': json.loads(row.metadata),
1644
1643
  'cluster_hash': row.cluster_hash,
1645
- 'storage_mounts_metadata': _load_storage_mounts_metadata(
1646
- row.storage_mounts_metadata),
1647
1644
  'cluster_ever_up': bool(row.cluster_ever_up),
1648
1645
  'status_updated_at': row.status_updated_at,
1649
1646
  'workspace': row.workspace,
@@ -1704,27 +1701,27 @@ def get_clusters(
1704
1701
  cluster_table.c.name,
1705
1702
  cluster_table.c.launched_at,
1706
1703
  cluster_table.c.handle,
1707
- cluster_table.c.last_use,
1708
1704
  cluster_table.c.status,
1709
1705
  cluster_table.c.autostop,
1710
1706
  cluster_table.c.to_down,
1711
- cluster_table.c.owner,
1712
- cluster_table.c.metadata,
1713
1707
  cluster_table.c.cluster_hash,
1714
- cluster_table.c.storage_mounts_metadata,
1715
1708
  cluster_table.c.cluster_ever_up,
1716
- cluster_table.c.status_updated_at,
1717
1709
  cluster_table.c.user_hash,
1718
- cluster_table.c.config_hash,
1719
1710
  cluster_table.c.workspace,
1720
- cluster_table.c.is_managed,
1721
1711
  user_table.c.name.label('user_name'),
1722
1712
  ]
1723
1713
  if not summary_response:
1724
1714
  query_fields.extend([
1725
1715
  cluster_table.c.last_creation_yaml,
1726
1716
  cluster_table.c.last_creation_command,
1717
+ cluster_table.c.config_hash,
1718
+ cluster_table.c.owner,
1719
+ cluster_table.c.metadata,
1720
+ cluster_table.c.last_use,
1721
+ cluster_table.c.status_updated_at,
1727
1722
  ])
1723
+ if not exclude_managed_clusters:
1724
+ query_fields.append(cluster_table.c.is_managed)
1728
1725
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1729
1726
  query = session.query(*query_fields).outerjoin(
1730
1727
  user_table, cluster_table.c.user_hash == user_table.c.id)
@@ -1771,30 +1768,29 @@ def get_clusters(
1771
1768
  'name': row.name,
1772
1769
  'launched_at': row.launched_at,
1773
1770
  'handle': pickle.loads(row.handle),
1774
- 'last_use': row.last_use,
1775
1771
  'status': status_lib.ClusterStatus[row.status],
1776
1772
  'autostop': row.autostop,
1777
1773
  'to_down': bool(row.to_down),
1778
- 'owner': _load_owner(row.owner),
1779
- 'metadata': json.loads(row.metadata),
1780
1774
  'cluster_hash': row.cluster_hash,
1781
- 'storage_mounts_metadata': _load_storage_mounts_metadata(
1782
- row.storage_mounts_metadata),
1783
1775
  'cluster_ever_up': bool(row.cluster_ever_up),
1784
- 'status_updated_at': row.status_updated_at,
1785
1776
  'user_hash': (row.user_hash
1786
1777
  if row.user_hash is not None else current_user_hash),
1787
1778
  'user_name': (row.user_name
1788
1779
  if row.user_name is not None else current_user_name),
1789
1780
  'workspace': row.workspace,
1790
- 'is_managed': bool(row.is_managed),
1791
- 'config_hash': row.config_hash,
1781
+ 'is_managed': False
1782
+ if exclude_managed_clusters else bool(row.is_managed),
1792
1783
  }
1793
1784
  if not summary_response:
1794
1785
  record['last_creation_yaml'] = row.last_creation_yaml
1795
1786
  record['last_creation_command'] = row.last_creation_command
1796
1787
  record['last_event'] = last_cluster_event_dict.get(
1797
1788
  row.cluster_hash, None)
1789
+ record['config_hash'] = row.config_hash
1790
+ record['owner'] = _load_owner(row.owner)
1791
+ record['metadata'] = json.loads(row.metadata)
1792
+ record['last_use'] = row.last_use
1793
+ record['status_updated_at'] = row.status_updated_at
1798
1794
 
1799
1795
  records.append(record)
1800
1796
  return records
sky/jobs/server/server.py CHANGED
@@ -206,8 +206,8 @@ async def pool_tail_logs(
206
206
  request_cluster_name=common.JOB_CONTROLLER_NAME,
207
207
  )
208
208
 
209
- request_task = api_requests.get_request(request.state.request_id,
210
- fields=['request_id'])
209
+ request_task = await api_requests.get_request_async(
210
+ request.state.request_id, fields=['request_id'])
211
211
 
212
212
  return stream_utils.stream_response_for_long_request(
213
213
  request_id=request_task.request_id,
sky/jobs/utils.py CHANGED
@@ -1522,12 +1522,11 @@ def get_managed_job_queue(
1522
1522
  handle = cluster_name_to_handle.get(
1523
1523
  cluster_name, None) if cluster_name is not None else None
1524
1524
  if isinstance(handle, backends.CloudVmRayResourceHandle):
1525
- resources_str = resources_utils.get_readable_resources_repr(
1526
- handle, simplify=True)
1527
- resources_str_full = (
1528
- resources_utils.get_readable_resources_repr(handle,
1529
- simplify=False))
1530
- job['cluster_resources'] = resources_str
1525
+ resources_str_simple, resources_str_full = (
1526
+ resources_utils.get_readable_resources_repr(
1527
+ handle, simplified_only=False))
1528
+ assert resources_str_full is not None
1529
+ job['cluster_resources'] = resources_str_simple
1531
1530
  job['cluster_resources_full'] = resources_str_full
1532
1531
  job['cloud'] = str(handle.launched_resources.cloud)
1533
1532
  job['region'] = handle.launched_resources.region
sky/optimizer.py CHANGED
@@ -1019,7 +1019,7 @@ class Optimizer:
1019
1019
  if res.instance_type is not None
1020
1020
  ])
1021
1021
  candidate_str = resources_utils.format_resource(
1022
- best_resources, simplify=True)
1022
+ best_resources, simplified_only=True)[0]
1023
1023
 
1024
1024
  logger.info(
1025
1025
  f'{colorama.Style.DIM}🔍 Multiple {cloud} instances '
@@ -33,6 +33,7 @@ from sky.utils.db import db_utils
33
33
  POLL_INTERVAL = 2
34
34
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
35
35
  _MAX_RETRIES = 3
36
+ _MAX_MISSING_PODS_RETRIES = 5
36
37
  _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
37
38
 
38
39
  # Pattern to extract SSH user from command output, handling MOTD contamination
@@ -489,17 +490,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
489
490
 
490
491
 
491
492
  @timeline.event
492
- def _wait_for_pods_to_run(namespace, context, new_nodes):
493
+ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
493
494
  """Wait for pods and their containers to be ready.
494
495
 
495
496
  Pods may be pulling images or may be in the process of container
496
497
  creation.
497
498
  """
498
- if not new_nodes:
499
+ if not new_pods:
499
500
  return
500
501
 
501
502
  # Create a set of pod names we're waiting for
502
- expected_pod_names = {node.metadata.name for node in new_nodes}
503
+ expected_pod_names = {pod.metadata.name for pod in new_pods}
503
504
 
504
505
  def _check_init_containers(pod):
505
506
  # Check if any of the init containers failed
@@ -526,28 +527,62 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
526
527
  'Failed to create init container for pod '
527
528
  f'{pod.metadata.name}. Error details: {msg}.')
528
529
 
530
+ missing_pods_retry = 0
529
531
  while True:
530
532
  # Get all pods in a single API call
531
- cluster_name = new_nodes[0].metadata.labels[
533
+ cluster_name_on_cloud = new_pods[0].metadata.labels[
532
534
  k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
533
535
  all_pods = kubernetes.core_api(context).list_namespaced_pod(
534
536
  namespace,
535
537
  label_selector=
536
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
538
+ f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
539
+ ).items
537
540
 
538
541
  # Get the set of found pod names and check if we have all expected pods
539
542
  found_pod_names = {pod.metadata.name for pod in all_pods}
540
- missing_pods = expected_pod_names - found_pod_names
541
- if missing_pods:
543
+ missing_pod_names = expected_pod_names - found_pod_names
544
+ if missing_pod_names:
545
+ # In _wait_for_pods_to_schedule, we already wait for all pods to go
546
+ # from pending to scheduled. So if a pod is missing here, it means
547
+ # something unusual must have happened, and so should be treated as
548
+ # an exception.
549
+ # It is also only in _wait_for_pods_to_schedule that
550
+ # provision_timeout is used.
551
+ # TODO(kevin): Should we take provision_timeout into account here,
552
+ # instead of hardcoding the number of retries?
553
+ if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
554
+ for pod_name in missing_pod_names:
555
+ reason = _get_pod_missing_reason(context, namespace,
556
+ cluster_name, pod_name)
557
+ logger.warning(f'Pod {pod_name} missing: {reason}')
558
+ raise config_lib.KubernetesError(
559
+ f'Failed to get all pods after {missing_pods_retry} '
560
+ f'retries. Some pods may have been terminated or failed '
561
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
562
+ 'for more details.')
542
563
  logger.info('Retrying running pods check: '
543
- f'Missing pods: {missing_pods}')
564
+ f'Missing pods: {missing_pod_names}')
544
565
  time.sleep(0.5)
566
+ missing_pods_retry += 1
545
567
  continue
546
568
 
547
569
  all_pods_running = True
548
570
  for pod in all_pods:
549
571
  if pod.metadata.name not in expected_pod_names:
550
572
  continue
573
+
574
+ # Check if pod is terminated/preempted/failed.
575
+ if (pod.metadata.deletion_timestamp is not None or
576
+ pod.status.phase == 'Failed'):
577
+ # Get the reason and write to cluster events before
578
+ # the pod gets completely deleted from the API.
579
+ reason = _get_pod_termination_reason(pod, cluster_name)
580
+ logger.warning(f'Pod {pod.metadata.name} terminated: {reason}')
581
+ raise config_lib.KubernetesError(
582
+ f'Pod {pod.metadata.name} has terminated or failed '
583
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
584
+ 'for more details.')
585
+
551
586
  # Continue if pod and all the containers within the
552
587
  # pod are successfully created and running.
553
588
  if pod.status.phase == 'Running' and all(
@@ -1169,7 +1204,7 @@ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
1169
1204
  # fail early if there is an error
1170
1205
  logger.debug(f'run_instances: waiting for pods to be running (pulling '
1171
1206
  f'images): {[pod.metadata.name for pod in pods]}')
1172
- _wait_for_pods_to_run(namespace, context, pods)
1207
+ _wait_for_pods_to_run(namespace, context, cluster_name, pods)
1173
1208
  logger.debug(f'run_instances: all pods are scheduled and running: '
1174
1209
  f'{[pod.metadata.name for pod in pods]}')
1175
1210
 
@@ -1428,9 +1463,45 @@ def get_cluster_info(
1428
1463
 
1429
1464
 
1430
1465
  def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1431
- """Get pod termination reason and write to cluster events."""
1432
- reasons = []
1466
+ """Get pod termination reason and write to cluster events.
1467
+
1468
+ Checks both pod conditions (for preemption/disruption) and
1469
+ container statuses (for exit codes/errors).
1470
+ """
1433
1471
  latest_timestamp = pod.status.start_time or datetime.datetime.min
1472
+ ready_state = 'Unknown'
1473
+ termination_reason = 'Terminated unexpectedly'
1474
+ container_reasons = []
1475
+
1476
+ # Check pod status conditions for high level overview.
1477
+ # No need to sort, as each condition.type will only appear once.
1478
+ for condition in pod.status.conditions:
1479
+ reason = condition.reason or 'Unknown reason'
1480
+ message = condition.message or ''
1481
+
1482
+ # Get last known readiness state.
1483
+ if condition.type == 'Ready':
1484
+ ready_state = f'{reason} ({message})' if message else reason
1485
+ # Kueue preemption, as defined in:
1486
+ # https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
1487
+ elif condition.type == 'TerminationTarget':
1488
+ termination_reason = f'Preempted by Kueue: {reason}'
1489
+ if message:
1490
+ termination_reason += f' ({message})'
1491
+ # Generic disruption.
1492
+ elif condition.type == 'DisruptionTarget':
1493
+ termination_reason = f'Disrupted: {reason}'
1494
+ if message:
1495
+ termination_reason += f' ({message})'
1496
+
1497
+ if condition.last_transition_time is not None:
1498
+ latest_timestamp = max(latest_timestamp,
1499
+ condition.last_transition_time)
1500
+
1501
+ pod_reason = (f'{termination_reason}.\n'
1502
+ f'Last known state: {ready_state}.')
1503
+
1504
+ # Check container statuses for exit codes/errors
1434
1505
  if pod.status and pod.status.container_statuses:
1435
1506
  for container_status in pod.status.container_statuses:
1436
1507
  terminated = container_status.state.terminated
@@ -1445,18 +1516,15 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1445
1516
  if reason is None:
1446
1517
  # just in-case reason is None, have default for debugging
1447
1518
  reason = f'exit({exit_code})'
1448
- reasons.append(reason)
1449
- if terminated.finished_at > latest_timestamp:
1450
- latest_timestamp = terminated.finished_at
1519
+ container_reasons.append(reason)
1520
+ latest_timestamp = max(latest_timestamp, terminated.finished_at)
1451
1521
 
1452
1522
  # TODO (kyuds): later, if needed, query `last_state` too.
1453
1523
 
1454
- if not reasons:
1455
- return ''
1456
-
1457
1524
  # Normally we will have a single container per pod for skypilot
1458
1525
  # but doing this just in-case there are multiple containers.
1459
- pod_reason = ' | '.join(reasons)
1526
+ if container_reasons:
1527
+ pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
1460
1528
 
1461
1529
  global_user_state.add_cluster_event(
1462
1530
  cluster_name,
@@ -1658,9 +1726,10 @@ def query_instances(
1658
1726
  Optional[str]]] = {}
1659
1727
  for pod in pods:
1660
1728
  phase = pod.status.phase
1729
+ is_terminating = pod.metadata.deletion_timestamp is not None
1661
1730
  pod_status = status_map[phase]
1662
1731
  reason = None
1663
- if phase in ('Failed', 'Unknown'):
1732
+ if phase in ('Failed', 'Unknown') or is_terminating:
1664
1733
  reason = _get_pod_termination_reason(pod, cluster_name)
1665
1734
  logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
1666
1735
  if non_terminated_only and pod_status is None:
@@ -75,7 +75,6 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
75
75
  """Deletes a volume."""
76
76
  context, namespace = _get_context_namespace(config)
77
77
  pvc_name = config.name_on_cloud
78
- logger.info(f'Deleting PVC {pvc_name}')
79
78
  kubernetes_utils.delete_k8s_resource_with_retry(
80
79
  delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
81
80
  context).delete_namespaced_persistent_volume_claim(
@@ -84,6 +83,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
84
83
  _request_timeout=config_lib.DELETION_TIMEOUT),
85
84
  resource_type='pvc',
86
85
  resource_name=pvc_name)
86
+ logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
87
87
  return config
88
88
 
89
89
 
@@ -242,9 +242,9 @@ def create_persistent_volume_claim(namespace: str, context: Optional[str],
242
242
  except kubernetes.api_exception() as e:
243
243
  if e.status != 404: # Not found
244
244
  raise
245
- logger.info(f'Creating PVC {pvc_name}')
246
245
  kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
247
246
  namespace=namespace, body=pvc_spec)
247
+ logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
248
248
 
249
249
 
250
250
  def _get_pvc_spec(namespace: str,