skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sky/__init__.py +48 -22
  2. sky/adaptors/aws.py +2 -1
  3. sky/adaptors/azure.py +4 -4
  4. sky/adaptors/cloudflare.py +4 -4
  5. sky/adaptors/kubernetes.py +8 -8
  6. sky/authentication.py +42 -45
  7. sky/backends/backend.py +2 -2
  8. sky/backends/backend_utils.py +108 -221
  9. sky/backends/cloud_vm_ray_backend.py +283 -282
  10. sky/benchmark/benchmark_utils.py +6 -2
  11. sky/check.py +40 -28
  12. sky/cli.py +1213 -1116
  13. sky/client/__init__.py +1 -0
  14. sky/client/cli.py +5644 -0
  15. sky/client/common.py +345 -0
  16. sky/client/sdk.py +1757 -0
  17. sky/cloud_stores.py +12 -6
  18. sky/clouds/__init__.py +0 -2
  19. sky/clouds/aws.py +20 -13
  20. sky/clouds/azure.py +5 -3
  21. sky/clouds/cloud.py +1 -1
  22. sky/clouds/cudo.py +2 -1
  23. sky/clouds/do.py +2 -1
  24. sky/clouds/fluidstack.py +3 -2
  25. sky/clouds/gcp.py +10 -8
  26. sky/clouds/ibm.py +8 -7
  27. sky/clouds/kubernetes.py +7 -6
  28. sky/clouds/lambda_cloud.py +8 -7
  29. sky/clouds/oci.py +4 -3
  30. sky/clouds/paperspace.py +2 -1
  31. sky/clouds/runpod.py +2 -1
  32. sky/clouds/scp.py +8 -7
  33. sky/clouds/service_catalog/__init__.py +3 -3
  34. sky/clouds/service_catalog/aws_catalog.py +7 -1
  35. sky/clouds/service_catalog/common.py +4 -2
  36. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
  37. sky/clouds/utils/oci_utils.py +1 -1
  38. sky/clouds/vast.py +2 -1
  39. sky/clouds/vsphere.py +2 -1
  40. sky/core.py +263 -99
  41. sky/dag.py +4 -0
  42. sky/data/mounting_utils.py +2 -1
  43. sky/data/storage.py +97 -35
  44. sky/data/storage_utils.py +69 -9
  45. sky/exceptions.py +138 -5
  46. sky/execution.py +47 -50
  47. sky/global_user_state.py +105 -22
  48. sky/jobs/__init__.py +12 -14
  49. sky/jobs/client/__init__.py +0 -0
  50. sky/jobs/client/sdk.py +296 -0
  51. sky/jobs/constants.py +30 -1
  52. sky/jobs/controller.py +12 -6
  53. sky/jobs/dashboard/dashboard.py +2 -6
  54. sky/jobs/recovery_strategy.py +22 -29
  55. sky/jobs/server/__init__.py +1 -0
  56. sky/jobs/{core.py → server/core.py} +101 -34
  57. sky/jobs/server/dashboard_utils.py +64 -0
  58. sky/jobs/server/server.py +182 -0
  59. sky/jobs/utils.py +32 -23
  60. sky/models.py +27 -0
  61. sky/optimizer.py +9 -11
  62. sky/provision/__init__.py +6 -3
  63. sky/provision/aws/config.py +2 -2
  64. sky/provision/aws/instance.py +1 -1
  65. sky/provision/azure/instance.py +1 -1
  66. sky/provision/cudo/instance.py +1 -1
  67. sky/provision/do/instance.py +1 -1
  68. sky/provision/do/utils.py +0 -5
  69. sky/provision/fluidstack/fluidstack_utils.py +4 -3
  70. sky/provision/fluidstack/instance.py +4 -2
  71. sky/provision/gcp/instance.py +1 -1
  72. sky/provision/instance_setup.py +2 -2
  73. sky/provision/kubernetes/constants.py +8 -0
  74. sky/provision/kubernetes/instance.py +1 -1
  75. sky/provision/kubernetes/utils.py +67 -76
  76. sky/provision/lambda_cloud/instance.py +3 -15
  77. sky/provision/logging.py +1 -1
  78. sky/provision/oci/instance.py +7 -4
  79. sky/provision/paperspace/instance.py +1 -1
  80. sky/provision/provisioner.py +3 -2
  81. sky/provision/runpod/instance.py +1 -1
  82. sky/provision/vast/instance.py +1 -1
  83. sky/provision/vast/utils.py +2 -1
  84. sky/provision/vsphere/instance.py +2 -11
  85. sky/resources.py +55 -40
  86. sky/serve/__init__.py +6 -10
  87. sky/serve/client/__init__.py +0 -0
  88. sky/serve/client/sdk.py +366 -0
  89. sky/serve/constants.py +3 -0
  90. sky/serve/replica_managers.py +10 -10
  91. sky/serve/serve_utils.py +56 -36
  92. sky/serve/server/__init__.py +0 -0
  93. sky/serve/{core.py → server/core.py} +37 -17
  94. sky/serve/server/server.py +117 -0
  95. sky/serve/service.py +8 -1
  96. sky/server/__init__.py +1 -0
  97. sky/server/common.py +441 -0
  98. sky/server/constants.py +21 -0
  99. sky/server/html/log.html +174 -0
  100. sky/server/requests/__init__.py +0 -0
  101. sky/server/requests/executor.py +462 -0
  102. sky/server/requests/payloads.py +481 -0
  103. sky/server/requests/queues/__init__.py +0 -0
  104. sky/server/requests/queues/mp_queue.py +76 -0
  105. sky/server/requests/requests.py +567 -0
  106. sky/server/requests/serializers/__init__.py +0 -0
  107. sky/server/requests/serializers/decoders.py +192 -0
  108. sky/server/requests/serializers/encoders.py +166 -0
  109. sky/server/server.py +1095 -0
  110. sky/server/stream_utils.py +144 -0
  111. sky/setup_files/MANIFEST.in +1 -0
  112. sky/setup_files/dependencies.py +12 -4
  113. sky/setup_files/setup.py +1 -1
  114. sky/sky_logging.py +9 -13
  115. sky/skylet/autostop_lib.py +2 -2
  116. sky/skylet/constants.py +46 -12
  117. sky/skylet/events.py +5 -6
  118. sky/skylet/job_lib.py +78 -66
  119. sky/skylet/log_lib.py +17 -11
  120. sky/skypilot_config.py +79 -94
  121. sky/task.py +119 -73
  122. sky/templates/aws-ray.yml.j2 +4 -4
  123. sky/templates/azure-ray.yml.j2 +3 -2
  124. sky/templates/cudo-ray.yml.j2 +3 -2
  125. sky/templates/fluidstack-ray.yml.j2 +3 -2
  126. sky/templates/gcp-ray.yml.j2 +3 -2
  127. sky/templates/ibm-ray.yml.j2 +3 -2
  128. sky/templates/jobs-controller.yaml.j2 +1 -12
  129. sky/templates/kubernetes-ray.yml.j2 +3 -2
  130. sky/templates/lambda-ray.yml.j2 +3 -2
  131. sky/templates/oci-ray.yml.j2 +3 -2
  132. sky/templates/paperspace-ray.yml.j2 +3 -2
  133. sky/templates/runpod-ray.yml.j2 +3 -2
  134. sky/templates/scp-ray.yml.j2 +3 -2
  135. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  136. sky/templates/vsphere-ray.yml.j2 +4 -2
  137. sky/templates/websocket_proxy.py +64 -0
  138. sky/usage/constants.py +8 -0
  139. sky/usage/usage_lib.py +45 -11
  140. sky/utils/accelerator_registry.py +33 -53
  141. sky/utils/admin_policy_utils.py +2 -1
  142. sky/utils/annotations.py +51 -0
  143. sky/utils/cli_utils/status_utils.py +33 -3
  144. sky/utils/cluster_utils.py +356 -0
  145. sky/utils/command_runner.py +69 -14
  146. sky/utils/common.py +74 -0
  147. sky/utils/common_utils.py +133 -93
  148. sky/utils/config_utils.py +204 -0
  149. sky/utils/control_master_utils.py +2 -3
  150. sky/utils/controller_utils.py +133 -147
  151. sky/utils/dag_utils.py +72 -24
  152. sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
  153. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  154. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  155. sky/utils/log_utils.py +83 -23
  156. sky/utils/message_utils.py +81 -0
  157. sky/utils/registry.py +127 -0
  158. sky/utils/resources_utils.py +2 -2
  159. sky/utils/rich_utils.py +213 -34
  160. sky/utils/schemas.py +19 -2
  161. sky/{status_lib.py → utils/status_lib.py} +12 -7
  162. sky/utils/subprocess_utils.py +51 -35
  163. sky/utils/timeline.py +7 -2
  164. sky/utils/ux_utils.py +95 -25
  165. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
  166. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
  167. sky/clouds/cloud_registry.py +0 -76
  168. sky/utils/cluster_yaml_utils.py +0 -24
  169. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
  170. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
sky/data/storage.py CHANGED
@@ -18,7 +18,6 @@ from sky import exceptions
18
18
  from sky import global_user_state
19
19
  from sky import sky_logging
20
20
  from sky import skypilot_config
21
- from sky import status_lib
22
21
  from sky.adaptors import aws
23
22
  from sky.adaptors import azure
24
23
  from sky.adaptors import cloudflare
@@ -34,6 +33,7 @@ from sky.skylet import constants
34
33
  from sky.utils import common_utils
35
34
  from sky.utils import rich_utils
36
35
  from sky.utils import schemas
36
+ from sky.utils import status_lib
37
37
  from sky.utils import ux_utils
38
38
 
39
39
  if typing.TYPE_CHECKING:
@@ -203,9 +203,8 @@ class StoreType(enum.Enum):
203
203
  @classmethod
204
204
  def get_fields_from_store_url(
205
205
  cls, store_url: str
206
- ) -> Tuple['StoreType', Type['AbstractStore'], str, str, Optional[str],
207
- Optional[str]]:
208
- """Returns the store type, store class, bucket name, and sub path from
206
+ ) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]:
207
+ """Returns the store type, bucket name, and sub path from
209
208
  a store URL, and the storage account name and region if applicable.
210
209
 
211
210
  Args:
@@ -221,21 +220,16 @@ class StoreType(enum.Enum):
221
220
  if store_type == StoreType.AZURE:
222
221
  storage_account_name, bucket_name, sub_path = \
223
222
  data_utils.split_az_path(store_url)
224
- store_cls: Type['AbstractStore'] = AzureBlobStore
225
223
  elif store_type == StoreType.IBM:
226
224
  bucket_name, sub_path, region = data_utils.split_cos_path(
227
225
  store_url)
228
- store_cls = IBMCosStore
229
226
  elif store_type == StoreType.R2:
230
227
  bucket_name, sub_path = data_utils.split_r2_path(store_url)
231
- store_cls = R2Store
232
228
  elif store_type == StoreType.GCS:
233
229
  bucket_name, sub_path = data_utils.split_gcs_path(store_url)
234
- store_cls = GcsStore
235
230
  elif store_type == StoreType.S3:
236
231
  bucket_name, sub_path = data_utils.split_s3_path(store_url)
237
- store_cls = S3Store
238
- return store_type, store_cls,bucket_name, \
232
+ return store_type, bucket_name, \
239
233
  sub_path, storage_account_name, region
240
234
  raise ValueError(f'Unknown store URL: {store_url}')
241
235
 
@@ -546,7 +540,7 @@ class Storage(object):
546
540
  self,
547
541
  name: Optional[str] = None,
548
542
  source: Optional[SourceType] = None,
549
- stores: Optional[Dict[StoreType, AbstractStore]] = None,
543
+ stores: Optional[List[StoreType]] = None,
550
544
  persistent: Optional[bool] = True,
551
545
  mode: StorageMode = StorageMode.MOUNT,
552
546
  sync_on_reconstruction: bool = True,
@@ -605,26 +599,47 @@ class Storage(object):
605
599
  _bucket_sub_path: Optional[str]; The subdirectory to use for the
606
600
  storage object.
607
601
  """
608
- self.name: str
602
+ self.name = name
609
603
  self.source = source
610
604
  self.persistent = persistent
611
605
  self.mode = mode
612
606
  assert mode in StorageMode
607
+ self.stores: Dict[StoreType, Optional[AbstractStore]] = {}
608
+ if stores is not None:
609
+ for store in stores:
610
+ self.stores[store] = None
613
611
  self.sync_on_reconstruction = sync_on_reconstruction
614
612
  self._is_sky_managed = _is_sky_managed
615
613
  self._bucket_sub_path = _bucket_sub_path
616
614
 
615
+ self._constructed = False
617
616
  # TODO(romilb, zhwu): This is a workaround to support storage deletion
618
- # for spot. Once sky storage supports forced management for external
619
- # buckets, this can be deprecated.
617
+ # for managed jobs. Once sky storage supports forced management for
618
+ # external buckets, this can be deprecated.
620
619
  self.force_delete = False
621
620
 
622
- # Validate and correct inputs if necessary
623
- self._validate_storage_spec(name)
621
+ def construct(self):
622
+ """Constructs the storage object.
623
+
624
+ The Storage object is lazily initialized, so that when a user
625
+ initializes a Storage object on client side, it does not trigger the
626
+ actual storage creation on the client side.
624
627
 
625
- # Sky optimizer either adds a storage object instance or selects
626
- # from existing ones
627
- self.stores = {} if stores is None else stores
628
+ Instead, once the specification of the storage object is uploaded to the
629
+ SkyPilot API server side, the server side should use this construct()
630
+ method to actually create the storage object. The construct() method
631
+ will:
632
+
633
+ 1. Set the stores field if not specified
634
+ 2. Create the bucket or check the existence of the bucket
635
+ 3. Sync the data from the source to the bucket if necessary
636
+ """
637
+ if self._constructed:
638
+ return
639
+ self._constructed = True
640
+
641
+ # Validate and correct inputs if necessary
642
+ self._validate_storage_spec(self.name)
628
643
 
629
644
  # Logic to rebuild Storage if it is in global user state
630
645
  handle = global_user_state.get_handle_from_storage_name(self.name)
@@ -635,6 +650,28 @@ class Storage(object):
635
650
  f'loading Storage: {self.name}')
636
651
  self._add_store_from_metadata(self.handle.sky_stores)
637
652
 
653
+ # When a storage object is reconstructed from global_user_state,
654
+ # the user may have specified a new store type in the yaml file that
655
+ # was not used with the storage object. We should error out in this
656
+ # case, as we don't support having multiple stores for the same
657
+ # storage object.
658
+ if any(s is None for s in self.stores.values()):
659
+ new_store_type = None
660
+ previous_store_type = None
661
+ for store_type, store in self.stores.items():
662
+ if store is not None:
663
+ previous_store_type = store_type
664
+ else:
665
+ new_store_type = store_type
666
+ with ux_utils.print_exception_no_traceback():
667
+ raise exceptions.StorageBucketCreateError(
668
+ f'Bucket {self.name} was previously created for '
669
+ f'{previous_store_type.value.lower()!r}, but a new '
670
+ f'store type {new_store_type.value.lower()!r} is '
671
+ 'requested. This is not supported yet. Please specify '
672
+ 'the same store type: '
673
+ f'{previous_store_type.value.lower()!r}.')
674
+
638
675
  # TODO(romilb): This logic should likely be in add_store to move
639
676
  # syncing to file_mount stage..
640
677
  if self.sync_on_reconstruction:
@@ -648,15 +685,16 @@ class Storage(object):
648
685
 
649
686
  else:
650
687
  # Storage does not exist in global_user_state, create new stores
651
- sky_managed_stores = {
652
- t: s.get_metadata()
653
- for t, s in self.stores.items()
654
- if s.is_sky_managed
655
- }
688
+ # Sky optimizer either adds a storage object instance or selects
689
+ # from existing ones
690
+ input_stores = self.stores
691
+ self.stores = {}
656
692
  self.handle = self.StorageMetadata(storage_name=self.name,
657
693
  source=self.source,
658
- mode=self.mode,
659
- sky_stores=sky_managed_stores)
694
+ mode=self.mode)
695
+
696
+ for store in input_stores:
697
+ self.add_store(store)
660
698
 
661
699
  if self.source is not None:
662
700
  # If source is a pre-existing bucket, connect to the bucket
@@ -987,10 +1025,13 @@ class Storage(object):
987
1025
  region: str; Region to place the bucket in. Caller must ensure that
988
1026
  the region is valid for the chosen store_type.
989
1027
  """
1028
+ assert self._constructed, self
1029
+ assert self.name is not None, self
1030
+
990
1031
  if isinstance(store_type, str):
991
1032
  store_type = StoreType(store_type)
992
1033
 
993
- if store_type in self.stores:
1034
+ if self.stores.get(store_type) is not None:
994
1035
  if store_type == StoreType.AZURE:
995
1036
  azure_store_obj = self.stores[store_type]
996
1037
  assert isinstance(azure_store_obj, AzureBlobStore)
@@ -999,8 +1040,9 @@ class Storage(object):
999
1040
  f'storage account {storage_account_name!r}.')
1000
1041
  else:
1001
1042
  logger.info(f'Storage type {store_type} already exists.')
1002
-
1003
- return self.stores[store_type]
1043
+ store = self.stores[store_type]
1044
+ assert store is not None, self
1045
+ return store
1004
1046
 
1005
1047
  store_cls: Type[AbstractStore]
1006
1048
  if store_type == StoreType.S3:
@@ -1068,6 +1110,7 @@ class Storage(object):
1068
1110
  if store.is_sky_managed:
1069
1111
  self.handle.add_store(store)
1070
1112
  if not is_reconstructed:
1113
+ assert self.name is not None, self
1071
1114
  global_user_state.add_or_update_storage(self.name, self.handle,
1072
1115
  StorageStatus.INIT)
1073
1116
 
@@ -1083,17 +1126,26 @@ class Storage(object):
1083
1126
  """
1084
1127
  if not self.stores:
1085
1128
  logger.info('No backing stores found. Deleting storage.')
1129
+ assert self.name is not None
1086
1130
  global_user_state.remove_storage(self.name)
1087
1131
  if store_type is not None:
1132
+ assert self.name is not None
1088
1133
  store = self.stores[store_type]
1134
+ assert store is not None, self
1135
+ is_sky_managed = store.is_sky_managed
1089
1136
  # We delete a store from the cloud if it's sky managed. Else just
1090
1137
  # remove handle and return
1091
- if store.is_sky_managed:
1138
+ if is_sky_managed:
1092
1139
  self.handle.remove_store(store)
1093
1140
  store.delete()
1094
1141
  # Check remaining stores - if none is sky managed, remove
1095
1142
  # the storage from global_user_state.
1096
- delete = all(not s.is_sky_managed for s in self.stores.values())
1143
+ delete = True
1144
+ for store in self.stores.values():
1145
+ assert store is not None, self
1146
+ if store.is_sky_managed:
1147
+ delete = False
1148
+ break
1097
1149
  if delete:
1098
1150
  global_user_state.remove_storage(self.name)
1099
1151
  else:
@@ -1104,6 +1156,7 @@ class Storage(object):
1104
1156
  del self.stores[store_type]
1105
1157
  else:
1106
1158
  for _, store in self.stores.items():
1159
+ assert store is not None, self
1107
1160
  if store.is_sky_managed:
1108
1161
  self.handle.remove_store(store)
1109
1162
  store.delete()
@@ -1111,15 +1164,19 @@ class Storage(object):
1111
1164
  store.delete()
1112
1165
  self.stores = {}
1113
1166
  # Remove storage from global_user_state if present
1114
- global_user_state.remove_storage(self.name)
1167
+ if self.name is not None:
1168
+ global_user_state.remove_storage(self.name)
1115
1169
 
1116
1170
  def sync_all_stores(self):
1117
1171
  """Syncs the source and destinations of all stores in the Storage"""
1118
1172
  for _, store in self.stores.items():
1173
+ assert store is not None, self
1119
1174
  self._sync_store(store)
1120
1175
 
1121
1176
  def _sync_store(self, store: AbstractStore):
1122
1177
  """Runs the upload routine for the store and handles failures"""
1178
+ assert self._constructed, self
1179
+ assert self.name is not None, self
1123
1180
 
1124
1181
  def warn_for_git_dir(source: str):
1125
1182
  if os.path.isdir(os.path.join(source, '.git')):
@@ -1175,14 +1232,17 @@ class Storage(object):
1175
1232
  assert not config, f'Invalid storage args: {config.keys()}'
1176
1233
 
1177
1234
  # Validation of the config object happens on instantiation.
1235
+ if store is not None:
1236
+ stores = [StoreType(store.upper())]
1237
+ else:
1238
+ stores = None
1178
1239
  storage_obj = cls(name=name,
1179
1240
  source=source,
1180
1241
  persistent=persistent,
1181
1242
  mode=mode,
1243
+ stores=stores,
1182
1244
  _is_sky_managed=_is_sky_managed,
1183
1245
  _bucket_sub_path=_bucket_sub_path)
1184
- if store is not None:
1185
- storage_obj.add_store(StoreType(store.upper()))
1186
1246
 
1187
1247
  # Add force deletion flag
1188
1248
  storage_obj.force_delete = force_delete
@@ -1207,7 +1267,9 @@ class Storage(object):
1207
1267
  is_sky_managed = self._is_sky_managed
1208
1268
  if self.stores:
1209
1269
  stores = ','.join([store.value for store in self.stores])
1210
- is_sky_managed = list(self.stores.values())[0].is_sky_managed
1270
+ store = list(self.stores.values())[0]
1271
+ if store is not None:
1272
+ is_sky_managed = store.is_sky_managed
1211
1273
  add_if_not_none('store', stores)
1212
1274
  add_if_not_none('_is_sky_managed', is_sky_managed)
1213
1275
  add_if_not_none('persistent', self.persistent)
sky/data/storage_utils.py CHANGED
@@ -1,9 +1,12 @@
1
1
  """Utility functions for the storage module."""
2
2
  import glob
3
3
  import os
4
+ import pathlib
4
5
  import shlex
5
6
  import subprocess
6
- from typing import Any, Dict, List
7
+ from typing import Any, Dict, List, Optional, TextIO, Union
8
+ import warnings
9
+ import zipfile
7
10
 
8
11
  import colorama
9
12
 
@@ -213,13 +216,70 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
213
216
  skyignore_path = os.path.join(expand_src_dir_path,
214
217
  constants.SKY_IGNORE_FILE)
215
218
  if os.path.exists(skyignore_path):
216
- logger.info(f' {colorama.Style.DIM}'
217
- f'Excluded files to sync to cluster based on '
218
- f'{constants.SKY_IGNORE_FILE}.'
219
- f'{colorama.Style.RESET_ALL}')
219
+ logger.debug(f' {colorama.Style.DIM}'
220
+ f'Excluded files to sync to cluster based on '
221
+ f'{constants.SKY_IGNORE_FILE}.'
222
+ f'{colorama.Style.RESET_ALL}')
220
223
  return get_excluded_files_from_skyignore(src_dir_path)
221
- logger.info(f' {colorama.Style.DIM}'
222
- f'Excluded files to sync to cluster based on '
223
- f'{constants.GIT_IGNORE_FILE}.'
224
- f'{colorama.Style.RESET_ALL}')
224
+ logger.debug(f' {colorama.Style.DIM}'
225
+ f'Excluded files to sync to cluster based on '
226
+ f'{constants.GIT_IGNORE_FILE}.'
227
+ f'{colorama.Style.RESET_ALL}')
225
228
  return get_excluded_files_from_gitignore(src_dir_path)
229
+
230
+
231
+ def zip_files_and_folders(items: List[str],
232
+ output_file: Union[str, pathlib.Path],
233
+ log_file: Optional[TextIO] = None):
234
+
235
+ def _store_symlink(zipf, path: str, is_dir: bool):
236
+ # Get the target of the symlink
237
+ target = os.readlink(path)
238
+ # Use relative path as absolute path will not be able to resolve on
239
+ # remote API server.
240
+ if os.path.isabs(target):
241
+ target = os.path.relpath(target, os.path.dirname(path))
242
+ # Create a ZipInfo instance
243
+ zi = zipfile.ZipInfo(path + '/') if is_dir else zipfile.ZipInfo(path)
244
+ # Set external attributes to mark as symlink
245
+ zi.external_attr = 0xA1ED0000
246
+ # Write symlink target as content
247
+ zipf.writestr(zi, target)
248
+
249
+ with warnings.catch_warnings():
250
+ warnings.filterwarnings('ignore',
251
+ category=UserWarning,
252
+ message='Duplicate name:')
253
+ with zipfile.ZipFile(output_file, 'w') as zipf:
254
+ for item in items:
255
+ item = os.path.expanduser(item)
256
+ if not os.path.isfile(item) and not os.path.isdir(item):
257
+ raise ValueError(f'{item} does not exist.')
258
+ excluded_files = set(
259
+ [os.path.join(item, f) for f in get_excluded_files(item)])
260
+ if os.path.isfile(item) and item not in excluded_files:
261
+ zipf.write(item)
262
+ elif os.path.isdir(item):
263
+ for root, dirs, files in os.walk(item, followlinks=False):
264
+ # Store directory entries (important for empty
265
+ # directories)
266
+ for dir_name in dirs:
267
+ dir_path = os.path.join(root, dir_name)
268
+ if dir_path in excluded_files:
269
+ continue
270
+ # If it's a symlink, store it as a symlink
271
+ if os.path.islink(dir_path):
272
+ _store_symlink(zipf, dir_path, is_dir=True)
273
+ else:
274
+ zipf.write(dir_path)
275
+
276
+ for file in files:
277
+ file_path = os.path.join(root, file)
278
+ if file_path in excluded_files:
279
+ continue
280
+ if os.path.islink(file_path):
281
+ _store_symlink(zipf, file_path, is_dir=False)
282
+ else:
283
+ zipf.write(file_path)
284
+ if log_file is not None:
285
+ log_file.write(f'Zipped {item}\n')
sky/exceptions.py CHANGED
@@ -1,13 +1,16 @@
1
1
  """Exceptions."""
2
+ import builtins
2
3
  import enum
4
+ import traceback
5
+ import types
3
6
  import typing
4
- from typing import List, Optional, Sequence
7
+ from typing import Any, Dict, List, Optional, Sequence
5
8
 
6
9
  from sky.utils import env_options
7
10
 
8
11
  if typing.TYPE_CHECKING:
9
- from sky import status_lib
10
12
  from sky.backends import backend
13
+ from sky.utils import status_lib
11
14
 
12
15
  # Return code for keyboard interruption and SIGTSTP
13
16
  KEYBOARD_INTERRUPT_CODE = 130
@@ -21,6 +24,107 @@ INSUFFICIENT_PRIVILEGES_CODE = 52
21
24
  GIT_FATAL_EXIT_CODE = 128
22
25
 
23
26
 
27
+ def is_safe_exception(exc: Exception) -> bool:
28
+ """Returns True if the exception is safe to send to clients.
29
+
30
+ Safe exceptions are:
31
+ 1. Built-in exceptions
32
+ 2. SkyPilot's own exceptions
33
+ """
34
+ module = type(exc).__module__
35
+
36
+ # Builtin exceptions (e.g., ValueError, RuntimeError)
37
+ if module == 'builtins':
38
+ return True
39
+
40
+ # SkyPilot exceptions
41
+ if module.startswith('sky.'):
42
+ return True
43
+
44
+ return False
45
+
46
+
47
+ def wrap_exception(exc: Exception) -> Exception:
48
+ """Wraps non-safe exceptions into SkyPilot exceptions
49
+
50
+ This is used to wrap exceptions that are not safe to deserialize at clients.
51
+
52
+ Examples include exceptions from cloud providers whose packages are not
53
+ available at clients.
54
+ """
55
+ if is_safe_exception(exc):
56
+ return exc
57
+
58
+ return CloudError(message=str(exc),
59
+ cloud_provider=type(exc).__module__.split('.')[0],
60
+ error_type=type(exc).__name__)
61
+
62
+
63
+ def serialize_exception(e: Exception) -> Dict[str, Any]:
64
+ """Serialize the exception.
65
+
66
+ This function also wraps any unsafe exceptions (e.g., cloud exceptions)
67
+ into SkyPilot's CloudError before serialization to ensure clients can
68
+ deserialize them without needing cloud provider packages installed.
69
+ """
70
+ # Wrap unsafe exceptions before serialization
71
+ e = wrap_exception(e)
72
+
73
+ stacktrace = getattr(e, 'stacktrace', None)
74
+ attributes = e.__dict__.copy()
75
+ if 'stacktrace' in attributes:
76
+ del attributes['stacktrace']
77
+ for attr_k in list(attributes.keys()):
78
+ attr_v = attributes[attr_k]
79
+ if isinstance(attr_v, types.TracebackType):
80
+ attributes[attr_k] = traceback.format_tb(attr_v)
81
+
82
+ data = {
83
+ 'type': e.__class__.__name__,
84
+ 'message': str(e),
85
+ 'args': e.args,
86
+ 'attributes': attributes,
87
+ 'stacktrace': stacktrace,
88
+ }
89
+ if isinstance(e, SkyPilotExcludeArgsBaseException):
90
+ data['args'] = tuple()
91
+ return data
92
+
93
+
94
+ def deserialize_exception(serialized: Dict[str, Any]) -> Exception:
95
+ """Deserialize the exception."""
96
+ exception_type = serialized['type']
97
+ if hasattr(builtins, exception_type):
98
+ exception_class = getattr(builtins, exception_type)
99
+ else:
100
+ exception_class = globals().get(exception_type, None)
101
+ if exception_class is None:
102
+ # Unknown exception type.
103
+ return Exception(f'{exception_type}: {serialized["message"]}')
104
+ e = exception_class(*serialized['args'], **serialized['attributes'])
105
+ if serialized['stacktrace'] is not None:
106
+ setattr(e, 'stacktrace', serialized['stacktrace'])
107
+ return e
108
+
109
+
110
+ class CloudError(Exception):
111
+ """Wraps cloud-specific errors into a SkyPilot exception."""
112
+
113
+ def __init__(self, message: str, cloud_provider: str, error_type: str):
114
+ super().__init__(message)
115
+ self.cloud_provider = cloud_provider
116
+ self.error_type = error_type
117
+
118
+ def __str__(self):
119
+ return (f'{self.cloud_provider} error ({self.error_type}): '
120
+ f'{super().__str__()}')
121
+
122
+
123
+ class InvalidSkyPilotConfigError(ValueError):
124
+ """Raised when the SkyPilot config is invalid."""
125
+ pass
126
+
127
+
24
128
  class ResourcesUnavailableError(Exception):
25
129
  """Raised when resources are unavailable.
26
130
 
@@ -94,7 +198,21 @@ class ResourcesMismatchError(Exception):
94
198
  pass
95
199
 
96
200
 
97
- class CommandError(Exception):
201
+ class SkyPilotExcludeArgsBaseException(Exception):
202
+ """Base class for exceptions that don't need args while serialization.
203
+
204
+ Due to our serialization/deserialization logic, when an exception does
205
+ not take `args` as an argument in __init__, `args` should not be included
206
+ in the serialized exception.
207
+
208
+ This is useful when an exception needs to construct the error message based
209
+ on the arguments passed in instead of directly having the error message as
210
+ the first argument in __init__. Refer to `CommandError` for an example.
211
+ """
212
+ pass
213
+
214
+
215
+ class CommandError(SkyPilotExcludeArgsBaseException):
98
216
  """Raised when a command fails.
99
217
 
100
218
  Args:
@@ -128,7 +246,7 @@ class ClusterNotUpError(Exception):
128
246
 
129
247
  def __init__(self,
130
248
  message: str,
131
- cluster_status: Optional['status_lib.ClusterStatus'],
249
+ cluster_status: Optional['status_lib.ClusterStatus'] = None,
132
250
  handle: Optional['backend.ResourceHandle'] = None) -> None:
133
251
  super().__init__(message)
134
252
  self.cluster_status = cluster_status
@@ -265,7 +383,7 @@ class NoCloudAccessError(Exception):
265
383
  pass
266
384
 
267
385
 
268
- class AWSAzFetchingError(Exception):
386
+ class AWSAzFetchingError(SkyPilotExcludeArgsBaseException):
269
387
  """Raised when fetching the AWS availability zone fails."""
270
388
 
271
389
  class Reason(enum.Enum):
@@ -314,3 +432,18 @@ class UserRequestRejectedByPolicy(Exception):
314
432
  class NoClusterLaunchedError(Exception):
315
433
  """No cluster launched, so cleanup can be skipped during failover."""
316
434
  pass
435
+
436
+
437
+ class RequestCancelled(Exception):
438
+ """Raised when a request is cancelled."""
439
+ pass
440
+
441
+
442
+ class ApiServerConnectionError(RuntimeError):
443
+ """Raised when the API server cannot be connected."""
444
+
445
+ def __init__(self, server_url: str):
446
+ super().__init__(
447
+ f'Could not connect to SkyPilot API server at {server_url}. '
448
+ f'Please ensure that the server is running. '
449
+ f'Try: curl {server_url}/api/health')