skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +48 -22
- sky/adaptors/aws.py +2 -1
- sky/adaptors/azure.py +4 -4
- sky/adaptors/cloudflare.py +4 -4
- sky/adaptors/kubernetes.py +8 -8
- sky/authentication.py +42 -45
- sky/backends/backend.py +2 -2
- sky/backends/backend_utils.py +108 -221
- sky/backends/cloud_vm_ray_backend.py +283 -282
- sky/benchmark/benchmark_utils.py +6 -2
- sky/check.py +40 -28
- sky/cli.py +1213 -1116
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5644 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1757 -0
- sky/cloud_stores.py +12 -6
- sky/clouds/__init__.py +0 -2
- sky/clouds/aws.py +20 -13
- sky/clouds/azure.py +5 -3
- sky/clouds/cloud.py +1 -1
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +3 -2
- sky/clouds/gcp.py +10 -8
- sky/clouds/ibm.py +8 -7
- sky/clouds/kubernetes.py +7 -6
- sky/clouds/lambda_cloud.py +8 -7
- sky/clouds/oci.py +4 -3
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +8 -7
- sky/clouds/service_catalog/__init__.py +3 -3
- sky/clouds/service_catalog/aws_catalog.py +7 -1
- sky/clouds/service_catalog/common.py +4 -2
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +263 -99
- sky/dag.py +4 -0
- sky/data/mounting_utils.py +2 -1
- sky/data/storage.py +97 -35
- sky/data/storage_utils.py +69 -9
- sky/exceptions.py +138 -5
- sky/execution.py +47 -50
- sky/global_user_state.py +105 -22
- sky/jobs/__init__.py +12 -14
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +296 -0
- sky/jobs/constants.py +30 -1
- sky/jobs/controller.py +12 -6
- sky/jobs/dashboard/dashboard.py +2 -6
- sky/jobs/recovery_strategy.py +22 -29
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/{core.py → server/core.py} +101 -34
- sky/jobs/server/dashboard_utils.py +64 -0
- sky/jobs/server/server.py +182 -0
- sky/jobs/utils.py +32 -23
- sky/models.py +27 -0
- sky/optimizer.py +9 -11
- sky/provision/__init__.py +6 -3
- sky/provision/aws/config.py +2 -2
- sky/provision/aws/instance.py +1 -1
- sky/provision/azure/instance.py +1 -1
- sky/provision/cudo/instance.py +1 -1
- sky/provision/do/instance.py +1 -1
- sky/provision/do/utils.py +0 -5
- sky/provision/fluidstack/fluidstack_utils.py +4 -3
- sky/provision/fluidstack/instance.py +4 -2
- sky/provision/gcp/instance.py +1 -1
- sky/provision/instance_setup.py +2 -2
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +67 -76
- sky/provision/lambda_cloud/instance.py +3 -15
- sky/provision/logging.py +1 -1
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +1 -1
- sky/provision/provisioner.py +3 -2
- sky/provision/runpod/instance.py +1 -1
- sky/provision/vast/instance.py +1 -1
- sky/provision/vast/utils.py +2 -1
- sky/provision/vsphere/instance.py +2 -11
- sky/resources.py +55 -40
- sky/serve/__init__.py +6 -10
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +3 -0
- sky/serve/replica_managers.py +10 -10
- sky/serve/serve_utils.py +56 -36
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +37 -17
- sky/serve/server/server.py +117 -0
- sky/serve/service.py +8 -1
- sky/server/__init__.py +1 -0
- sky/server/common.py +441 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +462 -0
- sky/server/requests/payloads.py +481 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1095 -0
- sky/server/stream_utils.py +144 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +12 -4
- sky/setup_files/setup.py +1 -1
- sky/sky_logging.py +9 -13
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +46 -12
- sky/skylet/events.py +5 -6
- sky/skylet/job_lib.py +78 -66
- sky/skylet/log_lib.py +17 -11
- sky/skypilot_config.py +79 -94
- sky/task.py +119 -73
- sky/templates/aws-ray.yml.j2 +4 -4
- sky/templates/azure-ray.yml.j2 +3 -2
- sky/templates/cudo-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +3 -2
- sky/templates/gcp-ray.yml.j2 +3 -2
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +1 -12
- sky/templates/kubernetes-ray.yml.j2 +3 -2
- sky/templates/lambda-ray.yml.j2 +3 -2
- sky/templates/oci-ray.yml.j2 +3 -2
- sky/templates/paperspace-ray.yml.j2 +3 -2
- sky/templates/runpod-ray.yml.j2 +3 -2
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vsphere-ray.yml.j2 +4 -2
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +8 -0
- sky/usage/usage_lib.py +45 -11
- sky/utils/accelerator_registry.py +33 -53
- sky/utils/admin_policy_utils.py +2 -1
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +33 -3
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +69 -14
- sky/utils/common.py +74 -0
- sky/utils/common_utils.py +133 -93
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +2 -3
- sky/utils/controller_utils.py +133 -147
- sky/utils/dag_utils.py +72 -24
- sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/log_utils.py +83 -23
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +2 -2
- sky/utils/rich_utils.py +213 -34
- sky/utils/schemas.py +19 -2
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +51 -35
- sky/utils/timeline.py +7 -2
- sky/utils/ux_utils.py +95 -25
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
- sky/clouds/cloud_registry.py +0 -76
- sky/utils/cluster_yaml_utils.py +0 -24
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
sky/data/storage.py
CHANGED
@@ -18,7 +18,6 @@ from sky import exceptions
|
|
18
18
|
from sky import global_user_state
|
19
19
|
from sky import sky_logging
|
20
20
|
from sky import skypilot_config
|
21
|
-
from sky import status_lib
|
22
21
|
from sky.adaptors import aws
|
23
22
|
from sky.adaptors import azure
|
24
23
|
from sky.adaptors import cloudflare
|
@@ -34,6 +33,7 @@ from sky.skylet import constants
|
|
34
33
|
from sky.utils import common_utils
|
35
34
|
from sky.utils import rich_utils
|
36
35
|
from sky.utils import schemas
|
36
|
+
from sky.utils import status_lib
|
37
37
|
from sky.utils import ux_utils
|
38
38
|
|
39
39
|
if typing.TYPE_CHECKING:
|
@@ -203,9 +203,8 @@ class StoreType(enum.Enum):
|
|
203
203
|
@classmethod
|
204
204
|
def get_fields_from_store_url(
|
205
205
|
cls, store_url: str
|
206
|
-
) -> Tuple['StoreType',
|
207
|
-
|
208
|
-
"""Returns the store type, store class, bucket name, and sub path from
|
206
|
+
) -> Tuple['StoreType', str, str, Optional[str], Optional[str]]:
|
207
|
+
"""Returns the store type, bucket name, and sub path from
|
209
208
|
a store URL, and the storage account name and region if applicable.
|
210
209
|
|
211
210
|
Args:
|
@@ -221,21 +220,16 @@ class StoreType(enum.Enum):
|
|
221
220
|
if store_type == StoreType.AZURE:
|
222
221
|
storage_account_name, bucket_name, sub_path = \
|
223
222
|
data_utils.split_az_path(store_url)
|
224
|
-
store_cls: Type['AbstractStore'] = AzureBlobStore
|
225
223
|
elif store_type == StoreType.IBM:
|
226
224
|
bucket_name, sub_path, region = data_utils.split_cos_path(
|
227
225
|
store_url)
|
228
|
-
store_cls = IBMCosStore
|
229
226
|
elif store_type == StoreType.R2:
|
230
227
|
bucket_name, sub_path = data_utils.split_r2_path(store_url)
|
231
|
-
store_cls = R2Store
|
232
228
|
elif store_type == StoreType.GCS:
|
233
229
|
bucket_name, sub_path = data_utils.split_gcs_path(store_url)
|
234
|
-
store_cls = GcsStore
|
235
230
|
elif store_type == StoreType.S3:
|
236
231
|
bucket_name, sub_path = data_utils.split_s3_path(store_url)
|
237
|
-
|
238
|
-
return store_type, store_cls,bucket_name, \
|
232
|
+
return store_type, bucket_name, \
|
239
233
|
sub_path, storage_account_name, region
|
240
234
|
raise ValueError(f'Unknown store URL: {store_url}')
|
241
235
|
|
@@ -546,7 +540,7 @@ class Storage(object):
|
|
546
540
|
self,
|
547
541
|
name: Optional[str] = None,
|
548
542
|
source: Optional[SourceType] = None,
|
549
|
-
stores: Optional[
|
543
|
+
stores: Optional[List[StoreType]] = None,
|
550
544
|
persistent: Optional[bool] = True,
|
551
545
|
mode: StorageMode = StorageMode.MOUNT,
|
552
546
|
sync_on_reconstruction: bool = True,
|
@@ -605,26 +599,47 @@ class Storage(object):
|
|
605
599
|
_bucket_sub_path: Optional[str]; The subdirectory to use for the
|
606
600
|
storage object.
|
607
601
|
"""
|
608
|
-
self.name
|
602
|
+
self.name = name
|
609
603
|
self.source = source
|
610
604
|
self.persistent = persistent
|
611
605
|
self.mode = mode
|
612
606
|
assert mode in StorageMode
|
607
|
+
self.stores: Dict[StoreType, Optional[AbstractStore]] = {}
|
608
|
+
if stores is not None:
|
609
|
+
for store in stores:
|
610
|
+
self.stores[store] = None
|
613
611
|
self.sync_on_reconstruction = sync_on_reconstruction
|
614
612
|
self._is_sky_managed = _is_sky_managed
|
615
613
|
self._bucket_sub_path = _bucket_sub_path
|
616
614
|
|
615
|
+
self._constructed = False
|
617
616
|
# TODO(romilb, zhwu): This is a workaround to support storage deletion
|
618
|
-
# for
|
619
|
-
# buckets, this can be deprecated.
|
617
|
+
# for managed jobs. Once sky storage supports forced management for
|
618
|
+
# external buckets, this can be deprecated.
|
620
619
|
self.force_delete = False
|
621
620
|
|
622
|
-
|
623
|
-
|
621
|
+
def construct(self):
|
622
|
+
"""Constructs the storage object.
|
623
|
+
|
624
|
+
The Storage object is lazily initialized, so that when a user
|
625
|
+
initializes a Storage object on client side, it does not trigger the
|
626
|
+
actual storage creation on the client side.
|
624
627
|
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
+
Instead, once the specification of the storage object is uploaded to the
|
629
|
+
SkyPilot API server side, the server side should use this construct()
|
630
|
+
method to actually create the storage object. The construct() method
|
631
|
+
will:
|
632
|
+
|
633
|
+
1. Set the stores field if not specified
|
634
|
+
2. Create the bucket or check the existence of the bucket
|
635
|
+
3. Sync the data from the source to the bucket if necessary
|
636
|
+
"""
|
637
|
+
if self._constructed:
|
638
|
+
return
|
639
|
+
self._constructed = True
|
640
|
+
|
641
|
+
# Validate and correct inputs if necessary
|
642
|
+
self._validate_storage_spec(self.name)
|
628
643
|
|
629
644
|
# Logic to rebuild Storage if it is in global user state
|
630
645
|
handle = global_user_state.get_handle_from_storage_name(self.name)
|
@@ -635,6 +650,28 @@ class Storage(object):
|
|
635
650
|
f'loading Storage: {self.name}')
|
636
651
|
self._add_store_from_metadata(self.handle.sky_stores)
|
637
652
|
|
653
|
+
# When a storage object is reconstructed from global_user_state,
|
654
|
+
# the user may have specified a new store type in the yaml file that
|
655
|
+
# was not used with the storage object. We should error out in this
|
656
|
+
# case, as we don't support having multiple stores for the same
|
657
|
+
# storage object.
|
658
|
+
if any(s is None for s in self.stores.values()):
|
659
|
+
new_store_type = None
|
660
|
+
previous_store_type = None
|
661
|
+
for store_type, store in self.stores.items():
|
662
|
+
if store is not None:
|
663
|
+
previous_store_type = store_type
|
664
|
+
else:
|
665
|
+
new_store_type = store_type
|
666
|
+
with ux_utils.print_exception_no_traceback():
|
667
|
+
raise exceptions.StorageBucketCreateError(
|
668
|
+
f'Bucket {self.name} was previously created for '
|
669
|
+
f'{previous_store_type.value.lower()!r}, but a new '
|
670
|
+
f'store type {new_store_type.value.lower()!r} is '
|
671
|
+
'requested. This is not supported yet. Please specify '
|
672
|
+
'the same store type: '
|
673
|
+
f'{previous_store_type.value.lower()!r}.')
|
674
|
+
|
638
675
|
# TODO(romilb): This logic should likely be in add_store to move
|
639
676
|
# syncing to file_mount stage..
|
640
677
|
if self.sync_on_reconstruction:
|
@@ -648,15 +685,16 @@ class Storage(object):
|
|
648
685
|
|
649
686
|
else:
|
650
687
|
# Storage does not exist in global_user_state, create new stores
|
651
|
-
|
652
|
-
|
653
|
-
|
654
|
-
|
655
|
-
}
|
688
|
+
# Sky optimizer either adds a storage object instance or selects
|
689
|
+
# from existing ones
|
690
|
+
input_stores = self.stores
|
691
|
+
self.stores = {}
|
656
692
|
self.handle = self.StorageMetadata(storage_name=self.name,
|
657
693
|
source=self.source,
|
658
|
-
mode=self.mode
|
659
|
-
|
694
|
+
mode=self.mode)
|
695
|
+
|
696
|
+
for store in input_stores:
|
697
|
+
self.add_store(store)
|
660
698
|
|
661
699
|
if self.source is not None:
|
662
700
|
# If source is a pre-existing bucket, connect to the bucket
|
@@ -987,10 +1025,13 @@ class Storage(object):
|
|
987
1025
|
region: str; Region to place the bucket in. Caller must ensure that
|
988
1026
|
the region is valid for the chosen store_type.
|
989
1027
|
"""
|
1028
|
+
assert self._constructed, self
|
1029
|
+
assert self.name is not None, self
|
1030
|
+
|
990
1031
|
if isinstance(store_type, str):
|
991
1032
|
store_type = StoreType(store_type)
|
992
1033
|
|
993
|
-
if store_type
|
1034
|
+
if self.stores.get(store_type) is not None:
|
994
1035
|
if store_type == StoreType.AZURE:
|
995
1036
|
azure_store_obj = self.stores[store_type]
|
996
1037
|
assert isinstance(azure_store_obj, AzureBlobStore)
|
@@ -999,8 +1040,9 @@ class Storage(object):
|
|
999
1040
|
f'storage account {storage_account_name!r}.')
|
1000
1041
|
else:
|
1001
1042
|
logger.info(f'Storage type {store_type} already exists.')
|
1002
|
-
|
1003
|
-
|
1043
|
+
store = self.stores[store_type]
|
1044
|
+
assert store is not None, self
|
1045
|
+
return store
|
1004
1046
|
|
1005
1047
|
store_cls: Type[AbstractStore]
|
1006
1048
|
if store_type == StoreType.S3:
|
@@ -1068,6 +1110,7 @@ class Storage(object):
|
|
1068
1110
|
if store.is_sky_managed:
|
1069
1111
|
self.handle.add_store(store)
|
1070
1112
|
if not is_reconstructed:
|
1113
|
+
assert self.name is not None, self
|
1071
1114
|
global_user_state.add_or_update_storage(self.name, self.handle,
|
1072
1115
|
StorageStatus.INIT)
|
1073
1116
|
|
@@ -1083,17 +1126,26 @@ class Storage(object):
|
|
1083
1126
|
"""
|
1084
1127
|
if not self.stores:
|
1085
1128
|
logger.info('No backing stores found. Deleting storage.')
|
1129
|
+
assert self.name is not None
|
1086
1130
|
global_user_state.remove_storage(self.name)
|
1087
1131
|
if store_type is not None:
|
1132
|
+
assert self.name is not None
|
1088
1133
|
store = self.stores[store_type]
|
1134
|
+
assert store is not None, self
|
1135
|
+
is_sky_managed = store.is_sky_managed
|
1089
1136
|
# We delete a store from the cloud if it's sky managed. Else just
|
1090
1137
|
# remove handle and return
|
1091
|
-
if
|
1138
|
+
if is_sky_managed:
|
1092
1139
|
self.handle.remove_store(store)
|
1093
1140
|
store.delete()
|
1094
1141
|
# Check remaining stores - if none is sky managed, remove
|
1095
1142
|
# the storage from global_user_state.
|
1096
|
-
delete =
|
1143
|
+
delete = True
|
1144
|
+
for store in self.stores.values():
|
1145
|
+
assert store is not None, self
|
1146
|
+
if store.is_sky_managed:
|
1147
|
+
delete = False
|
1148
|
+
break
|
1097
1149
|
if delete:
|
1098
1150
|
global_user_state.remove_storage(self.name)
|
1099
1151
|
else:
|
@@ -1104,6 +1156,7 @@ class Storage(object):
|
|
1104
1156
|
del self.stores[store_type]
|
1105
1157
|
else:
|
1106
1158
|
for _, store in self.stores.items():
|
1159
|
+
assert store is not None, self
|
1107
1160
|
if store.is_sky_managed:
|
1108
1161
|
self.handle.remove_store(store)
|
1109
1162
|
store.delete()
|
@@ -1111,15 +1164,19 @@ class Storage(object):
|
|
1111
1164
|
store.delete()
|
1112
1165
|
self.stores = {}
|
1113
1166
|
# Remove storage from global_user_state if present
|
1114
|
-
|
1167
|
+
if self.name is not None:
|
1168
|
+
global_user_state.remove_storage(self.name)
|
1115
1169
|
|
1116
1170
|
def sync_all_stores(self):
|
1117
1171
|
"""Syncs the source and destinations of all stores in the Storage"""
|
1118
1172
|
for _, store in self.stores.items():
|
1173
|
+
assert store is not None, self
|
1119
1174
|
self._sync_store(store)
|
1120
1175
|
|
1121
1176
|
def _sync_store(self, store: AbstractStore):
|
1122
1177
|
"""Runs the upload routine for the store and handles failures"""
|
1178
|
+
assert self._constructed, self
|
1179
|
+
assert self.name is not None, self
|
1123
1180
|
|
1124
1181
|
def warn_for_git_dir(source: str):
|
1125
1182
|
if os.path.isdir(os.path.join(source, '.git')):
|
@@ -1175,14 +1232,17 @@ class Storage(object):
|
|
1175
1232
|
assert not config, f'Invalid storage args: {config.keys()}'
|
1176
1233
|
|
1177
1234
|
# Validation of the config object happens on instantiation.
|
1235
|
+
if store is not None:
|
1236
|
+
stores = [StoreType(store.upper())]
|
1237
|
+
else:
|
1238
|
+
stores = None
|
1178
1239
|
storage_obj = cls(name=name,
|
1179
1240
|
source=source,
|
1180
1241
|
persistent=persistent,
|
1181
1242
|
mode=mode,
|
1243
|
+
stores=stores,
|
1182
1244
|
_is_sky_managed=_is_sky_managed,
|
1183
1245
|
_bucket_sub_path=_bucket_sub_path)
|
1184
|
-
if store is not None:
|
1185
|
-
storage_obj.add_store(StoreType(store.upper()))
|
1186
1246
|
|
1187
1247
|
# Add force deletion flag
|
1188
1248
|
storage_obj.force_delete = force_delete
|
@@ -1207,7 +1267,9 @@ class Storage(object):
|
|
1207
1267
|
is_sky_managed = self._is_sky_managed
|
1208
1268
|
if self.stores:
|
1209
1269
|
stores = ','.join([store.value for store in self.stores])
|
1210
|
-
|
1270
|
+
store = list(self.stores.values())[0]
|
1271
|
+
if store is not None:
|
1272
|
+
is_sky_managed = store.is_sky_managed
|
1211
1273
|
add_if_not_none('store', stores)
|
1212
1274
|
add_if_not_none('_is_sky_managed', is_sky_managed)
|
1213
1275
|
add_if_not_none('persistent', self.persistent)
|
sky/data/storage_utils.py
CHANGED
@@ -1,9 +1,12 @@
|
|
1
1
|
"""Utility functions for the storage module."""
|
2
2
|
import glob
|
3
3
|
import os
|
4
|
+
import pathlib
|
4
5
|
import shlex
|
5
6
|
import subprocess
|
6
|
-
from typing import Any, Dict, List
|
7
|
+
from typing import Any, Dict, List, Optional, TextIO, Union
|
8
|
+
import warnings
|
9
|
+
import zipfile
|
7
10
|
|
8
11
|
import colorama
|
9
12
|
|
@@ -213,13 +216,70 @@ def get_excluded_files(src_dir_path: str) -> List[str]:
|
|
213
216
|
skyignore_path = os.path.join(expand_src_dir_path,
|
214
217
|
constants.SKY_IGNORE_FILE)
|
215
218
|
if os.path.exists(skyignore_path):
|
216
|
-
logger.
|
217
|
-
|
218
|
-
|
219
|
-
|
219
|
+
logger.debug(f' {colorama.Style.DIM}'
|
220
|
+
f'Excluded files to sync to cluster based on '
|
221
|
+
f'{constants.SKY_IGNORE_FILE}.'
|
222
|
+
f'{colorama.Style.RESET_ALL}')
|
220
223
|
return get_excluded_files_from_skyignore(src_dir_path)
|
221
|
-
logger.
|
222
|
-
|
223
|
-
|
224
|
-
|
224
|
+
logger.debug(f' {colorama.Style.DIM}'
|
225
|
+
f'Excluded files to sync to cluster based on '
|
226
|
+
f'{constants.GIT_IGNORE_FILE}.'
|
227
|
+
f'{colorama.Style.RESET_ALL}')
|
225
228
|
return get_excluded_files_from_gitignore(src_dir_path)
|
229
|
+
|
230
|
+
|
231
|
+
def zip_files_and_folders(items: List[str],
|
232
|
+
output_file: Union[str, pathlib.Path],
|
233
|
+
log_file: Optional[TextIO] = None):
|
234
|
+
|
235
|
+
def _store_symlink(zipf, path: str, is_dir: bool):
|
236
|
+
# Get the target of the symlink
|
237
|
+
target = os.readlink(path)
|
238
|
+
# Use relative path as absolute path will not be able to resolve on
|
239
|
+
# remote API server.
|
240
|
+
if os.path.isabs(target):
|
241
|
+
target = os.path.relpath(target, os.path.dirname(path))
|
242
|
+
# Create a ZipInfo instance
|
243
|
+
zi = zipfile.ZipInfo(path + '/') if is_dir else zipfile.ZipInfo(path)
|
244
|
+
# Set external attributes to mark as symlink
|
245
|
+
zi.external_attr = 0xA1ED0000
|
246
|
+
# Write symlink target as content
|
247
|
+
zipf.writestr(zi, target)
|
248
|
+
|
249
|
+
with warnings.catch_warnings():
|
250
|
+
warnings.filterwarnings('ignore',
|
251
|
+
category=UserWarning,
|
252
|
+
message='Duplicate name:')
|
253
|
+
with zipfile.ZipFile(output_file, 'w') as zipf:
|
254
|
+
for item in items:
|
255
|
+
item = os.path.expanduser(item)
|
256
|
+
if not os.path.isfile(item) and not os.path.isdir(item):
|
257
|
+
raise ValueError(f'{item} does not exist.')
|
258
|
+
excluded_files = set(
|
259
|
+
[os.path.join(item, f) for f in get_excluded_files(item)])
|
260
|
+
if os.path.isfile(item) and item not in excluded_files:
|
261
|
+
zipf.write(item)
|
262
|
+
elif os.path.isdir(item):
|
263
|
+
for root, dirs, files in os.walk(item, followlinks=False):
|
264
|
+
# Store directory entries (important for empty
|
265
|
+
# directories)
|
266
|
+
for dir_name in dirs:
|
267
|
+
dir_path = os.path.join(root, dir_name)
|
268
|
+
if dir_path in excluded_files:
|
269
|
+
continue
|
270
|
+
# If it's a symlink, store it as a symlink
|
271
|
+
if os.path.islink(dir_path):
|
272
|
+
_store_symlink(zipf, dir_path, is_dir=True)
|
273
|
+
else:
|
274
|
+
zipf.write(dir_path)
|
275
|
+
|
276
|
+
for file in files:
|
277
|
+
file_path = os.path.join(root, file)
|
278
|
+
if file_path in excluded_files:
|
279
|
+
continue
|
280
|
+
if os.path.islink(file_path):
|
281
|
+
_store_symlink(zipf, file_path, is_dir=False)
|
282
|
+
else:
|
283
|
+
zipf.write(file_path)
|
284
|
+
if log_file is not None:
|
285
|
+
log_file.write(f'Zipped {item}\n')
|
sky/exceptions.py
CHANGED
@@ -1,13 +1,16 @@
|
|
1
1
|
"""Exceptions."""
|
2
|
+
import builtins
|
2
3
|
import enum
|
4
|
+
import traceback
|
5
|
+
import types
|
3
6
|
import typing
|
4
|
-
from typing import List, Optional, Sequence
|
7
|
+
from typing import Any, Dict, List, Optional, Sequence
|
5
8
|
|
6
9
|
from sky.utils import env_options
|
7
10
|
|
8
11
|
if typing.TYPE_CHECKING:
|
9
|
-
from sky import status_lib
|
10
12
|
from sky.backends import backend
|
13
|
+
from sky.utils import status_lib
|
11
14
|
|
12
15
|
# Return code for keyboard interruption and SIGTSTP
|
13
16
|
KEYBOARD_INTERRUPT_CODE = 130
|
@@ -21,6 +24,107 @@ INSUFFICIENT_PRIVILEGES_CODE = 52
|
|
21
24
|
GIT_FATAL_EXIT_CODE = 128
|
22
25
|
|
23
26
|
|
27
|
+
def is_safe_exception(exc: Exception) -> bool:
|
28
|
+
"""Returns True if the exception is safe to send to clients.
|
29
|
+
|
30
|
+
Safe exceptions are:
|
31
|
+
1. Built-in exceptions
|
32
|
+
2. SkyPilot's own exceptions
|
33
|
+
"""
|
34
|
+
module = type(exc).__module__
|
35
|
+
|
36
|
+
# Builtin exceptions (e.g., ValueError, RuntimeError)
|
37
|
+
if module == 'builtins':
|
38
|
+
return True
|
39
|
+
|
40
|
+
# SkyPilot exceptions
|
41
|
+
if module.startswith('sky.'):
|
42
|
+
return True
|
43
|
+
|
44
|
+
return False
|
45
|
+
|
46
|
+
|
47
|
+
def wrap_exception(exc: Exception) -> Exception:
|
48
|
+
"""Wraps non-safe exceptions into SkyPilot exceptions
|
49
|
+
|
50
|
+
This is used to wrap exceptions that are not safe to deserialize at clients.
|
51
|
+
|
52
|
+
Examples include exceptions from cloud providers whose packages are not
|
53
|
+
available at clients.
|
54
|
+
"""
|
55
|
+
if is_safe_exception(exc):
|
56
|
+
return exc
|
57
|
+
|
58
|
+
return CloudError(message=str(exc),
|
59
|
+
cloud_provider=type(exc).__module__.split('.')[0],
|
60
|
+
error_type=type(exc).__name__)
|
61
|
+
|
62
|
+
|
63
|
+
def serialize_exception(e: Exception) -> Dict[str, Any]:
|
64
|
+
"""Serialize the exception.
|
65
|
+
|
66
|
+
This function also wraps any unsafe exceptions (e.g., cloud exceptions)
|
67
|
+
into SkyPilot's CloudError before serialization to ensure clients can
|
68
|
+
deserialize them without needing cloud provider packages installed.
|
69
|
+
"""
|
70
|
+
# Wrap unsafe exceptions before serialization
|
71
|
+
e = wrap_exception(e)
|
72
|
+
|
73
|
+
stacktrace = getattr(e, 'stacktrace', None)
|
74
|
+
attributes = e.__dict__.copy()
|
75
|
+
if 'stacktrace' in attributes:
|
76
|
+
del attributes['stacktrace']
|
77
|
+
for attr_k in list(attributes.keys()):
|
78
|
+
attr_v = attributes[attr_k]
|
79
|
+
if isinstance(attr_v, types.TracebackType):
|
80
|
+
attributes[attr_k] = traceback.format_tb(attr_v)
|
81
|
+
|
82
|
+
data = {
|
83
|
+
'type': e.__class__.__name__,
|
84
|
+
'message': str(e),
|
85
|
+
'args': e.args,
|
86
|
+
'attributes': attributes,
|
87
|
+
'stacktrace': stacktrace,
|
88
|
+
}
|
89
|
+
if isinstance(e, SkyPilotExcludeArgsBaseException):
|
90
|
+
data['args'] = tuple()
|
91
|
+
return data
|
92
|
+
|
93
|
+
|
94
|
+
def deserialize_exception(serialized: Dict[str, Any]) -> Exception:
|
95
|
+
"""Deserialize the exception."""
|
96
|
+
exception_type = serialized['type']
|
97
|
+
if hasattr(builtins, exception_type):
|
98
|
+
exception_class = getattr(builtins, exception_type)
|
99
|
+
else:
|
100
|
+
exception_class = globals().get(exception_type, None)
|
101
|
+
if exception_class is None:
|
102
|
+
# Unknown exception type.
|
103
|
+
return Exception(f'{exception_type}: {serialized["message"]}')
|
104
|
+
e = exception_class(*serialized['args'], **serialized['attributes'])
|
105
|
+
if serialized['stacktrace'] is not None:
|
106
|
+
setattr(e, 'stacktrace', serialized['stacktrace'])
|
107
|
+
return e
|
108
|
+
|
109
|
+
|
110
|
+
class CloudError(Exception):
|
111
|
+
"""Wraps cloud-specific errors into a SkyPilot exception."""
|
112
|
+
|
113
|
+
def __init__(self, message: str, cloud_provider: str, error_type: str):
|
114
|
+
super().__init__(message)
|
115
|
+
self.cloud_provider = cloud_provider
|
116
|
+
self.error_type = error_type
|
117
|
+
|
118
|
+
def __str__(self):
|
119
|
+
return (f'{self.cloud_provider} error ({self.error_type}): '
|
120
|
+
f'{super().__str__()}')
|
121
|
+
|
122
|
+
|
123
|
+
class InvalidSkyPilotConfigError(ValueError):
|
124
|
+
"""Raised when the SkyPilot config is invalid."""
|
125
|
+
pass
|
126
|
+
|
127
|
+
|
24
128
|
class ResourcesUnavailableError(Exception):
|
25
129
|
"""Raised when resources are unavailable.
|
26
130
|
|
@@ -94,7 +198,21 @@ class ResourcesMismatchError(Exception):
|
|
94
198
|
pass
|
95
199
|
|
96
200
|
|
97
|
-
class
|
201
|
+
class SkyPilotExcludeArgsBaseException(Exception):
|
202
|
+
"""Base class for exceptions that don't need args while serialization.
|
203
|
+
|
204
|
+
Due to our serialization/deserialization logic, when an exception does
|
205
|
+
not take `args` as an argument in __init__, `args` should not be included
|
206
|
+
in the serialized exception.
|
207
|
+
|
208
|
+
This is useful when an exception needs to construct the error message based
|
209
|
+
on the arguments passed in instead of directly having the error message as
|
210
|
+
the first argument in __init__. Refer to `CommandError` for an example.
|
211
|
+
"""
|
212
|
+
pass
|
213
|
+
|
214
|
+
|
215
|
+
class CommandError(SkyPilotExcludeArgsBaseException):
|
98
216
|
"""Raised when a command fails.
|
99
217
|
|
100
218
|
Args:
|
@@ -128,7 +246,7 @@ class ClusterNotUpError(Exception):
|
|
128
246
|
|
129
247
|
def __init__(self,
|
130
248
|
message: str,
|
131
|
-
cluster_status: Optional['status_lib.ClusterStatus'],
|
249
|
+
cluster_status: Optional['status_lib.ClusterStatus'] = None,
|
132
250
|
handle: Optional['backend.ResourceHandle'] = None) -> None:
|
133
251
|
super().__init__(message)
|
134
252
|
self.cluster_status = cluster_status
|
@@ -265,7 +383,7 @@ class NoCloudAccessError(Exception):
|
|
265
383
|
pass
|
266
384
|
|
267
385
|
|
268
|
-
class AWSAzFetchingError(
|
386
|
+
class AWSAzFetchingError(SkyPilotExcludeArgsBaseException):
|
269
387
|
"""Raised when fetching the AWS availability zone fails."""
|
270
388
|
|
271
389
|
class Reason(enum.Enum):
|
@@ -314,3 +432,18 @@ class UserRequestRejectedByPolicy(Exception):
|
|
314
432
|
class NoClusterLaunchedError(Exception):
|
315
433
|
"""No cluster launched, so cleanup can be skipped during failover."""
|
316
434
|
pass
|
435
|
+
|
436
|
+
|
437
|
+
class RequestCancelled(Exception):
|
438
|
+
"""Raised when a request is cancelled."""
|
439
|
+
pass
|
440
|
+
|
441
|
+
|
442
|
+
class ApiServerConnectionError(RuntimeError):
|
443
|
+
"""Raised when the API server cannot be connected."""
|
444
|
+
|
445
|
+
def __init__(self, server_url: str):
|
446
|
+
super().__init__(
|
447
|
+
f'Could not connect to SkyPilot API server at {server_url}. '
|
448
|
+
f'Please ensure that the server is running. '
|
449
|
+
f'Try: curl {server_url}/api/health')
|