skypilot-nightly 1.0.0.dev20241029__py3-none-any.whl → 1.0.0.dev20241031__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +14 -13
- sky/clouds/azure.py +4 -5
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +51 -3
- sky/clouds/utils/gcp_utils.py +0 -8
- sky/execution.py +5 -4
- sky/jobs/controller.py +38 -22
- sky/jobs/recovery_strategy.py +30 -5
- sky/jobs/state.py +33 -5
- sky/jobs/utils.py +28 -4
- sky/provision/azure/instance.py +4 -24
- sky/resources.py +28 -8
- sky/setup_files/setup.py +4 -3
- sky/skylet/job_lib.py +34 -42
- sky/templates/azure-ray.yml.j2 +0 -1
- sky/utils/dag_utils.py +14 -4
- sky/utils/schemas.py +21 -1
- {skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/METADATA +13 -11
- {skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/RECORD +23 -23
- {skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'c4eeeb5fb3ef64be0f05a727e119ac9266f8940f'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20241031'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
@@ -1950,17 +1950,8 @@ class RetryingVmProvisioner(object):
|
|
1950
1950
|
|
1951
1951
|
failover_history: List[Exception] = list()
|
1952
1952
|
|
1953
|
-
style = colorama.Style
|
1954
|
-
fore = colorama.Fore
|
1955
1953
|
# Retrying launchable resources.
|
1956
1954
|
while True:
|
1957
|
-
if (isinstance(to_provision.cloud, clouds.Azure) and
|
1958
|
-
to_provision.accelerators is not None and
|
1959
|
-
'A10' in to_provision.accelerators and prev_handle is None):
|
1960
|
-
logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
|
1961
|
-
'an A10 cluster on Azure. This may take ~20 '
|
1962
|
-
'minutes due to driver installation.'
|
1963
|
-
f'{style.RESET_ALL}')
|
1964
1955
|
try:
|
1965
1956
|
# Recheck cluster name as the 'except:' block below may
|
1966
1957
|
# change the cloud assignment.
|
@@ -2476,7 +2467,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2476
2467
|
"""Returns number of IPs per node in the cluster, handling TPU Pod."""
|
2477
2468
|
is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
|
2478
2469
|
if is_tpu_vm_pod:
|
2479
|
-
num_ips =
|
2470
|
+
num_ips = len(self.internal_ips())
|
2480
2471
|
else:
|
2481
2472
|
num_ips = 1
|
2482
2473
|
return num_ips
|
@@ -3175,9 +3166,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3175
3166
|
returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
|
3176
3167
|
if returncode == 255:
|
3177
3168
|
is_message_too_long = False
|
3178
|
-
|
3179
|
-
|
3180
|
-
|
3169
|
+
try:
|
3170
|
+
with open(os.path.expanduser(setup_log_path),
|
3171
|
+
'r',
|
3172
|
+
encoding='utf-8') as f:
|
3173
|
+
if 'too long' in f.read():
|
3174
|
+
is_message_too_long = True
|
3175
|
+
except Exception as e: # pylint: disable=broad-except
|
3176
|
+
# We don't crash the setup if we cannot read the log file.
|
3177
|
+
# Instead, we should retry the setup with dumping the script
|
3178
|
+
# to a file to be safe.
|
3179
|
+
logger.debug('Failed to read setup log file '
|
3180
|
+
f'{setup_log_path}: {e}')
|
3181
|
+
is_message_too_long = True
|
3181
3182
|
|
3182
3183
|
if is_message_too_long:
|
3183
3184
|
# If the setup script is too long, we retry it with dumping
|
sky/clouds/azure.py
CHANGED
@@ -44,6 +44,8 @@ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
|
|
44
44
|
_DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
|
45
45
|
_DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
|
46
46
|
_FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
|
47
|
+
# This is used by Azure GPU VMs that use grid drivers (e.g. A10).
|
48
|
+
_DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'
|
47
49
|
|
48
50
|
_COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'
|
49
51
|
|
@@ -220,6 +222,8 @@ class Azure(clouds.Cloud):
|
|
220
222
|
acc_name = list(acc.keys())[0]
|
221
223
|
if acc_name == 'K80':
|
222
224
|
return _DEFAULT_GPU_K80_IMAGE_ID
|
225
|
+
if acc_name == 'A10':
|
226
|
+
return _DEFAULT_GPU_GRID_IMAGE_ID
|
223
227
|
# About Gen V1 vs V2:
|
224
228
|
# In Azure, all instances with K80 (Standard_NC series), some
|
225
229
|
# instances with M60 (Standard_NV series) and some cpu instances
|
@@ -350,10 +354,6 @@ class Azure(clouds.Cloud):
|
|
350
354
|
'image_version': version,
|
351
355
|
}
|
352
356
|
|
353
|
-
# Setup the A10 nvidia driver.
|
354
|
-
need_nvidia_driver_extension = (acc_dict is not None and
|
355
|
-
'A10' in acc_dict)
|
356
|
-
|
357
357
|
# Determine resource group for deploying the instance.
|
358
358
|
resource_group_name = skypilot_config.get_nested(
|
359
359
|
('azure', 'resource_group_vm'), None)
|
@@ -413,7 +413,6 @@ class Azure(clouds.Cloud):
|
|
413
413
|
# Azure does not support specific zones.
|
414
414
|
'zones': None,
|
415
415
|
**image_config,
|
416
|
-
'need_nvidia_driver_extension': need_nvidia_driver_extension,
|
417
416
|
'disk_tier': Azure._get_disk_type(disk_tier),
|
418
417
|
'cloud_init_setup_commands': cloud_init_setup_commands,
|
419
418
|
'azure_subscription_id': self.get_project_id(dryrun),
|
@@ -47,6 +47,10 @@ TPU_RETRY_CNT = 3
|
|
47
47
|
TPU_V4_ZONES = ['us-central2-b']
|
48
48
|
# TPU v3 pods are available in us-east1-d, but hidden in the skus.
|
49
49
|
# We assume the TPU prices are the same as us-central1.
|
50
|
+
# TPU v6e's pricing info is not available on the SKUs. However, in
|
51
|
+
# https://cloud.google.com/tpu/pricing, it listed the price for 4 regions:
|
52
|
+
# us-east1, us-east5, europe-west4, and asia-northeast1. We hardcode them here
|
53
|
+
# and filtered out the other regions (us-central{1,2}, us-south1).
|
50
54
|
HIDDEN_TPU_DF = pd.read_csv(
|
51
55
|
io.StringIO(
|
52
56
|
textwrap.dedent("""\
|
@@ -58,8 +62,50 @@ HIDDEN_TPU_DF = pd.read_csv(
|
|
58
62
|
,tpu-v3-512,1,,,tpu-v3-512,512.0,153.6,us-east1,us-east1-d
|
59
63
|
,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
|
60
64
|
,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
|
65
|
+
,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-b
|
66
|
+
,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-c
|
67
|
+
,tpu-v6e-1,1,,,tpu-v6e-1,2.97,,europe-west4,europe-west4-a
|
68
|
+
,tpu-v6e-1,1,,,tpu-v6e-1,3.24,,asia-northeast1,asia-northeast1-b
|
69
|
+
,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east1,us-east1-d
|
70
|
+
,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-b
|
71
|
+
,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-c
|
72
|
+
,tpu-v6e-4,1,,,tpu-v6e-4,11.88,,europe-west4,europe-west4-a
|
73
|
+
,tpu-v6e-4,1,,,tpu-v6e-4,12.96,,asia-northeast1,asia-northeast1-b
|
74
|
+
,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east1,us-east1-d
|
75
|
+
,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-b
|
76
|
+
,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-c
|
77
|
+
,tpu-v6e-8,1,,,tpu-v6e-8,23.76,,europe-west4,europe-west4-a
|
78
|
+
,tpu-v6e-8,1,,,tpu-v6e-8,25.92,,asia-northeast1,asia-northeast1-b
|
79
|
+
,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east1,us-east1-d
|
80
|
+
,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-b
|
81
|
+
,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-c
|
82
|
+
,tpu-v6e-16,1,,,tpu-v6e-16,47.52,,europe-west4,europe-west4-a
|
83
|
+
,tpu-v6e-16,1,,,tpu-v6e-16,51.84,,asia-northeast1,asia-northeast1-b
|
84
|
+
,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east1,us-east1-d
|
85
|
+
,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-b
|
86
|
+
,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-c
|
87
|
+
,tpu-v6e-32,1,,,tpu-v6e-32,95.04,,europe-west4,europe-west4-a
|
88
|
+
,tpu-v6e-32,1,,,tpu-v6e-32,103.68,,asia-northeast1,asia-northeast1-b
|
89
|
+
,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east1,us-east1-d
|
90
|
+
,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-b
|
91
|
+
,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-c
|
92
|
+
,tpu-v6e-64,1,,,tpu-v6e-64,190.08,,europe-west4,europe-west4-a
|
93
|
+
,tpu-v6e-64,1,,,tpu-v6e-64,207.36,,asia-northeast1,asia-northeast1-b
|
94
|
+
,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east1,us-east1-d
|
95
|
+
,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-b
|
96
|
+
,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-c
|
97
|
+
,tpu-v6e-128,1,,,tpu-v6e-128,380.16,,europe-west4,europe-west4-a
|
98
|
+
,tpu-v6e-128,1,,,tpu-v6e-128,414.72,,asia-northeast1,asia-northeast1-b
|
99
|
+
,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east1,us-east1-d
|
100
|
+
,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-b
|
101
|
+
,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-c
|
102
|
+
,tpu-v6e-256,1,,,tpu-v6e-256,760.32,,europe-west4,europe-west4-a
|
103
|
+
,tpu-v6e-256,1,,,tpu-v6e-256,829.44,,asia-northeast1,asia-northeast1-b
|
104
|
+
,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east1,us-east1-d
|
61
105
|
""")))
|
62
106
|
|
107
|
+
TPU_V6E_MISSING_REGIONS = ['us-central1', 'us-central2', 'us-south1']
|
108
|
+
|
63
109
|
# TPU V5 is not visible in specific zones. We hardcode the missing zones here.
|
64
110
|
# NOTE(dev): Keep the zones and the df in sync.
|
65
111
|
TPU_V5_MISSING_ZONES_DF = {
|
@@ -683,11 +729,13 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
|
|
683
729
|
'not found in SKUs or hidden TPU price DF.')
|
684
730
|
# TODO(tian): Hack. Should investigate how to retrieve the price
|
685
731
|
# for TPU-v6e.
|
686
|
-
if
|
732
|
+
if (tpu_name.startswith('tpu-v6e') and
|
733
|
+
tpu_region in TPU_V6E_MISSING_REGIONS):
|
734
|
+
if not spot:
|
735
|
+
tpu_price = 0.0
|
736
|
+
else:
|
687
737
|
assert spot or tpu_price is not None, (row, hidden_tpu,
|
688
738
|
HIDDEN_TPU_DF)
|
689
|
-
else:
|
690
|
-
tpu_price = 0.0
|
691
739
|
return tpu_price
|
692
740
|
|
693
741
|
df['Price'] = df.apply(lambda row: get_tpu_price(row, spot=False), axis=1)
|
sky/clouds/utils/gcp_utils.py
CHANGED
@@ -49,14 +49,6 @@ def is_tpu_vm_pod(resources: Optional['resources_lib.Resources']) -> bool:
|
|
49
49
|
return not acc.endswith('-8')
|
50
50
|
|
51
51
|
|
52
|
-
def get_num_tpu_devices(resources: Optional['resources_lib.Resources']) -> int:
|
53
|
-
if resources is None or not is_tpu(resources):
|
54
|
-
raise ValueError('resources must be a valid TPU resource.')
|
55
|
-
acc, _ = list(resources.accelerators.items())[0]
|
56
|
-
num_tpu_devices = int(int(acc.split('-')[2]) / 8)
|
57
|
-
return num_tpu_devices
|
58
|
-
|
59
|
-
|
60
52
|
@dataclasses.dataclass
|
61
53
|
class SpecificReservation:
|
62
54
|
count: int
|
sky/execution.py
CHANGED
@@ -171,10 +171,11 @@ def _execute(
|
|
171
171
|
task = dag.tasks[0]
|
172
172
|
|
173
173
|
if any(r.job_recovery is not None for r in task.resources):
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
174
|
+
logger.warning(
|
175
|
+
f'{colorama.Style.DIM}The task has `job_recovery` specified, '
|
176
|
+
'but is launched as an unmanaged job. It will be ignored.'
|
177
|
+
'To enable job recovery, use managed jobs: sky jobs launch.'
|
178
|
+
f'{colorama.Style.RESET_ALL}')
|
178
179
|
|
179
180
|
cluster_exists = False
|
180
181
|
if cluster_name is not None:
|
sky/jobs/controller.py
CHANGED
@@ -160,6 +160,11 @@ class JobsController:
|
|
160
160
|
if task_id == 0:
|
161
161
|
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
162
162
|
self._backend.run_timestamp)
|
163
|
+
assert task.name is not None, task
|
164
|
+
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
165
|
+
task.name, self._job_id)
|
166
|
+
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
167
|
+
cluster_name, self._backend, task, self._retry_until_up)
|
163
168
|
managed_job_state.set_submitted(
|
164
169
|
self._job_id,
|
165
170
|
task_id,
|
@@ -167,15 +172,14 @@ class JobsController:
|
|
167
172
|
submitted_at,
|
168
173
|
resources_str=backend_utils.get_task_resources_str(
|
169
174
|
task, is_managed_job=True),
|
175
|
+
specs={
|
176
|
+
'max_restarts_on_errors':
|
177
|
+
self._strategy_executor.max_restarts_on_errors
|
178
|
+
},
|
170
179
|
callback_func=callback_func)
|
171
180
|
logger.info(
|
172
181
|
f'Submitted managed job {self._job_id} (task: {task_id}, name: '
|
173
182
|
f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
174
|
-
assert task.name is not None, task
|
175
|
-
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
176
|
-
task.name, self._job_id)
|
177
|
-
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
178
|
-
cluster_name, self._backend, task, self._retry_until_up)
|
179
183
|
|
180
184
|
logger.info('Started monitoring.')
|
181
185
|
managed_job_state.set_starting(job_id=self._job_id,
|
@@ -283,23 +287,35 @@ class JobsController:
|
|
283
287
|
failure_reason = (
|
284
288
|
'To see the details, run: '
|
285
289
|
f'sky jobs logs --controller {self._job_id}')
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
290
|
+
should_restart_on_failure = (
|
291
|
+
self._strategy_executor.should_restart_on_failure())
|
292
|
+
if should_restart_on_failure:
|
293
|
+
max_restarts = (
|
294
|
+
self._strategy_executor.max_restarts_on_errors)
|
295
|
+
logger.info(
|
296
|
+
f'User program crashed '
|
297
|
+
f'({managed_job_status.value}). '
|
298
|
+
f'Retry the job as max_restarts_on_errors is '
|
299
|
+
f'set to {max_restarts}. '
|
300
|
+
f'[{self._strategy_executor.restart_cnt_on_failure}'
|
301
|
+
f'/{max_restarts}]')
|
302
|
+
else:
|
303
|
+
managed_job_state.set_failed(
|
304
|
+
self._job_id,
|
305
|
+
task_id,
|
306
|
+
failure_type=managed_job_status,
|
307
|
+
failure_reason=failure_reason,
|
308
|
+
end_time=end_time,
|
309
|
+
callback_func=callback_func)
|
310
|
+
return False
|
311
|
+
else:
|
312
|
+
# Although the cluster is healthy, we fail to access the
|
313
|
+
# job status. Try to recover the job (will not restart the
|
314
|
+
# cluster, if the cluster is healthy).
|
315
|
+
assert job_status is None, job_status
|
316
|
+
logger.info('Failed to fetch the job status while the '
|
317
|
+
'cluster is healthy. Try to recover the job '
|
318
|
+
'(the cluster will not be restarted).')
|
303
319
|
# When the handle is None, the cluster should be cleaned up already.
|
304
320
|
if handle is not None:
|
305
321
|
resources = handle.launched_resources
|
sky/jobs/recovery_strategy.py
CHANGED
@@ -66,7 +66,8 @@ class StrategyExecutor:
|
|
66
66
|
RETRY_INIT_GAP_SECONDS = 60
|
67
67
|
|
68
68
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
69
|
-
task: 'task_lib.Task', retry_until_up: bool
|
69
|
+
task: 'task_lib.Task', retry_until_up: bool,
|
70
|
+
max_restarts_on_errors: int) -> None:
|
70
71
|
"""Initialize the strategy executor.
|
71
72
|
|
72
73
|
Args:
|
@@ -82,6 +83,8 @@ class StrategyExecutor:
|
|
82
83
|
self.cluster_name = cluster_name
|
83
84
|
self.backend = backend
|
84
85
|
self.retry_until_up = retry_until_up
|
86
|
+
self.max_restarts_on_errors = max_restarts_on_errors
|
87
|
+
self.restart_cnt_on_failure = 0
|
85
88
|
|
86
89
|
def __init_subclass__(cls, name: str, default: bool = False):
|
87
90
|
RECOVERY_STRATEGIES[name] = cls
|
@@ -109,8 +112,17 @@ class StrategyExecutor:
|
|
109
112
|
# set the new_task_resources to be the same type (list or set) as the
|
110
113
|
# original task.resources
|
111
114
|
task.set_resources(type(task.resources)(new_resources_list))
|
112
|
-
|
113
|
-
|
115
|
+
if isinstance(job_recovery, dict):
|
116
|
+
job_recovery_name = job_recovery.pop('strategy',
|
117
|
+
DEFAULT_RECOVERY_STRATEGY)
|
118
|
+
max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
|
119
|
+
0)
|
120
|
+
else:
|
121
|
+
job_recovery_name = job_recovery
|
122
|
+
max_restarts_on_errors = 0
|
123
|
+
return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
|
124
|
+
task, retry_until_up,
|
125
|
+
max_restarts_on_errors)
|
114
126
|
|
115
127
|
def launch(self) -> float:
|
116
128
|
"""Launch the cluster for the first time.
|
@@ -368,6 +380,17 @@ class StrategyExecutor:
|
|
368
380
|
f'{gap_seconds:.1f} seconds.')
|
369
381
|
time.sleep(gap_seconds)
|
370
382
|
|
383
|
+
def should_restart_on_failure(self) -> bool:
|
384
|
+
"""Increments counter & checks if job should be restarted on a failure.
|
385
|
+
|
386
|
+
Returns:
|
387
|
+
True if the job should be restarted, otherwise False.
|
388
|
+
"""
|
389
|
+
self.restart_cnt_on_failure += 1
|
390
|
+
if self.restart_cnt_on_failure > self.max_restarts_on_errors:
|
391
|
+
return False
|
392
|
+
return True
|
393
|
+
|
371
394
|
|
372
395
|
class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
373
396
|
default=False):
|
@@ -376,8 +399,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
376
399
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
377
400
|
|
378
401
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
379
|
-
task: 'task_lib.Task', retry_until_up: bool
|
380
|
-
|
402
|
+
task: 'task_lib.Task', retry_until_up: bool,
|
403
|
+
max_restarts_on_errors: int) -> None:
|
404
|
+
super().__init__(cluster_name, backend, task, retry_until_up,
|
405
|
+
max_restarts_on_errors)
|
381
406
|
# Note down the cloud/region of the launched cluster, so that we can
|
382
407
|
# first retry in the same cloud/region. (Inside recover() we may not
|
383
408
|
# rely on cluster handle, as it can be None if the cluster is
|
sky/jobs/state.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
# TODO(zhwu): maybe use file based status instead of database, so
|
3
3
|
# that we can easily switch to a s3-based storage.
|
4
4
|
import enum
|
5
|
+
import json
|
5
6
|
import pathlib
|
6
7
|
import sqlite3
|
7
8
|
import time
|
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
|
|
65
66
|
failure_reason TEXT,
|
66
67
|
spot_job_id INTEGER,
|
67
68
|
task_id INTEGER DEFAULT 0,
|
68
|
-
task_name TEXT
|
69
|
+
task_name TEXT,
|
70
|
+
specs TEXT)""")
|
69
71
|
_CONN.commit()
|
70
72
|
|
71
73
|
db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
|
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
|
|
92
94
|
'TEXT',
|
93
95
|
copy_from='job_name')
|
94
96
|
|
97
|
+
# Specs is some useful information about the task, e.g., the
|
98
|
+
# max_restarts_on_errors value. It is stored in JSON format.
|
99
|
+
db_utils.add_column_to_table(_CURSOR,
|
100
|
+
_CONN,
|
101
|
+
'spot',
|
102
|
+
'specs',
|
103
|
+
'TEXT',
|
104
|
+
value_to_replace_existing_entries=json.dumps({
|
105
|
+
'max_restarts_on_errors': 0,
|
106
|
+
}))
|
107
|
+
|
95
108
|
# `job_info` contains the mapping from job_id to the job_name.
|
96
109
|
# In the future, it may contain more information about each job.
|
97
110
|
_CURSOR.execute("""\
|
@@ -128,9 +141,10 @@ columns = [
|
|
128
141
|
'job_id',
|
129
142
|
'task_id',
|
130
143
|
'task_name',
|
144
|
+
'specs',
|
131
145
|
# columns from the job_info table
|
132
146
|
'_job_info_job_id', # This should be the same as job_id
|
133
|
-
'job_name'
|
147
|
+
'job_name',
|
134
148
|
]
|
135
149
|
|
136
150
|
|
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
|
283
297
|
|
284
298
|
def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
285
299
|
submit_time: float, resources_str: str,
|
286
|
-
|
300
|
+
specs: Dict[str, Union[str,
|
301
|
+
int]], callback_func: CallbackType):
|
287
302
|
"""Set the task to submitted.
|
288
303
|
|
289
304
|
Args:
|
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
293
308
|
determine the log directory of the managed task.
|
294
309
|
submit_time: The time when the managed task is submitted.
|
295
310
|
resources_str: The resources string of the managed task.
|
311
|
+
specs: The specs of the managed task.
|
312
|
+
callback_func: The callback function.
|
296
313
|
"""
|
297
314
|
# Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
|
298
315
|
# the log directory and submission time align with each other, so as to
|
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
306
323
|
resources=(?),
|
307
324
|
submitted_at=(?),
|
308
325
|
status=(?),
|
309
|
-
run_timestamp=(?)
|
326
|
+
run_timestamp=(?),
|
327
|
+
specs=(?)
|
310
328
|
WHERE spot_job_id=(?) AND
|
311
329
|
task_id=(?)""",
|
312
330
|
(resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
|
313
|
-
run_timestamp, job_id, task_id))
|
331
|
+
run_timestamp, json.dumps(specs), job_id, task_id))
|
314
332
|
callback_func('SUBMITTED')
|
315
333
|
|
316
334
|
|
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
|
|
619
637
|
for (job_id,) in rows:
|
620
638
|
return job_id
|
621
639
|
return None
|
640
|
+
|
641
|
+
|
642
|
+
def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
|
643
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
644
|
+
task_specs = cursor.execute(
|
645
|
+
"""\
|
646
|
+
SELECT specs FROM spot
|
647
|
+
WHERE spot_job_id=(?) AND task_id=(?)""",
|
648
|
+
(job_id, task_id)).fetchone()
|
649
|
+
return json.loads(task_specs[0])
|
sky/jobs/utils.py
CHANGED
@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
|
|
70
70
|
# state, after the job finished. This is a safeguard to avoid the case where
|
71
71
|
# the managed job status fails to be updated and keep the `sky jobs logs`
|
72
72
|
# blocking for a long time.
|
73
|
-
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS =
|
73
|
+
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
|
74
74
|
|
75
75
|
|
76
76
|
class UserSignal(enum.Enum):
|
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
392
392
|
f'INFO: Log for the current task ({task_id}) '
|
393
393
|
'is finished. Waiting for the next task\'s log '
|
394
394
|
'to be started.')
|
395
|
-
|
396
|
-
|
395
|
+
# Add a newline to avoid the status display below
|
396
|
+
# removing the last line of the task output.
|
397
|
+
print()
|
398
|
+
status_display.update(
|
399
|
+
ux_utils.spinner_message(
|
400
|
+
f'Waiting for the next task: {task_id + 1}'))
|
397
401
|
status_display.start()
|
398
402
|
original_task_id = task_id
|
399
403
|
while True:
|
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
405
409
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
406
410
|
continue
|
407
411
|
else:
|
408
|
-
|
412
|
+
task_specs = managed_job_state.get_task_specs(
|
413
|
+
job_id, task_id)
|
414
|
+
if task_specs.get('max_restarts_on_errors', 0) == 0:
|
415
|
+
# We don't need to wait for the managed job status
|
416
|
+
# update, as the job is guaranteed to be in terminal
|
417
|
+
# state afterwards.
|
418
|
+
break
|
419
|
+
print()
|
420
|
+
status_display.update(
|
421
|
+
ux_utils.spinner_message(
|
422
|
+
'Waiting for next restart for the failed task'))
|
423
|
+
status_display.start()
|
424
|
+
while True:
|
425
|
+
_, managed_job_status = (
|
426
|
+
managed_job_state.get_latest_task_id_status(
|
427
|
+
job_id))
|
428
|
+
if (managed_job_status !=
|
429
|
+
managed_job_state.ManagedJobStatus.RUNNING):
|
430
|
+
break
|
431
|
+
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
432
|
+
continue
|
409
433
|
# The job can be cancelled by the user or the controller (when
|
410
434
|
# the cluster is partially preempted).
|
411
435
|
logger.debug(
|
sky/provision/azure/instance.py
CHANGED
@@ -311,30 +311,10 @@ def _create_vm(
|
|
311
311
|
vm_name=vm_name,
|
312
312
|
parameters=vm_instance,
|
313
313
|
)
|
314
|
-
#
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
# extension. Reference:
|
319
|
-
# https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2
|
320
|
-
# This can take more than 20mins for setting up the A10 GPUs
|
321
|
-
if node_config.get('need_nvidia_driver_extension', False):
|
322
|
-
ext_poller = compute_client.virtual_machine_extensions.\
|
323
|
-
begin_create_or_update(
|
324
|
-
resource_group_name=provider_config['resource_group'],
|
325
|
-
vm_name=vm_name,
|
326
|
-
vm_extension_name='NvidiaGpuDriverLinux',
|
327
|
-
extension_parameters=compute.VirtualMachineExtension(
|
328
|
-
location=provider_config['location'],
|
329
|
-
publisher='Microsoft.HpcCompute',
|
330
|
-
type_properties_type='NvidiaGpuDriverLinux',
|
331
|
-
type_handler_version='1.9',
|
332
|
-
auto_upgrade_minor_version=True,
|
333
|
-
settings='{}'))
|
334
|
-
logger.info(
|
335
|
-
f'Created VM extension {ext_poller.result().name} for VM {vm_name}.'
|
336
|
-
)
|
337
|
-
return vm_poller.result()
|
314
|
+
# This line will block until the VM is created or the operation times out.
|
315
|
+
vm = vm_poller.result()
|
316
|
+
logger.info(f'Created VM {vm.name}.')
|
317
|
+
return vm
|
338
318
|
|
339
319
|
|
340
320
|
def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
|
sky/resources.py
CHANGED
@@ -55,7 +55,7 @@ class Resources:
|
|
55
55
|
accelerators: Union[None, str, Dict[str, int]] = None,
|
56
56
|
accelerator_args: Optional[Dict[str, str]] = None,
|
57
57
|
use_spot: Optional[bool] = None,
|
58
|
-
job_recovery: Optional[str] = None,
|
58
|
+
job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
|
59
59
|
region: Optional[str] = None,
|
60
60
|
zone: Optional[str] = None,
|
61
61
|
image_id: Union[Dict[str, str], str, None] = None,
|
@@ -111,6 +111,12 @@ class Resources:
|
|
111
111
|
job to recover the cluster from preemption. Refer to
|
112
112
|
`recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
|
113
113
|
for more details.
|
114
|
+
When a dict is provided, it can have the following fields:
|
115
|
+
|
116
|
+
- strategy: the recovery strategy to use.
|
117
|
+
- max_restarts_on_errors: the max number of restarts on user code
|
118
|
+
errors.
|
119
|
+
|
114
120
|
region: the region to use.
|
115
121
|
zone: the zone to use.
|
116
122
|
image_id: the image ID to use. If a str, must be a string
|
@@ -161,10 +167,20 @@ class Resources:
|
|
161
167
|
|
162
168
|
self._use_spot_specified = use_spot is not None
|
163
169
|
self._use_spot = use_spot if use_spot is not None else False
|
164
|
-
self._job_recovery = None
|
170
|
+
self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
|
165
171
|
if job_recovery is not None:
|
166
|
-
if job_recovery
|
167
|
-
|
172
|
+
if isinstance(job_recovery, str):
|
173
|
+
job_recovery = {'strategy': job_recovery}
|
174
|
+
if 'strategy' not in job_recovery:
|
175
|
+
job_recovery['strategy'] = None
|
176
|
+
|
177
|
+
strategy_name = job_recovery['strategy']
|
178
|
+
if strategy_name == 'none':
|
179
|
+
self._job_recovery = None
|
180
|
+
else:
|
181
|
+
if strategy_name is not None:
|
182
|
+
job_recovery['strategy'] = strategy_name.upper()
|
183
|
+
self._job_recovery = job_recovery
|
168
184
|
|
169
185
|
if disk_size is not None:
|
170
186
|
if round(disk_size) != disk_size:
|
@@ -419,7 +435,7 @@ class Resources:
|
|
419
435
|
return self._use_spot_specified
|
420
436
|
|
421
437
|
@property
|
422
|
-
def job_recovery(self) -> Optional[str]:
|
438
|
+
def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
|
423
439
|
return self._job_recovery
|
424
440
|
|
425
441
|
@property
|
@@ -586,6 +602,9 @@ class Resources:
|
|
586
602
|
# TPU V5 requires a newer runtime version.
|
587
603
|
if acc.startswith('tpu-v5'):
|
588
604
|
return 'v2-alpha-tpuv5'
|
605
|
+
# TPU V6e requires a newer runtime version.
|
606
|
+
if acc.startswith('tpu-v6e'):
|
607
|
+
return 'v2-alpha-tpuv6e'
|
589
608
|
return 'tpu-vm-base'
|
590
609
|
|
591
610
|
accelerator_args['runtime_version'] = (
|
@@ -814,12 +833,13 @@ class Resources:
|
|
814
833
|
Raises:
|
815
834
|
ValueError: if the attributes are invalid.
|
816
835
|
"""
|
817
|
-
if self._job_recovery is None:
|
836
|
+
if self._job_recovery is None or self._job_recovery['strategy'] is None:
|
818
837
|
return
|
819
|
-
if self._job_recovery
|
838
|
+
if (self._job_recovery['strategy']
|
839
|
+
not in managed_jobs.RECOVERY_STRATEGIES):
|
820
840
|
with ux_utils.print_exception_no_traceback():
|
821
841
|
raise ValueError(
|
822
|
-
f'Spot recovery strategy {self._job_recovery} '
|
842
|
+
f'Spot recovery strategy {self._job_recovery["strategy"]} '
|
823
843
|
'is not supported. The strategy should be among '
|
824
844
|
f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')
|
825
845
|
|
sky/setup_files/setup.py
CHANGED
@@ -153,7 +153,7 @@ install_requires = [
|
|
153
153
|
'tabulate',
|
154
154
|
# Light weight requirement, can be replaced with "typing" once
|
155
155
|
# we deprecate Python 3.7 (this will take a while).
|
156
|
-
|
156
|
+
'typing_extensions',
|
157
157
|
'filelock >= 3.6.0',
|
158
158
|
'packaging',
|
159
159
|
'psutil',
|
@@ -216,8 +216,9 @@ extras_require: Dict[str, List[str]] = {
|
|
216
216
|
# We need azure-identity>=1.13.0 to enable the customization of the
|
217
217
|
# timeout of AzureCliCredential.
|
218
218
|
'azure': [
|
219
|
-
'azure-cli>=2.
|
220
|
-
'azure-mgmt-network', 'azure-
|
219
|
+
'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
|
220
|
+
'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
|
221
|
+
'azure-storage-blob>=12.23.1', 'msgraph-sdk'
|
221
222
|
] + local_ray,
|
222
223
|
# We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
|
223
224
|
# parameter for stopping instances.
|
sky/skylet/job_lib.py
CHANGED
@@ -512,16 +512,13 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
|
|
512
512
|
return records
|
513
513
|
|
514
514
|
|
515
|
-
def
|
516
|
-
rows = _CURSOR.execute(
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
'submit': submit
|
523
|
-
} for job_id, created_time, submit in rows
|
524
|
-
}
|
515
|
+
def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
|
516
|
+
rows = _CURSOR.execute('SELECT created_time, submit FROM pending_jobs '
|
517
|
+
f'WHERE job_id={job_id!r}')
|
518
|
+
for row in rows:
|
519
|
+
created_time, submit = row
|
520
|
+
return {'created_time': created_time, 'submit': submit}
|
521
|
+
return None
|
525
522
|
|
526
523
|
|
527
524
|
def update_job_status(job_ids: List[int],
|
@@ -535,7 +532,7 @@ def update_job_status(job_ids: List[int],
|
|
535
532
|
during job cancelling, we still need this to handle the staleness problem,
|
536
533
|
caused by instance restarting and other corner cases (if any).
|
537
534
|
|
538
|
-
This function should only be run on the remote instance with ray
|
535
|
+
This function should only be run on the remote instance with ray>=2.4.0.
|
539
536
|
"""
|
540
537
|
if len(job_ids) == 0:
|
541
538
|
return []
|
@@ -547,50 +544,45 @@ def update_job_status(job_ids: List[int],
|
|
547
544
|
|
548
545
|
# In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
|
549
546
|
# which contains the job status (str) and submission_id (str).
|
547
|
+
ray_job_query_time = time.time()
|
550
548
|
job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
|
551
549
|
|
552
|
-
pending_jobs = _get_pending_jobs()
|
553
550
|
job_details = {}
|
554
551
|
ray_job_ids_set = set(ray_job_ids)
|
555
552
|
for job_detail in job_detail_lists:
|
556
553
|
if job_detail.submission_id in ray_job_ids_set:
|
557
554
|
job_details[job_detail.submission_id] = job_detail
|
558
|
-
job_statuses: List[Optional[JobStatus]] = [None] * len(ray_job_ids)
|
559
|
-
for i, ray_job_id in enumerate(ray_job_ids):
|
560
|
-
job_id = job_ids[i]
|
561
|
-
if ray_job_id in job_details:
|
562
|
-
ray_status = job_details[ray_job_id].status
|
563
|
-
job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status]
|
564
|
-
if job_id in pending_jobs:
|
565
|
-
if pending_jobs[job_id]['created_time'] < psutil.boot_time():
|
566
|
-
logger.info(
|
567
|
-
f'Job {job_id} is stale, setting to FAILED: '
|
568
|
-
f'created_time={pending_jobs[job_id]["created_time"]}, '
|
569
|
-
f'boot_time={psutil.boot_time()}')
|
570
|
-
# The job is stale as it is created before the instance
|
571
|
-
# is booted, e.g. the instance is rebooted.
|
572
|
-
job_statuses[i] = JobStatus.FAILED
|
573
|
-
# Gives a 60 second grace period between job being submit from
|
574
|
-
# the pending table until appearing in ray jobs.
|
575
|
-
if (pending_jobs[job_id]['submit'] > 0 and
|
576
|
-
pending_jobs[job_id]['submit'] <
|
577
|
-
time.time() - _PENDING_SUBMIT_GRACE_PERIOD):
|
578
|
-
# For jobs submitted outside of the grace period, we will
|
579
|
-
# consider the ray job status.
|
580
|
-
continue
|
581
|
-
else:
|
582
|
-
# Reset the job status to PENDING even though it may not appear
|
583
|
-
# in the ray jobs, so that it will not be considered as stale.
|
584
|
-
job_statuses[i] = JobStatus.PENDING
|
585
|
-
|
586
|
-
assert len(job_statuses) == len(job_ids), (job_statuses, job_ids)
|
587
555
|
|
588
556
|
statuses = []
|
589
|
-
for job_id,
|
557
|
+
for job_id, ray_job_id in zip(job_ids, ray_job_ids):
|
590
558
|
# Per-job status lock is required because between the job status
|
591
559
|
# query and the job status update, the job status in the databse
|
592
560
|
# can be modified by the generated ray program.
|
593
561
|
with filelock.FileLock(_get_lock_path(job_id)):
|
562
|
+
status = None
|
563
|
+
if ray_job_id in job_details:
|
564
|
+
ray_status = job_details[ray_job_id].status
|
565
|
+
status = _RAY_TO_JOB_STATUS_MAP[ray_status]
|
566
|
+
pending_job = _get_pending_job(job_id)
|
567
|
+
if pending_job is not None:
|
568
|
+
if pending_job['created_time'] < psutil.boot_time():
|
569
|
+
logger.info(f'Job {job_id} is stale, setting to FAILED: '
|
570
|
+
f'created_time={pending_job["created_time"]}, '
|
571
|
+
f'boot_time={psutil.boot_time()}')
|
572
|
+
# The job is stale as it is created before the instance
|
573
|
+
# is booted, e.g. the instance is rebooted.
|
574
|
+
status = JobStatus.FAILED
|
575
|
+
# Gives a 60 second grace period between job being submit from
|
576
|
+
# the pending table until appearing in ray jobs. For jobs
|
577
|
+
# submitted outside of the grace period, we will consider the
|
578
|
+
# ray job status.
|
579
|
+
if not (pending_job['submit'] > 0 and pending_job['submit'] <
|
580
|
+
ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
|
581
|
+
# Reset the job status to PENDING even though it may not
|
582
|
+
# appear in the ray jobs, so that it will not be considered
|
583
|
+
# as stale.
|
584
|
+
status = JobStatus.PENDING
|
585
|
+
|
594
586
|
original_status = get_status_no_lock(job_id)
|
595
587
|
assert original_status is not None, (job_id, status)
|
596
588
|
if status is None:
|
sky/templates/azure-ray.yml.j2
CHANGED
@@ -83,7 +83,6 @@ available_node_types:
|
|
83
83
|
{%- for cmd in cloud_init_setup_commands %}
|
84
84
|
{{ cmd }}
|
85
85
|
{%- endfor %}
|
86
|
-
need_nvidia_driver_extension: {{need_nvidia_driver_extension}}
|
87
86
|
{%- if disk_performance_tier is not none %}
|
88
87
|
disk_performance_tier: {{disk_performance_tier}}
|
89
88
|
{%- endif %}
|
sky/utils/dag_utils.py
CHANGED
@@ -143,11 +143,21 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
|
|
143
143
|
for task_ in dag.tasks:
|
144
144
|
|
145
145
|
new_resources_list = []
|
146
|
+
default_strategy = jobs.DEFAULT_RECOVERY_STRATEGY
|
147
|
+
assert default_strategy is not None
|
146
148
|
for resources in list(task_.resources):
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
149
|
+
original_job_recovery = resources.job_recovery
|
150
|
+
job_recovery = {'strategy': default_strategy}
|
151
|
+
if isinstance(original_job_recovery, str):
|
152
|
+
job_recovery['strategy'] = original_job_recovery
|
153
|
+
elif isinstance(original_job_recovery, dict):
|
154
|
+
job_recovery.update(original_job_recovery)
|
155
|
+
strategy = job_recovery.get('strategy')
|
156
|
+
if strategy is None:
|
157
|
+
job_recovery['strategy'] = default_strategy
|
158
|
+
change_default_value: Dict[str, Any] = {
|
159
|
+
'job_recovery': job_recovery
|
160
|
+
}
|
151
161
|
|
152
162
|
new_resources = resources.copy(**change_default_value)
|
153
163
|
new_resources_list.append(new_resources)
|
sky/utils/schemas.py
CHANGED
@@ -92,7 +92,27 @@ def _get_single_resources_schema():
|
|
92
92
|
'type': 'string',
|
93
93
|
},
|
94
94
|
'job_recovery': {
|
95
|
-
|
95
|
+
# Either a string or a dict.
|
96
|
+
'anyOf': [{
|
97
|
+
'type': 'string',
|
98
|
+
}, {
|
99
|
+
'type': 'object',
|
100
|
+
'required': [],
|
101
|
+
'additionalProperties': False,
|
102
|
+
'properties': {
|
103
|
+
'strategy': {
|
104
|
+
'anyOf': [{
|
105
|
+
'type': 'string',
|
106
|
+
}, {
|
107
|
+
'type': 'null',
|
108
|
+
}],
|
109
|
+
},
|
110
|
+
'max_restarts_on_errors': {
|
111
|
+
'type': 'integer',
|
112
|
+
'minimum': 0,
|
113
|
+
},
|
114
|
+
}
|
115
|
+
}],
|
96
116
|
},
|
97
117
|
'disk_size': {
|
98
118
|
'type': 'integer',
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20241031
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -46,11 +46,12 @@ Requires-Dist: awscli>=1.27.10; extra == "all"
|
|
46
46
|
Requires-Dist: botocore>=1.29.10; extra == "all"
|
47
47
|
Requires-Dist: boto3>=1.26.1; extra == "all"
|
48
48
|
Requires-Dist: colorama<0.4.5; extra == "all"
|
49
|
-
Requires-Dist: azure-cli>=2.
|
50
|
-
Requires-Dist: azure-core; extra == "all"
|
51
|
-
Requires-Dist: azure-identity>=1.
|
52
|
-
Requires-Dist: azure-mgmt-network; extra == "all"
|
53
|
-
Requires-Dist: azure-
|
49
|
+
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
50
|
+
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
51
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
52
|
+
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
53
|
+
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
54
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
54
55
|
Requires-Dist: msgraph-sdk; extra == "all"
|
55
56
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
56
57
|
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
@@ -78,11 +79,12 @@ Requires-Dist: botocore>=1.29.10; extra == "aws"
|
|
78
79
|
Requires-Dist: boto3>=1.26.1; extra == "aws"
|
79
80
|
Requires-Dist: colorama<0.4.5; extra == "aws"
|
80
81
|
Provides-Extra: azure
|
81
|
-
Requires-Dist: azure-cli>=2.
|
82
|
-
Requires-Dist: azure-core; extra == "azure"
|
83
|
-
Requires-Dist: azure-identity>=1.
|
84
|
-
Requires-Dist: azure-mgmt-network; extra == "azure"
|
85
|
-
Requires-Dist: azure-
|
82
|
+
Requires-Dist: azure-cli>=2.65.0; extra == "azure"
|
83
|
+
Requires-Dist: azure-core>=1.31.0; extra == "azure"
|
84
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "azure"
|
85
|
+
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "azure"
|
86
|
+
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "azure"
|
87
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "azure"
|
86
88
|
Requires-Dist: msgraph-sdk; extra == "azure"
|
87
89
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "azure"
|
88
90
|
Provides-Extra: cloudflare
|
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/RECORD
RENAMED
@@ -1,4 +1,4 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=yJMQo4fd9D4f1vgLHCoy3S3IWUWqDbw9zbH6TEmTD-Y,5882
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
|
4
4
|
sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
|
@@ -7,10 +7,10 @@ sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
|
7
7
|
sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
|
8
8
|
sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
|
9
9
|
sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
|
10
|
-
sky/execution.py,sha256=
|
10
|
+
sky/execution.py,sha256=tDK6JhF_405cjqxRpbdLbHZyxrKTD5oa0UkKDvPJ_9Q,24751
|
11
11
|
sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
|
12
12
|
sky/optimizer.py,sha256=tXGrFpc6xNtKH34qjBAMd4jTuWcDZTPnGFwEtuCQFmk,59702
|
13
|
-
sky/resources.py,sha256=
|
13
|
+
sky/resources.py,sha256=Zt8mCCmdvZ5ZCqY-l3KXlx_lkUesAopRtaEcEsrRFZo,68465
|
14
14
|
sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
|
15
15
|
sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
|
16
16
|
sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
|
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
|
|
31
31
|
sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
|
32
32
|
sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
|
33
33
|
sky/backends/backend_utils.py,sha256=LmLsaLiPuuUyGebOXykdvwZpUY-8sB7n4o2AnmwNmdQ,121714
|
34
|
-
sky/backends/cloud_vm_ray_backend.py,sha256=
|
34
|
+
sky/backends/cloud_vm_ray_backend.py,sha256=veMpX-qNA_qat3CVRYw_wZjeMtBqOpjTCevf4lpw3nU,232582
|
35
35
|
sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
|
36
36
|
sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
|
37
37
|
sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
|
@@ -41,7 +41,7 @@ sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG
|
|
41
41
|
sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
|
42
42
|
sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
|
43
43
|
sky/clouds/aws.py,sha256=dVZ8auaa2z2Ifl9iiRT06IeEFaNtZhANKtHVLT6Gcno,49474
|
44
|
-
sky/clouds/azure.py,sha256
|
44
|
+
sky/clouds/azure.py,sha256=ixw5jCnnMxDLj0hpArljVzq88EKOrqRxk9xm5N9u-mc,30576
|
45
45
|
sky/clouds/cloud.py,sha256=A5F4a71ciPyljWEs6vT-4RmdGT-AE9NkhS8gJ4Vgi_I,35165
|
46
46
|
sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
|
47
47
|
sky/clouds/cudo.py,sha256=UiY273Sln7VOYDYx93yWiWH_RLlOKZ2cm7mA31ld4A8,13094
|
@@ -78,13 +78,13 @@ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=ro2zazdkDF6z9bE7QFy
|
|
78
78
|
sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzEPBBNE9XOZM0K0FIXbBUMj9h0MQ,12803
|
79
79
|
sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
|
80
80
|
sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
|
81
|
-
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=
|
81
|
+
sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=mDAN98T58h1g_LLyppSEUVDlsbLhk2454Nhmg5-aw0Q,32670
|
82
82
|
sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38zayuJGgUrD1PJYJKiVZHGnwH6JJop3F7o0,4918
|
83
83
|
sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
|
84
84
|
sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
85
85
|
sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
|
86
86
|
sky/clouds/utils/azure_utils.py,sha256=NToRBnhEyuUvb-nBnsKTxjhOBRkMcrelL8LK4w6s4t8,3555
|
87
|
-
sky/clouds/utils/gcp_utils.py,sha256=
|
87
|
+
sky/clouds/utils/gcp_utils.py,sha256=QejfgXOIVRv5-fv3Soi96VeVNVyquwVwy3M58N3YfNs,6633
|
88
88
|
sky/clouds/utils/oci_utils.py,sha256=t-5QEQEs8swN683AAp-oDD6yQJOQqVBbsVcHkNyqnbU,4968
|
89
89
|
sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
|
90
90
|
sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
|
@@ -95,11 +95,11 @@ sky/data/storage.py,sha256=x8YYY4zVBdit_5oAR_MXV-TM9qDefV_ZV4z0irv6ZaU,163102
|
|
95
95
|
sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
|
96
96
|
sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
|
97
97
|
sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
|
98
|
-
sky/jobs/controller.py,sha256=
|
98
|
+
sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
|
99
99
|
sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
|
100
|
-
sky/jobs/recovery_strategy.py,sha256=
|
101
|
-
sky/jobs/state.py,sha256=
|
102
|
-
sky/jobs/utils.py,sha256=
|
100
|
+
sky/jobs/recovery_strategy.py,sha256=FpPK6e2PT61cZPDUJqIfo6g53uSRTBh7dOTbfR1DLVE,26672
|
101
|
+
sky/jobs/state.py,sha256=exN6BdJlLBzFTccJCSHN4dNjVeYFgTgqgxOaHwLw2IQ,24307
|
102
|
+
sky/jobs/utils.py,sha256=pF4Kyl4v1M_Bmm2jIRlXGTSdII5BJ3f4qwex_oCFgBk,37742
|
103
103
|
sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
|
104
104
|
sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
|
105
105
|
sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
|
@@ -118,7 +118,7 @@ sky/provision/aws/utils.py,sha256=m49pS-SHGW7Au3bhDeTPsL8N5iRzbwOXzyEWRCc1Vho,32
|
|
118
118
|
sky/provision/azure/__init__.py,sha256=87cgk1_Ws7n9rqaDDPv-HpfrkVeSQMdFQnhnXwyx9g4,548
|
119
119
|
sky/provision/azure/azure-config-template.json,sha256=jrjAgOtpe0e6FSg3vsVqHKQqJe0w-HeWOFT1HuwzS2c,4712
|
120
120
|
sky/provision/azure/config.py,sha256=V5-0Zelt4Xo0vcqnD6PpsnaCS7vc3xosDelILDAKSW4,8885
|
121
|
-
sky/provision/azure/instance.py,sha256=
|
121
|
+
sky/provision/azure/instance.py,sha256=Xd1paLWVc6eVHzphOjZB4_BeXZNX7GYgPV9kH3GWvsc,48983
|
122
122
|
sky/provision/cudo/__init__.py,sha256=KAEl26MVPsk7IoP9Gg-MOJJRIV6-X9B0fbyHdyJWdLo,741
|
123
123
|
sky/provision/cudo/config.py,sha256=RYOVkV0MoUqVBJRZiKhBZhjFygeyFs7eUdVMdPg1vds,327
|
124
124
|
sky/provision/cudo/cudo_machine_type.py,sha256=_VNXWPELmlFXbtdcnPvkuLuyE9CZ923BUCdiac-ClDY,696
|
@@ -184,7 +184,7 @@ sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,3943
|
|
184
184
|
sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
|
185
185
|
sky/serve/service_spec.py,sha256=1aS6b-ku7W4CjyekXKDxjZsDdt-O8ygos-jFeXu31cA,13766
|
186
186
|
sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
|
187
|
-
sky/setup_files/setup.py,sha256=
|
187
|
+
sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
|
188
188
|
sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
|
189
189
|
sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
190
190
|
sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
|
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
|
|
192
192
|
sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
|
193
193
|
sky/skylet/constants.py,sha256=OsuJcQp6UgkQ9Yfml6f_raXXbHS7-_h-v4QNv92y0Gw,14642
|
194
194
|
sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
|
195
|
-
sky/skylet/job_lib.py,sha256=
|
195
|
+
sky/skylet/job_lib.py,sha256=jqJ4D3UeG6fNMm8xPtdWclnrVHQb6WiRqb1nrBp8TPg,35200
|
196
196
|
sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
|
197
197
|
sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
|
198
198
|
sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
|
@@ -219,7 +219,7 @@ sky/skylet/ray_patches/resource_demand_scheduler.py.patch,sha256=AVV-Hw-Rxw16aFm
|
|
219
219
|
sky/skylet/ray_patches/updater.py.patch,sha256=ZNMGVYICPBB44jLbEx2KvCgIY7BWYdDv3-2b2HJWmAQ,289
|
220
220
|
sky/skylet/ray_patches/worker.py.patch,sha256=_OBhibdr3xOy5Qje6Tt8D1eQVm_msi50TJbCJmOTxVU,565
|
221
221
|
sky/templates/aws-ray.yml.j2,sha256=K0rAuyf1XC_GPFp1BR9df42-Be12A6T2UF0BllVSpYg,8005
|
222
|
-
sky/templates/azure-ray.yml.j2,sha256=
|
222
|
+
sky/templates/azure-ray.yml.j2,sha256=uUneIfT5vTLUCvrZXiv2dsd3gFqLH2FK632oBruOO_k,6237
|
223
223
|
sky/templates/cudo-ray.yml.j2,sha256=SEHVY57iBauCOE2HYJtYVFEKlriAkdwQu_p86a1n_bA,3548
|
224
224
|
sky/templates/fluidstack-ray.yml.j2,sha256=t8TCULgiErCZdtFmBZVsA8ZdcqR7ccwsmQhuDFTBEAU,3541
|
225
225
|
sky/templates/gcp-ray.yml.j2,sha256=y95B-Nk6hFxm6vEIaxI1wFzAIcy_GcKC3XMYo9m-ThI,9662
|
@@ -249,14 +249,14 @@ sky/utils/command_runner.py,sha256=3CDcqRXEmoe3C-t2P58McgcRg6p9m5haUWYj1rOLuqM,3
|
|
249
249
|
sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
|
250
250
|
sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
|
251
251
|
sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
|
252
|
-
sky/utils/dag_utils.py,sha256=
|
252
|
+
sky/utils/dag_utils.py,sha256=pVX3lGDDcYTcGoH_1jEWzl9767Y4mwlIEYIzoyHO6gM,6105
|
253
253
|
sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
|
254
254
|
sky/utils/env_options.py,sha256=3oAaUPxowL6vI2XmxXrH56V7Myj9IJWsL-MXFmRFVdI,1294
|
255
255
|
sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
|
256
256
|
sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
|
257
257
|
sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
|
258
258
|
sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
|
259
|
-
sky/utils/schemas.py,sha256=
|
259
|
+
sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
|
260
260
|
sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
|
261
261
|
sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
|
262
262
|
sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
|
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
|
|
274
274
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
275
275
|
sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
|
276
276
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
281
|
-
skypilot_nightly-1.0.0.
|
282
|
-
skypilot_nightly-1.0.0.
|
277
|
+
skypilot_nightly-1.0.0.dev20241031.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
278
|
+
skypilot_nightly-1.0.0.dev20241031.dist-info/METADATA,sha256=_tflHJA52Q4Bt2-rQMwozR70icXKpmFjdr9H-zkURe8,19708
|
279
|
+
skypilot_nightly-1.0.0.dev20241031.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
|
280
|
+
skypilot_nightly-1.0.0.dev20241031.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
281
|
+
skypilot_nightly-1.0.0.dev20241031.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
282
|
+
skypilot_nightly-1.0.0.dev20241031.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20241029.dist-info → skypilot_nightly-1.0.0.dev20241031.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|