skypilot-nightly 1.0.0.dev20241029__py3-none-any.whl → 1.0.0.dev20241031__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '47ebae73e972c65de6e87aa7556220e515f2fc5e'
8
+ _SKYPILOT_COMMIT_SHA = 'c4eeeb5fb3ef64be0f05a727e119ac9266f8940f'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20241029'
38
+ __version__ = '1.0.0.dev20241031'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -1950,17 +1950,8 @@ class RetryingVmProvisioner(object):
1950
1950
 
1951
1951
  failover_history: List[Exception] = list()
1952
1952
 
1953
- style = colorama.Style
1954
- fore = colorama.Fore
1955
1953
  # Retrying launchable resources.
1956
1954
  while True:
1957
- if (isinstance(to_provision.cloud, clouds.Azure) and
1958
- to_provision.accelerators is not None and
1959
- 'A10' in to_provision.accelerators and prev_handle is None):
1960
- logger.warning(f'{style.BRIGHT}{fore.YELLOW}Trying to launch '
1961
- 'an A10 cluster on Azure. This may take ~20 '
1962
- 'minutes due to driver installation.'
1963
- f'{style.RESET_ALL}')
1964
1955
  try:
1965
1956
  # Recheck cluster name as the 'except:' block below may
1966
1957
  # change the cloud assignment.
@@ -2476,7 +2467,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2476
2467
  """Returns number of IPs per node in the cluster, handling TPU Pod."""
2477
2468
  is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
2478
2469
  if is_tpu_vm_pod:
2479
- num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources)
2470
+ num_ips = len(self.internal_ips())
2480
2471
  else:
2481
2472
  num_ips = 1
2482
2473
  return num_ips
@@ -3175,9 +3166,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3175
3166
  returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3176
3167
  if returncode == 255:
3177
3168
  is_message_too_long = False
3178
- with open(setup_log_path, 'r', encoding='utf-8') as f:
3179
- if 'too long' in f.read():
3180
- is_message_too_long = True
3169
+ try:
3170
+ with open(os.path.expanduser(setup_log_path),
3171
+ 'r',
3172
+ encoding='utf-8') as f:
3173
+ if 'too long' in f.read():
3174
+ is_message_too_long = True
3175
+ except Exception as e: # pylint: disable=broad-except
3176
+ # We don't crash the setup if we cannot read the log file.
3177
+ # Instead, we should retry the setup with dumping the script
3178
+ # to a file to be safe.
3179
+ logger.debug('Failed to read setup log file '
3180
+ f'{setup_log_path}: {e}')
3181
+ is_message_too_long = True
3181
3182
 
3182
3183
  if is_message_too_long:
3183
3184
  # If the setup script is too long, we retry it with dumping
sky/clouds/azure.py CHANGED
@@ -44,6 +44,8 @@ _DEFAULT_GPU_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2'
44
44
  _DEFAULT_V1_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v1'
45
45
  _DEFAULT_GPU_K80_IMAGE_ID = 'skypilot:k80-ubuntu-2004'
46
46
  _FALLBACK_IMAGE_ID = 'skypilot:gpu-ubuntu-2204'
47
+ # This is used by Azure GPU VMs that use grid drivers (e.g. A10).
48
+ _DEFAULT_GPU_GRID_IMAGE_ID = 'skypilot:custom-gpu-ubuntu-v2-grid'
47
49
 
48
50
  _COMMUNITY_IMAGE_PREFIX = '/CommunityGalleries'
49
51
 
@@ -220,6 +222,8 @@ class Azure(clouds.Cloud):
220
222
  acc_name = list(acc.keys())[0]
221
223
  if acc_name == 'K80':
222
224
  return _DEFAULT_GPU_K80_IMAGE_ID
225
+ if acc_name == 'A10':
226
+ return _DEFAULT_GPU_GRID_IMAGE_ID
223
227
  # About Gen V1 vs V2:
224
228
  # In Azure, all instances with K80 (Standard_NC series), some
225
229
  # instances with M60 (Standard_NV series) and some cpu instances
@@ -350,10 +354,6 @@ class Azure(clouds.Cloud):
350
354
  'image_version': version,
351
355
  }
352
356
 
353
- # Setup the A10 nvidia driver.
354
- need_nvidia_driver_extension = (acc_dict is not None and
355
- 'A10' in acc_dict)
356
-
357
357
  # Determine resource group for deploying the instance.
358
358
  resource_group_name = skypilot_config.get_nested(
359
359
  ('azure', 'resource_group_vm'), None)
@@ -413,7 +413,6 @@ class Azure(clouds.Cloud):
413
413
  # Azure does not support specific zones.
414
414
  'zones': None,
415
415
  **image_config,
416
- 'need_nvidia_driver_extension': need_nvidia_driver_extension,
417
416
  'disk_tier': Azure._get_disk_type(disk_tier),
418
417
  'cloud_init_setup_commands': cloud_init_setup_commands,
419
418
  'azure_subscription_id': self.get_project_id(dryrun),
@@ -47,6 +47,10 @@ TPU_RETRY_CNT = 3
47
47
  TPU_V4_ZONES = ['us-central2-b']
48
48
  # TPU v3 pods are available in us-east1-d, but hidden in the skus.
49
49
  # We assume the TPU prices are the same as us-central1.
50
+ # TPU v6e's pricing info is not available on the SKUs. However, in
51
+ # https://cloud.google.com/tpu/pricing, it listed the price for 4 regions:
52
+ # us-east1, us-east5, europe-west4, and asia-northeast1. We hardcode them here
53
+ # and filtered out the other regions (us-central{1,2}, us-south1).
50
54
  HIDDEN_TPU_DF = pd.read_csv(
51
55
  io.StringIO(
52
56
  textwrap.dedent("""\
@@ -58,8 +62,50 @@ HIDDEN_TPU_DF = pd.read_csv(
58
62
  ,tpu-v3-512,1,,,tpu-v3-512,512.0,153.6,us-east1,us-east1-d
59
63
  ,tpu-v3-1024,1,,,tpu-v3-1024,1024.0,307.2,us-east1,us-east1-d
60
64
  ,tpu-v3-2048,1,,,tpu-v3-2048,2048.0,614.4,us-east1,us-east1-d
65
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-b
66
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east5,us-east5-c
67
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.97,,europe-west4,europe-west4-a
68
+ ,tpu-v6e-1,1,,,tpu-v6e-1,3.24,,asia-northeast1,asia-northeast1-b
69
+ ,tpu-v6e-1,1,,,tpu-v6e-1,2.7,,us-east1,us-east1-d
70
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-b
71
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east5,us-east5-c
72
+ ,tpu-v6e-4,1,,,tpu-v6e-4,11.88,,europe-west4,europe-west4-a
73
+ ,tpu-v6e-4,1,,,tpu-v6e-4,12.96,,asia-northeast1,asia-northeast1-b
74
+ ,tpu-v6e-4,1,,,tpu-v6e-4,10.8,,us-east1,us-east1-d
75
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-b
76
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east5,us-east5-c
77
+ ,tpu-v6e-8,1,,,tpu-v6e-8,23.76,,europe-west4,europe-west4-a
78
+ ,tpu-v6e-8,1,,,tpu-v6e-8,25.92,,asia-northeast1,asia-northeast1-b
79
+ ,tpu-v6e-8,1,,,tpu-v6e-8,21.6,,us-east1,us-east1-d
80
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-b
81
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east5,us-east5-c
82
+ ,tpu-v6e-16,1,,,tpu-v6e-16,47.52,,europe-west4,europe-west4-a
83
+ ,tpu-v6e-16,1,,,tpu-v6e-16,51.84,,asia-northeast1,asia-northeast1-b
84
+ ,tpu-v6e-16,1,,,tpu-v6e-16,43.2,,us-east1,us-east1-d
85
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-b
86
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east5,us-east5-c
87
+ ,tpu-v6e-32,1,,,tpu-v6e-32,95.04,,europe-west4,europe-west4-a
88
+ ,tpu-v6e-32,1,,,tpu-v6e-32,103.68,,asia-northeast1,asia-northeast1-b
89
+ ,tpu-v6e-32,1,,,tpu-v6e-32,86.4,,us-east1,us-east1-d
90
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-b
91
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east5,us-east5-c
92
+ ,tpu-v6e-64,1,,,tpu-v6e-64,190.08,,europe-west4,europe-west4-a
93
+ ,tpu-v6e-64,1,,,tpu-v6e-64,207.36,,asia-northeast1,asia-northeast1-b
94
+ ,tpu-v6e-64,1,,,tpu-v6e-64,172.8,,us-east1,us-east1-d
95
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-b
96
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east5,us-east5-c
97
+ ,tpu-v6e-128,1,,,tpu-v6e-128,380.16,,europe-west4,europe-west4-a
98
+ ,tpu-v6e-128,1,,,tpu-v6e-128,414.72,,asia-northeast1,asia-northeast1-b
99
+ ,tpu-v6e-128,1,,,tpu-v6e-128,345.6,,us-east1,us-east1-d
100
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-b
101
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east5,us-east5-c
102
+ ,tpu-v6e-256,1,,,tpu-v6e-256,760.32,,europe-west4,europe-west4-a
103
+ ,tpu-v6e-256,1,,,tpu-v6e-256,829.44,,asia-northeast1,asia-northeast1-b
104
+ ,tpu-v6e-256,1,,,tpu-v6e-256,691.2,,us-east1,us-east1-d
61
105
  """)))
62
106
 
107
+ TPU_V6E_MISSING_REGIONS = ['us-central1', 'us-central2', 'us-south1']
108
+
63
109
  # TPU V5 is not visible in specific zones. We hardcode the missing zones here.
64
110
  # NOTE(dev): Keep the zones and the df in sync.
65
111
  TPU_V5_MISSING_ZONES_DF = {
@@ -683,11 +729,13 @@ def get_tpu_df(gce_skus: List[Dict[str, Any]],
683
729
  'not found in SKUs or hidden TPU price DF.')
684
730
  # TODO(tian): Hack. Should investigate how to retrieve the price
685
731
  # for TPU-v6e.
686
- if not tpu_name.startswith('tpu-v6e'):
732
+ if (tpu_name.startswith('tpu-v6e') and
733
+ tpu_region in TPU_V6E_MISSING_REGIONS):
734
+ if not spot:
735
+ tpu_price = 0.0
736
+ else:
687
737
  assert spot or tpu_price is not None, (row, hidden_tpu,
688
738
  HIDDEN_TPU_DF)
689
- else:
690
- tpu_price = 0.0
691
739
  return tpu_price
692
740
 
693
741
  df['Price'] = df.apply(lambda row: get_tpu_price(row, spot=False), axis=1)
@@ -49,14 +49,6 @@ def is_tpu_vm_pod(resources: Optional['resources_lib.Resources']) -> bool:
49
49
  return not acc.endswith('-8')
50
50
 
51
51
 
52
- def get_num_tpu_devices(resources: Optional['resources_lib.Resources']) -> int:
53
- if resources is None or not is_tpu(resources):
54
- raise ValueError('resources must be a valid TPU resource.')
55
- acc, _ = list(resources.accelerators.items())[0]
56
- num_tpu_devices = int(int(acc.split('-')[2]) / 8)
57
- return num_tpu_devices
58
-
59
-
60
52
  @dataclasses.dataclass
61
53
  class SpecificReservation:
62
54
  count: int
sky/execution.py CHANGED
@@ -171,10 +171,11 @@ def _execute(
171
171
  task = dag.tasks[0]
172
172
 
173
173
  if any(r.job_recovery is not None for r in task.resources):
174
- with ux_utils.print_exception_no_traceback():
175
- raise ValueError(
176
- 'Job recovery is specified in the task. To launch a '
177
- 'managed job, please use: sky jobs launch')
174
+ logger.warning(
175
+ f'{colorama.Style.DIM}The task has `job_recovery` specified, '
176
+ 'but is launched as an unmanaged job. It will be ignored.'
177
+ 'To enable job recovery, use managed jobs: sky jobs launch.'
178
+ f'{colorama.Style.RESET_ALL}')
178
179
 
179
180
  cluster_exists = False
180
181
  if cluster_name is not None:
sky/jobs/controller.py CHANGED
@@ -160,6 +160,11 @@ class JobsController:
160
160
  if task_id == 0:
161
161
  submitted_at = backend_utils.get_timestamp_from_run_timestamp(
162
162
  self._backend.run_timestamp)
163
+ assert task.name is not None, task
164
+ cluster_name = managed_job_utils.generate_managed_job_cluster_name(
165
+ task.name, self._job_id)
166
+ self._strategy_executor = recovery_strategy.StrategyExecutor.make(
167
+ cluster_name, self._backend, task, self._retry_until_up)
163
168
  managed_job_state.set_submitted(
164
169
  self._job_id,
165
170
  task_id,
@@ -167,15 +172,14 @@ class JobsController:
167
172
  submitted_at,
168
173
  resources_str=backend_utils.get_task_resources_str(
169
174
  task, is_managed_job=True),
175
+ specs={
176
+ 'max_restarts_on_errors':
177
+ self._strategy_executor.max_restarts_on_errors
178
+ },
170
179
  callback_func=callback_func)
171
180
  logger.info(
172
181
  f'Submitted managed job {self._job_id} (task: {task_id}, name: '
173
182
  f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
174
- assert task.name is not None, task
175
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
176
- task.name, self._job_id)
177
- self._strategy_executor = recovery_strategy.StrategyExecutor.make(
178
- cluster_name, self._backend, task, self._retry_until_up)
179
183
 
180
184
  logger.info('Started monitoring.')
181
185
  managed_job_state.set_starting(job_id=self._job_id,
@@ -283,23 +287,35 @@ class JobsController:
283
287
  failure_reason = (
284
288
  'To see the details, run: '
285
289
  f'sky jobs logs --controller {self._job_id}')
286
-
287
- managed_job_state.set_failed(
288
- self._job_id,
289
- task_id,
290
- failure_type=managed_job_status,
291
- failure_reason=failure_reason,
292
- end_time=end_time,
293
- callback_func=callback_func)
294
- return False
295
- # Although the cluster is healthy, we fail to access the
296
- # job status. Try to recover the job (will not restart the
297
- # cluster, if the cluster is healthy).
298
- assert job_status is None, job_status
299
- logger.info('Failed to fetch the job status while the '
300
- 'cluster is healthy. Try to recover the job '
301
- '(the cluster will not be restarted).')
302
-
290
+ should_restart_on_failure = (
291
+ self._strategy_executor.should_restart_on_failure())
292
+ if should_restart_on_failure:
293
+ max_restarts = (
294
+ self._strategy_executor.max_restarts_on_errors)
295
+ logger.info(
296
+ f'User program crashed '
297
+ f'({managed_job_status.value}). '
298
+ f'Retry the job as max_restarts_on_errors is '
299
+ f'set to {max_restarts}. '
300
+ f'[{self._strategy_executor.restart_cnt_on_failure}'
301
+ f'/{max_restarts}]')
302
+ else:
303
+ managed_job_state.set_failed(
304
+ self._job_id,
305
+ task_id,
306
+ failure_type=managed_job_status,
307
+ failure_reason=failure_reason,
308
+ end_time=end_time,
309
+ callback_func=callback_func)
310
+ return False
311
+ else:
312
+ # Although the cluster is healthy, we fail to access the
313
+ # job status. Try to recover the job (will not restart the
314
+ # cluster, if the cluster is healthy).
315
+ assert job_status is None, job_status
316
+ logger.info('Failed to fetch the job status while the '
317
+ 'cluster is healthy. Try to recover the job '
318
+ '(the cluster will not be restarted).')
303
319
  # When the handle is None, the cluster should be cleaned up already.
304
320
  if handle is not None:
305
321
  resources = handle.launched_resources
@@ -66,7 +66,8 @@ class StrategyExecutor:
66
66
  RETRY_INIT_GAP_SECONDS = 60
67
67
 
68
68
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
69
- task: 'task_lib.Task', retry_until_up: bool) -> None:
69
+ task: 'task_lib.Task', retry_until_up: bool,
70
+ max_restarts_on_errors: int) -> None:
70
71
  """Initialize the strategy executor.
71
72
 
72
73
  Args:
@@ -82,6 +83,8 @@ class StrategyExecutor:
82
83
  self.cluster_name = cluster_name
83
84
  self.backend = backend
84
85
  self.retry_until_up = retry_until_up
86
+ self.max_restarts_on_errors = max_restarts_on_errors
87
+ self.restart_cnt_on_failure = 0
85
88
 
86
89
  def __init_subclass__(cls, name: str, default: bool = False):
87
90
  RECOVERY_STRATEGIES[name] = cls
@@ -109,8 +112,17 @@ class StrategyExecutor:
109
112
  # set the new_task_resources to be the same type (list or set) as the
110
113
  # original task.resources
111
114
  task.set_resources(type(task.resources)(new_resources_list))
112
- return RECOVERY_STRATEGIES[job_recovery](cluster_name, backend, task,
113
- retry_until_up)
115
+ if isinstance(job_recovery, dict):
116
+ job_recovery_name = job_recovery.pop('strategy',
117
+ DEFAULT_RECOVERY_STRATEGY)
118
+ max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
119
+ 0)
120
+ else:
121
+ job_recovery_name = job_recovery
122
+ max_restarts_on_errors = 0
123
+ return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
124
+ task, retry_until_up,
125
+ max_restarts_on_errors)
114
126
 
115
127
  def launch(self) -> float:
116
128
  """Launch the cluster for the first time.
@@ -368,6 +380,17 @@ class StrategyExecutor:
368
380
  f'{gap_seconds:.1f} seconds.')
369
381
  time.sleep(gap_seconds)
370
382
 
383
+ def should_restart_on_failure(self) -> bool:
384
+ """Increments counter & checks if job should be restarted on a failure.
385
+
386
+ Returns:
387
+ True if the job should be restarted, otherwise False.
388
+ """
389
+ self.restart_cnt_on_failure += 1
390
+ if self.restart_cnt_on_failure > self.max_restarts_on_errors:
391
+ return False
392
+ return True
393
+
371
394
 
372
395
  class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
373
396
  default=False):
@@ -376,8 +399,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
376
399
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
377
400
 
378
401
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
379
- task: 'task_lib.Task', retry_until_up: bool) -> None:
380
- super().__init__(cluster_name, backend, task, retry_until_up)
402
+ task: 'task_lib.Task', retry_until_up: bool,
403
+ max_restarts_on_errors: int) -> None:
404
+ super().__init__(cluster_name, backend, task, retry_until_up,
405
+ max_restarts_on_errors)
381
406
  # Note down the cloud/region of the launched cluster, so that we can
382
407
  # first retry in the same cloud/region. (Inside recover() we may not
383
408
  # rely on cluster handle, as it can be None if the cluster is
sky/jobs/state.py CHANGED
@@ -2,6 +2,7 @@
2
2
  # TODO(zhwu): maybe use file based status instead of database, so
3
3
  # that we can easily switch to a s3-based storage.
4
4
  import enum
5
+ import json
5
6
  import pathlib
6
7
  import sqlite3
7
8
  import time
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
65
66
  failure_reason TEXT,
66
67
  spot_job_id INTEGER,
67
68
  task_id INTEGER DEFAULT 0,
68
- task_name TEXT)""")
69
+ task_name TEXT,
70
+ specs TEXT)""")
69
71
  _CONN.commit()
70
72
 
71
73
  db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
92
94
  'TEXT',
93
95
  copy_from='job_name')
94
96
 
97
+ # Specs is some useful information about the task, e.g., the
98
+ # max_restarts_on_errors value. It is stored in JSON format.
99
+ db_utils.add_column_to_table(_CURSOR,
100
+ _CONN,
101
+ 'spot',
102
+ 'specs',
103
+ 'TEXT',
104
+ value_to_replace_existing_entries=json.dumps({
105
+ 'max_restarts_on_errors': 0,
106
+ }))
107
+
95
108
  # `job_info` contains the mapping from job_id to the job_name.
96
109
  # In the future, it may contain more information about each job.
97
110
  _CURSOR.execute("""\
@@ -128,9 +141,10 @@ columns = [
128
141
  'job_id',
129
142
  'task_id',
130
143
  'task_name',
144
+ 'specs',
131
145
  # columns from the job_info table
132
146
  '_job_info_job_id', # This should be the same as job_id
133
- 'job_name'
147
+ 'job_name',
134
148
  ]
135
149
 
136
150
 
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
283
297
 
284
298
  def set_submitted(job_id: int, task_id: int, run_timestamp: str,
285
299
  submit_time: float, resources_str: str,
286
- callback_func: CallbackType):
300
+ specs: Dict[str, Union[str,
301
+ int]], callback_func: CallbackType):
287
302
  """Set the task to submitted.
288
303
 
289
304
  Args:
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
293
308
  determine the log directory of the managed task.
294
309
  submit_time: The time when the managed task is submitted.
295
310
  resources_str: The resources string of the managed task.
311
+ specs: The specs of the managed task.
312
+ callback_func: The callback function.
296
313
  """
297
314
  # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
298
315
  # the log directory and submission time align with each other, so as to
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
306
323
  resources=(?),
307
324
  submitted_at=(?),
308
325
  status=(?),
309
- run_timestamp=(?)
326
+ run_timestamp=(?),
327
+ specs=(?)
310
328
  WHERE spot_job_id=(?) AND
311
329
  task_id=(?)""",
312
330
  (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
313
- run_timestamp, job_id, task_id))
331
+ run_timestamp, json.dumps(specs), job_id, task_id))
314
332
  callback_func('SUBMITTED')
315
333
 
316
334
 
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
619
637
  for (job_id,) in rows:
620
638
  return job_id
621
639
  return None
640
+
641
+
642
+ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
643
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
644
+ task_specs = cursor.execute(
645
+ """\
646
+ SELECT specs FROM spot
647
+ WHERE spot_job_id=(?) AND task_id=(?)""",
648
+ (job_id, task_id)).fetchone()
649
+ return json.loads(task_specs[0])
sky/jobs/utils.py CHANGED
@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
70
70
  # state, after the job finished. This is a safeguard to avoid the case where
71
71
  # the managed job status fails to be updated and keep the `sky jobs logs`
72
72
  # blocking for a long time.
73
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
73
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
74
74
 
75
75
 
76
76
  class UserSignal(enum.Enum):
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
392
392
  f'INFO: Log for the current task ({task_id}) '
393
393
  'is finished. Waiting for the next task\'s log '
394
394
  'to be started.')
395
- status_display.update('Waiting for the next task: '
396
- f'{task_id + 1}.')
395
+ # Add a newline to avoid the status display below
396
+ # removing the last line of the task output.
397
+ print()
398
+ status_display.update(
399
+ ux_utils.spinner_message(
400
+ f'Waiting for the next task: {task_id + 1}'))
397
401
  status_display.start()
398
402
  original_task_id = task_id
399
403
  while True:
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
405
409
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
406
410
  continue
407
411
  else:
408
- break
412
+ task_specs = managed_job_state.get_task_specs(
413
+ job_id, task_id)
414
+ if task_specs.get('max_restarts_on_errors', 0) == 0:
415
+ # We don't need to wait for the managed job status
416
+ # update, as the job is guaranteed to be in terminal
417
+ # state afterwards.
418
+ break
419
+ print()
420
+ status_display.update(
421
+ ux_utils.spinner_message(
422
+ 'Waiting for next restart for the failed task'))
423
+ status_display.start()
424
+ while True:
425
+ _, managed_job_status = (
426
+ managed_job_state.get_latest_task_id_status(
427
+ job_id))
428
+ if (managed_job_status !=
429
+ managed_job_state.ManagedJobStatus.RUNNING):
430
+ break
431
+ time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
432
+ continue
409
433
  # The job can be cancelled by the user or the controller (when
410
434
  # the cluster is partially preempted).
411
435
  logger.debug(
@@ -311,30 +311,10 @@ def _create_vm(
311
311
  vm_name=vm_name,
312
312
  parameters=vm_instance,
313
313
  )
314
- # poller.result() will block on async operation until it's done.
315
- logger.info(f'Created VM {vm_poller.result().name}.')
316
- # Configure driver extension for A10 GPUs. A10 GPUs requires a
317
- # special type of drivers which is available at Microsoft HPC
318
- # extension. Reference:
319
- # https://forums.developer.nvidia.com/t/ubuntu-22-04-installation-driver-error-nvidia-a10/285195/2
320
- # This can take more than 20mins for setting up the A10 GPUs
321
- if node_config.get('need_nvidia_driver_extension', False):
322
- ext_poller = compute_client.virtual_machine_extensions.\
323
- begin_create_or_update(
324
- resource_group_name=provider_config['resource_group'],
325
- vm_name=vm_name,
326
- vm_extension_name='NvidiaGpuDriverLinux',
327
- extension_parameters=compute.VirtualMachineExtension(
328
- location=provider_config['location'],
329
- publisher='Microsoft.HpcCompute',
330
- type_properties_type='NvidiaGpuDriverLinux',
331
- type_handler_version='1.9',
332
- auto_upgrade_minor_version=True,
333
- settings='{}'))
334
- logger.info(
335
- f'Created VM extension {ext_poller.result().name} for VM {vm_name}.'
336
- )
337
- return vm_poller.result()
314
+ # This line will block until the VM is created or the operation times out.
315
+ vm = vm_poller.result()
316
+ logger.info(f'Created VM {vm.name}.')
317
+ return vm
338
318
 
339
319
 
340
320
  def _create_instances(compute_client: 'azure_compute.ComputeManagementClient',
sky/resources.py CHANGED
@@ -55,7 +55,7 @@ class Resources:
55
55
  accelerators: Union[None, str, Dict[str, int]] = None,
56
56
  accelerator_args: Optional[Dict[str, str]] = None,
57
57
  use_spot: Optional[bool] = None,
58
- job_recovery: Optional[str] = None,
58
+ job_recovery: Optional[Union[Dict[str, Union[str, int]], str]] = None,
59
59
  region: Optional[str] = None,
60
60
  zone: Optional[str] = None,
61
61
  image_id: Union[Dict[str, str], str, None] = None,
@@ -111,6 +111,12 @@ class Resources:
111
111
  job to recover the cluster from preemption. Refer to
112
112
  `recovery_strategy module <https://github.com/skypilot-org/skypilot/blob/master/sky/jobs/recovery_strategy.py>`__ # pylint: disable=line-too-long
113
113
  for more details.
114
+ When a dict is provided, it can have the following fields:
115
+
116
+ - strategy: the recovery strategy to use.
117
+ - max_restarts_on_errors: the max number of restarts on user code
118
+ errors.
119
+
114
120
  region: the region to use.
115
121
  zone: the zone to use.
116
122
  image_id: the image ID to use. If a str, must be a string
@@ -161,10 +167,20 @@ class Resources:
161
167
 
162
168
  self._use_spot_specified = use_spot is not None
163
169
  self._use_spot = use_spot if use_spot is not None else False
164
- self._job_recovery = None
170
+ self._job_recovery: Optional[Dict[str, Union[str, int]]] = None
165
171
  if job_recovery is not None:
166
- if job_recovery.strip().lower() != 'none':
167
- self._job_recovery = job_recovery.upper()
172
+ if isinstance(job_recovery, str):
173
+ job_recovery = {'strategy': job_recovery}
174
+ if 'strategy' not in job_recovery:
175
+ job_recovery['strategy'] = None
176
+
177
+ strategy_name = job_recovery['strategy']
178
+ if strategy_name == 'none':
179
+ self._job_recovery = None
180
+ else:
181
+ if strategy_name is not None:
182
+ job_recovery['strategy'] = strategy_name.upper()
183
+ self._job_recovery = job_recovery
168
184
 
169
185
  if disk_size is not None:
170
186
  if round(disk_size) != disk_size:
@@ -419,7 +435,7 @@ class Resources:
419
435
  return self._use_spot_specified
420
436
 
421
437
  @property
422
- def job_recovery(self) -> Optional[str]:
438
+ def job_recovery(self) -> Optional[Dict[str, Union[str, int]]]:
423
439
  return self._job_recovery
424
440
 
425
441
  @property
@@ -586,6 +602,9 @@ class Resources:
586
602
  # TPU V5 requires a newer runtime version.
587
603
  if acc.startswith('tpu-v5'):
588
604
  return 'v2-alpha-tpuv5'
605
+ # TPU V6e requires a newer runtime version.
606
+ if acc.startswith('tpu-v6e'):
607
+ return 'v2-alpha-tpuv6e'
589
608
  return 'tpu-vm-base'
590
609
 
591
610
  accelerator_args['runtime_version'] = (
@@ -814,12 +833,13 @@ class Resources:
814
833
  Raises:
815
834
  ValueError: if the attributes are invalid.
816
835
  """
817
- if self._job_recovery is None:
836
+ if self._job_recovery is None or self._job_recovery['strategy'] is None:
818
837
  return
819
- if self._job_recovery not in managed_jobs.RECOVERY_STRATEGIES:
838
+ if (self._job_recovery['strategy']
839
+ not in managed_jobs.RECOVERY_STRATEGIES):
820
840
  with ux_utils.print_exception_no_traceback():
821
841
  raise ValueError(
822
- f'Spot recovery strategy {self._job_recovery} '
842
+ f'Spot recovery strategy {self._job_recovery["strategy"]} '
823
843
  'is not supported. The strategy should be among '
824
844
  f'{list(managed_jobs.RECOVERY_STRATEGIES.keys())}')
825
845
 
sky/setup_files/setup.py CHANGED
@@ -153,7 +153,7 @@ install_requires = [
153
153
  'tabulate',
154
154
  # Light weight requirement, can be replaced with "typing" once
155
155
  # we deprecate Python 3.7 (this will take a while).
156
- "typing_extensions",
156
+ 'typing_extensions',
157
157
  'filelock >= 3.6.0',
158
158
  'packaging',
159
159
  'psutil',
@@ -216,8 +216,9 @@ extras_require: Dict[str, List[str]] = {
216
216
  # We need azure-identity>=1.13.0 to enable the customization of the
217
217
  # timeout of AzureCliCredential.
218
218
  'azure': [
219
- 'azure-cli>=2.31.0', 'azure-core', 'azure-identity>=1.13.0',
220
- 'azure-mgmt-network', 'azure-storage-blob', 'msgraph-sdk'
219
+ 'azure-cli>=2.65.0', 'azure-core>=1.31.0', 'azure-identity>=1.19.0',
220
+ 'azure-mgmt-network>=27.0.0', 'azure-mgmt-compute>=33.0.0',
221
+ 'azure-storage-blob>=12.23.1', 'msgraph-sdk'
221
222
  ] + local_ray,
222
223
  # We need google-api-python-client>=2.69.0 to enable 'discardLocalSsd'
223
224
  # parameter for stopping instances.
sky/skylet/job_lib.py CHANGED
@@ -512,16 +512,13 @@ def _get_jobs_by_ids(job_ids: List[int]) -> List[Dict[str, Any]]:
512
512
  return records
513
513
 
514
514
 
515
- def _get_pending_jobs():
516
- rows = _CURSOR.execute(
517
- 'SELECT job_id, created_time, submit FROM pending_jobs')
518
- rows = list(rows)
519
- return {
520
- job_id: {
521
- 'created_time': created_time,
522
- 'submit': submit
523
- } for job_id, created_time, submit in rows
524
- }
515
+ def _get_pending_job(job_id: int) -> Optional[Dict[str, Any]]:
516
+ rows = _CURSOR.execute('SELECT created_time, submit FROM pending_jobs '
517
+ f'WHERE job_id={job_id!r}')
518
+ for row in rows:
519
+ created_time, submit = row
520
+ return {'created_time': created_time, 'submit': submit}
521
+ return None
525
522
 
526
523
 
527
524
  def update_job_status(job_ids: List[int],
@@ -535,7 +532,7 @@ def update_job_status(job_ids: List[int],
535
532
  during job cancelling, we still need this to handle the staleness problem,
536
533
  caused by instance restarting and other corner cases (if any).
537
534
 
538
- This function should only be run on the remote instance with ray==2.4.0.
535
+ This function should only be run on the remote instance with ray>=2.4.0.
539
536
  """
540
537
  if len(job_ids) == 0:
541
538
  return []
@@ -547,50 +544,45 @@ def update_job_status(job_ids: List[int],
547
544
 
548
545
  # In ray 2.4.0, job_client.list_jobs returns a list of JobDetails,
549
546
  # which contains the job status (str) and submission_id (str).
547
+ ray_job_query_time = time.time()
550
548
  job_detail_lists: List['ray_pydantic.JobDetails'] = job_client.list_jobs()
551
549
 
552
- pending_jobs = _get_pending_jobs()
553
550
  job_details = {}
554
551
  ray_job_ids_set = set(ray_job_ids)
555
552
  for job_detail in job_detail_lists:
556
553
  if job_detail.submission_id in ray_job_ids_set:
557
554
  job_details[job_detail.submission_id] = job_detail
558
- job_statuses: List[Optional[JobStatus]] = [None] * len(ray_job_ids)
559
- for i, ray_job_id in enumerate(ray_job_ids):
560
- job_id = job_ids[i]
561
- if ray_job_id in job_details:
562
- ray_status = job_details[ray_job_id].status
563
- job_statuses[i] = _RAY_TO_JOB_STATUS_MAP[ray_status]
564
- if job_id in pending_jobs:
565
- if pending_jobs[job_id]['created_time'] < psutil.boot_time():
566
- logger.info(
567
- f'Job {job_id} is stale, setting to FAILED: '
568
- f'created_time={pending_jobs[job_id]["created_time"]}, '
569
- f'boot_time={psutil.boot_time()}')
570
- # The job is stale as it is created before the instance
571
- # is booted, e.g. the instance is rebooted.
572
- job_statuses[i] = JobStatus.FAILED
573
- # Gives a 60 second grace period between job being submit from
574
- # the pending table until appearing in ray jobs.
575
- if (pending_jobs[job_id]['submit'] > 0 and
576
- pending_jobs[job_id]['submit'] <
577
- time.time() - _PENDING_SUBMIT_GRACE_PERIOD):
578
- # For jobs submitted outside of the grace period, we will
579
- # consider the ray job status.
580
- continue
581
- else:
582
- # Reset the job status to PENDING even though it may not appear
583
- # in the ray jobs, so that it will not be considered as stale.
584
- job_statuses[i] = JobStatus.PENDING
585
-
586
- assert len(job_statuses) == len(job_ids), (job_statuses, job_ids)
587
555
 
588
556
  statuses = []
589
- for job_id, status in zip(job_ids, job_statuses):
557
+ for job_id, ray_job_id in zip(job_ids, ray_job_ids):
590
558
  # Per-job status lock is required because between the job status
591
559
  # query and the job status update, the job status in the databse
592
560
  # can be modified by the generated ray program.
593
561
  with filelock.FileLock(_get_lock_path(job_id)):
562
+ status = None
563
+ if ray_job_id in job_details:
564
+ ray_status = job_details[ray_job_id].status
565
+ status = _RAY_TO_JOB_STATUS_MAP[ray_status]
566
+ pending_job = _get_pending_job(job_id)
567
+ if pending_job is not None:
568
+ if pending_job['created_time'] < psutil.boot_time():
569
+ logger.info(f'Job {job_id} is stale, setting to FAILED: '
570
+ f'created_time={pending_job["created_time"]}, '
571
+ f'boot_time={psutil.boot_time()}')
572
+ # The job is stale as it is created before the instance
573
+ # is booted, e.g. the instance is rebooted.
574
+ status = JobStatus.FAILED
575
+ # Gives a 60 second grace period between job being submit from
576
+ # the pending table until appearing in ray jobs. For jobs
577
+ # submitted outside of the grace period, we will consider the
578
+ # ray job status.
579
+ if not (pending_job['submit'] > 0 and pending_job['submit'] <
580
+ ray_job_query_time - _PENDING_SUBMIT_GRACE_PERIOD):
581
+ # Reset the job status to PENDING even though it may not
582
+ # appear in the ray jobs, so that it will not be considered
583
+ # as stale.
584
+ status = JobStatus.PENDING
585
+
594
586
  original_status = get_status_no_lock(job_id)
595
587
  assert original_status is not None, (job_id, status)
596
588
  if status is None:
@@ -83,7 +83,6 @@ available_node_types:
83
83
  {%- for cmd in cloud_init_setup_commands %}
84
84
  {{ cmd }}
85
85
  {%- endfor %}
86
- need_nvidia_driver_extension: {{need_nvidia_driver_extension}}
87
86
  {%- if disk_performance_tier is not none %}
88
87
  disk_performance_tier: {{disk_performance_tier}}
89
88
  {%- endif %}
sky/utils/dag_utils.py CHANGED
@@ -143,11 +143,21 @@ def fill_default_config_in_dag_for_job_launch(dag: dag_lib.Dag) -> None:
143
143
  for task_ in dag.tasks:
144
144
 
145
145
  new_resources_list = []
146
+ default_strategy = jobs.DEFAULT_RECOVERY_STRATEGY
147
+ assert default_strategy is not None
146
148
  for resources in list(task_.resources):
147
- change_default_value: Dict[str, Any] = {}
148
- if resources.job_recovery is None:
149
- change_default_value[
150
- 'job_recovery'] = jobs.DEFAULT_RECOVERY_STRATEGY
149
+ original_job_recovery = resources.job_recovery
150
+ job_recovery = {'strategy': default_strategy}
151
+ if isinstance(original_job_recovery, str):
152
+ job_recovery['strategy'] = original_job_recovery
153
+ elif isinstance(original_job_recovery, dict):
154
+ job_recovery.update(original_job_recovery)
155
+ strategy = job_recovery.get('strategy')
156
+ if strategy is None:
157
+ job_recovery['strategy'] = default_strategy
158
+ change_default_value: Dict[str, Any] = {
159
+ 'job_recovery': job_recovery
160
+ }
151
161
 
152
162
  new_resources = resources.copy(**change_default_value)
153
163
  new_resources_list.append(new_resources)
sky/utils/schemas.py CHANGED
@@ -92,7 +92,27 @@ def _get_single_resources_schema():
92
92
  'type': 'string',
93
93
  },
94
94
  'job_recovery': {
95
- 'type': 'string',
95
+ # Either a string or a dict.
96
+ 'anyOf': [{
97
+ 'type': 'string',
98
+ }, {
99
+ 'type': 'object',
100
+ 'required': [],
101
+ 'additionalProperties': False,
102
+ 'properties': {
103
+ 'strategy': {
104
+ 'anyOf': [{
105
+ 'type': 'string',
106
+ }, {
107
+ 'type': 'null',
108
+ }],
109
+ },
110
+ 'max_restarts_on_errors': {
111
+ 'type': 'integer',
112
+ 'minimum': 0,
113
+ },
114
+ }
115
+ }],
96
116
  },
97
117
  'disk_size': {
98
118
  'type': 'integer',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20241029
3
+ Version: 1.0.0.dev20241031
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -46,11 +46,12 @@ Requires-Dist: awscli>=1.27.10; extra == "all"
46
46
  Requires-Dist: botocore>=1.29.10; extra == "all"
47
47
  Requires-Dist: boto3>=1.26.1; extra == "all"
48
48
  Requires-Dist: colorama<0.4.5; extra == "all"
49
- Requires-Dist: azure-cli>=2.31.0; extra == "all"
50
- Requires-Dist: azure-core; extra == "all"
51
- Requires-Dist: azure-identity>=1.13.0; extra == "all"
52
- Requires-Dist: azure-mgmt-network; extra == "all"
53
- Requires-Dist: azure-storage-blob; extra == "all"
49
+ Requires-Dist: azure-cli>=2.65.0; extra == "all"
50
+ Requires-Dist: azure-core>=1.31.0; extra == "all"
51
+ Requires-Dist: azure-identity>=1.19.0; extra == "all"
52
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
53
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
54
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
54
55
  Requires-Dist: msgraph-sdk; extra == "all"
55
56
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
56
57
  Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
@@ -78,11 +79,12 @@ Requires-Dist: botocore>=1.29.10; extra == "aws"
78
79
  Requires-Dist: boto3>=1.26.1; extra == "aws"
79
80
  Requires-Dist: colorama<0.4.5; extra == "aws"
80
81
  Provides-Extra: azure
81
- Requires-Dist: azure-cli>=2.31.0; extra == "azure"
82
- Requires-Dist: azure-core; extra == "azure"
83
- Requires-Dist: azure-identity>=1.13.0; extra == "azure"
84
- Requires-Dist: azure-mgmt-network; extra == "azure"
85
- Requires-Dist: azure-storage-blob; extra == "azure"
82
+ Requires-Dist: azure-cli>=2.65.0; extra == "azure"
83
+ Requires-Dist: azure-core>=1.31.0; extra == "azure"
84
+ Requires-Dist: azure-identity>=1.19.0; extra == "azure"
85
+ Requires-Dist: azure-mgmt-network>=27.0.0; extra == "azure"
86
+ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "azure"
87
+ Requires-Dist: azure-storage-blob>=12.23.1; extra == "azure"
86
88
  Requires-Dist: msgraph-sdk; extra == "azure"
87
89
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "azure"
88
90
  Provides-Extra: cloudflare
@@ -1,4 +1,4 @@
1
- sky/__init__.py,sha256=BxGwYNzkF-X3QWAkY2mXeidbCMkcagQQqkJ-gwBlJiI,5882
1
+ sky/__init__.py,sha256=yJMQo4fd9D4f1vgLHCoy3S3IWUWqDbw9zbH6TEmTD-Y,5882
2
2
  sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
3
3
  sky/authentication.py,sha256=pAdCT60OxxiXI9KXDyP2lQ9u9vMc6aMtq5Xi2h_hbdw,20984
4
4
  sky/check.py,sha256=D3Y3saIFAYVvPxuBHnVgJEO0fUVDxgjwuMBaO-D778k,9472
@@ -7,10 +7,10 @@ sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
7
7
  sky/core.py,sha256=DW9OGE2kS2CmsvQ1grrpRnNFS3woMGWSHu5GE99e-I4,38190
8
8
  sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
9
9
  sky/exceptions.py,sha256=KBIEJHgrw6OMBL8H65o-Gk6qYQEV1SR9gBwMjnMnxxg,8858
10
- sky/execution.py,sha256=CbrKMgfc2JgLqZqwPvmYKxbWAQKYqHpOLpUEOb-k2m0,24679
10
+ sky/execution.py,sha256=tDK6JhF_405cjqxRpbdLbHZyxrKTD5oa0UkKDvPJ_9Q,24751
11
11
  sky/global_user_state.py,sha256=PywEmUutF97XBgRMClR6IS5_KM8JJC0oA1LsPUZebp0,28681
12
12
  sky/optimizer.py,sha256=tXGrFpc6xNtKH34qjBAMd4jTuWcDZTPnGFwEtuCQFmk,59702
13
- sky/resources.py,sha256=bm004Ms2qlBqEr0N_TEUybDOXJVhLF8yOwkhoqb1t9c,67478
13
+ sky/resources.py,sha256=Zt8mCCmdvZ5ZCqY-l3KXlx_lkUesAopRtaEcEsrRFZo,68465
14
14
  sky/sky_logging.py,sha256=oLmTmwkuucIto3LHXLJfMcyRpYSkmZAZa5XzQPA5IHk,4434
15
15
  sky/skypilot_config.py,sha256=E3g65cX3P3dT9b5N0GgFBG6yB0FXwIGpisKoozmJmWU,9094
16
16
  sky/status_lib.py,sha256=J7Jb4_Dz0v2T64ttOdyUgpokvl4S0sBJrMfH7Fvo51A,1457
@@ -31,7 +31,7 @@ sky/adaptors/vsphere.py,sha256=zJP9SeObEoLrpgHW2VHvZE48EhgVf8GfAEIwBeaDMfM,2129
31
31
  sky/backends/__init__.py,sha256=UDjwbUgpTRApbPJnNfR786GadUuwgRk3vsWoVu5RB_c,536
32
32
  sky/backends/backend.py,sha256=wwfbrxPhjMPs6PSyy3tAHI8WJhl-xhgzWBsAZjmJJ6g,6249
33
33
  sky/backends/backend_utils.py,sha256=LmLsaLiPuuUyGebOXykdvwZpUY-8sB7n4o2AnmwNmdQ,121714
34
- sky/backends/cloud_vm_ray_backend.py,sha256=WX93AnMR_E6e8L0hvXc5eWFdajQo-Sbwfv8Z8lidy9U,232598
34
+ sky/backends/cloud_vm_ray_backend.py,sha256=veMpX-qNA_qat3CVRYw_wZjeMtBqOpjTCevf4lpw3nU,232582
35
35
  sky/backends/docker_utils.py,sha256=Hyw1YY20EyghhEbYx6O2FIMDcGkNzBzV9TM7LFynei8,8358
36
36
  sky/backends/local_docker_backend.py,sha256=0JL5m0YUgOmOL4aWEUe4tmt89dsxjk4_WXkPwgEKEis,16801
37
37
  sky/backends/wheel_utils.py,sha256=3QS4T_Ydvo4DbYhogtyADyNBEf04I6jUCL71M285shQ,7963
@@ -41,7 +41,7 @@ sky/benchmark/benchmark_state.py,sha256=X8CXmuU9KgsDRhKedhFgjeRMUFWtQsjFs1qECvPG
41
41
  sky/benchmark/benchmark_utils.py,sha256=eb-i6zYoo-Zkod-T9qtCu1FcYLw--Yyos1SyibUPZNE,26194
42
42
  sky/clouds/__init__.py,sha256=WuNIJEnZmBO72tU5awgaaL3rdvFRSkgaYNNeuY68dXo,1356
43
43
  sky/clouds/aws.py,sha256=dVZ8auaa2z2Ifl9iiRT06IeEFaNtZhANKtHVLT6Gcno,49474
44
- sky/clouds/azure.py,sha256=-K-VPV2sYJJAfJbDcPAiNhNVhMQYkRBuYHQRb3-MGIQ,30598
44
+ sky/clouds/azure.py,sha256=ixw5jCnnMxDLj0hpArljVzq88EKOrqRxk9xm5N9u-mc,30576
45
45
  sky/clouds/cloud.py,sha256=A5F4a71ciPyljWEs6vT-4RmdGT-AE9NkhS8gJ4Vgi_I,35165
46
46
  sky/clouds/cloud_registry.py,sha256=oLoYFjm_SDTgdHokY7b6A5Utq80HXRQNxV0fLjDdVsQ,2361
47
47
  sky/clouds/cudo.py,sha256=UiY273Sln7VOYDYx93yWiWH_RLlOKZ2cm7mA31ld4A8,13094
@@ -78,13 +78,13 @@ sky/clouds/service_catalog/data_fetchers/fetch_aws.py,sha256=ro2zazdkDF6z9bE7QFy
78
78
  sky/clouds/service_catalog/data_fetchers/fetch_azure.py,sha256=L1JsX1YrhpyI7ylzEPBBNE9XOZM0K0FIXbBUMj9h0MQ,12803
79
79
  sky/clouds/service_catalog/data_fetchers/fetch_cudo.py,sha256=52P48lvWN0s1ArjeLPeLemPRpxjSRcHincRle0nqdm4,3440
80
80
  sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py,sha256=35nO_VaDOgp5W13kt_lIANSk_CNf7gBiZGJ5fGyZu6o,6808
81
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=5CbgU90ldiKVgaagQTnYBJVsgVGE3cMwtF7KpBiTtvU,29873
81
+ sky/clouds/service_catalog/data_fetchers/fetch_gcp.py,sha256=mDAN98T58h1g_LLyppSEUVDlsbLhk2454Nhmg5-aw0Q,32670
82
82
  sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py,sha256=B7H14so38zayuJGgUrD1PJYJKiVZHGnwH6JJop3F7o0,4918
83
83
  sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py,sha256=SF_gTU74qg6L-DSWneCAbqP0lwZXaaDi5otiMIJbrw0,21462
84
84
  sky/clouds/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
85
85
  sky/clouds/utils/aws_utils.py,sha256=W5BRC-2F_VY4BymRA1kS6-MufsI3V8cfY_hv--4gJBU,1986
86
86
  sky/clouds/utils/azure_utils.py,sha256=NToRBnhEyuUvb-nBnsKTxjhOBRkMcrelL8LK4w6s4t8,3555
87
- sky/clouds/utils/gcp_utils.py,sha256=Xc_COjJfDt__oqVwrCw7ejY2B7ptHjMjDVb8obcpJ6s,6968
87
+ sky/clouds/utils/gcp_utils.py,sha256=QejfgXOIVRv5-fv3Soi96VeVNVyquwVwy3M58N3YfNs,6633
88
88
  sky/clouds/utils/oci_utils.py,sha256=t-5QEQEs8swN683AAp-oDD6yQJOQqVBbsVcHkNyqnbU,4968
89
89
  sky/clouds/utils/scp_utils.py,sha256=RUp7NwyhKygOoVOwvdAOGdoQNSJjryOG6WSExCf-yas,15812
90
90
  sky/data/__init__.py,sha256=Nhaf1NURisXpZuwWANa2IuCyppIuc720FRwqSE2oEwY,184
@@ -95,11 +95,11 @@ sky/data/storage.py,sha256=x8YYY4zVBdit_5oAR_MXV-TM9qDefV_ZV4z0irv6ZaU,163102
95
95
  sky/data/storage_utils.py,sha256=cM3kxlffYE7PnJySDu8huyUsMX_JYsf9uer8r5OYsjo,9556
96
96
  sky/jobs/__init__.py,sha256=yucibSB_ZimtJMvOhMxn6ZqwBIYNfcwmc6pSXtCqmNQ,1483
97
97
  sky/jobs/constants.py,sha256=YLgcCg_RHSYr_rfsI_4UIdXk78KKKOK29Oem88t5j8I,1350
98
- sky/jobs/controller.py,sha256=zSdawmXg-9SZ91jJg5_OSFVlntu9xupLs-CiPwG1QdQ,26412
98
+ sky/jobs/controller.py,sha256=sirpi730_GfKfPZeZ2PvCXnJWger0r6AyLSOx2sLd6A,27368
99
99
  sky/jobs/core.py,sha256=RkBFaKDlovmdzqlOAgQ0xAimZFgo4pXq3qaQkAvGsGk,16908
100
- sky/jobs/recovery_strategy.py,sha256=UOEaVGSpRbCnCzlD8cgyjhCPIBIeBeCXCutoSic5aiA,25545
101
- sky/jobs/state.py,sha256=C6R5Yq7ftBqGPa_71tUjflBMKAaJ1FTTdbgjAwmbJsI,23231
102
- sky/jobs/utils.py,sha256=lYfWkEAPVnYcj2nT6VYdM6PCaWKUH6_AD4TAV_sVCkY,36376
100
+ sky/jobs/recovery_strategy.py,sha256=FpPK6e2PT61cZPDUJqIfo6g53uSRTBh7dOTbfR1DLVE,26672
101
+ sky/jobs/state.py,sha256=exN6BdJlLBzFTccJCSHN4dNjVeYFgTgqgxOaHwLw2IQ,24307
102
+ sky/jobs/utils.py,sha256=pF4Kyl4v1M_Bmm2jIRlXGTSdII5BJ3f4qwex_oCFgBk,37742
103
103
  sky/jobs/dashboard/dashboard.py,sha256=HFShuaxKir97QTeK2x37h6bsY6ncaFaNEg1USZqJPdc,3050
104
104
  sky/jobs/dashboard/static/favicon.ico,sha256=uYlvgxSM7gjBmXpZ8wydvZUPAbJiiix-rc2Xe5mma9s,15086
105
105
  sky/jobs/dashboard/templates/index.html,sha256=DBKMYEkkJ6sgLYod9ro7drgL8Y_neDsCx_WbwhWDsWM,9837
@@ -118,7 +118,7 @@ sky/provision/aws/utils.py,sha256=m49pS-SHGW7Au3bhDeTPsL8N5iRzbwOXzyEWRCc1Vho,32
118
118
  sky/provision/azure/__init__.py,sha256=87cgk1_Ws7n9rqaDDPv-HpfrkVeSQMdFQnhnXwyx9g4,548
119
119
  sky/provision/azure/azure-config-template.json,sha256=jrjAgOtpe0e6FSg3vsVqHKQqJe0w-HeWOFT1HuwzS2c,4712
120
120
  sky/provision/azure/config.py,sha256=V5-0Zelt4Xo0vcqnD6PpsnaCS7vc3xosDelILDAKSW4,8885
121
- sky/provision/azure/instance.py,sha256=dq67O6gwvNN0jrBklgJ8AnrNj784aqyLl7PHeB5xVQA,50088
121
+ sky/provision/azure/instance.py,sha256=Xd1paLWVc6eVHzphOjZB4_BeXZNX7GYgPV9kH3GWvsc,48983
122
122
  sky/provision/cudo/__init__.py,sha256=KAEl26MVPsk7IoP9Gg-MOJJRIV6-X9B0fbyHdyJWdLo,741
123
123
  sky/provision/cudo/config.py,sha256=RYOVkV0MoUqVBJRZiKhBZhjFygeyFs7eUdVMdPg1vds,327
124
124
  sky/provision/cudo/cudo_machine_type.py,sha256=_VNXWPELmlFXbtdcnPvkuLuyE9CZ923BUCdiac-ClDY,696
@@ -184,7 +184,7 @@ sky/serve/serve_utils.py,sha256=wqBxChpJylZ_qHWyFmMBJqrG8_7xTIOr9nlOeyHs9P8,3943
184
184
  sky/serve/service.py,sha256=fkfJvNJ2BO6rfV0TblZG-QkOXaCyZlpkwbGgrsTzf2w,11872
185
185
  sky/serve/service_spec.py,sha256=1aS6b-ku7W4CjyekXKDxjZsDdt-O8ygos-jFeXu31cA,13766
186
186
  sky/setup_files/MANIFEST.in,sha256=CXz8lIJMgWlH9TvYgzIL3vPFtSDoQq-UMfD9K62rtH4,590
187
- sky/setup_files/setup.py,sha256=o4IgiwFoTB6Sdn3MmOirUIS0OSkoh6qo_0vrgcmrYA4,12093
187
+ sky/setup_files/setup.py,sha256=G767GNB-jXqyC8MR-IdiojnnI2E6tP4gMYenKU14ZGA,12156
188
188
  sky/skylet/LICENSE,sha256=BnFrJSvUFpMUoH5mOpWnEvaC5R6Uux8W6WXgrte8iYg,12381
189
189
  sky/skylet/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
190
190
  sky/skylet/attempt_skylet.py,sha256=GZ6ITjjA0m-da3IxXXfoHR6n4pjp3X3TOXUqVvSrV0k,2136
@@ -192,7 +192,7 @@ sky/skylet/autostop_lib.py,sha256=JPDHmByuhoNYXSUHl-OnyeJUkOFWn7gDM1FrS7Kr3E8,44
192
192
  sky/skylet/configs.py,sha256=UtnpmEL0F9hH6PSjhsps7xgjGZ6qzPOfW1p2yj9tSng,1887
193
193
  sky/skylet/constants.py,sha256=OsuJcQp6UgkQ9Yfml6f_raXXbHS7-_h-v4QNv92y0Gw,14642
194
194
  sky/skylet/events.py,sha256=A09E7LmmwzcGrSG0n8K7d3EZ1ZJr1mmmzoGyhnArYJA,12303
195
- sky/skylet/job_lib.py,sha256=Nfvefaa3N5IwxfhhOz1XE7ps46l3LY-db6VWF2pC3HQ,35335
195
+ sky/skylet/job_lib.py,sha256=jqJ4D3UeG6fNMm8xPtdWclnrVHQb6WiRqb1nrBp8TPg,35200
196
196
  sky/skylet/log_lib.py,sha256=Jyj3h2yMBlheFX53AabXEiPaKyCbu06hLEhay5_ZRN0,18734
197
197
  sky/skylet/log_lib.pyi,sha256=AHMkW2DGK2erFovb3ToZWxRiYaATlzkxKb5J9pkgF2Y,4295
198
198
  sky/skylet/skylet.py,sha256=U9plr5hmhD9-Nyy0LMCymlE8DWtRXTFXQvfbFsS746Y,1153
@@ -219,7 +219,7 @@ sky/skylet/ray_patches/resource_demand_scheduler.py.patch,sha256=AVV-Hw-Rxw16aFm
219
219
  sky/skylet/ray_patches/updater.py.patch,sha256=ZNMGVYICPBB44jLbEx2KvCgIY7BWYdDv3-2b2HJWmAQ,289
220
220
  sky/skylet/ray_patches/worker.py.patch,sha256=_OBhibdr3xOy5Qje6Tt8D1eQVm_msi50TJbCJmOTxVU,565
221
221
  sky/templates/aws-ray.yml.j2,sha256=K0rAuyf1XC_GPFp1BR9df42-Be12A6T2UF0BllVSpYg,8005
222
- sky/templates/azure-ray.yml.j2,sha256=l8zBUVfMPNRlKpn3l7_D3yXpdrUoSeykUuZRy0UoCLQ,6308
222
+ sky/templates/azure-ray.yml.j2,sha256=uUneIfT5vTLUCvrZXiv2dsd3gFqLH2FK632oBruOO_k,6237
223
223
  sky/templates/cudo-ray.yml.j2,sha256=SEHVY57iBauCOE2HYJtYVFEKlriAkdwQu_p86a1n_bA,3548
224
224
  sky/templates/fluidstack-ray.yml.j2,sha256=t8TCULgiErCZdtFmBZVsA8ZdcqR7ccwsmQhuDFTBEAU,3541
225
225
  sky/templates/gcp-ray.yml.j2,sha256=y95B-Nk6hFxm6vEIaxI1wFzAIcy_GcKC3XMYo9m-ThI,9662
@@ -249,14 +249,14 @@ sky/utils/command_runner.py,sha256=3CDcqRXEmoe3C-t2P58McgcRg6p9m5haUWYj1rOLuqM,3
249
249
  sky/utils/command_runner.pyi,sha256=mJOzCgcYZAfHwnY_6Wf1YwlTEJGb9ihzc2f0rE0Kw98,7751
250
250
  sky/utils/common_utils.py,sha256=Qy25LuIoTT0qg391EWyT9i5D6fwk1S4OdFwRpCTZ9Vk,24657
251
251
  sky/utils/controller_utils.py,sha256=wF4_y1PCsLAWoo3XEtECwkNYTN6hO3vn_cxGxgQYcd8,43268
252
- sky/utils/dag_utils.py,sha256=gjGZiJj4_GYsraXX67e6ElvbmOByJcyjSfvVgYZiXvs,5588
252
+ sky/utils/dag_utils.py,sha256=pVX3lGDDcYTcGoH_1jEWzl9767Y4mwlIEYIzoyHO6gM,6105
253
253
  sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
254
254
  sky/utils/env_options.py,sha256=3oAaUPxowL6vI2XmxXrH56V7Myj9IJWsL-MXFmRFVdI,1294
255
255
  sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
256
256
  sky/utils/log_utils.py,sha256=ptv2sbsiJSgk4NvdccrMsUR-MvOKnbu4BQiRSishgk0,12472
257
257
  sky/utils/resources_utils.py,sha256=Xqi7gxPYw2y5wl5okUI5zx5LEij0hJF_V3Zi8q7TXYg,7890
258
258
  sky/utils/rich_utils.py,sha256=hmnI1X5dKvRIQzB7EyNb34FT97qFNve-0QHqM5r0mVk,3066
259
- sky/utils/schemas.py,sha256=MTjGcxmc4aAz9QzqZY2pO87uNuWhJ3ss1N9rXcCNYGQ,28357
259
+ sky/utils/schemas.py,sha256=mogoStpQ77S936VfChinAW2I1DT4q2c5E7qY_qNiO0w,29094
260
260
  sky/utils/subprocess_utils.py,sha256=3R54Elc2n8DQeO6Y8MCDJ6N6v27HDGpbNMIfCquqXYQ,6552
261
261
  sky/utils/timeline.py,sha256=ao_nm0y52ZQILfL7Y92c3pSEFRyPm_ElORC3DrI5BwQ,3936
262
262
  sky/utils/ux_utils.py,sha256=CqyIFGDuSE8fQasPkna_loZMwtboC9KedR09WEQ7qz0,6502
@@ -274,9 +274,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=k0TBoQ4zgf79-sVkixKSGYFHQ7Z
274
274
  sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
275
275
  sky/utils/kubernetes/rsync_helper.sh,sha256=hyYDaYSNxYaNvzUQBzC8AidB7nDeojizjkzc_CTxycY,1077
276
276
  sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
277
- skypilot_nightly-1.0.0.dev20241029.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
- skypilot_nightly-1.0.0.dev20241029.dist-info/METADATA,sha256=UAIFfOVp0n7QbIlx-vP21aRhzERPIIoEGbE4RcLzR5U,19540
279
- skypilot_nightly-1.0.0.dev20241029.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
- skypilot_nightly-1.0.0.dev20241029.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
- skypilot_nightly-1.0.0.dev20241029.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
- skypilot_nightly-1.0.0.dev20241029.dist-info/RECORD,,
277
+ skypilot_nightly-1.0.0.dev20241031.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
278
+ skypilot_nightly-1.0.0.dev20241031.dist-info/METADATA,sha256=_tflHJA52Q4Bt2-rQMwozR70icXKpmFjdr9H-zkURe8,19708
279
+ skypilot_nightly-1.0.0.dev20241031.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
280
+ skypilot_nightly-1.0.0.dev20241031.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
281
+ skypilot_nightly-1.0.0.dev20241031.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
282
+ skypilot_nightly-1.0.0.dev20241031.dist-info/RECORD,,