skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/azure.py +3 -0
  3. sky/backends/backend_utils.py +10 -133
  4. sky/backends/cloud_vm_ray_backend.py +17 -105
  5. sky/clouds/azure.py +10 -1
  6. sky/execution.py +5 -4
  7. sky/jobs/controller.py +38 -22
  8. sky/jobs/recovery_strategy.py +30 -5
  9. sky/jobs/state.py +33 -5
  10. sky/jobs/utils.py +28 -4
  11. sky/optimizer.py +11 -7
  12. sky/provision/azure/azure-config-template.json +7 -1
  13. sky/provision/azure/config.py +65 -45
  14. sky/provision/azure/instance.py +275 -70
  15. sky/provision/constants.py +7 -0
  16. sky/provision/gcp/instance.py +0 -7
  17. sky/resources.py +25 -8
  18. sky/serve/core.py +0 -2
  19. sky/serve/serve_state.py +3 -7
  20. sky/serve/serve_utils.py +2 -14
  21. sky/serve/service_spec.py +0 -28
  22. sky/setup_files/setup.py +4 -3
  23. sky/skylet/job_lib.py +37 -53
  24. sky/skylet/log_lib.py +5 -14
  25. sky/templates/azure-ray.yml.j2 +1 -0
  26. sky/utils/dag_utils.py +14 -4
  27. sky/utils/schemas.py +25 -15
  28. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA +13 -11
  29. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD +33 -33
  30. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL +1 -1
  31. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE +0 -0
  32. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt +0 -0
  33. {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt +0 -0
sky/jobs/state.py CHANGED
@@ -2,6 +2,7 @@
2
2
  # TODO(zhwu): maybe use file based status instead of database, so
3
3
  # that we can easily switch to a s3-based storage.
4
4
  import enum
5
+ import json
5
6
  import pathlib
6
7
  import sqlite3
7
8
  import time
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
65
66
  failure_reason TEXT,
66
67
  spot_job_id INTEGER,
67
68
  task_id INTEGER DEFAULT 0,
68
- task_name TEXT)""")
69
+ task_name TEXT,
70
+ specs TEXT)""")
69
71
  _CONN.commit()
70
72
 
71
73
  db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
92
94
  'TEXT',
93
95
  copy_from='job_name')
94
96
 
97
+ # Specs is some useful information about the task, e.g., the
98
+ # max_restarts_on_errors value. It is stored in JSON format.
99
+ db_utils.add_column_to_table(_CURSOR,
100
+ _CONN,
101
+ 'spot',
102
+ 'specs',
103
+ 'TEXT',
104
+ value_to_replace_existing_entries=json.dumps({
105
+ 'max_restarts_on_errors': 0,
106
+ }))
107
+
95
108
  # `job_info` contains the mapping from job_id to the job_name.
96
109
  # In the future, it may contain more information about each job.
97
110
  _CURSOR.execute("""\
@@ -130,7 +143,8 @@ columns = [
130
143
  'task_name',
131
144
  # columns from the job_info table
132
145
  '_job_info_job_id', # This should be the same as job_id
133
- 'job_name'
146
+ 'job_name',
147
+ 'specs',
134
148
  ]
135
149
 
136
150
 
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
283
297
 
284
298
  def set_submitted(job_id: int, task_id: int, run_timestamp: str,
285
299
  submit_time: float, resources_str: str,
286
- callback_func: CallbackType):
300
+ specs: Dict[str, Union[str,
301
+ int]], callback_func: CallbackType):
287
302
  """Set the task to submitted.
288
303
 
289
304
  Args:
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
293
308
  determine the log directory of the managed task.
294
309
  submit_time: The time when the managed task is submitted.
295
310
  resources_str: The resources string of the managed task.
311
+ specs: The specs of the managed task.
312
+ callback_func: The callback function.
296
313
  """
297
314
  # Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
298
315
  # the log directory and submission time align with each other, so as to
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
306
323
  resources=(?),
307
324
  submitted_at=(?),
308
325
  status=(?),
309
- run_timestamp=(?)
326
+ run_timestamp=(?),
327
+ specs=(?)
310
328
  WHERE spot_job_id=(?) AND
311
329
  task_id=(?)""",
312
330
  (resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
313
- run_timestamp, job_id, task_id))
331
+ run_timestamp, json.dumps(specs), job_id, task_id))
314
332
  callback_func('SUBMITTED')
315
333
 
316
334
 
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
619
637
  for (job_id,) in rows:
620
638
  return job_id
621
639
  return None
640
+
641
+
642
+ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
643
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
644
+ task_specs = cursor.execute(
645
+ """\
646
+ SELECT specs FROM spot
647
+ WHERE spot_job_id=(?) AND task_id=(?)""",
648
+ (job_id, task_id)).fetchone()
649
+ return json.loads(task_specs[0])
sky/jobs/utils.py CHANGED
@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
70
70
  # state, after the job finished. This is a safeguard to avoid the case where
71
71
  # the managed job status fails to be updated and keep the `sky jobs logs`
72
72
  # blocking for a long time.
73
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 20
73
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
74
74
 
75
75
 
76
76
  class UserSignal(enum.Enum):
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
392
392
  f'INFO: Log for the current task ({task_id}) '
393
393
  'is finished. Waiting for the next task\'s log '
394
394
  'to be started.')
395
- status_display.update('Waiting for the next task: '
396
- f'{task_id + 1}.')
395
+ # Add a newline to avoid the status display below
396
+ # removing the last line of the task output.
397
+ print()
398
+ status_display.update(
399
+ ux_utils.spinner_message(
400
+ f'Waiting for the next task: {task_id + 1}'))
397
401
  status_display.start()
398
402
  original_task_id = task_id
399
403
  while True:
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
405
409
  time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
406
410
  continue
407
411
  else:
408
- break
412
+ task_specs = managed_job_state.get_task_specs(
413
+ job_id, task_id)
414
+ if task_specs.get('max_restarts_on_errors', 0) == 0:
415
+ # We don't need to wait for the managed job status
416
+ # update, as the job is guaranteed to be in terminal
417
+ # state afterwards.
418
+ break
419
+ print()
420
+ status_display.update(
421
+ ux_utils.spinner_message(
422
+ 'Waiting for next restart for the failed task'))
423
+ status_display.start()
424
+ while True:
425
+ _, managed_job_status = (
426
+ managed_job_state.get_latest_task_id_status(
427
+ job_id))
428
+ if (managed_job_status !=
429
+ managed_job_state.ManagedJobStatus.RUNNING):
430
+ break
431
+ time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
432
+ continue
409
433
  # The job can be cancelled by the user or the controller (when
410
434
  # the cluster is partially preempted).
411
435
  logger.debug(
sky/optimizer.py CHANGED
@@ -831,13 +831,17 @@ class Optimizer:
831
831
  return row
832
832
 
833
833
  def _get_resource_group_hash(resources: 'resources_lib.Resources'):
834
- return json.dumps(
835
- {
836
- 'cloud': f'{resources.cloud}',
837
- 'accelerators': f'{resources.accelerators}',
838
- 'use_spot': resources.use_spot
839
- },
840
- sort_keys=True)
834
+ resource_key_dict = {
835
+ 'cloud': f'{resources.cloud}',
836
+ 'accelerators': f'{resources.accelerators}',
837
+ 'use_spot': resources.use_spot
838
+ }
839
+ if isinstance(resources.cloud, clouds.Kubernetes):
840
+ # Region for Kubernetes is the context name, i.e. different
841
+ # Kubernetes clusters. We add region to the key to show all the
842
+ # Kubernetes clusters in the optimizer table for better UX.
843
+ resource_key_dict['region'] = resources.region
844
+ return json.dumps(resource_key_dict, sort_keys=True)
841
845
 
842
846
  # Print the list of resouces that the optimizer considered.
843
847
  resource_fields = [
@@ -14,6 +14,12 @@
14
14
  "description": "Subnet parameters."
15
15
  }
16
16
  },
17
+ "location": {
18
+ "type": "string",
19
+ "metadata": {
20
+ "description": "Location of where the resources are allocated."
21
+ }
22
+ },
17
23
  "nsgName": {
18
24
  "type": "string",
19
25
  "metadata": {
@@ -23,7 +29,7 @@
23
29
  },
24
30
  "variables": {
25
31
  "contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
26
- "location": "[resourceGroup().location]",
32
+ "location": "[parameters('location')]",
27
33
  "msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
28
34
  "roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
29
35
  "nsgName": "[parameters('nsgName')]",
@@ -14,13 +14,12 @@ from sky import exceptions
14
14
  from sky import sky_logging
15
15
  from sky.adaptors import azure
16
16
  from sky.provision import common
17
+ from sky.provision import constants
17
18
  from sky.utils import common_utils
18
19
 
19
20
  logger = sky_logging.init_logger(__name__)
20
21
 
21
22
  UNIQUE_ID_LEN = 4
22
- _DEPLOYMENT_NAME = 'skypilot-config'
23
- _LEGACY_DEPLOYMENT_NAME = 'ray-config'
24
23
  _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes
25
24
  _CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'
26
25
 
@@ -82,46 +81,55 @@ def bootstrap_instances(
82
81
  in provider_config), 'Provider config must include location field'
83
82
  params = {'location': provider_config['location']}
84
83
 
84
+ assert ('use_external_resource_group'
85
+ in provider_config), ('Provider config must include '
86
+ 'use_external_resource_group field')
87
+ use_external_resource_group = provider_config['use_external_resource_group']
88
+
85
89
  if 'tags' in provider_config:
86
90
  params['tags'] = provider_config['tags']
87
91
 
88
- logger.info(f'Creating/Updating resource group: {resource_group}')
89
- rg_create_or_update = get_azure_sdk_function(
90
- client=resource_client.resource_groups,
91
- function_name='create_or_update')
92
- rg_creation_start = time.time()
93
- retry = 0
94
- while (time.time() - rg_creation_start <
95
- _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
96
- try:
97
- rg_create_or_update(resource_group_name=resource_group,
98
- parameters=params)
99
- break
100
- except azure.exceptions().ResourceExistsError as e:
101
- if 'ResourceGroupBeingDeleted' in str(e):
102
- if retry % 5 == 0:
103
- logger.info(
104
- f'Azure resource group {resource_group} of a recent '
105
- f'terminated cluster {cluster_name_on_cloud} is being '
106
- 'deleted. It can only be provisioned after it is fully '
107
- 'deleted. Waiting...')
108
- time.sleep(1)
109
- retry += 1
110
- continue
111
- raise
112
- except azure.exceptions().ClientAuthenticationError as e:
92
+ # When resource group is user specified, it already exists in certain
93
+ # region.
94
+ if not use_external_resource_group:
95
+ logger.info(f'Creating/Updating resource group: {resource_group}')
96
+ rg_create_or_update = get_azure_sdk_function(
97
+ client=resource_client.resource_groups,
98
+ function_name='create_or_update')
99
+ rg_creation_start = time.time()
100
+ retry = 0
101
+ while (time.time() - rg_creation_start <
102
+ _RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
103
+ try:
104
+ rg_create_or_update(resource_group_name=resource_group,
105
+ parameters=params)
106
+ break
107
+ except azure.exceptions().ResourceExistsError as e:
108
+ if 'ResourceGroupBeingDeleted' in str(e):
109
+ if retry % 5 == 0:
110
+ logger.info(
111
+ f'Azure resource group {resource_group} of a '
112
+ 'recent terminated cluster '
113
+ f'{cluster_name_on_cloud} is being deleted. It can'
114
+ ' only be provisioned after it is fully deleted. '
115
+ 'Waiting...')
116
+ time.sleep(1)
117
+ retry += 1
118
+ continue
119
+ raise
120
+ except azure.exceptions().ClientAuthenticationError as e:
121
+ message = (
122
+ 'Failed to authenticate with Azure. Please check your '
123
+ 'Azure credentials. Error: '
124
+ f'{common_utils.format_exception(e)}').replace('\n', ' ')
125
+ logger.error(message)
126
+ raise exceptions.NoClusterLaunchedError(message) from e
127
+ else:
113
128
  message = (
114
- 'Failed to authenticate with Azure. Please check your Azure '
115
- f'credentials. Error: {common_utils.format_exception(e)}'
116
- ).replace('\n', ' ')
129
+ f'Timed out waiting for resource group {resource_group} to be '
130
+ 'deleted.')
117
131
  logger.error(message)
118
- raise exceptions.NoClusterLaunchedError(message) from e
119
- else:
120
- message = (
121
- f'Timed out waiting for resource group {resource_group} to be '
122
- 'deleted.')
123
- logger.error(message)
124
- raise TimeoutError(message)
132
+ raise TimeoutError(message)
125
133
 
126
134
  # load the template file
127
135
  current_path = Path(__file__).parent
@@ -155,6 +163,9 @@ def bootstrap_instances(
155
163
  'nsgName': {
156
164
  'value': nsg_name
157
165
  },
166
+ 'location': {
167
+ 'value': params['location']
168
+ }
158
169
  },
159
170
  }
160
171
  }
@@ -164,11 +175,22 @@ def bootstrap_instances(
164
175
  get_deployment = get_azure_sdk_function(client=resource_client.deployments,
165
176
  function_name='get')
166
177
  deployment_exists = False
167
- for deployment_name in [_DEPLOYMENT_NAME, _LEGACY_DEPLOYMENT_NAME]:
178
+ if use_external_resource_group:
179
+ deployment_name = (
180
+ constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
181
+ cluster_name_on_cloud=cluster_name_on_cloud))
182
+ deployment_list = [deployment_name]
183
+ else:
184
+ deployment_name = constants.DEPLOYMENT_NAME
185
+ deployment_list = [
186
+ constants.DEPLOYMENT_NAME, constants.LEGACY_DEPLOYMENT_NAME
187
+ ]
188
+
189
+ for deploy_name in deployment_list:
168
190
  try:
169
191
  deployment = get_deployment(resource_group_name=resource_group,
170
- deployment_name=deployment_name)
171
- logger.info(f'Deployment {deployment_name!r} already exists. '
192
+ deployment_name=deploy_name)
193
+ logger.info(f'Deployment {deploy_name!r} already exists. '
172
194
  'Skipping deployment creation.')
173
195
 
174
196
  outputs = deployment.properties.outputs
@@ -179,22 +201,20 @@ def bootstrap_instances(
179
201
  deployment_exists = False
180
202
 
181
203
  if not deployment_exists:
182
- logger.info(f'Creating/Updating deployment: {_DEPLOYMENT_NAME}')
204
+ logger.info(f'Creating/Updating deployment: {deployment_name}')
183
205
  create_or_update = get_azure_sdk_function(
184
206
  client=resource_client.deployments,
185
207
  function_name='create_or_update')
186
208
  # TODO (skypilot): this takes a long time (> 40 seconds) to run.
187
209
  outputs = create_or_update(
188
210
  resource_group_name=resource_group,
189
- deployment_name=_DEPLOYMENT_NAME,
211
+ deployment_name=deployment_name,
190
212
  parameters=parameters,
191
213
  ).result().properties.outputs
192
214
 
193
- nsg_id = outputs['nsg']['value']
194
-
195
215
  # append output resource ids to be used with vm creation
196
216
  provider_config['msi'] = outputs['msi']['value']
197
- provider_config['nsg'] = nsg_id
217
+ provider_config['nsg'] = outputs['nsg']['value']
198
218
  provider_config['subnet'] = outputs['subnet']['value']
199
219
 
200
220
  return config