skypilot-nightly 1.0.0.dev20241028__py3-none-any.whl → 1.0.0.dev20241030__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/azure.py +3 -0
- sky/backends/backend_utils.py +10 -133
- sky/backends/cloud_vm_ray_backend.py +17 -105
- sky/clouds/azure.py +10 -1
- sky/execution.py +5 -4
- sky/jobs/controller.py +38 -22
- sky/jobs/recovery_strategy.py +30 -5
- sky/jobs/state.py +33 -5
- sky/jobs/utils.py +28 -4
- sky/optimizer.py +11 -7
- sky/provision/azure/azure-config-template.json +7 -1
- sky/provision/azure/config.py +65 -45
- sky/provision/azure/instance.py +275 -70
- sky/provision/constants.py +7 -0
- sky/provision/gcp/instance.py +0 -7
- sky/resources.py +25 -8
- sky/serve/core.py +0 -2
- sky/serve/serve_state.py +3 -7
- sky/serve/serve_utils.py +2 -14
- sky/serve/service_spec.py +0 -28
- sky/setup_files/setup.py +4 -3
- sky/skylet/job_lib.py +37 -53
- sky/skylet/log_lib.py +5 -14
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/utils/dag_utils.py +14 -4
- sky/utils/schemas.py +25 -15
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/METADATA +13 -11
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/RECORD +33 -33
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241028.dist-info → skypilot_nightly-1.0.0.dev20241030.dist-info}/top_level.txt +0 -0
sky/jobs/state.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
# TODO(zhwu): maybe use file based status instead of database, so
|
3
3
|
# that we can easily switch to a s3-based storage.
|
4
4
|
import enum
|
5
|
+
import json
|
5
6
|
import pathlib
|
6
7
|
import sqlite3
|
7
8
|
import time
|
@@ -65,7 +66,8 @@ _CURSOR.execute("""\
|
|
65
66
|
failure_reason TEXT,
|
66
67
|
spot_job_id INTEGER,
|
67
68
|
task_id INTEGER DEFAULT 0,
|
68
|
-
task_name TEXT
|
69
|
+
task_name TEXT,
|
70
|
+
specs TEXT)""")
|
69
71
|
_CONN.commit()
|
70
72
|
|
71
73
|
db_utils.add_column_to_table(_CURSOR, _CONN, 'spot', 'failure_reason', 'TEXT')
|
@@ -92,6 +94,17 @@ db_utils.add_column_to_table(_CURSOR,
|
|
92
94
|
'TEXT',
|
93
95
|
copy_from='job_name')
|
94
96
|
|
97
|
+
# Specs is some useful information about the task, e.g., the
|
98
|
+
# max_restarts_on_errors value. It is stored in JSON format.
|
99
|
+
db_utils.add_column_to_table(_CURSOR,
|
100
|
+
_CONN,
|
101
|
+
'spot',
|
102
|
+
'specs',
|
103
|
+
'TEXT',
|
104
|
+
value_to_replace_existing_entries=json.dumps({
|
105
|
+
'max_restarts_on_errors': 0,
|
106
|
+
}))
|
107
|
+
|
95
108
|
# `job_info` contains the mapping from job_id to the job_name.
|
96
109
|
# In the future, it may contain more information about each job.
|
97
110
|
_CURSOR.execute("""\
|
@@ -130,7 +143,8 @@ columns = [
|
|
130
143
|
'task_name',
|
131
144
|
# columns from the job_info table
|
132
145
|
'_job_info_job_id', # This should be the same as job_id
|
133
|
-
'job_name'
|
146
|
+
'job_name',
|
147
|
+
'specs',
|
134
148
|
]
|
135
149
|
|
136
150
|
|
@@ -283,7 +297,8 @@ def set_pending(job_id: int, task_id: int, task_name: str, resources_str: str):
|
|
283
297
|
|
284
298
|
def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
285
299
|
submit_time: float, resources_str: str,
|
286
|
-
|
300
|
+
specs: Dict[str, Union[str,
|
301
|
+
int]], callback_func: CallbackType):
|
287
302
|
"""Set the task to submitted.
|
288
303
|
|
289
304
|
Args:
|
@@ -293,6 +308,8 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
293
308
|
determine the log directory of the managed task.
|
294
309
|
submit_time: The time when the managed task is submitted.
|
295
310
|
resources_str: The resources string of the managed task.
|
311
|
+
specs: The specs of the managed task.
|
312
|
+
callback_func: The callback function.
|
296
313
|
"""
|
297
314
|
# Use the timestamp in the `run_timestamp` ('sky-2022-10...'), to make
|
298
315
|
# the log directory and submission time align with each other, so as to
|
@@ -306,11 +323,12 @@ def set_submitted(job_id: int, task_id: int, run_timestamp: str,
|
|
306
323
|
resources=(?),
|
307
324
|
submitted_at=(?),
|
308
325
|
status=(?),
|
309
|
-
run_timestamp=(?)
|
326
|
+
run_timestamp=(?),
|
327
|
+
specs=(?)
|
310
328
|
WHERE spot_job_id=(?) AND
|
311
329
|
task_id=(?)""",
|
312
330
|
(resources_str, submit_time, ManagedJobStatus.SUBMITTED.value,
|
313
|
-
run_timestamp, job_id, task_id))
|
331
|
+
run_timestamp, json.dumps(specs), job_id, task_id))
|
314
332
|
callback_func('SUBMITTED')
|
315
333
|
|
316
334
|
|
@@ -619,3 +637,13 @@ def get_latest_job_id() -> Optional[int]:
|
|
619
637
|
for (job_id,) in rows:
|
620
638
|
return job_id
|
621
639
|
return None
|
640
|
+
|
641
|
+
|
642
|
+
def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
|
643
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
644
|
+
task_specs = cursor.execute(
|
645
|
+
"""\
|
646
|
+
SELECT specs FROM spot
|
647
|
+
WHERE spot_job_id=(?) AND task_id=(?)""",
|
648
|
+
(job_id, task_id)).fetchone()
|
649
|
+
return json.loads(task_specs[0])
|
sky/jobs/utils.py
CHANGED
@@ -70,7 +70,7 @@ _JOB_CANCELLED_MESSAGE = (
|
|
70
70
|
# state, after the job finished. This is a safeguard to avoid the case where
|
71
71
|
# the managed job status fails to be updated and keep the `sky jobs logs`
|
72
72
|
# blocking for a long time.
|
73
|
-
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS =
|
73
|
+
_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 25
|
74
74
|
|
75
75
|
|
76
76
|
class UserSignal(enum.Enum):
|
@@ -392,8 +392,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
392
392
|
f'INFO: Log for the current task ({task_id}) '
|
393
393
|
'is finished. Waiting for the next task\'s log '
|
394
394
|
'to be started.')
|
395
|
-
|
396
|
-
|
395
|
+
# Add a newline to avoid the status display below
|
396
|
+
# removing the last line of the task output.
|
397
|
+
print()
|
398
|
+
status_display.update(
|
399
|
+
ux_utils.spinner_message(
|
400
|
+
f'Waiting for the next task: {task_id + 1}'))
|
397
401
|
status_display.start()
|
398
402
|
original_task_id = task_id
|
399
403
|
while True:
|
@@ -405,7 +409,27 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> str:
|
|
405
409
|
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
406
410
|
continue
|
407
411
|
else:
|
408
|
-
|
412
|
+
task_specs = managed_job_state.get_task_specs(
|
413
|
+
job_id, task_id)
|
414
|
+
if task_specs.get('max_restarts_on_errors', 0) == 0:
|
415
|
+
# We don't need to wait for the managed job status
|
416
|
+
# update, as the job is guaranteed to be in terminal
|
417
|
+
# state afterwards.
|
418
|
+
break
|
419
|
+
print()
|
420
|
+
status_display.update(
|
421
|
+
ux_utils.spinner_message(
|
422
|
+
'Waiting for next restart for the failed task'))
|
423
|
+
status_display.start()
|
424
|
+
while True:
|
425
|
+
_, managed_job_status = (
|
426
|
+
managed_job_state.get_latest_task_id_status(
|
427
|
+
job_id))
|
428
|
+
if (managed_job_status !=
|
429
|
+
managed_job_state.ManagedJobStatus.RUNNING):
|
430
|
+
break
|
431
|
+
time.sleep(JOB_STATUS_CHECK_GAP_SECONDS)
|
432
|
+
continue
|
409
433
|
# The job can be cancelled by the user or the controller (when
|
410
434
|
# the cluster is partially preempted).
|
411
435
|
logger.debug(
|
sky/optimizer.py
CHANGED
@@ -831,13 +831,17 @@ class Optimizer:
|
|
831
831
|
return row
|
832
832
|
|
833
833
|
def _get_resource_group_hash(resources: 'resources_lib.Resources'):
|
834
|
-
|
835
|
-
{
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
834
|
+
resource_key_dict = {
|
835
|
+
'cloud': f'{resources.cloud}',
|
836
|
+
'accelerators': f'{resources.accelerators}',
|
837
|
+
'use_spot': resources.use_spot
|
838
|
+
}
|
839
|
+
if isinstance(resources.cloud, clouds.Kubernetes):
|
840
|
+
# Region for Kubernetes is the context name, i.e. different
|
841
|
+
# Kubernetes clusters. We add region to the key to show all the
|
842
|
+
# Kubernetes clusters in the optimizer table for better UX.
|
843
|
+
resource_key_dict['region'] = resources.region
|
844
|
+
return json.dumps(resource_key_dict, sort_keys=True)
|
841
845
|
|
842
846
|
# Print the list of resouces that the optimizer considered.
|
843
847
|
resource_fields = [
|
@@ -14,6 +14,12 @@
|
|
14
14
|
"description": "Subnet parameters."
|
15
15
|
}
|
16
16
|
},
|
17
|
+
"location": {
|
18
|
+
"type": "string",
|
19
|
+
"metadata": {
|
20
|
+
"description": "Location of where the resources are allocated."
|
21
|
+
}
|
22
|
+
},
|
17
23
|
"nsgName": {
|
18
24
|
"type": "string",
|
19
25
|
"metadata": {
|
@@ -23,7 +29,7 @@
|
|
23
29
|
},
|
24
30
|
"variables": {
|
25
31
|
"contributor": "[subscriptionResourceId('Microsoft.Authorization/roleDefinitions', 'b24988ac-6180-42a0-ab88-20f7382dd24c')]",
|
26
|
-
"location": "[
|
32
|
+
"location": "[parameters('location')]",
|
27
33
|
"msiName": "[concat('sky-', parameters('clusterId'), '-msi')]",
|
28
34
|
"roleAssignmentName": "[concat('sky-', parameters('clusterId'), '-ra')]",
|
29
35
|
"nsgName": "[parameters('nsgName')]",
|
sky/provision/azure/config.py
CHANGED
@@ -14,13 +14,12 @@ from sky import exceptions
|
|
14
14
|
from sky import sky_logging
|
15
15
|
from sky.adaptors import azure
|
16
16
|
from sky.provision import common
|
17
|
+
from sky.provision import constants
|
17
18
|
from sky.utils import common_utils
|
18
19
|
|
19
20
|
logger = sky_logging.init_logger(__name__)
|
20
21
|
|
21
22
|
UNIQUE_ID_LEN = 4
|
22
|
-
_DEPLOYMENT_NAME = 'skypilot-config'
|
23
|
-
_LEGACY_DEPLOYMENT_NAME = 'ray-config'
|
24
23
|
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT = 480 # 8 minutes
|
25
24
|
_CLUSTER_ID = '{cluster_name_on_cloud}-{unique_id}'
|
26
25
|
|
@@ -82,46 +81,55 @@ def bootstrap_instances(
|
|
82
81
|
in provider_config), 'Provider config must include location field'
|
83
82
|
params = {'location': provider_config['location']}
|
84
83
|
|
84
|
+
assert ('use_external_resource_group'
|
85
|
+
in provider_config), ('Provider config must include '
|
86
|
+
'use_external_resource_group field')
|
87
|
+
use_external_resource_group = provider_config['use_external_resource_group']
|
88
|
+
|
85
89
|
if 'tags' in provider_config:
|
86
90
|
params['tags'] = provider_config['tags']
|
87
91
|
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
92
|
+
# When resource group is user specified, it already exists in certain
|
93
|
+
# region.
|
94
|
+
if not use_external_resource_group:
|
95
|
+
logger.info(f'Creating/Updating resource group: {resource_group}')
|
96
|
+
rg_create_or_update = get_azure_sdk_function(
|
97
|
+
client=resource_client.resource_groups,
|
98
|
+
function_name='create_or_update')
|
99
|
+
rg_creation_start = time.time()
|
100
|
+
retry = 0
|
101
|
+
while (time.time() - rg_creation_start <
|
102
|
+
_RESOURCE_GROUP_WAIT_FOR_DELETION_TIMEOUT):
|
103
|
+
try:
|
104
|
+
rg_create_or_update(resource_group_name=resource_group,
|
105
|
+
parameters=params)
|
106
|
+
break
|
107
|
+
except azure.exceptions().ResourceExistsError as e:
|
108
|
+
if 'ResourceGroupBeingDeleted' in str(e):
|
109
|
+
if retry % 5 == 0:
|
110
|
+
logger.info(
|
111
|
+
f'Azure resource group {resource_group} of a '
|
112
|
+
'recent terminated cluster '
|
113
|
+
f'{cluster_name_on_cloud} is being deleted. It can'
|
114
|
+
' only be provisioned after it is fully deleted. '
|
115
|
+
'Waiting...')
|
116
|
+
time.sleep(1)
|
117
|
+
retry += 1
|
118
|
+
continue
|
119
|
+
raise
|
120
|
+
except azure.exceptions().ClientAuthenticationError as e:
|
121
|
+
message = (
|
122
|
+
'Failed to authenticate with Azure. Please check your '
|
123
|
+
'Azure credentials. Error: '
|
124
|
+
f'{common_utils.format_exception(e)}').replace('\n', ' ')
|
125
|
+
logger.error(message)
|
126
|
+
raise exceptions.NoClusterLaunchedError(message) from e
|
127
|
+
else:
|
113
128
|
message = (
|
114
|
-
'
|
115
|
-
|
116
|
-
).replace('\n', ' ')
|
129
|
+
f'Timed out waiting for resource group {resource_group} to be '
|
130
|
+
'deleted.')
|
117
131
|
logger.error(message)
|
118
|
-
raise
|
119
|
-
else:
|
120
|
-
message = (
|
121
|
-
f'Timed out waiting for resource group {resource_group} to be '
|
122
|
-
'deleted.')
|
123
|
-
logger.error(message)
|
124
|
-
raise TimeoutError(message)
|
132
|
+
raise TimeoutError(message)
|
125
133
|
|
126
134
|
# load the template file
|
127
135
|
current_path = Path(__file__).parent
|
@@ -155,6 +163,9 @@ def bootstrap_instances(
|
|
155
163
|
'nsgName': {
|
156
164
|
'value': nsg_name
|
157
165
|
},
|
166
|
+
'location': {
|
167
|
+
'value': params['location']
|
168
|
+
}
|
158
169
|
},
|
159
170
|
}
|
160
171
|
}
|
@@ -164,11 +175,22 @@ def bootstrap_instances(
|
|
164
175
|
get_deployment = get_azure_sdk_function(client=resource_client.deployments,
|
165
176
|
function_name='get')
|
166
177
|
deployment_exists = False
|
167
|
-
|
178
|
+
if use_external_resource_group:
|
179
|
+
deployment_name = (
|
180
|
+
constants.EXTERNAL_RG_BOOTSTRAP_DEPLOYMENT_NAME.format(
|
181
|
+
cluster_name_on_cloud=cluster_name_on_cloud))
|
182
|
+
deployment_list = [deployment_name]
|
183
|
+
else:
|
184
|
+
deployment_name = constants.DEPLOYMENT_NAME
|
185
|
+
deployment_list = [
|
186
|
+
constants.DEPLOYMENT_NAME, constants.LEGACY_DEPLOYMENT_NAME
|
187
|
+
]
|
188
|
+
|
189
|
+
for deploy_name in deployment_list:
|
168
190
|
try:
|
169
191
|
deployment = get_deployment(resource_group_name=resource_group,
|
170
|
-
deployment_name=
|
171
|
-
logger.info(f'Deployment {
|
192
|
+
deployment_name=deploy_name)
|
193
|
+
logger.info(f'Deployment {deploy_name!r} already exists. '
|
172
194
|
'Skipping deployment creation.')
|
173
195
|
|
174
196
|
outputs = deployment.properties.outputs
|
@@ -179,22 +201,20 @@ def bootstrap_instances(
|
|
179
201
|
deployment_exists = False
|
180
202
|
|
181
203
|
if not deployment_exists:
|
182
|
-
logger.info(f'Creating/Updating deployment: {
|
204
|
+
logger.info(f'Creating/Updating deployment: {deployment_name}')
|
183
205
|
create_or_update = get_azure_sdk_function(
|
184
206
|
client=resource_client.deployments,
|
185
207
|
function_name='create_or_update')
|
186
208
|
# TODO (skypilot): this takes a long time (> 40 seconds) to run.
|
187
209
|
outputs = create_or_update(
|
188
210
|
resource_group_name=resource_group,
|
189
|
-
deployment_name=
|
211
|
+
deployment_name=deployment_name,
|
190
212
|
parameters=parameters,
|
191
213
|
).result().properties.outputs
|
192
214
|
|
193
|
-
nsg_id = outputs['nsg']['value']
|
194
|
-
|
195
215
|
# append output resource ids to be used with vm creation
|
196
216
|
provider_config['msi'] = outputs['msi']['value']
|
197
|
-
provider_config['nsg'] =
|
217
|
+
provider_config['nsg'] = outputs['nsg']['value']
|
198
218
|
provider_config['subnet'] = outputs['subnet']['value']
|
199
219
|
|
200
220
|
return config
|