skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +99 -16
- sky/authentication.py +54 -7
- sky/backends/backend_utils.py +35 -22
- sky/backends/cloud_vm_ray_backend.py +30 -15
- sky/check.py +1 -1
- sky/cli.py +20 -8
- sky/client/cli.py +20 -8
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +60 -10
- sky/clouds/nebius.py +55 -14
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +6 -0
- sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +6 -0
- sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
- sky/dashboard/out/_next/static/chunks/682-f3f1443ed2fba42f.js +6 -0
- sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
- sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
- sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{config-41738d1896fc02fe.js → config-3c6a2dabf56e8cd6.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-342bc15bb78ab2e5.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-7b4b8e7fa9fa0827.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +11 -0
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
- sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/exceptions.py +11 -1
- sky/global_user_state.py +149 -1
- sky/jobs/client/sdk.py +1 -0
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +3 -5
- sky/jobs/recovery_strategy.py +148 -102
- sky/jobs/scheduler.py +23 -8
- sky/jobs/server/core.py +16 -0
- sky/jobs/state.py +153 -39
- sky/jobs/utils.py +33 -5
- sky/provision/kubernetes/utils.py +2 -1
- sky/provision/provisioner.py +15 -10
- sky/resources.py +16 -1
- sky/serve/controller.py +10 -7
- sky/serve/replica_managers.py +22 -18
- sky/serve/service.py +5 -4
- sky/server/common.py +11 -4
- sky/server/html/token_page.html +32 -6
- sky/server/server.py +3 -1
- sky/server/stream_utils.py +21 -0
- sky/setup_files/dependencies.py +7 -1
- sky/skylet/constants.py +1 -1
- sky/task.py +26 -0
- sky/templates/jobs-controller.yaml.j2 +2 -1
- sky/templates/kubernetes-ray.yml.j2 +19 -1
- sky/utils/common_utils.py +66 -0
- sky/utils/rich_utils.py +5 -0
- sky/utils/schemas.py +32 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/RECORD +84 -81
- sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
- sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
- sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
- sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
- sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +0 -1
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
- sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
- /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → Q32Bxr2Pby5tFDW-y5TNg}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-f19ea34b91c33950.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/top_level.txt +0 -0
sky/global_user_state.py
CHANGED
@@ -22,6 +22,7 @@ from sqlalchemy import orm
|
|
22
22
|
from sqlalchemy.dialects import postgresql
|
23
23
|
from sqlalchemy.dialects import sqlite
|
24
24
|
from sqlalchemy.ext import declarative
|
25
|
+
import yaml
|
25
26
|
|
26
27
|
from sky import models
|
27
28
|
from sky import sky_logging
|
@@ -96,6 +97,12 @@ cluster_table = sqlalchemy.Table(
|
|
96
97
|
sqlalchemy.Column('workspace',
|
97
98
|
sqlalchemy.Text,
|
98
99
|
server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
|
100
|
+
sqlalchemy.Column('last_creation_yaml',
|
101
|
+
sqlalchemy.Text,
|
102
|
+
server_default=None),
|
103
|
+
sqlalchemy.Column('last_creation_command',
|
104
|
+
sqlalchemy.Text,
|
105
|
+
server_default=None),
|
99
106
|
)
|
100
107
|
|
101
108
|
storage_table = sqlalchemy.Table(
|
@@ -133,6 +140,21 @@ cluster_history_table = sqlalchemy.Table(
|
|
133
140
|
sqlalchemy.Column('user_hash', sqlalchemy.Text),
|
134
141
|
)
|
135
142
|
|
143
|
+
ssh_key_table = sqlalchemy.Table(
|
144
|
+
'ssh_key',
|
145
|
+
Base.metadata,
|
146
|
+
sqlalchemy.Column('user_hash', sqlalchemy.Text, primary_key=True),
|
147
|
+
sqlalchemy.Column('ssh_public_key', sqlalchemy.Text),
|
148
|
+
sqlalchemy.Column('ssh_private_key', sqlalchemy.Text),
|
149
|
+
)
|
150
|
+
|
151
|
+
cluster_yaml_table = sqlalchemy.Table(
|
152
|
+
'cluster_yaml',
|
153
|
+
Base.metadata,
|
154
|
+
sqlalchemy.Column('cluster_name', sqlalchemy.Text, primary_key=True),
|
155
|
+
sqlalchemy.Column('yaml', sqlalchemy.Text),
|
156
|
+
)
|
157
|
+
|
136
158
|
|
137
159
|
def _glob_to_similar(glob_pattern):
|
138
160
|
"""Converts a glob pattern to a PostgreSQL LIKE pattern."""
|
@@ -270,6 +292,19 @@ def create_table():
|
|
270
292
|
default_statement='DEFAULT \'default\'',
|
271
293
|
value_to_replace_existing_entries=constants.
|
272
294
|
SKYPILOT_DEFAULT_WORKSPACE)
|
295
|
+
db_utils.add_column_to_table_sqlalchemy(
|
296
|
+
session,
|
297
|
+
'clusters',
|
298
|
+
'last_creation_yaml',
|
299
|
+
sqlalchemy.Text(),
|
300
|
+
default_statement='DEFAULT NULL',
|
301
|
+
)
|
302
|
+
db_utils.add_column_to_table_sqlalchemy(
|
303
|
+
session,
|
304
|
+
'clusters',
|
305
|
+
'last_creation_command',
|
306
|
+
sqlalchemy.Text(),
|
307
|
+
default_statement='DEFAULT NULL')
|
273
308
|
session.commit()
|
274
309
|
|
275
310
|
|
@@ -318,7 +353,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
318
353
|
requested_resources: Optional[Set[Any]],
|
319
354
|
ready: bool,
|
320
355
|
is_launch: bool = True,
|
321
|
-
config_hash: Optional[str] = None
|
356
|
+
config_hash: Optional[str] = None,
|
357
|
+
task_config: Optional[Dict[str, Any]] = None):
|
322
358
|
"""Adds or updates cluster_name -> cluster_handle mapping.
|
323
359
|
|
324
360
|
Args:
|
@@ -329,6 +365,8 @@ def add_or_update_cluster(cluster_name: str,
|
|
329
365
|
be marked as INIT, otherwise it will be marked as UP.
|
330
366
|
is_launch: if the cluster is firstly launched. If True, the launched_at
|
331
367
|
and last_use will be updated. Otherwise, use the old value.
|
368
|
+
config_hash: Configuration hash for the cluster.
|
369
|
+
task_config: The config of the task being launched.
|
332
370
|
"""
|
333
371
|
# TODO(zhwu): have to be imported here to avoid circular import.
|
334
372
|
from sky import skypilot_config # pylint: disable=import-outside-toplevel
|
@@ -404,6 +442,13 @@ def add_or_update_cluster(cluster_name: str,
|
|
404
442
|
conditional_values.update({
|
405
443
|
'workspace': active_workspace,
|
406
444
|
})
|
445
|
+
if (is_launch and not cluster_row or
|
446
|
+
cluster_row.status != status_lib.ClusterStatus.UP.value):
|
447
|
+
conditional_values.update({
|
448
|
+
'last_creation_yaml': common_utils.dump_yaml_str(task_config)
|
449
|
+
if task_config else None,
|
450
|
+
'last_creation_command': last_use,
|
451
|
+
})
|
407
452
|
|
408
453
|
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
409
454
|
db_utils.SQLAlchemyDialect.SQLITE.value):
|
@@ -790,6 +835,8 @@ def get_cluster_from_name(
|
|
790
835
|
'user_name': get_user(user_hash).name,
|
791
836
|
'config_hash': row.config_hash,
|
792
837
|
'workspace': row.workspace,
|
838
|
+
'last_creation_yaml': row.last_creation_yaml,
|
839
|
+
'last_creation_command': row.last_creation_command,
|
793
840
|
}
|
794
841
|
|
795
842
|
return record
|
@@ -822,6 +869,8 @@ def get_clusters() -> List[Dict[str, Any]]:
|
|
822
869
|
'user_name': get_user(user_hash).name,
|
823
870
|
'config_hash': row.config_hash,
|
824
871
|
'workspace': row.workspace,
|
872
|
+
'last_creation_yaml': row.last_creation_yaml,
|
873
|
+
'last_creation_command': row.last_creation_command,
|
825
874
|
}
|
826
875
|
|
827
876
|
records.append(record)
|
@@ -1049,3 +1098,102 @@ def get_storage() -> List[Dict[str, Any]]:
|
|
1049
1098
|
'status': status_lib.StorageStatus[row.status],
|
1050
1099
|
})
|
1051
1100
|
return records
|
1101
|
+
|
1102
|
+
|
1103
|
+
def get_ssh_keys(user_hash: str) -> Tuple[str, str, bool]:
|
1104
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1105
|
+
row = session.query(ssh_key_table).filter_by(
|
1106
|
+
user_hash=user_hash).first()
|
1107
|
+
if row:
|
1108
|
+
return row.ssh_public_key, row.ssh_private_key, True
|
1109
|
+
return '', '', False
|
1110
|
+
|
1111
|
+
|
1112
|
+
def set_ssh_keys(user_hash: str, ssh_public_key: str, ssh_private_key: str):
|
1113
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1114
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
1115
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
1116
|
+
insert_func = sqlite.insert
|
1117
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
1118
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
1119
|
+
insert_func = postgresql.insert
|
1120
|
+
else:
|
1121
|
+
raise ValueError('Unsupported database dialect')
|
1122
|
+
insert_stmnt = insert_func(ssh_key_table).values(
|
1123
|
+
user_hash=user_hash,
|
1124
|
+
ssh_public_key=ssh_public_key,
|
1125
|
+
ssh_private_key=ssh_private_key)
|
1126
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
1127
|
+
index_elements=[ssh_key_table.c.user_hash],
|
1128
|
+
set_={
|
1129
|
+
ssh_key_table.c.ssh_public_key: ssh_public_key,
|
1130
|
+
ssh_key_table.c.ssh_private_key: ssh_private_key
|
1131
|
+
})
|
1132
|
+
session.execute(do_update_stmt)
|
1133
|
+
session.commit()
|
1134
|
+
|
1135
|
+
|
1136
|
+
def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
|
1137
|
+
"""Get the cluster yaml from the database or the local file system.
|
1138
|
+
If the cluster yaml is not in the database, check if it exists on the
|
1139
|
+
local file system and migrate it to the database.
|
1140
|
+
|
1141
|
+
It is assumed that the cluster yaml file is named as <cluster_name>.yml.
|
1142
|
+
"""
|
1143
|
+
if cluster_yaml_path is None:
|
1144
|
+
raise ValueError('Attempted to read a None YAML.')
|
1145
|
+
cluster_file_name = os.path.basename(cluster_yaml_path)
|
1146
|
+
cluster_name, _ = os.path.splitext(cluster_file_name)
|
1147
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1148
|
+
row = session.query(cluster_yaml_table).filter_by(
|
1149
|
+
cluster_name=cluster_name).first()
|
1150
|
+
if row is None:
|
1151
|
+
# If the cluster yaml is not in the database, check if it exists
|
1152
|
+
# on the local file system and migrate it to the database.
|
1153
|
+
# TODO(syang): remove this check once we have a way to migrate the
|
1154
|
+
# cluster from file to database. Remove on v0.12.0.
|
1155
|
+
if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
|
1156
|
+
with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
|
1157
|
+
yaml_str = f.read()
|
1158
|
+
set_cluster_yaml(cluster_name, yaml_str)
|
1159
|
+
return yaml_str
|
1160
|
+
return None
|
1161
|
+
return row.yaml
|
1162
|
+
|
1163
|
+
|
1164
|
+
def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
|
1165
|
+
"""Get the cluster yaml as a dictionary from the database.
|
1166
|
+
|
1167
|
+
It is assumed that the cluster yaml file is named as <cluster_name>.yml.
|
1168
|
+
"""
|
1169
|
+
yaml_str = get_cluster_yaml_str(cluster_yaml_path)
|
1170
|
+
if yaml_str is None:
|
1171
|
+
raise ValueError(f'Cluster yaml {cluster_yaml_path} not found.')
|
1172
|
+
return yaml.safe_load(yaml_str)
|
1173
|
+
|
1174
|
+
|
1175
|
+
def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
|
1176
|
+
"""Set the cluster yaml in the database."""
|
1177
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1178
|
+
if (_SQLALCHEMY_ENGINE.dialect.name ==
|
1179
|
+
db_utils.SQLAlchemyDialect.SQLITE.value):
|
1180
|
+
insert_func = sqlite.insert
|
1181
|
+
elif (_SQLALCHEMY_ENGINE.dialect.name ==
|
1182
|
+
db_utils.SQLAlchemyDialect.POSTGRESQL.value):
|
1183
|
+
insert_func = postgresql.insert
|
1184
|
+
else:
|
1185
|
+
raise ValueError('Unsupported database dialect')
|
1186
|
+
insert_stmnt = insert_func(cluster_yaml_table).values(
|
1187
|
+
cluster_name=cluster_name, yaml=yaml_str)
|
1188
|
+
do_update_stmt = insert_stmnt.on_conflict_do_update(
|
1189
|
+
index_elements=[cluster_yaml_table.c.cluster_name],
|
1190
|
+
set_={cluster_yaml_table.c.yaml: yaml_str})
|
1191
|
+
session.execute(do_update_stmt)
|
1192
|
+
session.commit()
|
1193
|
+
|
1194
|
+
|
1195
|
+
def remove_cluster_yaml(cluster_name: str):
|
1196
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
1197
|
+
session.query(cluster_yaml_table).filter_by(
|
1198
|
+
cluster_name=cluster_name).delete()
|
1199
|
+
session.commit()
|
sky/jobs/client/sdk.py
CHANGED
@@ -46,6 +46,7 @@ def launch(
|
|
46
46
|
task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
|
47
47
|
managed job.
|
48
48
|
name: Name of the managed job.
|
49
|
+
priority: Priority of the managed job.
|
49
50
|
_need_confirmation: (Internal only) Whether to show a confirmation
|
50
51
|
prompt before launching the job.
|
51
52
|
|
sky/jobs/constants.py
CHANGED
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
|
|
47
47
|
# The version of the lib files that jobs/utils use. Whenever there is an API
|
48
48
|
# change for the jobs/utils, we need to bump this version and update
|
49
49
|
# job.utils.ManagedJobCodeGen to handle the version update.
|
50
|
-
MANAGED_JOBS_VERSION =
|
50
|
+
MANAGED_JOBS_VERSION = 5
|
51
51
|
|
52
52
|
# The command for setting up the jobs dashboard on the controller. It firstly
|
53
53
|
# checks if the systemd services are available, and if not (e.g., Kubernetes
|
@@ -70,3 +70,5 @@ DASHBOARD_SETUP_CMD = (
|
|
70
70
|
f'(nohup {skylet_constants.SKY_PYTHON_CMD} -m sky.jobs.dashboard.dashboard '
|
71
71
|
'>> ~/.sky/job-dashboard.log 2>&1 &); '
|
72
72
|
'fi')
|
73
|
+
|
74
|
+
DEFAULT_PRIORITY = 500
|
sky/jobs/controller.py
CHANGED
@@ -179,8 +179,8 @@ class JobsController:
|
|
179
179
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
180
180
|
task.name, self._job_id)
|
181
181
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
182
|
-
cluster_name, self._backend, task, self._job_id)
|
183
|
-
managed_job_state.
|
182
|
+
cluster_name, self._backend, task, self._job_id, task_id)
|
183
|
+
managed_job_state.set_starting(
|
184
184
|
self._job_id,
|
185
185
|
task_id,
|
186
186
|
self._backend.run_timestamp,
|
@@ -197,9 +197,7 @@ class JobsController:
|
|
197
197
|
f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
198
198
|
|
199
199
|
logger.info('Started monitoring.')
|
200
|
-
|
201
|
-
task_id=task_id,
|
202
|
-
callback_func=callback_func)
|
200
|
+
|
203
201
|
remote_job_submitted_at = self._strategy_executor.launch()
|
204
202
|
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
205
203
|
|
sky/jobs/recovery_strategy.py
CHANGED
@@ -18,6 +18,7 @@ from sky import global_user_state
|
|
18
18
|
from sky import sky_logging
|
19
19
|
from sky.backends import backend_utils
|
20
20
|
from sky.jobs import scheduler
|
21
|
+
from sky.jobs import state
|
21
22
|
from sky.jobs import utils as managed_job_utils
|
22
23
|
from sky.skylet import job_lib
|
23
24
|
from sky.usage import usage_lib
|
@@ -49,7 +50,7 @@ class StrategyExecutor:
|
|
49
50
|
|
50
51
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
51
52
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
52
|
-
job_id: int) -> None:
|
53
|
+
job_id: int, task_id: int) -> None:
|
53
54
|
"""Initialize the strategy executor.
|
54
55
|
|
55
56
|
Args:
|
@@ -65,11 +66,13 @@ class StrategyExecutor:
|
|
65
66
|
self.backend = backend
|
66
67
|
self.max_restarts_on_errors = max_restarts_on_errors
|
67
68
|
self.job_id = job_id
|
69
|
+
self.task_id = task_id
|
68
70
|
self.restart_cnt_on_failure = 0
|
69
71
|
|
70
72
|
@classmethod
|
71
73
|
def make(cls, cluster_name: str, backend: 'backends.Backend',
|
72
|
-
task: 'task_lib.Task', job_id: int
|
74
|
+
task: 'task_lib.Task', job_id: int,
|
75
|
+
task_id: int) -> 'StrategyExecutor':
|
73
76
|
"""Create a strategy from a task."""
|
74
77
|
|
75
78
|
resource_list = list(task.resources)
|
@@ -100,7 +103,7 @@ class StrategyExecutor:
|
|
100
103
|
from_str(job_recovery_name))
|
101
104
|
assert job_recovery_strategy is not None, job_recovery_name
|
102
105
|
return job_recovery_strategy(cluster_name, backend, task,
|
103
|
-
max_restarts_on_errors, job_id)
|
106
|
+
max_restarts_on_errors, job_id, task_id)
|
104
107
|
|
105
108
|
def launch(self) -> float:
|
106
109
|
"""Launch the cluster for the first time.
|
@@ -235,7 +238,8 @@ class StrategyExecutor:
|
|
235
238
|
|
236
239
|
def _launch(self,
|
237
240
|
max_retry: Optional[int] = 3,
|
238
|
-
raise_on_failure: bool = True
|
241
|
+
raise_on_failure: bool = True,
|
242
|
+
recovery: bool = False) -> Optional[float]:
|
239
243
|
"""Implementation of launch().
|
240
244
|
|
241
245
|
The function will wait until the job starts running, but will leave the
|
@@ -275,98 +279,134 @@ class StrategyExecutor:
|
|
275
279
|
backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
|
276
280
|
while True:
|
277
281
|
retry_cnt += 1
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
#
|
282
|
-
#
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
exceptions.ResourcesUnavailableError)
|
309
|
-
for err in e.failover_history):
|
310
|
-
# _launch() (this function) should fail/exit directly,
|
311
|
-
# if none of the failover reasons were because of
|
312
|
-
# resource unavailability or no failover was attempted
|
313
|
-
# (the optimizer cannot find feasible resources for
|
314
|
-
# requested resources), i.e., e.failover_history is
|
315
|
-
# empty. Failing directly avoids the infinite loop of
|
316
|
-
# retrying the launch when, e.g., an invalid cluster
|
317
|
-
# name is used and --retry-until-up is specified.
|
318
|
-
reasons = (e.failover_history
|
319
|
-
if e.failover_history else [e])
|
320
|
-
reasons_str = '; '.join(
|
321
|
-
common_utils.format_exception(err)
|
322
|
-
for err in reasons)
|
323
|
-
logger.error(
|
324
|
-
'Failure happened before provisioning. Failover '
|
325
|
-
f'reasons: {reasons_str}')
|
282
|
+
try:
|
283
|
+
with scheduler.scheduled_launch(self.job_id):
|
284
|
+
# The job state may have been PENDING during backoff -
|
285
|
+
# update to STARTING or RECOVERING.
|
286
|
+
# On the first attempt (when retry_cnt is 1), we should
|
287
|
+
# already be in STARTING or RECOVERING.
|
288
|
+
if retry_cnt > 1:
|
289
|
+
state.set_restarting(self.job_id, self.task_id,
|
290
|
+
recovery)
|
291
|
+
try:
|
292
|
+
usage_lib.messages.usage.set_internal()
|
293
|
+
# Detach setup, so that the setup failure can be
|
294
|
+
# detected by the controller process (job_status ->
|
295
|
+
# FAILED_SETUP).
|
296
|
+
execution.launch(
|
297
|
+
self.dag,
|
298
|
+
cluster_name=self.cluster_name,
|
299
|
+
# We expect to tear down the cluster as soon as the
|
300
|
+
# job is finished. However, in case the controller
|
301
|
+
# dies, set autodown to try and avoid a resource
|
302
|
+
# leak.
|
303
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
304
|
+
down=True,
|
305
|
+
_is_launched_by_jobs_controller=True)
|
306
|
+
logger.info('Managed job cluster launched.')
|
307
|
+
except (exceptions.InvalidClusterNameError,
|
308
|
+
exceptions.NoCloudAccessError,
|
309
|
+
exceptions.ResourcesMismatchError) as e:
|
310
|
+
logger.error('Failure happened before provisioning. '
|
311
|
+
f'{common_utils.format_exception(e)}')
|
326
312
|
if raise_on_failure:
|
327
|
-
raise exceptions.ProvisionPrechecksError(
|
328
|
-
|
329
|
-
logger.info('Failed to launch a cluster with error: '
|
330
|
-
f'{common_utils.format_exception(e)})')
|
331
|
-
except Exception as e: # pylint: disable=broad-except
|
332
|
-
# If the launch fails, it will be recovered by the following
|
333
|
-
# code.
|
334
|
-
logger.info('Failed to launch a cluster with error: '
|
335
|
-
f'{common_utils.format_exception(e)})')
|
336
|
-
with ux_utils.enable_traceback():
|
337
|
-
logger.info(f' Traceback: {traceback.format_exc()}')
|
338
|
-
else: # No exception, the launch succeeds.
|
339
|
-
# At this point, a sky.launch() has succeeded. Cluster may
|
340
|
-
# be UP (no preemption since) or DOWN (newly preempted).
|
341
|
-
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
342
|
-
if job_submitted_at is not None:
|
343
|
-
return job_submitted_at
|
344
|
-
# The job fails to start on the cluster, retry the launch.
|
345
|
-
# TODO(zhwu): log the unexpected error to usage collection
|
346
|
-
# for future debugging.
|
347
|
-
logger.info(
|
348
|
-
'Failed to successfully submit the job to the '
|
349
|
-
'launched cluster, due to unexpected submission errors '
|
350
|
-
'or the cluster being preempted during job submission.')
|
351
|
-
|
352
|
-
# If we get here, the launch did not succeed. Tear down the
|
353
|
-
# cluster and retry.
|
354
|
-
managed_job_utils.terminate_cluster(self.cluster_name)
|
355
|
-
if max_retry is not None and retry_cnt >= max_retry:
|
356
|
-
# Retry forever if max_retry is None.
|
357
|
-
if raise_on_failure:
|
358
|
-
with ux_utils.print_exception_no_traceback():
|
359
|
-
raise exceptions.ManagedJobReachedMaxRetriesError(
|
360
|
-
'Resources unavailable: failed to launch '
|
361
|
-
f'clusters after {max_retry} retries.')
|
362
|
-
else:
|
313
|
+
raise exceptions.ProvisionPrechecksError(
|
314
|
+
reasons=[e])
|
363
315
|
return None
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
316
|
+
except exceptions.ResourcesUnavailableError as e:
|
317
|
+
# This is raised when the launch fails due to prechecks
|
318
|
+
# or after failing over through all the candidates.
|
319
|
+
# Please refer to the docstring of `sky.launch` for more
|
320
|
+
# details of how the exception will be structured.
|
321
|
+
if not any(
|
322
|
+
isinstance(err,
|
323
|
+
exceptions.ResourcesUnavailableError)
|
324
|
+
for err in e.failover_history):
|
325
|
+
# _launch() (this function) should fail/exit
|
326
|
+
# directly, if none of the failover reasons were
|
327
|
+
# because of resource unavailability or no failover
|
328
|
+
# was attempted (the optimizer cannot find feasible
|
329
|
+
# resources for requested resources), i.e.,
|
330
|
+
# e.failover_history is empty. Failing directly
|
331
|
+
# avoids the infinite loop of retrying the launch
|
332
|
+
# when, e.g., an invalid cluster name is used and
|
333
|
+
# --retry-until-up is specified.
|
334
|
+
reasons = (e.failover_history
|
335
|
+
if e.failover_history else [e])
|
336
|
+
reasons_str = '; '.join(
|
337
|
+
common_utils.format_exception(err)
|
338
|
+
for err in reasons)
|
339
|
+
logger.error(
|
340
|
+
'Failure happened before provisioning. '
|
341
|
+
f'Failover reasons: {reasons_str}')
|
342
|
+
if raise_on_failure:
|
343
|
+
raise exceptions.ProvisionPrechecksError(
|
344
|
+
reasons)
|
345
|
+
return None
|
346
|
+
logger.info('Failed to launch a cluster with error: '
|
347
|
+
f'{common_utils.format_exception(e)})')
|
348
|
+
except Exception as e: # pylint: disable=broad-except
|
349
|
+
# If the launch fails, it will be recovered by the
|
350
|
+
# following code.
|
351
|
+
logger.info('Failed to launch a cluster with error: '
|
352
|
+
f'{common_utils.format_exception(e)})')
|
353
|
+
with ux_utils.enable_traceback():
|
354
|
+
logger.info(
|
355
|
+
f' Traceback: {traceback.format_exc()}')
|
356
|
+
else: # No exception, the launch succeeds.
|
357
|
+
# At this point, a sky.launch() has succeeded. Cluster
|
358
|
+
# may be UP (no preemption since) or DOWN (newly
|
359
|
+
# preempted).
|
360
|
+
job_submitted_at = (
|
361
|
+
self._wait_until_job_starts_on_cluster())
|
362
|
+
if job_submitted_at is not None:
|
363
|
+
return job_submitted_at
|
364
|
+
# The job fails to start on the cluster, retry the
|
365
|
+
# launch.
|
366
|
+
# TODO(zhwu): log the unexpected error to usage
|
367
|
+
# collection for future debugging.
|
368
|
+
logger.info(
|
369
|
+
'Failed to successfully submit the job to the '
|
370
|
+
'launched cluster, due to unexpected submission '
|
371
|
+
'errors or the cluster being preempted during '
|
372
|
+
'job submission.')
|
373
|
+
|
374
|
+
# If we get here, the launch did not succeed. Tear down the
|
375
|
+
# cluster and retry.
|
376
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
377
|
+
if max_retry is not None and retry_cnt >= max_retry:
|
378
|
+
# Retry forever if max_retry is None.
|
379
|
+
if raise_on_failure:
|
380
|
+
with ux_utils.print_exception_no_traceback():
|
381
|
+
raise (
|
382
|
+
exceptions.ManagedJobReachedMaxRetriesError(
|
383
|
+
'Resources unavailable: failed to '
|
384
|
+
f'launch clusters after {max_retry} '
|
385
|
+
'retries.'))
|
386
|
+
else:
|
387
|
+
return None
|
388
|
+
|
389
|
+
# Raise NoClusterLaunchedError to indicate that the job is
|
390
|
+
# in retry backoff. This will trigger special handling in
|
391
|
+
# scheduler.schedule_launched().
|
392
|
+
# We will exit the scheduled_launch context so that the
|
393
|
+
# schedule state is ALIVE_BACKOFF during the backoff. This
|
394
|
+
# allows other jobs to launch.
|
395
|
+
raise exceptions.NoClusterLaunchedError()
|
396
|
+
|
397
|
+
except exceptions.NoClusterLaunchedError:
|
398
|
+
# Update the status to PENDING during backoff.
|
399
|
+
state.set_backoff_pending(self.job_id, self.task_id)
|
400
|
+
# Calculate the backoff time and sleep.
|
401
|
+
gap_seconds = backoff.current_backoff()
|
402
|
+
logger.info('Retrying to launch the cluster in '
|
403
|
+
f'{gap_seconds:.1f} seconds.')
|
404
|
+
time.sleep(gap_seconds)
|
405
|
+
continue
|
406
|
+
else:
|
407
|
+
# The inner loop should either return or throw
|
408
|
+
# NoClusterLaunchedError.
|
409
|
+
assert False, 'Unreachable'
|
370
410
|
|
371
411
|
def should_restart_on_failure(self) -> bool:
|
372
412
|
"""Increments counter & checks if job should be restarted on a failure.
|
@@ -389,9 +429,9 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
389
429
|
|
390
430
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
391
431
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
392
|
-
job_id: int) -> None:
|
432
|
+
job_id: int, task_id: int) -> None:
|
393
433
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
394
|
-
job_id)
|
434
|
+
job_id, task_id)
|
395
435
|
# Note down the cloud/region of the launched cluster, so that we can
|
396
436
|
# first retry in the same cloud/region. (Inside recover() we may not
|
397
437
|
# rely on cluster handle, as it can be None if the cluster is
|
@@ -400,8 +440,10 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
400
440
|
|
401
441
|
def _launch(self,
|
402
442
|
max_retry: Optional[int] = 3,
|
403
|
-
raise_on_failure: bool = True
|
404
|
-
|
443
|
+
raise_on_failure: bool = True,
|
444
|
+
recovery: bool = False) -> Optional[float]:
|
445
|
+
job_submitted_at = super()._launch(max_retry, raise_on_failure,
|
446
|
+
recovery)
|
405
447
|
if job_submitted_at is not None:
|
406
448
|
# Only record the cloud/region if the launch is successful.
|
407
449
|
handle = global_user_state.get_handle_from_cluster_name(
|
@@ -436,7 +478,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
436
478
|
cloud=launched_cloud, region=launched_region, zone=None)
|
437
479
|
task.set_resources({new_resources})
|
438
480
|
# Not using self.launch to avoid the retry until up logic.
|
439
|
-
job_submitted_at = self._launch(raise_on_failure=False
|
481
|
+
job_submitted_at = self._launch(raise_on_failure=False,
|
482
|
+
recovery=True)
|
440
483
|
# Restore the original dag, i.e. reset the region constraint.
|
441
484
|
task.set_resources(original_resources)
|
442
485
|
if job_submitted_at is not None:
|
@@ -452,7 +495,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
452
495
|
'cloud/region.')
|
453
496
|
# Not using self.launch to avoid the retry until up logic.
|
454
497
|
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
455
|
-
raise_on_failure=False
|
498
|
+
raise_on_failure=False,
|
499
|
+
recovery=True)
|
456
500
|
if job_submitted_at is None:
|
457
501
|
# Failed to launch the cluster.
|
458
502
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
@@ -524,7 +568,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
524
568
|
region=launched_region)
|
525
569
|
}
|
526
570
|
# Not using self.launch to avoid the retry until up logic.
|
527
|
-
job_submitted_at = self._launch(raise_on_failure=False
|
571
|
+
job_submitted_at = self._launch(raise_on_failure=False,
|
572
|
+
recovery=True)
|
528
573
|
task.blocked_resources = None
|
529
574
|
if job_submitted_at is not None:
|
530
575
|
return job_submitted_at
|
@@ -535,7 +580,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
535
580
|
'cloud/region.')
|
536
581
|
# Not using self.launch to avoid the retry until up logic.
|
537
582
|
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
538
|
-
raise_on_failure=False
|
583
|
+
raise_on_failure=False,
|
584
|
+
recovery=True)
|
539
585
|
if job_submitted_at is None:
|
540
586
|
# Failed to launch the cluster.
|
541
587
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
sky/jobs/scheduler.py
CHANGED
@@ -45,6 +45,7 @@ import typing
|
|
45
45
|
|
46
46
|
import filelock
|
47
47
|
|
48
|
+
from sky import exceptions
|
48
49
|
from sky import sky_logging
|
49
50
|
from sky.adaptors import common as adaptors_common
|
50
51
|
from sky.jobs import constants as managed_job_constants
|
@@ -190,7 +191,8 @@ def maybe_schedule_next_jobs() -> None:
|
|
190
191
|
pass
|
191
192
|
|
192
193
|
|
193
|
-
def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str
|
194
|
+
def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str,
|
195
|
+
priority: int) -> None:
|
194
196
|
"""Submit an existing job to the scheduler.
|
195
197
|
|
196
198
|
This should be called after a job is created in the `spot` table as
|
@@ -202,7 +204,7 @@ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
|
|
202
204
|
"""
|
203
205
|
with filelock.FileLock(_get_lock_path()):
|
204
206
|
state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
|
205
|
-
common_utils.get_user_hash())
|
207
|
+
common_utils.get_user_hash(), priority)
|
206
208
|
maybe_schedule_next_jobs()
|
207
209
|
|
208
210
|
|
@@ -240,11 +242,19 @@ def scheduled_launch(job_id: int):
|
|
240
242
|
state.ManagedJobScheduleState.LAUNCHING):
|
241
243
|
time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
|
242
244
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
245
|
+
try:
|
246
|
+
yield
|
247
|
+
except exceptions.NoClusterLaunchedError:
|
248
|
+
# NoClusterLaunchedError is indicates that the job is in retry backoff.
|
249
|
+
# We should transition to ALIVE_BACKOFF instead of ALIVE.
|
250
|
+
with filelock.FileLock(_get_lock_path()):
|
251
|
+
state.scheduler_set_alive_backoff(job_id)
|
252
|
+
raise
|
253
|
+
else:
|
254
|
+
with filelock.FileLock(_get_lock_path()):
|
255
|
+
state.scheduler_set_alive(job_id)
|
256
|
+
finally:
|
257
|
+
maybe_schedule_next_jobs()
|
248
258
|
|
249
259
|
|
250
260
|
def job_done(job_id: int, idempotent: bool = False) -> None:
|
@@ -309,5 +319,10 @@ if __name__ == '__main__':
|
|
309
319
|
parser.add_argument('--env-file',
|
310
320
|
type=str,
|
311
321
|
help='The path to the controller env file.')
|
322
|
+
parser.add_argument(
|
323
|
+
'--priority',
|
324
|
+
type=int,
|
325
|
+
default=500,
|
326
|
+
help='Job priority (0-1000, lower is higher). Default: 500.')
|
312
327
|
args = parser.parse_args()
|
313
|
-
submit_job(args.job_id, args.dag_yaml, args.env_file)
|
328
|
+
submit_job(args.job_id, args.dag_yaml, args.env_file, args.priority)
|