skypilot-nightly 1.0.0.dev20250528__py3-none-any.whl → 1.0.0.dev20250530__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +99 -16
  3. sky/authentication.py +54 -7
  4. sky/backends/backend_utils.py +35 -22
  5. sky/backends/cloud_vm_ray_backend.py +30 -15
  6. sky/check.py +1 -1
  7. sky/cli.py +20 -8
  8. sky/client/cli.py +20 -8
  9. sky/client/oauth.py +82 -0
  10. sky/client/sdk.py +60 -10
  11. sky/clouds/nebius.py +55 -14
  12. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +3 -3
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/Q32Bxr2Pby5tFDW-y5TNg/_buildManifest.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/236-ca00738e2f58ea65.js +6 -0
  16. sky/dashboard/out/_next/static/chunks/37-64efcd0e9c54bff6.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/{173-7db8607cefc20f70.js → 614-3d29f98e0634b179.js} +2 -2
  18. sky/dashboard/out/_next/static/chunks/682-f3f1443ed2fba42f.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/798-c0525dc3f21e488d.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/843-786c36624d5ff61f.js +11 -0
  21. sky/dashboard/out/_next/static/chunks/856-02e34c9fc5945066.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-42d3656aba9d2e78.js +6 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-20835df7b0c4599c.js +6 -0
  24. sky/dashboard/out/_next/static/chunks/pages/{clusters-943992b84fd6f4ee.js → clusters-f37ff20f0af29aae.js} +1 -1
  25. sky/dashboard/out/_next/static/chunks/pages/{config-41738d1896fc02fe.js → config-3c6a2dabf56e8cd6.js} +2 -2
  26. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-342bc15bb78ab2e5.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/infra-7b4b8e7fa9fa0827.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-258decb65e95f520.js +11 -0
  29. sky/dashboard/out/_next/static/chunks/pages/{jobs-a4efc09e61988f8d.js → jobs-78a6c5ba3e24c0cf.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{users-b2634885d67c49a6.js → users-89f9212b81d8897e.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/workspace/{new-579b3203c7c19d84.js → new-198b6e00d7d724c5.js} +1 -1
  32. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-9388e38fac73ee8f.js → [name]-2ce792183b03c341.js} +1 -1
  33. sky/dashboard/out/_next/static/chunks/pages/workspaces-17d41826537196e7.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/webpack-f27c9a32aa3d9c6d.js +1 -0
  35. sky/dashboard/out/_next/static/css/5411b9fb0a783c1c.css +3 -0
  36. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  37. sky/dashboard/out/clusters/[cluster].html +1 -1
  38. sky/dashboard/out/clusters.html +1 -1
  39. sky/dashboard/out/config.html +1 -1
  40. sky/dashboard/out/index.html +1 -1
  41. sky/dashboard/out/infra/[context].html +1 -0
  42. sky/dashboard/out/infra.html +1 -1
  43. sky/dashboard/out/jobs/[job].html +1 -1
  44. sky/dashboard/out/jobs.html +1 -1
  45. sky/dashboard/out/users.html +1 -1
  46. sky/dashboard/out/workspace/new.html +1 -1
  47. sky/dashboard/out/workspaces/[name].html +1 -1
  48. sky/dashboard/out/workspaces.html +1 -1
  49. sky/exceptions.py +11 -1
  50. sky/global_user_state.py +149 -1
  51. sky/jobs/client/sdk.py +1 -0
  52. sky/jobs/constants.py +3 -1
  53. sky/jobs/controller.py +3 -5
  54. sky/jobs/recovery_strategy.py +148 -102
  55. sky/jobs/scheduler.py +23 -8
  56. sky/jobs/server/core.py +16 -0
  57. sky/jobs/state.py +153 -39
  58. sky/jobs/utils.py +33 -5
  59. sky/provision/kubernetes/utils.py +2 -1
  60. sky/provision/provisioner.py +15 -10
  61. sky/resources.py +16 -1
  62. sky/serve/controller.py +10 -7
  63. sky/serve/replica_managers.py +22 -18
  64. sky/serve/service.py +5 -4
  65. sky/server/common.py +11 -4
  66. sky/server/html/token_page.html +32 -6
  67. sky/server/server.py +3 -1
  68. sky/server/stream_utils.py +21 -0
  69. sky/setup_files/dependencies.py +7 -1
  70. sky/skylet/constants.py +1 -1
  71. sky/task.py +26 -0
  72. sky/templates/jobs-controller.yaml.j2 +2 -1
  73. sky/templates/kubernetes-ray.yml.j2 +19 -1
  74. sky/utils/common_utils.py +66 -0
  75. sky/utils/rich_utils.py +5 -0
  76. sky/utils/schemas.py +32 -1
  77. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/METADATA +3 -1
  78. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/RECORD +84 -81
  79. sky/dashboard/out/_next/static/Mx1iAbDQn1jMHh3UHmK3R/_buildManifest.js +0 -1
  80. sky/dashboard/out/_next/static/chunks/236-d6900c828331f664.js +0 -6
  81. sky/dashboard/out/_next/static/chunks/320-afea3ddcc5bd1c6c.js +0 -6
  82. sky/dashboard/out/_next/static/chunks/578-9146658cead92981.js +0 -6
  83. sky/dashboard/out/_next/static/chunks/843-256ec920f6d5f41f.js +0 -11
  84. sky/dashboard/out/_next/static/chunks/856-62b87c68917b08ed.js +0 -1
  85. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +0 -1
  86. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-159bffb2fa34ed54.js +0 -6
  87. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9506c00257d10dbd.js +0 -1
  88. sky/dashboard/out/_next/static/chunks/pages/infra-881fcd902fbbd0e5.js +0 -6
  89. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-2c29e97a6aa50dd4.js +0 -6
  90. sky/dashboard/out/_next/static/chunks/pages/workspaces-610c49ae3619ee85.js +0 -1
  91. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +0 -1
  92. sky/dashboard/out/_next/static/css/ffd1cd601648c303.css +0 -3
  93. /sky/dashboard/out/_next/static/{Mx1iAbDQn1jMHh3UHmK3R → Q32Bxr2Pby5tFDW-y5TNg}/_ssgManifest.js +0 -0
  94. /sky/dashboard/out/_next/static/chunks/pages/{_app-a631df412d8172de.js → _app-f19ea34b91c33950.js} +0 -0
  95. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/WHEEL +0 -0
  96. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/entry_points.txt +0 -0
  97. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/licenses/LICENSE +0 -0
  98. {skypilot_nightly-1.0.0.dev20250528.dist-info → skypilot_nightly-1.0.0.dev20250530.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -22,6 +22,7 @@ from sqlalchemy import orm
22
22
  from sqlalchemy.dialects import postgresql
23
23
  from sqlalchemy.dialects import sqlite
24
24
  from sqlalchemy.ext import declarative
25
+ import yaml
25
26
 
26
27
  from sky import models
27
28
  from sky import sky_logging
@@ -96,6 +97,12 @@ cluster_table = sqlalchemy.Table(
96
97
  sqlalchemy.Column('workspace',
97
98
  sqlalchemy.Text,
98
99
  server_default=constants.SKYPILOT_DEFAULT_WORKSPACE),
100
+ sqlalchemy.Column('last_creation_yaml',
101
+ sqlalchemy.Text,
102
+ server_default=None),
103
+ sqlalchemy.Column('last_creation_command',
104
+ sqlalchemy.Text,
105
+ server_default=None),
99
106
  )
100
107
 
101
108
  storage_table = sqlalchemy.Table(
@@ -133,6 +140,21 @@ cluster_history_table = sqlalchemy.Table(
133
140
  sqlalchemy.Column('user_hash', sqlalchemy.Text),
134
141
  )
135
142
 
143
+ ssh_key_table = sqlalchemy.Table(
144
+ 'ssh_key',
145
+ Base.metadata,
146
+ sqlalchemy.Column('user_hash', sqlalchemy.Text, primary_key=True),
147
+ sqlalchemy.Column('ssh_public_key', sqlalchemy.Text),
148
+ sqlalchemy.Column('ssh_private_key', sqlalchemy.Text),
149
+ )
150
+
151
+ cluster_yaml_table = sqlalchemy.Table(
152
+ 'cluster_yaml',
153
+ Base.metadata,
154
+ sqlalchemy.Column('cluster_name', sqlalchemy.Text, primary_key=True),
155
+ sqlalchemy.Column('yaml', sqlalchemy.Text),
156
+ )
157
+
136
158
 
137
159
  def _glob_to_similar(glob_pattern):
138
160
  """Converts a glob pattern to a PostgreSQL LIKE pattern."""
@@ -270,6 +292,19 @@ def create_table():
270
292
  default_statement='DEFAULT \'default\'',
271
293
  value_to_replace_existing_entries=constants.
272
294
  SKYPILOT_DEFAULT_WORKSPACE)
295
+ db_utils.add_column_to_table_sqlalchemy(
296
+ session,
297
+ 'clusters',
298
+ 'last_creation_yaml',
299
+ sqlalchemy.Text(),
300
+ default_statement='DEFAULT NULL',
301
+ )
302
+ db_utils.add_column_to_table_sqlalchemy(
303
+ session,
304
+ 'clusters',
305
+ 'last_creation_command',
306
+ sqlalchemy.Text(),
307
+ default_statement='DEFAULT NULL')
273
308
  session.commit()
274
309
 
275
310
 
@@ -318,7 +353,8 @@ def add_or_update_cluster(cluster_name: str,
318
353
  requested_resources: Optional[Set[Any]],
319
354
  ready: bool,
320
355
  is_launch: bool = True,
321
- config_hash: Optional[str] = None):
356
+ config_hash: Optional[str] = None,
357
+ task_config: Optional[Dict[str, Any]] = None):
322
358
  """Adds or updates cluster_name -> cluster_handle mapping.
323
359
 
324
360
  Args:
@@ -329,6 +365,8 @@ def add_or_update_cluster(cluster_name: str,
329
365
  be marked as INIT, otherwise it will be marked as UP.
330
366
  is_launch: if the cluster is firstly launched. If True, the launched_at
331
367
  and last_use will be updated. Otherwise, use the old value.
368
+ config_hash: Configuration hash for the cluster.
369
+ task_config: The config of the task being launched.
332
370
  """
333
371
  # TODO(zhwu): have to be imported here to avoid circular import.
334
372
  from sky import skypilot_config # pylint: disable=import-outside-toplevel
@@ -404,6 +442,13 @@ def add_or_update_cluster(cluster_name: str,
404
442
  conditional_values.update({
405
443
  'workspace': active_workspace,
406
444
  })
445
+ if (is_launch and not cluster_row or
446
+ cluster_row.status != status_lib.ClusterStatus.UP.value):
447
+ conditional_values.update({
448
+ 'last_creation_yaml': common_utils.dump_yaml_str(task_config)
449
+ if task_config else None,
450
+ 'last_creation_command': last_use,
451
+ })
407
452
 
408
453
  if (_SQLALCHEMY_ENGINE.dialect.name ==
409
454
  db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -790,6 +835,8 @@ def get_cluster_from_name(
790
835
  'user_name': get_user(user_hash).name,
791
836
  'config_hash': row.config_hash,
792
837
  'workspace': row.workspace,
838
+ 'last_creation_yaml': row.last_creation_yaml,
839
+ 'last_creation_command': row.last_creation_command,
793
840
  }
794
841
 
795
842
  return record
@@ -822,6 +869,8 @@ def get_clusters() -> List[Dict[str, Any]]:
822
869
  'user_name': get_user(user_hash).name,
823
870
  'config_hash': row.config_hash,
824
871
  'workspace': row.workspace,
872
+ 'last_creation_yaml': row.last_creation_yaml,
873
+ 'last_creation_command': row.last_creation_command,
825
874
  }
826
875
 
827
876
  records.append(record)
@@ -1049,3 +1098,102 @@ def get_storage() -> List[Dict[str, Any]]:
1049
1098
  'status': status_lib.StorageStatus[row.status],
1050
1099
  })
1051
1100
  return records
1101
+
1102
+
1103
+ def get_ssh_keys(user_hash: str) -> Tuple[str, str, bool]:
1104
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1105
+ row = session.query(ssh_key_table).filter_by(
1106
+ user_hash=user_hash).first()
1107
+ if row:
1108
+ return row.ssh_public_key, row.ssh_private_key, True
1109
+ return '', '', False
1110
+
1111
+
1112
+ def set_ssh_keys(user_hash: str, ssh_public_key: str, ssh_private_key: str):
1113
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1114
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
1115
+ db_utils.SQLAlchemyDialect.SQLITE.value):
1116
+ insert_func = sqlite.insert
1117
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
1118
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1119
+ insert_func = postgresql.insert
1120
+ else:
1121
+ raise ValueError('Unsupported database dialect')
1122
+ insert_stmnt = insert_func(ssh_key_table).values(
1123
+ user_hash=user_hash,
1124
+ ssh_public_key=ssh_public_key,
1125
+ ssh_private_key=ssh_private_key)
1126
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
1127
+ index_elements=[ssh_key_table.c.user_hash],
1128
+ set_={
1129
+ ssh_key_table.c.ssh_public_key: ssh_public_key,
1130
+ ssh_key_table.c.ssh_private_key: ssh_private_key
1131
+ })
1132
+ session.execute(do_update_stmt)
1133
+ session.commit()
1134
+
1135
+
1136
+ def get_cluster_yaml_str(cluster_yaml_path: Optional[str]) -> Optional[str]:
1137
+ """Get the cluster yaml from the database or the local file system.
1138
+ If the cluster yaml is not in the database, check if it exists on the
1139
+ local file system and migrate it to the database.
1140
+
1141
+ It is assumed that the cluster yaml file is named as <cluster_name>.yml.
1142
+ """
1143
+ if cluster_yaml_path is None:
1144
+ raise ValueError('Attempted to read a None YAML.')
1145
+ cluster_file_name = os.path.basename(cluster_yaml_path)
1146
+ cluster_name, _ = os.path.splitext(cluster_file_name)
1147
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1148
+ row = session.query(cluster_yaml_table).filter_by(
1149
+ cluster_name=cluster_name).first()
1150
+ if row is None:
1151
+ # If the cluster yaml is not in the database, check if it exists
1152
+ # on the local file system and migrate it to the database.
1153
+ # TODO(syang): remove this check once we have a way to migrate the
1154
+ # cluster from file to database. Remove on v0.12.0.
1155
+ if cluster_yaml_path is not None and os.path.exists(cluster_yaml_path):
1156
+ with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
1157
+ yaml_str = f.read()
1158
+ set_cluster_yaml(cluster_name, yaml_str)
1159
+ return yaml_str
1160
+ return None
1161
+ return row.yaml
1162
+
1163
+
1164
+ def get_cluster_yaml_dict(cluster_yaml_path: Optional[str]) -> Dict[str, Any]:
1165
+ """Get the cluster yaml as a dictionary from the database.
1166
+
1167
+ It is assumed that the cluster yaml file is named as <cluster_name>.yml.
1168
+ """
1169
+ yaml_str = get_cluster_yaml_str(cluster_yaml_path)
1170
+ if yaml_str is None:
1171
+ raise ValueError(f'Cluster yaml {cluster_yaml_path} not found.')
1172
+ return yaml.safe_load(yaml_str)
1173
+
1174
+
1175
+ def set_cluster_yaml(cluster_name: str, yaml_str: str) -> None:
1176
+ """Set the cluster yaml in the database."""
1177
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1178
+ if (_SQLALCHEMY_ENGINE.dialect.name ==
1179
+ db_utils.SQLAlchemyDialect.SQLITE.value):
1180
+ insert_func = sqlite.insert
1181
+ elif (_SQLALCHEMY_ENGINE.dialect.name ==
1182
+ db_utils.SQLAlchemyDialect.POSTGRESQL.value):
1183
+ insert_func = postgresql.insert
1184
+ else:
1185
+ raise ValueError('Unsupported database dialect')
1186
+ insert_stmnt = insert_func(cluster_yaml_table).values(
1187
+ cluster_name=cluster_name, yaml=yaml_str)
1188
+ do_update_stmt = insert_stmnt.on_conflict_do_update(
1189
+ index_elements=[cluster_yaml_table.c.cluster_name],
1190
+ set_={cluster_yaml_table.c.yaml: yaml_str})
1191
+ session.execute(do_update_stmt)
1192
+ session.commit()
1193
+
1194
+
1195
+ def remove_cluster_yaml(cluster_name: str):
1196
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1197
+ session.query(cluster_yaml_table).filter_by(
1198
+ cluster_name=cluster_name).delete()
1199
+ session.commit()
sky/jobs/client/sdk.py CHANGED
@@ -46,6 +46,7 @@ def launch(
46
46
  task: sky.Task, or sky.Dag (experimental; 1-task only) to launch as a
47
47
  managed job.
48
48
  name: Name of the managed job.
49
+ priority: Priority of the managed job.
49
50
  _need_confirmation: (Internal only) Whether to show a confirmation
50
51
  prompt before launching the job.
51
52
 
sky/jobs/constants.py CHANGED
@@ -47,7 +47,7 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
47
47
  # The version of the lib files that jobs/utils use. Whenever there is an API
48
48
  # change for the jobs/utils, we need to bump this version and update
49
49
  # job.utils.ManagedJobCodeGen to handle the version update.
50
- MANAGED_JOBS_VERSION = 4
50
+ MANAGED_JOBS_VERSION = 5
51
51
 
52
52
  # The command for setting up the jobs dashboard on the controller. It firstly
53
53
  # checks if the systemd services are available, and if not (e.g., Kubernetes
@@ -70,3 +70,5 @@ DASHBOARD_SETUP_CMD = (
70
70
  f'(nohup {skylet_constants.SKY_PYTHON_CMD} -m sky.jobs.dashboard.dashboard '
71
71
  '>> ~/.sky/job-dashboard.log 2>&1 &); '
72
72
  'fi')
73
+
74
+ DEFAULT_PRIORITY = 500
sky/jobs/controller.py CHANGED
@@ -179,8 +179,8 @@ class JobsController:
179
179
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
180
180
  task.name, self._job_id)
181
181
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
182
- cluster_name, self._backend, task, self._job_id)
183
- managed_job_state.set_submitted(
182
+ cluster_name, self._backend, task, self._job_id, task_id)
183
+ managed_job_state.set_starting(
184
184
  self._job_id,
185
185
  task_id,
186
186
  self._backend.run_timestamp,
@@ -197,9 +197,7 @@ class JobsController:
197
197
  f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
198
198
 
199
199
  logger.info('Started monitoring.')
200
- managed_job_state.set_starting(job_id=self._job_id,
201
- task_id=task_id,
202
- callback_func=callback_func)
200
+
203
201
  remote_job_submitted_at = self._strategy_executor.launch()
204
202
  assert remote_job_submitted_at is not None, remote_job_submitted_at
205
203
 
@@ -18,6 +18,7 @@ from sky import global_user_state
18
18
  from sky import sky_logging
19
19
  from sky.backends import backend_utils
20
20
  from sky.jobs import scheduler
21
+ from sky.jobs import state
21
22
  from sky.jobs import utils as managed_job_utils
22
23
  from sky.skylet import job_lib
23
24
  from sky.usage import usage_lib
@@ -49,7 +50,7 @@ class StrategyExecutor:
49
50
 
50
51
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
51
52
  task: 'task_lib.Task', max_restarts_on_errors: int,
52
- job_id: int) -> None:
53
+ job_id: int, task_id: int) -> None:
53
54
  """Initialize the strategy executor.
54
55
 
55
56
  Args:
@@ -65,11 +66,13 @@ class StrategyExecutor:
65
66
  self.backend = backend
66
67
  self.max_restarts_on_errors = max_restarts_on_errors
67
68
  self.job_id = job_id
69
+ self.task_id = task_id
68
70
  self.restart_cnt_on_failure = 0
69
71
 
70
72
  @classmethod
71
73
  def make(cls, cluster_name: str, backend: 'backends.Backend',
72
- task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
74
+ task: 'task_lib.Task', job_id: int,
75
+ task_id: int) -> 'StrategyExecutor':
73
76
  """Create a strategy from a task."""
74
77
 
75
78
  resource_list = list(task.resources)
@@ -100,7 +103,7 @@ class StrategyExecutor:
100
103
  from_str(job_recovery_name))
101
104
  assert job_recovery_strategy is not None, job_recovery_name
102
105
  return job_recovery_strategy(cluster_name, backend, task,
103
- max_restarts_on_errors, job_id)
106
+ max_restarts_on_errors, job_id, task_id)
104
107
 
105
108
  def launch(self) -> float:
106
109
  """Launch the cluster for the first time.
@@ -235,7 +238,8 @@ class StrategyExecutor:
235
238
 
236
239
  def _launch(self,
237
240
  max_retry: Optional[int] = 3,
238
- raise_on_failure: bool = True) -> Optional[float]:
241
+ raise_on_failure: bool = True,
242
+ recovery: bool = False) -> Optional[float]:
239
243
  """Implementation of launch().
240
244
 
241
245
  The function will wait until the job starts running, but will leave the
@@ -275,98 +279,134 @@ class StrategyExecutor:
275
279
  backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
276
280
  while True:
277
281
  retry_cnt += 1
278
- with scheduler.scheduled_launch(self.job_id):
279
- try:
280
- usage_lib.messages.usage.set_internal()
281
- # Detach setup, so that the setup failure can be detected
282
- # by the controller process (job_status -> FAILED_SETUP).
283
- execution.launch(
284
- self.dag,
285
- cluster_name=self.cluster_name,
286
- # We expect to tear down the cluster as soon as the job
287
- # is finished. However, in case the controller dies, set
288
- # autodown to try and avoid a resource leak.
289
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
290
- down=True,
291
- _is_launched_by_jobs_controller=True)
292
- logger.info('Managed job cluster launched.')
293
- except (exceptions.InvalidClusterNameError,
294
- exceptions.NoCloudAccessError,
295
- exceptions.ResourcesMismatchError) as e:
296
- logger.error('Failure happened before provisioning. '
297
- f'{common_utils.format_exception(e)}')
298
- if raise_on_failure:
299
- raise exceptions.ProvisionPrechecksError(reasons=[e])
300
- return None
301
- except exceptions.ResourcesUnavailableError as e:
302
- # This is raised when the launch fails due to prechecks or
303
- # after failing over through all the candidates.
304
- # Please refer to the docstring of `sky.launch` for more
305
- # details of how the exception will be structured.
306
- if not any(
307
- isinstance(err,
308
- exceptions.ResourcesUnavailableError)
309
- for err in e.failover_history):
310
- # _launch() (this function) should fail/exit directly,
311
- # if none of the failover reasons were because of
312
- # resource unavailability or no failover was attempted
313
- # (the optimizer cannot find feasible resources for
314
- # requested resources), i.e., e.failover_history is
315
- # empty. Failing directly avoids the infinite loop of
316
- # retrying the launch when, e.g., an invalid cluster
317
- # name is used and --retry-until-up is specified.
318
- reasons = (e.failover_history
319
- if e.failover_history else [e])
320
- reasons_str = '; '.join(
321
- common_utils.format_exception(err)
322
- for err in reasons)
323
- logger.error(
324
- 'Failure happened before provisioning. Failover '
325
- f'reasons: {reasons_str}')
282
+ try:
283
+ with scheduler.scheduled_launch(self.job_id):
284
+ # The job state may have been PENDING during backoff -
285
+ # update to STARTING or RECOVERING.
286
+ # On the first attempt (when retry_cnt is 1), we should
287
+ # already be in STARTING or RECOVERING.
288
+ if retry_cnt > 1:
289
+ state.set_restarting(self.job_id, self.task_id,
290
+ recovery)
291
+ try:
292
+ usage_lib.messages.usage.set_internal()
293
+ # Detach setup, so that the setup failure can be
294
+ # detected by the controller process (job_status ->
295
+ # FAILED_SETUP).
296
+ execution.launch(
297
+ self.dag,
298
+ cluster_name=self.cluster_name,
299
+ # We expect to tear down the cluster as soon as the
300
+ # job is finished. However, in case the controller
301
+ # dies, set autodown to try and avoid a resource
302
+ # leak.
303
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
304
+ down=True,
305
+ _is_launched_by_jobs_controller=True)
306
+ logger.info('Managed job cluster launched.')
307
+ except (exceptions.InvalidClusterNameError,
308
+ exceptions.NoCloudAccessError,
309
+ exceptions.ResourcesMismatchError) as e:
310
+ logger.error('Failure happened before provisioning. '
311
+ f'{common_utils.format_exception(e)}')
326
312
  if raise_on_failure:
327
- raise exceptions.ProvisionPrechecksError(reasons)
328
- return None
329
- logger.info('Failed to launch a cluster with error: '
330
- f'{common_utils.format_exception(e)})')
331
- except Exception as e: # pylint: disable=broad-except
332
- # If the launch fails, it will be recovered by the following
333
- # code.
334
- logger.info('Failed to launch a cluster with error: '
335
- f'{common_utils.format_exception(e)})')
336
- with ux_utils.enable_traceback():
337
- logger.info(f' Traceback: {traceback.format_exc()}')
338
- else: # No exception, the launch succeeds.
339
- # At this point, a sky.launch() has succeeded. Cluster may
340
- # be UP (no preemption since) or DOWN (newly preempted).
341
- job_submitted_at = self._wait_until_job_starts_on_cluster()
342
- if job_submitted_at is not None:
343
- return job_submitted_at
344
- # The job fails to start on the cluster, retry the launch.
345
- # TODO(zhwu): log the unexpected error to usage collection
346
- # for future debugging.
347
- logger.info(
348
- 'Failed to successfully submit the job to the '
349
- 'launched cluster, due to unexpected submission errors '
350
- 'or the cluster being preempted during job submission.')
351
-
352
- # If we get here, the launch did not succeed. Tear down the
353
- # cluster and retry.
354
- managed_job_utils.terminate_cluster(self.cluster_name)
355
- if max_retry is not None and retry_cnt >= max_retry:
356
- # Retry forever if max_retry is None.
357
- if raise_on_failure:
358
- with ux_utils.print_exception_no_traceback():
359
- raise exceptions.ManagedJobReachedMaxRetriesError(
360
- 'Resources unavailable: failed to launch '
361
- f'clusters after {max_retry} retries.')
362
- else:
313
+ raise exceptions.ProvisionPrechecksError(
314
+ reasons=[e])
363
315
  return None
364
- # Exit the scheduled_launch context so that the scheulde state is
365
- # ALIVE during the backoff. This allows other jobs to launch.
366
- gap_seconds = backoff.current_backoff()
367
- logger.info('Retrying to launch the cluster in '
368
- f'{gap_seconds:.1f} seconds.')
369
- time.sleep(gap_seconds)
316
+ except exceptions.ResourcesUnavailableError as e:
317
+ # This is raised when the launch fails due to prechecks
318
+ # or after failing over through all the candidates.
319
+ # Please refer to the docstring of `sky.launch` for more
320
+ # details of how the exception will be structured.
321
+ if not any(
322
+ isinstance(err,
323
+ exceptions.ResourcesUnavailableError)
324
+ for err in e.failover_history):
325
+ # _launch() (this function) should fail/exit
326
+ # directly, if none of the failover reasons were
327
+ # because of resource unavailability or no failover
328
+ # was attempted (the optimizer cannot find feasible
329
+ # resources for requested resources), i.e.,
330
+ # e.failover_history is empty. Failing directly
331
+ # avoids the infinite loop of retrying the launch
332
+ # when, e.g., an invalid cluster name is used and
333
+ # --retry-until-up is specified.
334
+ reasons = (e.failover_history
335
+ if e.failover_history else [e])
336
+ reasons_str = '; '.join(
337
+ common_utils.format_exception(err)
338
+ for err in reasons)
339
+ logger.error(
340
+ 'Failure happened before provisioning. '
341
+ f'Failover reasons: {reasons_str}')
342
+ if raise_on_failure:
343
+ raise exceptions.ProvisionPrechecksError(
344
+ reasons)
345
+ return None
346
+ logger.info('Failed to launch a cluster with error: '
347
+ f'{common_utils.format_exception(e)})')
348
+ except Exception as e: # pylint: disable=broad-except
349
+ # If the launch fails, it will be recovered by the
350
+ # following code.
351
+ logger.info('Failed to launch a cluster with error: '
352
+ f'{common_utils.format_exception(e)})')
353
+ with ux_utils.enable_traceback():
354
+ logger.info(
355
+ f' Traceback: {traceback.format_exc()}')
356
+ else: # No exception, the launch succeeds.
357
+ # At this point, a sky.launch() has succeeded. Cluster
358
+ # may be UP (no preemption since) or DOWN (newly
359
+ # preempted).
360
+ job_submitted_at = (
361
+ self._wait_until_job_starts_on_cluster())
362
+ if job_submitted_at is not None:
363
+ return job_submitted_at
364
+ # The job fails to start on the cluster, retry the
365
+ # launch.
366
+ # TODO(zhwu): log the unexpected error to usage
367
+ # collection for future debugging.
368
+ logger.info(
369
+ 'Failed to successfully submit the job to the '
370
+ 'launched cluster, due to unexpected submission '
371
+ 'errors or the cluster being preempted during '
372
+ 'job submission.')
373
+
374
+ # If we get here, the launch did not succeed. Tear down the
375
+ # cluster and retry.
376
+ managed_job_utils.terminate_cluster(self.cluster_name)
377
+ if max_retry is not None and retry_cnt >= max_retry:
378
+ # Retry forever if max_retry is None.
379
+ if raise_on_failure:
380
+ with ux_utils.print_exception_no_traceback():
381
+ raise (
382
+ exceptions.ManagedJobReachedMaxRetriesError(
383
+ 'Resources unavailable: failed to '
384
+ f'launch clusters after {max_retry} '
385
+ 'retries.'))
386
+ else:
387
+ return None
388
+
389
+ # Raise NoClusterLaunchedError to indicate that the job is
390
+ # in retry backoff. This will trigger special handling in
391
+ # scheduler.schedule_launched().
392
+ # We will exit the scheduled_launch context so that the
393
+ # schedule state is ALIVE_BACKOFF during the backoff. This
394
+ # allows other jobs to launch.
395
+ raise exceptions.NoClusterLaunchedError()
396
+
397
+ except exceptions.NoClusterLaunchedError:
398
+ # Update the status to PENDING during backoff.
399
+ state.set_backoff_pending(self.job_id, self.task_id)
400
+ # Calculate the backoff time and sleep.
401
+ gap_seconds = backoff.current_backoff()
402
+ logger.info('Retrying to launch the cluster in '
403
+ f'{gap_seconds:.1f} seconds.')
404
+ time.sleep(gap_seconds)
405
+ continue
406
+ else:
407
+ # The inner loop should either return or throw
408
+ # NoClusterLaunchedError.
409
+ assert False, 'Unreachable'
370
410
 
371
411
  def should_restart_on_failure(self) -> bool:
372
412
  """Increments counter & checks if job should be restarted on a failure.
@@ -389,9 +429,9 @@ class FailoverStrategyExecutor(StrategyExecutor):
389
429
 
390
430
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
391
431
  task: 'task_lib.Task', max_restarts_on_errors: int,
392
- job_id: int) -> None:
432
+ job_id: int, task_id: int) -> None:
393
433
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
394
- job_id)
434
+ job_id, task_id)
395
435
  # Note down the cloud/region of the launched cluster, so that we can
396
436
  # first retry in the same cloud/region. (Inside recover() we may not
397
437
  # rely on cluster handle, as it can be None if the cluster is
@@ -400,8 +440,10 @@ class FailoverStrategyExecutor(StrategyExecutor):
400
440
 
401
441
  def _launch(self,
402
442
  max_retry: Optional[int] = 3,
403
- raise_on_failure: bool = True) -> Optional[float]:
404
- job_submitted_at = super()._launch(max_retry, raise_on_failure)
443
+ raise_on_failure: bool = True,
444
+ recovery: bool = False) -> Optional[float]:
445
+ job_submitted_at = super()._launch(max_retry, raise_on_failure,
446
+ recovery)
405
447
  if job_submitted_at is not None:
406
448
  # Only record the cloud/region if the launch is successful.
407
449
  handle = global_user_state.get_handle_from_cluster_name(
@@ -436,7 +478,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
436
478
  cloud=launched_cloud, region=launched_region, zone=None)
437
479
  task.set_resources({new_resources})
438
480
  # Not using self.launch to avoid the retry until up logic.
439
- job_submitted_at = self._launch(raise_on_failure=False)
481
+ job_submitted_at = self._launch(raise_on_failure=False,
482
+ recovery=True)
440
483
  # Restore the original dag, i.e. reset the region constraint.
441
484
  task.set_resources(original_resources)
442
485
  if job_submitted_at is not None:
@@ -452,7 +495,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
452
495
  'cloud/region.')
453
496
  # Not using self.launch to avoid the retry until up logic.
454
497
  job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
455
- raise_on_failure=False)
498
+ raise_on_failure=False,
499
+ recovery=True)
456
500
  if job_submitted_at is None:
457
501
  # Failed to launch the cluster.
458
502
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
@@ -524,7 +568,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
524
568
  region=launched_region)
525
569
  }
526
570
  # Not using self.launch to avoid the retry until up logic.
527
- job_submitted_at = self._launch(raise_on_failure=False)
571
+ job_submitted_at = self._launch(raise_on_failure=False,
572
+ recovery=True)
528
573
  task.blocked_resources = None
529
574
  if job_submitted_at is not None:
530
575
  return job_submitted_at
@@ -535,7 +580,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
535
580
  'cloud/region.')
536
581
  # Not using self.launch to avoid the retry until up logic.
537
582
  job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
538
- raise_on_failure=False)
583
+ raise_on_failure=False,
584
+ recovery=True)
539
585
  if job_submitted_at is None:
540
586
  # Failed to launch the cluster.
541
587
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
sky/jobs/scheduler.py CHANGED
@@ -45,6 +45,7 @@ import typing
45
45
 
46
46
  import filelock
47
47
 
48
+ from sky import exceptions
48
49
  from sky import sky_logging
49
50
  from sky.adaptors import common as adaptors_common
50
51
  from sky.jobs import constants as managed_job_constants
@@ -190,7 +191,8 @@ def maybe_schedule_next_jobs() -> None:
190
191
  pass
191
192
 
192
193
 
193
- def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
194
+ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str,
195
+ priority: int) -> None:
194
196
  """Submit an existing job to the scheduler.
195
197
 
196
198
  This should be called after a job is created in the `spot` table as
@@ -202,7 +204,7 @@ def submit_job(job_id: int, dag_yaml_path: str, env_file_path: str) -> None:
202
204
  """
203
205
  with filelock.FileLock(_get_lock_path()):
204
206
  state.scheduler_set_waiting(job_id, dag_yaml_path, env_file_path,
205
- common_utils.get_user_hash())
207
+ common_utils.get_user_hash(), priority)
206
208
  maybe_schedule_next_jobs()
207
209
 
208
210
 
@@ -240,11 +242,19 @@ def scheduled_launch(job_id: int):
240
242
  state.ManagedJobScheduleState.LAUNCHING):
241
243
  time.sleep(_ALIVE_JOB_LAUNCH_WAIT_INTERVAL)
242
244
 
243
- yield
244
-
245
- with filelock.FileLock(_get_lock_path()):
246
- state.scheduler_set_alive(job_id)
247
- maybe_schedule_next_jobs()
245
+ try:
246
+ yield
247
+ except exceptions.NoClusterLaunchedError:
248
+ # NoClusterLaunchedError is indicates that the job is in retry backoff.
249
+ # We should transition to ALIVE_BACKOFF instead of ALIVE.
250
+ with filelock.FileLock(_get_lock_path()):
251
+ state.scheduler_set_alive_backoff(job_id)
252
+ raise
253
+ else:
254
+ with filelock.FileLock(_get_lock_path()):
255
+ state.scheduler_set_alive(job_id)
256
+ finally:
257
+ maybe_schedule_next_jobs()
248
258
 
249
259
 
250
260
  def job_done(job_id: int, idempotent: bool = False) -> None:
@@ -309,5 +319,10 @@ if __name__ == '__main__':
309
319
  parser.add_argument('--env-file',
310
320
  type=str,
311
321
  help='The path to the controller env file.')
322
+ parser.add_argument(
323
+ '--priority',
324
+ type=int,
325
+ default=500,
326
+ help='Job priority (0-1000, lower is higher). Default: 500.')
312
327
  args = parser.parse_args()
313
- submit_job(args.job_id, args.dag_yaml, args.env_file)
328
+ submit_job(args.job_id, args.dag_yaml, args.env_file, args.priority)