skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -4
- sky/backends/backend_utils.py +7 -0
- sky/backends/cloud_vm_ray_backend.py +91 -96
- sky/cli.py +5 -6311
- sky/client/cli.py +66 -639
- sky/client/sdk.py +22 -2
- sky/clouds/kubernetes.py +8 -0
- sky/clouds/scp.py +7 -26
- sky/clouds/utils/scp_utils.py +177 -124
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
- sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
- sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/global_user_state.py +50 -11
- sky/jobs/controller.py +98 -31
- sky/jobs/scheduler.py +37 -29
- sky/jobs/server/core.py +36 -3
- sky/jobs/state.py +69 -9
- sky/jobs/utils.py +11 -0
- sky/logs/__init__.py +17 -0
- sky/logs/agent.py +73 -0
- sky/logs/gcp.py +91 -0
- sky/models.py +1 -0
- sky/provision/__init__.py +1 -0
- sky/provision/instance_setup.py +35 -0
- sky/provision/provisioner.py +11 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +528 -0
- sky/resources.py +164 -29
- sky/server/common.py +21 -9
- sky/server/requests/payloads.py +19 -1
- sky/server/server.py +121 -29
- sky/setup_files/dependencies.py +11 -1
- sky/skylet/constants.py +48 -1
- sky/skylet/job_lib.py +83 -19
- sky/task.py +171 -21
- sky/templates/kubernetes-ray.yml.j2 +60 -4
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/users/permission.py +47 -34
- sky/users/rbac.py +10 -1
- sky/users/server.py +274 -9
- sky/utils/command_runner.py +1 -1
- sky/utils/common_utils.py +16 -14
- sky/utils/context.py +1 -1
- sky/utils/controller_utils.py +12 -3
- sky/utils/dag_utils.py +17 -4
- sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
- sky/utils/schemas.py +83 -5
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
- sky/benchmark/__init__.py +0 -0
- sky/benchmark/benchmark_state.py +0 -295
- sky/benchmark/benchmark_utils.py +0 -641
- sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
- sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
- /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
- /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-db3c97c2bfbceb65.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-1493ac755eadeb35.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-b3040e493f6e7947.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c4ff1ec05e2f3daf.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
sky/global_user_state.py
CHANGED
@@ -64,6 +64,7 @@ user_table = sqlalchemy.Table(
|
|
64
64
|
Base.metadata,
|
65
65
|
sqlalchemy.Column('id', sqlalchemy.Text, primary_key=True),
|
66
66
|
sqlalchemy.Column('name', sqlalchemy.Text),
|
67
|
+
sqlalchemy.Column('password', sqlalchemy.Text),
|
67
68
|
)
|
68
69
|
|
69
70
|
cluster_table = sqlalchemy.Table(
|
@@ -301,6 +302,12 @@ def create_table():
|
|
301
302
|
'last_creation_command',
|
302
303
|
sqlalchemy.Text(),
|
303
304
|
default_statement='DEFAULT NULL')
|
305
|
+
db_utils.add_column_to_table_sqlalchemy(
|
306
|
+
session,
|
307
|
+
'users',
|
308
|
+
'password',
|
309
|
+
sqlalchemy.Text(),
|
310
|
+
default_statement='DEFAULT NULL')
|
304
311
|
session.commit()
|
305
312
|
|
306
313
|
|
@@ -358,7 +365,9 @@ def add_or_update_user(user: models.User) -> bool:
|
|
358
365
|
|
359
366
|
# First try INSERT OR IGNORE - this won't fail if user exists
|
360
367
|
insert_stmnt = insert_func(user_table).prefix_with(
|
361
|
-
'OR IGNORE').values(id=user.id,
|
368
|
+
'OR IGNORE').values(id=user.id,
|
369
|
+
name=user.name,
|
370
|
+
password=user.password)
|
362
371
|
result = session.execute(insert_stmnt)
|
363
372
|
|
364
373
|
# Check if the INSERT actually inserted a row
|
@@ -366,8 +375,14 @@ def add_or_update_user(user: models.User) -> bool:
|
|
366
375
|
|
367
376
|
if not was_inserted:
|
368
377
|
# User existed, so update it
|
369
|
-
|
370
|
-
|
378
|
+
if user.password:
|
379
|
+
session.query(user_table).filter_by(id=user.id).update({
|
380
|
+
user_table.c.name: user.name,
|
381
|
+
user_table.c.password: user.password
|
382
|
+
})
|
383
|
+
else:
|
384
|
+
session.query(user_table).filter_by(id=user.id).update(
|
385
|
+
{user_table.c.name: user.name})
|
371
386
|
|
372
387
|
session.commit()
|
373
388
|
return was_inserted
|
@@ -377,15 +392,19 @@ def add_or_update_user(user: models.User) -> bool:
|
|
377
392
|
# For PostgreSQL, use INSERT ... ON CONFLICT with RETURNING to
|
378
393
|
# detect insert vs update
|
379
394
|
insert_func = postgresql.insert
|
380
|
-
insert_stmnt = insert_func(user_table).values(
|
381
|
-
|
395
|
+
insert_stmnt = insert_func(user_table).values(
|
396
|
+
id=user.id, name=user.name, password=user.password)
|
382
397
|
|
383
398
|
# Use a sentinel in the RETURNING clause to detect insert vs update
|
399
|
+
if user.password:
|
400
|
+
set_ = {
|
401
|
+
user_table.c.name: user.name,
|
402
|
+
user_table.c.password: user.password
|
403
|
+
}
|
404
|
+
else:
|
405
|
+
set_ = {user_table.c.name: user.name}
|
384
406
|
upsert_stmnt = insert_stmnt.on_conflict_do_update(
|
385
|
-
index_elements=[user_table.c.id],
|
386
|
-
set_={
|
387
|
-
user_table.c.name: user.name
|
388
|
-
}).returning(
|
407
|
+
index_elements=[user_table.c.id], set_=set_).returning(
|
389
408
|
user_table.c.id,
|
390
409
|
# This will be True for INSERT, False for UPDATE
|
391
410
|
sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
|
@@ -407,7 +426,24 @@ def get_user(user_id: str) -> Optional[models.User]:
|
|
407
426
|
row = session.query(user_table).filter_by(id=user_id).first()
|
408
427
|
if row is None:
|
409
428
|
return None
|
410
|
-
return models.User(id=row.id, name=row.name)
|
429
|
+
return models.User(id=row.id, name=row.name, password=row.password)
|
430
|
+
|
431
|
+
|
432
|
+
def get_user_by_name(username: str) -> List[models.User]:
|
433
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
434
|
+
rows = session.query(user_table).filter_by(name=username).all()
|
435
|
+
if len(rows) == 0:
|
436
|
+
return []
|
437
|
+
return [
|
438
|
+
models.User(id=row.id, name=row.name, password=row.password)
|
439
|
+
for row in rows
|
440
|
+
]
|
441
|
+
|
442
|
+
|
443
|
+
def delete_user(user_id: str) -> None:
|
444
|
+
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
445
|
+
session.query(user_table).filter_by(id=user_id).delete()
|
446
|
+
session.commit()
|
411
447
|
|
412
448
|
|
413
449
|
@_init_db
|
@@ -415,7 +451,10 @@ def get_all_users() -> List[models.User]:
|
|
415
451
|
assert _SQLALCHEMY_ENGINE is not None
|
416
452
|
with orm.Session(_SQLALCHEMY_ENGINE) as session:
|
417
453
|
rows = session.query(user_table).all()
|
418
|
-
return [
|
454
|
+
return [
|
455
|
+
models.User(id=row.id, name=row.name, password=row.password)
|
456
|
+
for row in rows
|
457
|
+
]
|
419
458
|
|
420
459
|
|
421
460
|
@_init_db
|
sky/jobs/controller.py
CHANGED
@@ -152,6 +152,20 @@ class JobsController:
|
|
152
152
|
Other exceptions may be raised depending on the backend.
|
153
153
|
"""
|
154
154
|
|
155
|
+
latest_task_id, last_task_prev_status = (
|
156
|
+
managed_job_state.get_latest_task_id_status(self._job_id))
|
157
|
+
is_resume = False
|
158
|
+
if (latest_task_id is not None and last_task_prev_status !=
|
159
|
+
managed_job_state.ManagedJobStatus.PENDING):
|
160
|
+
assert latest_task_id >= task_id, (latest_task_id, task_id)
|
161
|
+
if latest_task_id > task_id:
|
162
|
+
logger.info(f'Task {task_id} ({task.name}) has already '
|
163
|
+
'been executed. Skipping...')
|
164
|
+
return True
|
165
|
+
if latest_task_id == task_id:
|
166
|
+
# Start recovery.
|
167
|
+
is_resume = True
|
168
|
+
|
155
169
|
callback_func = managed_job_utils.event_callback_func(
|
156
170
|
job_id=self._job_id, task_id=task_id, task=task)
|
157
171
|
if task.run is None:
|
@@ -171,42 +185,72 @@ class JobsController:
|
|
171
185
|
return True
|
172
186
|
usage_lib.messages.usage.update_task_id(task_id)
|
173
187
|
task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
|
174
|
-
submitted_at = time.time()
|
175
|
-
if task_id == 0:
|
176
|
-
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
177
|
-
self._backend.run_timestamp)
|
178
188
|
assert task.name is not None, task
|
179
189
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
180
190
|
task.name, self._job_id)
|
181
191
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
182
192
|
cluster_name, self._backend, task, self._job_id, task_id)
|
183
|
-
|
184
|
-
|
185
|
-
task_id
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
193
|
+
if not is_resume:
|
194
|
+
submitted_at = time.time()
|
195
|
+
if task_id == 0:
|
196
|
+
submitted_at = backend_utils.get_timestamp_from_run_timestamp(
|
197
|
+
self._backend.run_timestamp)
|
198
|
+
managed_job_state.set_starting(
|
199
|
+
self._job_id,
|
200
|
+
task_id,
|
201
|
+
self._backend.run_timestamp,
|
202
|
+
submitted_at,
|
203
|
+
resources_str=backend_utils.get_task_resources_str(
|
204
|
+
task, is_managed_job=True),
|
205
|
+
specs={
|
206
|
+
'max_restarts_on_errors':
|
207
|
+
self._strategy_executor.max_restarts_on_errors
|
208
|
+
},
|
209
|
+
callback_func=callback_func)
|
210
|
+
logger.info(f'Submitted managed job {self._job_id} '
|
211
|
+
f'(task: {task_id}, name: {task.name!r}); '
|
212
|
+
f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
|
198
213
|
|
199
214
|
logger.info('Started monitoring.')
|
200
215
|
|
201
|
-
|
202
|
-
|
216
|
+
# Only do the initial cluster launch if not resuming from a controller
|
217
|
+
# failure. Otherwise, we will transit to recovering immediately.
|
218
|
+
remote_job_submitted_at = time.time()
|
219
|
+
if not is_resume:
|
220
|
+
remote_job_submitted_at = self._strategy_executor.launch()
|
221
|
+
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
203
222
|
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
223
|
+
if not is_resume:
|
224
|
+
managed_job_state.set_started(job_id=self._job_id,
|
225
|
+
task_id=task_id,
|
226
|
+
start_time=remote_job_submitted_at,
|
227
|
+
callback_func=callback_func)
|
208
228
|
|
209
229
|
while True:
|
230
|
+
# NOTE: if we are resuming from a controller failure, we only keep
|
231
|
+
# monitoring if the job is in RUNNING state. For all other cases,
|
232
|
+
# we will directly transit to recovering since we have no idea what
|
233
|
+
# the cluster status is.
|
234
|
+
force_transit_to_recovering = False
|
235
|
+
if is_resume:
|
236
|
+
prev_status = managed_job_state.get_job_status_with_task_id(
|
237
|
+
job_id=self._job_id, task_id=task_id)
|
238
|
+
if prev_status is not None:
|
239
|
+
if prev_status.is_terminal():
|
240
|
+
return (prev_status ==
|
241
|
+
managed_job_state.ManagedJobStatus.SUCCEEDED)
|
242
|
+
if (prev_status ==
|
243
|
+
managed_job_state.ManagedJobStatus.CANCELLING):
|
244
|
+
# If the controller is down when cancelling the job,
|
245
|
+
# we re-raise the error to run the `_cleanup` function
|
246
|
+
# again to clean up any remaining resources.
|
247
|
+
raise exceptions.ManagedJobUserCancelledError(
|
248
|
+
'Recovering cancel signal.')
|
249
|
+
if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
|
250
|
+
force_transit_to_recovering = True
|
251
|
+
# This resume logic should only be triggered once.
|
252
|
+
is_resume = False
|
253
|
+
|
210
254
|
time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
|
211
255
|
|
212
256
|
# Check the network connection to avoid false alarm for job failure.
|
@@ -221,8 +265,19 @@ class JobsController:
|
|
221
265
|
|
222
266
|
# NOTE: we do not check cluster status first because race condition
|
223
267
|
# can occur, i.e. cluster can be down during the job status check.
|
224
|
-
|
225
|
-
|
268
|
+
# NOTE: If fetching the job status fails or we force to transit to
|
269
|
+
# recovering, we will set the job status to None, which will force
|
270
|
+
# enter the recovering logic.
|
271
|
+
job_status = None
|
272
|
+
if not force_transit_to_recovering:
|
273
|
+
try:
|
274
|
+
job_status = managed_job_utils.get_job_status(
|
275
|
+
self._backend, cluster_name)
|
276
|
+
except exceptions.FetchClusterInfoError as fetch_e:
|
277
|
+
logger.info(
|
278
|
+
'Failed to fetch the job status. Start recovery.\n'
|
279
|
+
f'Exception: {common_utils.format_exception(fetch_e)}\n'
|
280
|
+
f'Traceback: {traceback.format_exc()}')
|
226
281
|
|
227
282
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
228
283
|
success_end_time = managed_job_utils.try_to_get_job_end_time(
|
@@ -379,7 +434,17 @@ class JobsController:
|
|
379
434
|
if handle is not None:
|
380
435
|
resources = handle.launched_resources
|
381
436
|
assert resources is not None, handle
|
382
|
-
|
437
|
+
# If we are forcing to transit to recovering, we need to clean
|
438
|
+
# up the cluster as it is possible that we already submitted the
|
439
|
+
# job to the worker cluster, but state is not updated yet. In
|
440
|
+
# this case, it is possible that we will double-submit the job
|
441
|
+
# to the worker cluster. So we always clean up the cluster here.
|
442
|
+
# TODO(tian,cooperc): We can check if there is a running job on
|
443
|
+
# the worker cluster, and if so, we can skip the cleanup.
|
444
|
+
# Challenge: race condition when the worker cluster thought it
|
445
|
+
# does not have a running job yet but later the job is launched.
|
446
|
+
if (resources.need_cleanup_after_preemption_or_failure() or
|
447
|
+
force_transit_to_recovering):
|
383
448
|
# Some spot resource (e.g., Spot TPU VM) may need to be
|
384
449
|
# cleaned up after preemption, as running launch again on
|
385
450
|
# those clusters again may fail.
|
@@ -389,9 +454,11 @@ class JobsController:
|
|
389
454
|
|
390
455
|
# Try to recover the managed jobs, when the cluster is preempted or
|
391
456
|
# failed or the job status is failed to be fetched.
|
392
|
-
managed_job_state.set_recovering(
|
393
|
-
|
394
|
-
|
457
|
+
managed_job_state.set_recovering(
|
458
|
+
job_id=self._job_id,
|
459
|
+
task_id=task_id,
|
460
|
+
force_transit_to_recovering=force_transit_to_recovering,
|
461
|
+
callback_func=callback_func)
|
395
462
|
recovered_time = self._strategy_executor.recover()
|
396
463
|
managed_job_state.set_recovered(self._job_id,
|
397
464
|
task_id,
|
sky/jobs/scheduler.py
CHANGED
@@ -84,6 +84,32 @@ def _get_lock_path() -> str:
|
|
84
84
|
return path
|
85
85
|
|
86
86
|
|
87
|
+
def _start_controller(job_id: int, dag_yaml_path: str,
|
88
|
+
env_file_path: str) -> None:
|
89
|
+
activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
90
|
+
source_environment_cmd = (f'source {env_file_path};'
|
91
|
+
if env_file_path else '')
|
92
|
+
run_controller_cmd = ('python -u -m sky.jobs.controller '
|
93
|
+
f'{dag_yaml_path} --job-id {job_id};')
|
94
|
+
|
95
|
+
# If the command line here is changed, please also update
|
96
|
+
# utils._controller_process_alive. `--job-id X` should be at
|
97
|
+
# the end.
|
98
|
+
run_cmd = (f'{activate_python_env_cmd}'
|
99
|
+
f'{source_environment_cmd}'
|
100
|
+
f'{run_controller_cmd}')
|
101
|
+
|
102
|
+
logs_dir = os.path.expanduser(
|
103
|
+
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
104
|
+
os.makedirs(logs_dir, exist_ok=True)
|
105
|
+
log_path = os.path.join(logs_dir, f'{job_id}.log')
|
106
|
+
|
107
|
+
pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
|
108
|
+
state.set_job_controller_pid(job_id, pid)
|
109
|
+
|
110
|
+
logger.debug(f'Job {job_id} started with pid {pid}')
|
111
|
+
|
112
|
+
|
87
113
|
def maybe_schedule_next_jobs() -> None:
|
88
114
|
"""Determine if any managed jobs can be scheduled, and if so, schedule them.
|
89
115
|
|
@@ -158,32 +184,9 @@ def maybe_schedule_next_jobs() -> None:
|
|
158
184
|
|
159
185
|
job_id = maybe_next_job['job_id']
|
160
186
|
dag_yaml_path = maybe_next_job['dag_yaml_path']
|
187
|
+
env_file_path = maybe_next_job['env_file_path']
|
161
188
|
|
162
|
-
|
163
|
-
f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
|
164
|
-
env_file = maybe_next_job['env_file_path']
|
165
|
-
source_environment_cmd = (f'source {env_file};'
|
166
|
-
if env_file else '')
|
167
|
-
run_controller_cmd = ('python -u -m sky.jobs.controller '
|
168
|
-
f'{dag_yaml_path} --job-id {job_id};')
|
169
|
-
|
170
|
-
# If the command line here is changed, please also update
|
171
|
-
# utils._controller_process_alive. `--job-id X` should be at
|
172
|
-
# the end.
|
173
|
-
run_cmd = (f'{activate_python_env_cmd}'
|
174
|
-
f'{source_environment_cmd}'
|
175
|
-
f'{run_controller_cmd}')
|
176
|
-
|
177
|
-
logs_dir = os.path.expanduser(
|
178
|
-
managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
|
179
|
-
os.makedirs(logs_dir, exist_ok=True)
|
180
|
-
log_path = os.path.join(logs_dir, f'{job_id}.log')
|
181
|
-
|
182
|
-
pid = subprocess_utils.launch_new_process_tree(
|
183
|
-
run_cmd, log_output=log_path)
|
184
|
-
state.set_job_controller_pid(job_id, pid)
|
185
|
-
|
186
|
-
logger.debug(f'Job {job_id} started with pid {pid}')
|
189
|
+
_start_controller(job_id, dag_yaml_path, env_file_path)
|
187
190
|
|
188
191
|
except filelock.Timeout:
|
189
192
|
# If we can't get the lock, just exit. The process holding the lock
|
@@ -203,10 +206,15 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
|
|
203
206
|
The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
|
204
207
|
"""
|
205
208
|
with filelock.FileLock(_get_lock_path()):
|
206
|
-
state.scheduler_set_waiting(job_id, dag_yaml_path,
|
207
|
-
|
208
|
-
|
209
|
-
|
209
|
+
is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
|
210
|
+
original_user_yaml_path,
|
211
|
+
env_file_path,
|
212
|
+
common_utils.get_user_hash(),
|
213
|
+
priority)
|
214
|
+
if is_resume:
|
215
|
+
_start_controller(job_id, dag_yaml_path, env_file_path)
|
216
|
+
else:
|
217
|
+
maybe_schedule_next_jobs()
|
210
218
|
|
211
219
|
|
212
220
|
@contextlib.contextmanager
|
sky/jobs/server/core.py
CHANGED
@@ -102,14 +102,47 @@ def launch(
|
|
102
102
|
'name only and comment out the task names (so that they '
|
103
103
|
'will be auto-generated) .')
|
104
104
|
task_names.add(task_.name)
|
105
|
-
|
106
|
-
|
105
|
+
|
106
|
+
# Check for priority in resources first, then fall back to job priority
|
107
|
+
task_priority = None
|
108
|
+
if task_.resources:
|
109
|
+
# Convert set to list to access elements by index
|
110
|
+
resources_list = list(task_.resources)
|
111
|
+
# Take first resource's priority as reference
|
112
|
+
task_priority = resources_list[0].priority
|
113
|
+
|
114
|
+
# Check all other resources have same priority
|
115
|
+
for resource in resources_list[1:]:
|
116
|
+
if resource.priority != task_priority:
|
117
|
+
with ux_utils.print_exception_no_traceback():
|
118
|
+
raise ValueError(
|
119
|
+
f'Task {task_.name!r}: All resources must have the '
|
120
|
+
'same priority. Found priority '
|
121
|
+
f'{resource.priority} but expected {task_priority}.'
|
122
|
+
)
|
123
|
+
|
124
|
+
# Check for conflict between resources priority and job
|
125
|
+
# priority
|
126
|
+
if task_.job_priority is not None:
|
127
|
+
with ux_utils.print_exception_no_traceback():
|
128
|
+
raise ValueError(
|
129
|
+
f'Task {task_.name!r}: Cannot specify both '
|
130
|
+
f'resources.priority ({task_priority}) and '
|
131
|
+
f'job.priority ({task_.job_priority}). Please use only '
|
132
|
+
'one priority specification method.')
|
133
|
+
|
134
|
+
# Fall back to job priority if no resources priority found
|
135
|
+
if task_priority is None:
|
136
|
+
task_priority = task_.job_priority
|
137
|
+
|
138
|
+
if task_priority is not None:
|
139
|
+
if (priority is not None and priority != task_priority):
|
107
140
|
with ux_utils.print_exception_no_traceback():
|
108
141
|
raise ValueError(
|
109
142
|
'Multiple tasks in the DAG have different priorities. '
|
110
143
|
'Either specify a priority in only one task, or set '
|
111
144
|
'the same priority for each task.')
|
112
|
-
priority =
|
145
|
+
priority = task_priority
|
113
146
|
|
114
147
|
if priority is None:
|
115
148
|
priority = managed_job_constants.DEFAULT_PRIORITY
|
sky/jobs/state.py
CHANGED
@@ -352,6 +352,16 @@ class ManagedJobStatus(enum.Enum):
|
|
352
352
|
cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER
|
353
353
|
]
|
354
354
|
|
355
|
+
@classmethod
|
356
|
+
def processing_statuses(cls) -> List['ManagedJobStatus']:
|
357
|
+
# Any status that is not terminal and is not CANCELLING.
|
358
|
+
return [
|
359
|
+
cls.PENDING,
|
360
|
+
cls.STARTING,
|
361
|
+
cls.RUNNING,
|
362
|
+
cls.RECOVERING,
|
363
|
+
]
|
364
|
+
|
355
365
|
|
356
366
|
_SPOT_STATUS_TO_COLOR = {
|
357
367
|
ManagedJobStatus.PENDING: colorama.Fore.BLUE,
|
@@ -607,21 +617,49 @@ def set_started(job_id: int, task_id: int, start_time: float,
|
|
607
617
|
|
608
618
|
|
609
619
|
@_init_db
|
610
|
-
def set_recovering(job_id: int, task_id: int,
|
620
|
+
def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
|
621
|
+
callback_func: CallbackType):
|
611
622
|
"""Set the task to recovering state, and update the job duration."""
|
612
623
|
assert _DB_PATH is not None
|
613
624
|
logger.info('=== Recovering... ===')
|
625
|
+
expected_status: List[str] = [ManagedJobStatus.RUNNING.value]
|
626
|
+
status_str = 'status=(?)'
|
627
|
+
if force_transit_to_recovering:
|
628
|
+
# For the HA job controller, it is possible that the jobs came from any
|
629
|
+
# processing status to recovering. But it should not be any terminal
|
630
|
+
# status as such jobs will not be recovered; and it should not be
|
631
|
+
# CANCELLING as we will directly trigger a cleanup.
|
632
|
+
expected_status = [
|
633
|
+
s.value for s in ManagedJobStatus.processing_statuses()
|
634
|
+
]
|
635
|
+
question_mark_str = ', '.join(['?'] * len(expected_status))
|
636
|
+
status_str = f'status IN ({question_mark_str})'
|
637
|
+
# NOTE: if we are resuming from a controller failure and the previous status
|
638
|
+
# is STARTING, the initial value of `last_recovered_at` might not be set
|
639
|
+
# yet (default value -1). In this case, we should not add current timestamp.
|
640
|
+
# Otherwise, the job duration will be incorrect (~55 years from 1970).
|
641
|
+
current_time = time.time()
|
614
642
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
615
643
|
cursor.execute(
|
616
|
-
"""\
|
644
|
+
f"""\
|
617
645
|
UPDATE spot SET
|
618
|
-
status=(?),
|
646
|
+
status=(?),
|
647
|
+
job_duration=CASE
|
648
|
+
WHEN last_recovered_at >= 0
|
649
|
+
THEN job_duration+(?)-last_recovered_at
|
650
|
+
ELSE job_duration
|
651
|
+
END,
|
652
|
+
last_recovered_at=CASE
|
653
|
+
WHEN last_recovered_at < 0
|
654
|
+
THEN (?)
|
655
|
+
ELSE last_recovered_at
|
656
|
+
END
|
619
657
|
WHERE spot_job_id=(?) AND
|
620
658
|
task_id=(?) AND
|
621
|
-
|
659
|
+
{status_str} AND
|
622
660
|
end_at IS null""",
|
623
|
-
(ManagedJobStatus.RECOVERING.value,
|
624
|
-
|
661
|
+
(ManagedJobStatus.RECOVERING.value, current_time, current_time,
|
662
|
+
job_id, task_id, *expected_status))
|
625
663
|
if cursor.rowcount != 1:
|
626
664
|
raise exceptions.ManagedJobStatusError(
|
627
665
|
f'Failed to set the task to recovering. '
|
@@ -996,6 +1034,19 @@ def _get_all_task_ids_statuses(
|
|
996
1034
|
return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
|
997
1035
|
|
998
1036
|
|
1037
|
+
@_init_db
|
1038
|
+
def get_job_status_with_task_id(job_id: int,
|
1039
|
+
task_id: int) -> Optional[ManagedJobStatus]:
|
1040
|
+
assert _DB_PATH is not None
|
1041
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1042
|
+
status = cursor.execute(
|
1043
|
+
"""\
|
1044
|
+
SELECT status FROM spot
|
1045
|
+
WHERE spot_job_id=(?) AND task_id=(?)""",
|
1046
|
+
(job_id, task_id)).fetchone()
|
1047
|
+
return ManagedJobStatus(status[0]) if status else None
|
1048
|
+
|
1049
|
+
|
999
1050
|
def get_num_tasks(job_id: int) -> int:
|
1000
1051
|
return len(_get_all_task_ids_statuses(job_id))
|
1001
1052
|
|
@@ -1156,8 +1207,15 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
|
|
1156
1207
|
@_init_db
|
1157
1208
|
def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
1158
1209
|
original_user_yaml_path: str, env_file_path: str,
|
1159
|
-
user_hash: str, priority: int) ->
|
1160
|
-
"""Do not call without holding the scheduler lock.
|
1210
|
+
user_hash: str, priority: int) -> bool:
|
1211
|
+
"""Do not call without holding the scheduler lock.
|
1212
|
+
|
1213
|
+
Returns: Whether this is a recovery run or not.
|
1214
|
+
If this is a recovery run, the job may already be in the WAITING
|
1215
|
+
state and the update will not change the schedule_state (hence the
|
1216
|
+
updated_count will be 0). In this case, we return True.
|
1217
|
+
Otherwise, we return False.
|
1218
|
+
"""
|
1161
1219
|
assert _DB_PATH is not None
|
1162
1220
|
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
1163
1221
|
updated_count = cursor.execute(
|
@@ -1169,7 +1227,9 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
|
|
1169
1227
|
(ManagedJobScheduleState.WAITING.value, dag_yaml_path,
|
1170
1228
|
original_user_yaml_path, env_file_path, user_hash, priority,
|
1171
1229
|
job_id, ManagedJobScheduleState.INACTIVE.value)).rowcount
|
1172
|
-
|
1230
|
+
# For a recovery run, the job may already be in the WAITING state.
|
1231
|
+
assert updated_count <= 1, (job_id, updated_count)
|
1232
|
+
return updated_count == 0
|
1173
1233
|
|
1174
1234
|
|
1175
1235
|
@_init_db
|
sky/jobs/utils.py
CHANGED
@@ -176,6 +176,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
|
|
176
176
|
Note: we expect that job_id, if provided, refers to a nonterminal job or a
|
177
177
|
job that has not completed its cleanup (schedule state not DONE).
|
178
178
|
"""
|
179
|
+
# This signal file suggests that the controller is recovering from a
|
180
|
+
# failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
|
181
|
+
# When restarting the controller processes, we don't want this event to
|
182
|
+
# set the job status to FAILED_CONTROLLER.
|
183
|
+
# TODO(tian): Change this to restart the controller process. For now we
|
184
|
+
# disabled it when recovering because we want to avoid caveats of infinite
|
185
|
+
# restart of last controller process that fully occupied the controller VM.
|
186
|
+
if os.path.exists(
|
187
|
+
os.path.expanduser(
|
188
|
+
constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
|
189
|
+
return
|
179
190
|
|
180
191
|
def _cleanup_job_clusters(job_id: int) -> Optional[str]:
|
181
192
|
"""Clean up clusters for a job. Returns error message if any.
|
sky/logs/__init__.py
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
"""Sky logging agents."""
|
2
|
+
from typing import Optional
|
3
|
+
|
4
|
+
from sky import exceptions
|
5
|
+
from sky import skypilot_config
|
6
|
+
from sky.logs.agent import LoggingAgent
|
7
|
+
from sky.logs.gcp import GCPLoggingAgent
|
8
|
+
|
9
|
+
|
10
|
+
def get_logging_agent() -> Optional[LoggingAgent]:
|
11
|
+
store = skypilot_config.get_nested(('logs', 'store'), None)
|
12
|
+
if store is None:
|
13
|
+
return None
|
14
|
+
if store == 'gcp':
|
15
|
+
return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
|
16
|
+
raise exceptions.InvalidSkyPilotConfigError(
|
17
|
+
f'Invalid logging store: {store}')
|