skypilot-nightly 1.0.0.dev20250616__py3-none-any.whl → 1.0.0.dev20250618__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sky/__init__.py +2 -4
  2. sky/backends/backend_utils.py +7 -0
  3. sky/backends/cloud_vm_ray_backend.py +91 -96
  4. sky/cli.py +5 -6311
  5. sky/client/cli.py +66 -639
  6. sky/client/sdk.py +22 -2
  7. sky/clouds/kubernetes.py +8 -0
  8. sky/clouds/scp.py +7 -26
  9. sky/clouds/utils/scp_utils.py +177 -124
  10. sky/dashboard/out/404.html +1 -1
  11. sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_buildManifest.js +1 -1
  12. sky/dashboard/out/_next/static/chunks/641.c8e452bc5070a630.js +1 -0
  13. sky/dashboard/out/_next/static/chunks/984.ae8c08791d274ca0.js +50 -0
  14. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-36bc0962129f72df.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-cf490d1fa38f3740.js +16 -0
  16. sky/dashboard/out/_next/static/chunks/pages/users-928edf039219e47b.js +1 -0
  17. sky/dashboard/out/_next/static/chunks/webpack-ebc2404fd6ce581c.js +1 -0
  18. sky/dashboard/out/_next/static/css/6c12ecc3bd2239b6.css +3 -0
  19. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  20. sky/dashboard/out/clusters/[cluster].html +1 -1
  21. sky/dashboard/out/clusters.html +1 -1
  22. sky/dashboard/out/config.html +1 -1
  23. sky/dashboard/out/index.html +1 -1
  24. sky/dashboard/out/infra/[context].html +1 -1
  25. sky/dashboard/out/infra.html +1 -1
  26. sky/dashboard/out/jobs/[job].html +1 -1
  27. sky/dashboard/out/jobs.html +1 -1
  28. sky/dashboard/out/users.html +1 -1
  29. sky/dashboard/out/workspace/new.html +1 -1
  30. sky/dashboard/out/workspaces/[name].html +1 -1
  31. sky/dashboard/out/workspaces.html +1 -1
  32. sky/global_user_state.py +50 -11
  33. sky/jobs/controller.py +98 -31
  34. sky/jobs/scheduler.py +37 -29
  35. sky/jobs/server/core.py +36 -3
  36. sky/jobs/state.py +69 -9
  37. sky/jobs/utils.py +11 -0
  38. sky/logs/__init__.py +17 -0
  39. sky/logs/agent.py +73 -0
  40. sky/logs/gcp.py +91 -0
  41. sky/models.py +1 -0
  42. sky/provision/__init__.py +1 -0
  43. sky/provision/instance_setup.py +35 -0
  44. sky/provision/provisioner.py +11 -0
  45. sky/provision/scp/__init__.py +15 -0
  46. sky/provision/scp/config.py +93 -0
  47. sky/provision/scp/instance.py +528 -0
  48. sky/resources.py +164 -29
  49. sky/server/common.py +21 -9
  50. sky/server/requests/payloads.py +19 -1
  51. sky/server/server.py +121 -29
  52. sky/setup_files/dependencies.py +11 -1
  53. sky/skylet/constants.py +48 -1
  54. sky/skylet/job_lib.py +83 -19
  55. sky/task.py +171 -21
  56. sky/templates/kubernetes-ray.yml.j2 +60 -4
  57. sky/templates/scp-ray.yml.j2 +3 -50
  58. sky/users/permission.py +47 -34
  59. sky/users/rbac.py +10 -1
  60. sky/users/server.py +274 -9
  61. sky/utils/command_runner.py +1 -1
  62. sky/utils/common_utils.py +16 -14
  63. sky/utils/context.py +1 -1
  64. sky/utils/controller_utils.py +12 -3
  65. sky/utils/dag_utils.py +17 -4
  66. sky/utils/kubernetes/deploy_remote_cluster.py +17 -8
  67. sky/utils/schemas.py +83 -5
  68. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/METADATA +9 -1
  69. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/RECORD +80 -79
  70. sky/benchmark/__init__.py +0 -0
  71. sky/benchmark/benchmark_state.py +0 -295
  72. sky/benchmark/benchmark_utils.py +0 -641
  73. sky/dashboard/out/_next/static/chunks/600.bd2ed8c076b720ec.js +0 -16
  74. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-59950b2f83b66e48.js +0 -6
  75. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b3dbf38b51cb29be.js +0 -16
  76. sky/dashboard/out/_next/static/chunks/pages/users-c69ffcab9d6e5269.js +0 -1
  77. sky/dashboard/out/_next/static/chunks/webpack-1b69b196a4dbffef.js +0 -1
  78. sky/dashboard/out/_next/static/css/8e97adcaacc15293.css +0 -3
  79. sky/skylet/providers/scp/__init__.py +0 -2
  80. sky/skylet/providers/scp/config.py +0 -149
  81. sky/skylet/providers/scp/node_provider.py +0 -578
  82. /sky/dashboard/out/_next/static/{OZxMW3bxAJmqgn5f4MdhO → LRpGymRCqq-feuFyoWz4m}/_ssgManifest.js +0 -0
  83. /sky/dashboard/out/_next/static/chunks/{37-824c707421f6f003.js → 37-3a4d77ad62932eaf.js} +0 -0
  84. /sky/dashboard/out/_next/static/chunks/{843-ab9c4f609239155f.js → 843-b3040e493f6e7947.js} +0 -0
  85. /sky/dashboard/out/_next/static/chunks/{938-385d190b95815e11.js → 938-1493ac755eadeb35.js} +0 -0
  86. /sky/dashboard/out/_next/static/chunks/{973-c807fc34f09c7df3.js → 973-db3c97c2bfbceb65.js} +0 -0
  87. /sky/dashboard/out/_next/static/chunks/pages/{_app-32b2caae3445bf3b.js → _app-c416e87d5c2715cf.js} +0 -0
  88. /sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-c8c2191328532b7d.js → [name]-c4ff1ec05e2f3daf.js} +0 -0
  89. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/WHEEL +0 -0
  90. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/entry_points.txt +0 -0
  91. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/licenses/LICENSE +0 -0
  92. {skypilot_nightly-1.0.0.dev20250616.dist-info → skypilot_nightly-1.0.0.dev20250618.dist-info}/top_level.txt +0 -0
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/OZxMW3bxAJmqgn5f4MdhO/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/OZxMW3bxAJmqgn5f4MdhO/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"OZxMW3bxAJmqgn5f4MdhO","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspace/new-31aa8bdcb7592635.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspace/new","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-c807fc34f09c7df3.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-385d190b95815e11.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-ab9c4f609239155f.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c8c2191328532b7d.js" defer=""></script><script src="/dashboard/_next/static/OZxMW3bxAJmqgn5f4MdhO/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/OZxMW3bxAJmqgn5f4MdhO/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"OZxMW3bxAJmqgn5f4MdhO","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/616-d6128fa9e7cae6e6.js" defer=""></script><script src="/dashboard/_next/static/chunks/760-a89d354797ce7af5.js" defer=""></script><script src="/dashboard/_next/static/chunks/799-3625946b2ec2eb30.js" defer=""></script><script src="/dashboard/_next/static/chunks/804-4c9fc53aa74bc191.js" defer=""></script><script src="/dashboard/_next/static/chunks/664-047bc03493fda379.js" defer=""></script><script src="/dashboard/_next/static/chunks/798-c0525dc3f21e488d.js" defer=""></script><script src="/dashboard/_next/static/chunks/947-6620842ef80ae879.js" defer=""></script><script src="/dashboard/_next/static/chunks/470-4d1a5dbe58a8a2b9.js" defer=""></script><script src="/dashboard/_next/static/chunks/901-b424d293275e1fd7.js" defer=""></script><script src="/dashboard/_next/static/chunks/969-20d54a9d998dc102.js" defer=""></script><script src="/dashboard/_next/static/chunks/856-c2c39c0912285e54.js" defer=""></script><script src="/dashboard/_next/static/chunks/973-db3c97c2bfbceb65.js" defer=""></script><script src="/dashboard/_next/static/chunks/938-1493ac755eadeb35.js" defer=""></script><script src="/dashboard/_next/static/chunks/843-b3040e493f6e7947.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces/%5Bname%5D-c4ff1ec05e2f3daf.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces/[name]","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/8e97adcaacc15293.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/8e97adcaacc15293.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-1b69b196a4dbffef.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-32b2caae3445bf3b.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/OZxMW3bxAJmqgn5f4MdhO/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/OZxMW3bxAJmqgn5f4MdhO/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"OZxMW3bxAJmqgn5f4MdhO","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/6c12ecc3bd2239b6.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-ebc2404fd6ce581c.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-c416e87d5c2715cf.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/workspaces-82e6601baa5dd280.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/LRpGymRCqq-feuFyoWz4m/_ssgManifest.js" defer=""></script></head><body><div id="__next"></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/workspaces","query":{},"buildId":"LRpGymRCqq-feuFyoWz4m","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
sky/global_user_state.py CHANGED
@@ -64,6 +64,7 @@ user_table = sqlalchemy.Table(
64
64
  Base.metadata,
65
65
  sqlalchemy.Column('id', sqlalchemy.Text, primary_key=True),
66
66
  sqlalchemy.Column('name', sqlalchemy.Text),
67
+ sqlalchemy.Column('password', sqlalchemy.Text),
67
68
  )
68
69
 
69
70
  cluster_table = sqlalchemy.Table(
@@ -301,6 +302,12 @@ def create_table():
301
302
  'last_creation_command',
302
303
  sqlalchemy.Text(),
303
304
  default_statement='DEFAULT NULL')
305
+ db_utils.add_column_to_table_sqlalchemy(
306
+ session,
307
+ 'users',
308
+ 'password',
309
+ sqlalchemy.Text(),
310
+ default_statement='DEFAULT NULL')
304
311
  session.commit()
305
312
 
306
313
 
@@ -358,7 +365,9 @@ def add_or_update_user(user: models.User) -> bool:
358
365
 
359
366
  # First try INSERT OR IGNORE - this won't fail if user exists
360
367
  insert_stmnt = insert_func(user_table).prefix_with(
361
- 'OR IGNORE').values(id=user.id, name=user.name)
368
+ 'OR IGNORE').values(id=user.id,
369
+ name=user.name,
370
+ password=user.password)
362
371
  result = session.execute(insert_stmnt)
363
372
 
364
373
  # Check if the INSERT actually inserted a row
@@ -366,8 +375,14 @@ def add_or_update_user(user: models.User) -> bool:
366
375
 
367
376
  if not was_inserted:
368
377
  # User existed, so update it
369
- session.query(user_table).filter_by(id=user.id).update(
370
- {user_table.c.name: user.name})
378
+ if user.password:
379
+ session.query(user_table).filter_by(id=user.id).update({
380
+ user_table.c.name: user.name,
381
+ user_table.c.password: user.password
382
+ })
383
+ else:
384
+ session.query(user_table).filter_by(id=user.id).update(
385
+ {user_table.c.name: user.name})
371
386
 
372
387
  session.commit()
373
388
  return was_inserted
@@ -377,15 +392,19 @@ def add_or_update_user(user: models.User) -> bool:
377
392
  # For PostgreSQL, use INSERT ... ON CONFLICT with RETURNING to
378
393
  # detect insert vs update
379
394
  insert_func = postgresql.insert
380
- insert_stmnt = insert_func(user_table).values(id=user.id,
381
- name=user.name)
395
+ insert_stmnt = insert_func(user_table).values(
396
+ id=user.id, name=user.name, password=user.password)
382
397
 
383
398
  # Use a sentinel in the RETURNING clause to detect insert vs update
399
+ if user.password:
400
+ set_ = {
401
+ user_table.c.name: user.name,
402
+ user_table.c.password: user.password
403
+ }
404
+ else:
405
+ set_ = {user_table.c.name: user.name}
384
406
  upsert_stmnt = insert_stmnt.on_conflict_do_update(
385
- index_elements=[user_table.c.id],
386
- set_={
387
- user_table.c.name: user.name
388
- }).returning(
407
+ index_elements=[user_table.c.id], set_=set_).returning(
389
408
  user_table.c.id,
390
409
  # This will be True for INSERT, False for UPDATE
391
410
  sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
@@ -407,7 +426,24 @@ def get_user(user_id: str) -> Optional[models.User]:
407
426
  row = session.query(user_table).filter_by(id=user_id).first()
408
427
  if row is None:
409
428
  return None
410
- return models.User(id=row.id, name=row.name)
429
+ return models.User(id=row.id, name=row.name, password=row.password)
430
+
431
+
432
+ def get_user_by_name(username: str) -> List[models.User]:
433
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
434
+ rows = session.query(user_table).filter_by(name=username).all()
435
+ if len(rows) == 0:
436
+ return []
437
+ return [
438
+ models.User(id=row.id, name=row.name, password=row.password)
439
+ for row in rows
440
+ ]
441
+
442
+
443
+ def delete_user(user_id: str) -> None:
444
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
445
+ session.query(user_table).filter_by(id=user_id).delete()
446
+ session.commit()
411
447
 
412
448
 
413
449
  @_init_db
@@ -415,7 +451,10 @@ def get_all_users() -> List[models.User]:
415
451
  assert _SQLALCHEMY_ENGINE is not None
416
452
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
417
453
  rows = session.query(user_table).all()
418
- return [models.User(id=row.id, name=row.name) for row in rows]
454
+ return [
455
+ models.User(id=row.id, name=row.name, password=row.password)
456
+ for row in rows
457
+ ]
419
458
 
420
459
 
421
460
  @_init_db
sky/jobs/controller.py CHANGED
@@ -152,6 +152,20 @@ class JobsController:
152
152
  Other exceptions may be raised depending on the backend.
153
153
  """
154
154
 
155
+ latest_task_id, last_task_prev_status = (
156
+ managed_job_state.get_latest_task_id_status(self._job_id))
157
+ is_resume = False
158
+ if (latest_task_id is not None and last_task_prev_status !=
159
+ managed_job_state.ManagedJobStatus.PENDING):
160
+ assert latest_task_id >= task_id, (latest_task_id, task_id)
161
+ if latest_task_id > task_id:
162
+ logger.info(f'Task {task_id} ({task.name}) has already '
163
+ 'been executed. Skipping...')
164
+ return True
165
+ if latest_task_id == task_id:
166
+ # Start recovery.
167
+ is_resume = True
168
+
155
169
  callback_func = managed_job_utils.event_callback_func(
156
170
  job_id=self._job_id, task_id=task_id, task=task)
157
171
  if task.run is None:
@@ -171,42 +185,72 @@ class JobsController:
171
185
  return True
172
186
  usage_lib.messages.usage.update_task_id(task_id)
173
187
  task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
174
- submitted_at = time.time()
175
- if task_id == 0:
176
- submitted_at = backend_utils.get_timestamp_from_run_timestamp(
177
- self._backend.run_timestamp)
178
188
  assert task.name is not None, task
179
189
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
180
190
  task.name, self._job_id)
181
191
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
182
192
  cluster_name, self._backend, task, self._job_id, task_id)
183
- managed_job_state.set_starting(
184
- self._job_id,
185
- task_id,
186
- self._backend.run_timestamp,
187
- submitted_at,
188
- resources_str=backend_utils.get_task_resources_str(
189
- task, is_managed_job=True),
190
- specs={
191
- 'max_restarts_on_errors':
192
- self._strategy_executor.max_restarts_on_errors
193
- },
194
- callback_func=callback_func)
195
- logger.info(
196
- f'Submitted managed job {self._job_id} (task: {task_id}, name: '
197
- f'{task.name!r}); {constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
193
+ if not is_resume:
194
+ submitted_at = time.time()
195
+ if task_id == 0:
196
+ submitted_at = backend_utils.get_timestamp_from_run_timestamp(
197
+ self._backend.run_timestamp)
198
+ managed_job_state.set_starting(
199
+ self._job_id,
200
+ task_id,
201
+ self._backend.run_timestamp,
202
+ submitted_at,
203
+ resources_str=backend_utils.get_task_resources_str(
204
+ task, is_managed_job=True),
205
+ specs={
206
+ 'max_restarts_on_errors':
207
+ self._strategy_executor.max_restarts_on_errors
208
+ },
209
+ callback_func=callback_func)
210
+ logger.info(f'Submitted managed job {self._job_id} '
211
+ f'(task: {task_id}, name: {task.name!r}); '
212
+ f'{constants.TASK_ID_ENV_VAR}: {task_id_env_var}')
198
213
 
199
214
  logger.info('Started monitoring.')
200
215
 
201
- remote_job_submitted_at = self._strategy_executor.launch()
202
- assert remote_job_submitted_at is not None, remote_job_submitted_at
216
+ # Only do the initial cluster launch if not resuming from a controller
217
+ # failure. Otherwise, we will transit to recovering immediately.
218
+ remote_job_submitted_at = time.time()
219
+ if not is_resume:
220
+ remote_job_submitted_at = self._strategy_executor.launch()
221
+ assert remote_job_submitted_at is not None, remote_job_submitted_at
203
222
 
204
- managed_job_state.set_started(job_id=self._job_id,
205
- task_id=task_id,
206
- start_time=remote_job_submitted_at,
207
- callback_func=callback_func)
223
+ if not is_resume:
224
+ managed_job_state.set_started(job_id=self._job_id,
225
+ task_id=task_id,
226
+ start_time=remote_job_submitted_at,
227
+ callback_func=callback_func)
208
228
 
209
229
  while True:
230
+ # NOTE: if we are resuming from a controller failure, we only keep
231
+ # monitoring if the job is in RUNNING state. For all other cases,
232
+ # we will directly transit to recovering since we have no idea what
233
+ # the cluster status is.
234
+ force_transit_to_recovering = False
235
+ if is_resume:
236
+ prev_status = managed_job_state.get_job_status_with_task_id(
237
+ job_id=self._job_id, task_id=task_id)
238
+ if prev_status is not None:
239
+ if prev_status.is_terminal():
240
+ return (prev_status ==
241
+ managed_job_state.ManagedJobStatus.SUCCEEDED)
242
+ if (prev_status ==
243
+ managed_job_state.ManagedJobStatus.CANCELLING):
244
+ # If the controller is down when cancelling the job,
245
+ # we re-raise the error to run the `_cleanup` function
246
+ # again to clean up any remaining resources.
247
+ raise exceptions.ManagedJobUserCancelledError(
248
+ 'Recovering cancel signal.')
249
+ if prev_status != managed_job_state.ManagedJobStatus.RUNNING:
250
+ force_transit_to_recovering = True
251
+ # This resume logic should only be triggered once.
252
+ is_resume = False
253
+
210
254
  time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
211
255
 
212
256
  # Check the network connection to avoid false alarm for job failure.
@@ -221,8 +265,19 @@ class JobsController:
221
265
 
222
266
  # NOTE: we do not check cluster status first because race condition
223
267
  # can occur, i.e. cluster can be down during the job status check.
224
- job_status = managed_job_utils.get_job_status(
225
- self._backend, cluster_name)
268
+ # NOTE: If fetching the job status fails or we force to transit to
269
+ # recovering, we will set the job status to None, which will force
270
+ # enter the recovering logic.
271
+ job_status = None
272
+ if not force_transit_to_recovering:
273
+ try:
274
+ job_status = managed_job_utils.get_job_status(
275
+ self._backend, cluster_name)
276
+ except exceptions.FetchClusterInfoError as fetch_e:
277
+ logger.info(
278
+ 'Failed to fetch the job status. Start recovery.\n'
279
+ f'Exception: {common_utils.format_exception(fetch_e)}\n'
280
+ f'Traceback: {traceback.format_exc()}')
226
281
 
227
282
  if job_status == job_lib.JobStatus.SUCCEEDED:
228
283
  success_end_time = managed_job_utils.try_to_get_job_end_time(
@@ -379,7 +434,17 @@ class JobsController:
379
434
  if handle is not None:
380
435
  resources = handle.launched_resources
381
436
  assert resources is not None, handle
382
- if resources.need_cleanup_after_preemption_or_failure():
437
+ # If we are forcing to transit to recovering, we need to clean
438
+ # up the cluster as it is possible that we already submitted the
439
+ # job to the worker cluster, but state is not updated yet. In
440
+ # this case, it is possible that we will double-submit the job
441
+ # to the worker cluster. So we always clean up the cluster here.
442
+ # TODO(tian,cooperc): We can check if there is a running job on
443
+ # the worker cluster, and if so, we can skip the cleanup.
444
+ # Challenge: race condition when the worker cluster thought it
445
+ # does not have a running job yet but later the job is launched.
446
+ if (resources.need_cleanup_after_preemption_or_failure() or
447
+ force_transit_to_recovering):
383
448
  # Some spot resource (e.g., Spot TPU VM) may need to be
384
449
  # cleaned up after preemption, as running launch again on
385
450
  # those clusters again may fail.
@@ -389,9 +454,11 @@ class JobsController:
389
454
 
390
455
  # Try to recover the managed jobs, when the cluster is preempted or
391
456
  # failed or the job status is failed to be fetched.
392
- managed_job_state.set_recovering(job_id=self._job_id,
393
- task_id=task_id,
394
- callback_func=callback_func)
457
+ managed_job_state.set_recovering(
458
+ job_id=self._job_id,
459
+ task_id=task_id,
460
+ force_transit_to_recovering=force_transit_to_recovering,
461
+ callback_func=callback_func)
395
462
  recovered_time = self._strategy_executor.recover()
396
463
  managed_job_state.set_recovered(self._job_id,
397
464
  task_id,
sky/jobs/scheduler.py CHANGED
@@ -84,6 +84,32 @@ def _get_lock_path() -> str:
84
84
  return path
85
85
 
86
86
 
87
+ def _start_controller(job_id: int, dag_yaml_path: str,
88
+ env_file_path: str) -> None:
89
+ activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
90
+ source_environment_cmd = (f'source {env_file_path};'
91
+ if env_file_path else '')
92
+ run_controller_cmd = ('python -u -m sky.jobs.controller '
93
+ f'{dag_yaml_path} --job-id {job_id};')
94
+
95
+ # If the command line here is changed, please also update
96
+ # utils._controller_process_alive. `--job-id X` should be at
97
+ # the end.
98
+ run_cmd = (f'{activate_python_env_cmd}'
99
+ f'{source_environment_cmd}'
100
+ f'{run_controller_cmd}')
101
+
102
+ logs_dir = os.path.expanduser(
103
+ managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
104
+ os.makedirs(logs_dir, exist_ok=True)
105
+ log_path = os.path.join(logs_dir, f'{job_id}.log')
106
+
107
+ pid = subprocess_utils.launch_new_process_tree(run_cmd, log_output=log_path)
108
+ state.set_job_controller_pid(job_id, pid)
109
+
110
+ logger.debug(f'Job {job_id} started with pid {pid}')
111
+
112
+
87
113
  def maybe_schedule_next_jobs() -> None:
88
114
  """Determine if any managed jobs can be scheduled, and if so, schedule them.
89
115
 
@@ -158,32 +184,9 @@ def maybe_schedule_next_jobs() -> None:
158
184
 
159
185
  job_id = maybe_next_job['job_id']
160
186
  dag_yaml_path = maybe_next_job['dag_yaml_path']
187
+ env_file_path = maybe_next_job['env_file_path']
161
188
 
162
- activate_python_env_cmd = (
163
- f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
164
- env_file = maybe_next_job['env_file_path']
165
- source_environment_cmd = (f'source {env_file};'
166
- if env_file else '')
167
- run_controller_cmd = ('python -u -m sky.jobs.controller '
168
- f'{dag_yaml_path} --job-id {job_id};')
169
-
170
- # If the command line here is changed, please also update
171
- # utils._controller_process_alive. `--job-id X` should be at
172
- # the end.
173
- run_cmd = (f'{activate_python_env_cmd}'
174
- f'{source_environment_cmd}'
175
- f'{run_controller_cmd}')
176
-
177
- logs_dir = os.path.expanduser(
178
- managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
179
- os.makedirs(logs_dir, exist_ok=True)
180
- log_path = os.path.join(logs_dir, f'{job_id}.log')
181
-
182
- pid = subprocess_utils.launch_new_process_tree(
183
- run_cmd, log_output=log_path)
184
- state.set_job_controller_pid(job_id, pid)
185
-
186
- logger.debug(f'Job {job_id} started with pid {pid}')
189
+ _start_controller(job_id, dag_yaml_path, env_file_path)
187
190
 
188
191
  except filelock.Timeout:
189
192
  # If we can't get the lock, just exit. The process holding the lock
@@ -203,10 +206,15 @@ def submit_job(job_id: int, dag_yaml_path: str, original_user_yaml_path: str,
203
206
  The user hash should be set (e.g. via SKYPILOT_USER_ID) before calling this.
204
207
  """
205
208
  with filelock.FileLock(_get_lock_path()):
206
- state.scheduler_set_waiting(job_id, dag_yaml_path,
207
- original_user_yaml_path, env_file_path,
208
- common_utils.get_user_hash(), priority)
209
- maybe_schedule_next_jobs()
209
+ is_resume = state.scheduler_set_waiting(job_id, dag_yaml_path,
210
+ original_user_yaml_path,
211
+ env_file_path,
212
+ common_utils.get_user_hash(),
213
+ priority)
214
+ if is_resume:
215
+ _start_controller(job_id, dag_yaml_path, env_file_path)
216
+ else:
217
+ maybe_schedule_next_jobs()
210
218
 
211
219
 
212
220
  @contextlib.contextmanager
sky/jobs/server/core.py CHANGED
@@ -102,14 +102,47 @@ def launch(
102
102
  'name only and comment out the task names (so that they '
103
103
  'will be auto-generated) .')
104
104
  task_names.add(task_.name)
105
- if task_.job_priority is not None:
106
- if (priority is not None and priority != task_.job_priority):
105
+
106
+ # Check for priority in resources first, then fall back to job priority
107
+ task_priority = None
108
+ if task_.resources:
109
+ # Convert set to list to access elements by index
110
+ resources_list = list(task_.resources)
111
+ # Take first resource's priority as reference
112
+ task_priority = resources_list[0].priority
113
+
114
+ # Check all other resources have same priority
115
+ for resource in resources_list[1:]:
116
+ if resource.priority != task_priority:
117
+ with ux_utils.print_exception_no_traceback():
118
+ raise ValueError(
119
+ f'Task {task_.name!r}: All resources must have the '
120
+ 'same priority. Found priority '
121
+ f'{resource.priority} but expected {task_priority}.'
122
+ )
123
+
124
+ # Check for conflict between resources priority and job
125
+ # priority
126
+ if task_.job_priority is not None:
127
+ with ux_utils.print_exception_no_traceback():
128
+ raise ValueError(
129
+ f'Task {task_.name!r}: Cannot specify both '
130
+ f'resources.priority ({task_priority}) and '
131
+ f'job.priority ({task_.job_priority}). Please use only '
132
+ 'one priority specification method.')
133
+
134
+ # Fall back to job priority if no resources priority found
135
+ if task_priority is None:
136
+ task_priority = task_.job_priority
137
+
138
+ if task_priority is not None:
139
+ if (priority is not None and priority != task_priority):
107
140
  with ux_utils.print_exception_no_traceback():
108
141
  raise ValueError(
109
142
  'Multiple tasks in the DAG have different priorities. '
110
143
  'Either specify a priority in only one task, or set '
111
144
  'the same priority for each task.')
112
- priority = task_.job_priority
145
+ priority = task_priority
113
146
 
114
147
  if priority is None:
115
148
  priority = managed_job_constants.DEFAULT_PRIORITY
sky/jobs/state.py CHANGED
@@ -352,6 +352,16 @@ class ManagedJobStatus(enum.Enum):
352
352
  cls.FAILED_NO_RESOURCE, cls.FAILED_CONTROLLER
353
353
  ]
354
354
 
355
+ @classmethod
356
+ def processing_statuses(cls) -> List['ManagedJobStatus']:
357
+ # Any status that is not terminal and is not CANCELLING.
358
+ return [
359
+ cls.PENDING,
360
+ cls.STARTING,
361
+ cls.RUNNING,
362
+ cls.RECOVERING,
363
+ ]
364
+
355
365
 
356
366
  _SPOT_STATUS_TO_COLOR = {
357
367
  ManagedJobStatus.PENDING: colorama.Fore.BLUE,
@@ -607,21 +617,49 @@ def set_started(job_id: int, task_id: int, start_time: float,
607
617
 
608
618
 
609
619
  @_init_db
610
- def set_recovering(job_id: int, task_id: int, callback_func: CallbackType):
620
+ def set_recovering(job_id: int, task_id: int, force_transit_to_recovering: bool,
621
+ callback_func: CallbackType):
611
622
  """Set the task to recovering state, and update the job duration."""
612
623
  assert _DB_PATH is not None
613
624
  logger.info('=== Recovering... ===')
625
+ expected_status: List[str] = [ManagedJobStatus.RUNNING.value]
626
+ status_str = 'status=(?)'
627
+ if force_transit_to_recovering:
628
+ # For the HA job controller, it is possible that the jobs came from any
629
+ # processing status to recovering. But it should not be any terminal
630
+ # status as such jobs will not be recovered; and it should not be
631
+ # CANCELLING as we will directly trigger a cleanup.
632
+ expected_status = [
633
+ s.value for s in ManagedJobStatus.processing_statuses()
634
+ ]
635
+ question_mark_str = ', '.join(['?'] * len(expected_status))
636
+ status_str = f'status IN ({question_mark_str})'
637
+ # NOTE: if we are resuming from a controller failure and the previous status
638
+ # is STARTING, the initial value of `last_recovered_at` might not be set
639
+ # yet (default value -1). In this case, we should not add current timestamp.
640
+ # Otherwise, the job duration will be incorrect (~55 years from 1970).
641
+ current_time = time.time()
614
642
  with db_utils.safe_cursor(_DB_PATH) as cursor:
615
643
  cursor.execute(
616
- """\
644
+ f"""\
617
645
  UPDATE spot SET
618
- status=(?), job_duration=job_duration+(?)-last_recovered_at
646
+ status=(?),
647
+ job_duration=CASE
648
+ WHEN last_recovered_at >= 0
649
+ THEN job_duration+(?)-last_recovered_at
650
+ ELSE job_duration
651
+ END,
652
+ last_recovered_at=CASE
653
+ WHEN last_recovered_at < 0
654
+ THEN (?)
655
+ ELSE last_recovered_at
656
+ END
619
657
  WHERE spot_job_id=(?) AND
620
658
  task_id=(?) AND
621
- status=(?) AND
659
+ {status_str} AND
622
660
  end_at IS null""",
623
- (ManagedJobStatus.RECOVERING.value, time.time(), job_id, task_id,
624
- ManagedJobStatus.RUNNING.value))
661
+ (ManagedJobStatus.RECOVERING.value, current_time, current_time,
662
+ job_id, task_id, *expected_status))
625
663
  if cursor.rowcount != 1:
626
664
  raise exceptions.ManagedJobStatusError(
627
665
  f'Failed to set the task to recovering. '
@@ -996,6 +1034,19 @@ def _get_all_task_ids_statuses(
996
1034
  return [(row[0], ManagedJobStatus(row[1])) for row in id_statuses]
997
1035
 
998
1036
 
1037
+ @_init_db
1038
+ def get_job_status_with_task_id(job_id: int,
1039
+ task_id: int) -> Optional[ManagedJobStatus]:
1040
+ assert _DB_PATH is not None
1041
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
1042
+ status = cursor.execute(
1043
+ """\
1044
+ SELECT status FROM spot
1045
+ WHERE spot_job_id=(?) AND task_id=(?)""",
1046
+ (job_id, task_id)).fetchone()
1047
+ return ManagedJobStatus(status[0]) if status else None
1048
+
1049
+
999
1050
  def get_num_tasks(job_id: int) -> int:
1000
1051
  return len(_get_all_task_ids_statuses(job_id))
1001
1052
 
@@ -1156,8 +1207,15 @@ def get_local_log_file(job_id: int, task_id: Optional[int]) -> Optional[str]:
1156
1207
  @_init_db
1157
1208
  def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1158
1209
  original_user_yaml_path: str, env_file_path: str,
1159
- user_hash: str, priority: int) -> None:
1160
- """Do not call without holding the scheduler lock."""
1210
+ user_hash: str, priority: int) -> bool:
1211
+ """Do not call without holding the scheduler lock.
1212
+
1213
+ Returns: Whether this is a recovery run or not.
1214
+ If this is a recovery run, the job may already be in the WAITING
1215
+ state and the update will not change the schedule_state (hence the
1216
+ updated_count will be 0). In this case, we return True.
1217
+ Otherwise, we return False.
1218
+ """
1161
1219
  assert _DB_PATH is not None
1162
1220
  with db_utils.safe_cursor(_DB_PATH) as cursor:
1163
1221
  updated_count = cursor.execute(
@@ -1169,7 +1227,9 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
1169
1227
  (ManagedJobScheduleState.WAITING.value, dag_yaml_path,
1170
1228
  original_user_yaml_path, env_file_path, user_hash, priority,
1171
1229
  job_id, ManagedJobScheduleState.INACTIVE.value)).rowcount
1172
- assert updated_count == 1, (job_id, updated_count)
1230
+ # For a recovery run, the job may already be in the WAITING state.
1231
+ assert updated_count <= 1, (job_id, updated_count)
1232
+ return updated_count == 0
1173
1233
 
1174
1234
 
1175
1235
  @_init_db
sky/jobs/utils.py CHANGED
@@ -176,6 +176,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
176
176
  Note: we expect that job_id, if provided, refers to a nonterminal job or a
177
177
  job that has not completed its cleanup (schedule state not DONE).
178
178
  """
179
+ # This signal file suggests that the controller is recovering from a
180
+ # failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
181
+ # When restarting the controller processes, we don't want this event to
182
+ # set the job status to FAILED_CONTROLLER.
183
+ # TODO(tian): Change this to restart the controller process. For now we
184
+ # disabled it when recovering because we want to avoid caveats of infinite
185
+ # restart of last controller process that fully occupied the controller VM.
186
+ if os.path.exists(
187
+ os.path.expanduser(
188
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
189
+ return
179
190
 
180
191
  def _cleanup_job_clusters(job_id: int) -> Optional[str]:
181
192
  """Clean up clusters for a job. Returns error message if any.
sky/logs/__init__.py ADDED
@@ -0,0 +1,17 @@
1
+ """Sky logging agents."""
2
+ from typing import Optional
3
+
4
+ from sky import exceptions
5
+ from sky import skypilot_config
6
+ from sky.logs.agent import LoggingAgent
7
+ from sky.logs.gcp import GCPLoggingAgent
8
+
9
+
10
+ def get_logging_agent() -> Optional[LoggingAgent]:
11
+ store = skypilot_config.get_nested(('logs', 'store'), None)
12
+ if store is None:
13
+ return None
14
+ if store == 'gcp':
15
+ return GCPLoggingAgent(skypilot_config.get_nested(('logs', 'gcp'), {}))
16
+ raise exceptions.InvalidSkyPilotConfigError(
17
+ f'Invalid logging store: {store}')