skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/global_user_state.py CHANGED
@@ -32,6 +32,7 @@ from sky import sky_logging
32
32
  from sky import skypilot_config
33
33
  from sky.metrics import utils as metrics_lib
34
34
  from sky.skylet import constants
35
+ from sky.utils import annotations
35
36
  from sky.utils import common_utils
36
37
  from sky.utils import context_utils
37
38
  from sky.utils import registry
@@ -342,6 +343,10 @@ def initialize_and_get_db() -> sqlalchemy.engine.Engine:
342
343
 
343
344
  # return engine
344
345
  _SQLALCHEMY_ENGINE = engine
346
+ # Cache the result of _sqlite_supports_returning()
347
+ # ahead of time, as it won't change throughout
348
+ # the lifetime of the engine.
349
+ _sqlite_supports_returning()
345
350
  return _SQLALCHEMY_ENGINE
346
351
 
347
352
 
@@ -372,19 +377,51 @@ def _init_db(func):
372
377
  return wrapper
373
378
 
374
379
 
380
+ @annotations.lru_cache(scope='global', maxsize=1)
381
+ def _sqlite_supports_returning() -> bool:
382
+ """Check if SQLite (3.35.0+) and SQLAlchemy (2.0+) support RETURNING.
383
+
384
+ See https://sqlite.org/lang_returning.html and
385
+ https://docs.sqlalchemy.org/en/20/dialects/sqlite.html#insert-update-delete-returning # pylint: disable=line-too-long
386
+ """
387
+ sqlalchemy_version_parts = sqlalchemy.__version__.split('.')
388
+ assert len(sqlalchemy_version_parts) >= 1, \
389
+ f'Invalid SQLAlchemy version: {sqlalchemy.__version__}'
390
+ sqlalchemy_major = int(sqlalchemy_version_parts[0])
391
+ if sqlalchemy_major < 2:
392
+ return False
393
+
394
+ assert _SQLALCHEMY_ENGINE is not None
395
+ if (_SQLALCHEMY_ENGINE.dialect.name !=
396
+ db_utils.SQLAlchemyDialect.SQLITE.value):
397
+ return False
398
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
399
+ result = session.execute(sqlalchemy.text('SELECT sqlite_version()'))
400
+ version_str = result.scalar()
401
+ version_parts = version_str.split('.')
402
+ assert len(version_parts) >= 2, \
403
+ f'Invalid version string: {version_str}'
404
+ major, minor = int(version_parts[0]), int(version_parts[1])
405
+ return (major > 3) or (major == 3 and minor >= 35)
406
+
407
+
375
408
  @_init_db
376
409
  @metrics_lib.time_me
377
- def add_or_update_user(user: models.User,
378
- allow_duplicate_name: bool = True) -> bool:
410
+ def add_or_update_user(
411
+ user: models.User,
412
+ allow_duplicate_name: bool = True,
413
+ return_user: bool = False
414
+ ) -> typing.Union[bool, typing.Tuple[bool, models.User]]:
379
415
  """Store the mapping from user hash to user name for display purposes.
380
416
 
381
417
  Returns:
382
- Boolean: whether the user is newly added
418
+ If return_user=False: bool (whether the user is newly added)
419
+ If return_user=True: Tuple[bool, models.User]
383
420
  """
384
421
  assert _SQLALCHEMY_ENGINE is not None
385
422
 
386
423
  if user.name is None:
387
- return False
424
+ return (False, user) if return_user else False
388
425
 
389
426
  # Set created_at if not already set
390
427
  created_at = user.created_at
@@ -396,7 +433,7 @@ def add_or_update_user(user: models.User,
396
433
  existing_user = session.query(user_table).filter(
397
434
  user_table.c.name == user.name).first()
398
435
  if existing_user is not None:
399
- return False
436
+ return (False, user) if return_user else False
400
437
 
401
438
  if (_SQLALCHEMY_ENGINE.dialect.name ==
402
439
  db_utils.SQLAlchemyDialect.SQLITE.value):
@@ -410,24 +447,57 @@ def add_or_update_user(user: models.User,
410
447
  name=user.name,
411
448
  password=user.password,
412
449
  created_at=created_at)
450
+ use_returning = return_user and _sqlite_supports_returning()
451
+ if use_returning:
452
+ insert_stmnt = insert_stmnt.returning(
453
+ user_table.c.id,
454
+ user_table.c.name,
455
+ user_table.c.password,
456
+ user_table.c.created_at,
457
+ )
413
458
  result = session.execute(insert_stmnt)
414
459
 
415
- # Check if the INSERT actually inserted a row
416
- was_inserted = result.rowcount > 0
460
+ row = None
461
+ if use_returning:
462
+ # With RETURNING, check if we got a row back.
463
+ row = result.fetchone()
464
+ was_inserted = row is not None
465
+ else:
466
+ # Without RETURNING, use rowcount.
467
+ was_inserted = result.rowcount > 0
417
468
 
418
469
  if not was_inserted:
419
470
  # User existed, so update it (but don't update created_at)
471
+ update_values = {user_table.c.name: user.name}
420
472
  if user.password:
421
- session.query(user_table).filter_by(id=user.id).update({
422
- user_table.c.name: user.name,
423
- user_table.c.password: user.password
424
- })
425
- else:
426
- session.query(user_table).filter_by(id=user.id).update(
427
- {user_table.c.name: user.name})
473
+ update_values[user_table.c.password] = user.password
474
+
475
+ update_stmnt = sqlalchemy.update(user_table).where(
476
+ user_table.c.id == user.id).values(update_values)
477
+ if use_returning:
478
+ update_stmnt = update_stmnt.returning(
479
+ user_table.c.id, user_table.c.name,
480
+ user_table.c.password, user_table.c.created_at)
481
+
482
+ result = session.execute(update_stmnt)
483
+ if use_returning:
484
+ row = result.fetchone()
428
485
 
429
486
  session.commit()
430
- return was_inserted
487
+
488
+ if return_user:
489
+ if row is None:
490
+ # row=None means the sqlite used has no RETURNING support,
491
+ # so we need to do a separate query
492
+ row = session.query(user_table).filter_by(
493
+ id=user.id).first()
494
+ updated_user = models.User(id=row.id,
495
+ name=row.name,
496
+ password=row.password,
497
+ created_at=row.created_at)
498
+ return was_inserted, updated_user
499
+ else:
500
+ return was_inserted
431
501
 
432
502
  elif (_SQLALCHEMY_ENGINE.dialect.name ==
433
503
  db_utils.SQLAlchemyDialect.POSTGRESQL.value):
@@ -452,6 +522,9 @@ def add_or_update_user(user: models.User,
452
522
  upsert_stmnt = insert_stmnt.on_conflict_do_update(
453
523
  index_elements=[user_table.c.id], set_=set_).returning(
454
524
  user_table.c.id,
525
+ user_table.c.name,
526
+ user_table.c.password,
527
+ user_table.c.created_at,
455
528
  # This will be True for INSERT, False for UPDATE
456
529
  sqlalchemy.literal_column('(xmax = 0)').label('was_inserted'
457
530
  ))
@@ -459,10 +532,17 @@ def add_or_update_user(user: models.User,
459
532
  result = session.execute(upsert_stmnt)
460
533
  row = result.fetchone()
461
534
 
462
- ret = bool(row.was_inserted) if row else False
535
+ was_inserted = bool(row.was_inserted) if row else False
463
536
  session.commit()
464
537
 
465
- return ret
538
+ if return_user:
539
+ updated_user = models.User(id=row.id,
540
+ name=row.name,
541
+ password=row.password,
542
+ created_at=row.created_at)
543
+ return was_inserted, updated_user
544
+ else:
545
+ return was_inserted
466
546
  else:
467
547
  raise ValueError('Unsupported database dialect')
468
548
 
@@ -1081,6 +1161,26 @@ def get_handles_from_cluster_names(
1081
1161
  }
1082
1162
 
1083
1163
 
1164
+ @_init_db
1165
+ @metrics_lib.time_me
1166
+ def get_cluster_name_to_handle_map(
1167
+ is_managed: Optional[bool] = None,
1168
+ ) -> Dict[str, Optional['backends.ResourceHandle']]:
1169
+ assert _SQLALCHEMY_ENGINE is not None
1170
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1171
+ query = session.query(cluster_table.c.name, cluster_table.c.handle)
1172
+ if is_managed is not None:
1173
+ query = query.filter(cluster_table.c.is_managed == int(is_managed))
1174
+ rows = query.all()
1175
+ name_to_handle = {}
1176
+ for row in rows:
1177
+ if row.handle and len(row.handle) > 0:
1178
+ name_to_handle[row.name] = pickle.loads(row.handle)
1179
+ else:
1180
+ name_to_handle[row.name] = None
1181
+ return name_to_handle
1182
+
1183
+
1084
1184
  @_init_db_async
1085
1185
  @metrics_lib.time_me
1086
1186
  async def get_status_from_cluster_name_async(
@@ -1494,41 +1594,31 @@ def get_cluster_from_name(
1494
1594
  include_user_info: bool = True,
1495
1595
  summary_response: bool = False) -> Optional[Dict[str, Any]]:
1496
1596
  assert _SQLALCHEMY_ENGINE is not None
1597
+ query_fields = [
1598
+ cluster_table.c.name,
1599
+ cluster_table.c.launched_at,
1600
+ cluster_table.c.handle,
1601
+ cluster_table.c.last_use,
1602
+ cluster_table.c.status,
1603
+ cluster_table.c.autostop,
1604
+ cluster_table.c.to_down,
1605
+ cluster_table.c.owner,
1606
+ cluster_table.c.metadata,
1607
+ cluster_table.c.cluster_hash,
1608
+ cluster_table.c.cluster_ever_up,
1609
+ cluster_table.c.status_updated_at,
1610
+ cluster_table.c.user_hash,
1611
+ cluster_table.c.config_hash,
1612
+ cluster_table.c.workspace,
1613
+ cluster_table.c.is_managed,
1614
+ ]
1615
+ if not summary_response:
1616
+ query_fields.extend([
1617
+ cluster_table.c.last_creation_yaml,
1618
+ cluster_table.c.last_creation_command,
1619
+ ])
1497
1620
  with orm.Session(_SQLALCHEMY_ENGINE) as session:
1498
- if summary_response:
1499
- query = session.query(
1500
- cluster_table.c.name, cluster_table.c.launched_at,
1501
- cluster_table.c.handle, cluster_table.c.last_use,
1502
- cluster_table.c.status, cluster_table.c.autostop,
1503
- cluster_table.c.to_down, cluster_table.c.owner,
1504
- cluster_table.c.metadata, cluster_table.c.cluster_hash,
1505
- cluster_table.c.storage_mounts_metadata,
1506
- cluster_table.c.cluster_ever_up,
1507
- cluster_table.c.status_updated_at, cluster_table.c.user_hash,
1508
- cluster_table.c.config_hash, cluster_table.c.workspace,
1509
- cluster_table.c.is_managed)
1510
- else:
1511
- query = session.query(
1512
- cluster_table.c.name,
1513
- cluster_table.c.launched_at,
1514
- cluster_table.c.handle,
1515
- cluster_table.c.last_use,
1516
- cluster_table.c.status,
1517
- cluster_table.c.autostop,
1518
- cluster_table.c.to_down,
1519
- cluster_table.c.owner,
1520
- cluster_table.c.metadata,
1521
- cluster_table.c.cluster_hash,
1522
- cluster_table.c.storage_mounts_metadata,
1523
- cluster_table.c.cluster_ever_up,
1524
- cluster_table.c.status_updated_at,
1525
- cluster_table.c.user_hash,
1526
- cluster_table.c.config_hash,
1527
- cluster_table.c.workspace,
1528
- cluster_table.c.is_managed,
1529
- # extra fields compared to above query
1530
- cluster_table.c.last_creation_yaml,
1531
- cluster_table.c.last_creation_command)
1621
+ query = session.query(*query_fields)
1532
1622
  row = query.filter_by(name=cluster_name).first()
1533
1623
  if row is None:
1534
1624
  return None
@@ -1551,8 +1641,6 @@ def get_cluster_from_name(
1551
1641
  'owner': _load_owner(row.owner),
1552
1642
  'metadata': json.loads(row.metadata),
1553
1643
  'cluster_hash': row.cluster_hash,
1554
- 'storage_mounts_metadata': _load_storage_mounts_metadata(
1555
- row.storage_mounts_metadata),
1556
1644
  'cluster_ever_up': bool(row.cluster_ever_up),
1557
1645
  'status_updated_at': row.status_updated_at,
1558
1646
  'workspace': row.workspace,
@@ -1609,41 +1697,34 @@ def get_clusters(
1609
1697
  # we treat it as belonging to the current user.
1610
1698
  current_user_hash = common_utils.get_user_hash()
1611
1699
  assert _SQLALCHEMY_ENGINE is not None
1612
- with orm.Session(_SQLALCHEMY_ENGINE) as session:
1613
- if summary_response:
1614
- query = session.query(
1615
- cluster_table.c.name, cluster_table.c.launched_at,
1616
- cluster_table.c.handle, cluster_table.c.last_use,
1617
- cluster_table.c.status, cluster_table.c.autostop,
1618
- cluster_table.c.to_down, cluster_table.c.owner,
1619
- cluster_table.c.metadata, cluster_table.c.cluster_hash,
1620
- cluster_table.c.storage_mounts_metadata,
1621
- cluster_table.c.cluster_ever_up,
1622
- cluster_table.c.status_updated_at, cluster_table.c.user_hash,
1623
- cluster_table.c.config_hash, cluster_table.c.workspace,
1624
- cluster_table.c.is_managed)
1625
- else:
1626
- query = session.query(
1627
- cluster_table.c.name,
1628
- cluster_table.c.launched_at,
1629
- cluster_table.c.handle,
1630
- cluster_table.c.last_use,
1631
- cluster_table.c.status,
1632
- cluster_table.c.autostop,
1633
- cluster_table.c.to_down,
1634
- cluster_table.c.owner,
1635
- cluster_table.c.metadata,
1636
- cluster_table.c.cluster_hash,
1637
- cluster_table.c.storage_mounts_metadata,
1638
- cluster_table.c.cluster_ever_up,
1639
- cluster_table.c.status_updated_at,
1640
- cluster_table.c.user_hash,
1641
- cluster_table.c.config_hash,
1642
- cluster_table.c.workspace,
1643
- cluster_table.c.is_managed,
1644
- # extra fields compared to above query
1645
- cluster_table.c.last_creation_yaml,
1646
- cluster_table.c.last_creation_command)
1700
+ query_fields = [
1701
+ cluster_table.c.name,
1702
+ cluster_table.c.launched_at,
1703
+ cluster_table.c.handle,
1704
+ cluster_table.c.status,
1705
+ cluster_table.c.autostop,
1706
+ cluster_table.c.to_down,
1707
+ cluster_table.c.cluster_hash,
1708
+ cluster_table.c.cluster_ever_up,
1709
+ cluster_table.c.user_hash,
1710
+ cluster_table.c.workspace,
1711
+ user_table.c.name.label('user_name'),
1712
+ ]
1713
+ if not summary_response:
1714
+ query_fields.extend([
1715
+ cluster_table.c.last_creation_yaml,
1716
+ cluster_table.c.last_creation_command,
1717
+ cluster_table.c.config_hash,
1718
+ cluster_table.c.owner,
1719
+ cluster_table.c.metadata,
1720
+ cluster_table.c.last_use,
1721
+ cluster_table.c.status_updated_at,
1722
+ ])
1723
+ if not exclude_managed_clusters:
1724
+ query_fields.append(cluster_table.c.is_managed)
1725
+ with orm.Session(_SQLALCHEMY_ENGINE) as session:
1726
+ query = session.query(*query_fields).outerjoin(
1727
+ user_table, cluster_table.c.user_hash == user_table.c.id)
1647
1728
  if exclude_managed_clusters:
1648
1729
  query = query.filter(cluster_table.c.is_managed == int(False))
1649
1730
  if workspaces_filter is not None:
@@ -1666,55 +1747,50 @@ def get_clusters(
1666
1747
  rows = query.all()
1667
1748
  records = []
1668
1749
 
1669
- # get user hash for each row
1670
- row_to_user_hash = {}
1671
- for row in rows:
1672
- user_hash = (row.user_hash
1673
- if row.user_hash is not None else current_user_hash)
1674
- row_to_user_hash[row.cluster_hash] = user_hash
1675
-
1676
- # get all users needed for the rows at once
1677
- user_hashes = set(row_to_user_hash.values())
1678
- user_hash_to_user = get_users(user_hashes)
1750
+ # Check if we need to fetch the current user's name,
1751
+ # for backwards compatibility, if user_hash is None.
1752
+ current_user_name = None
1753
+ needs_current_user = any(row.user_hash is None for row in rows)
1754
+ if needs_current_user:
1755
+ current_user = get_user(current_user_hash)
1756
+ current_user_name = (current_user.name
1757
+ if current_user is not None else None)
1679
1758
 
1680
1759
  # get last cluster event for each row
1681
- cluster_hashes = set(row_to_user_hash.keys())
1682
1760
  if not summary_response:
1761
+ cluster_hashes = {row.cluster_hash for row in rows}
1683
1762
  last_cluster_event_dict = _get_last_cluster_event_multiple(
1684
1763
  cluster_hashes, ClusterEventType.STATUS_CHANGE)
1685
1764
 
1686
- # get user for each row
1687
1765
  for row in rows:
1688
- user_hash = row_to_user_hash[row.cluster_hash]
1689
- user = user_hash_to_user.get(user_hash, None)
1690
- user_name = user.name if user is not None else None
1691
1766
  # TODO: use namedtuple instead of dict
1692
1767
  record = {
1693
1768
  'name': row.name,
1694
1769
  'launched_at': row.launched_at,
1695
1770
  'handle': pickle.loads(row.handle),
1696
- 'last_use': row.last_use,
1697
1771
  'status': status_lib.ClusterStatus[row.status],
1698
1772
  'autostop': row.autostop,
1699
1773
  'to_down': bool(row.to_down),
1700
- 'owner': _load_owner(row.owner),
1701
- 'metadata': json.loads(row.metadata),
1702
1774
  'cluster_hash': row.cluster_hash,
1703
- 'storage_mounts_metadata': _load_storage_mounts_metadata(
1704
- row.storage_mounts_metadata),
1705
1775
  'cluster_ever_up': bool(row.cluster_ever_up),
1706
- 'status_updated_at': row.status_updated_at,
1707
- 'user_hash': user_hash,
1708
- 'user_name': user_name,
1776
+ 'user_hash': (row.user_hash
1777
+ if row.user_hash is not None else current_user_hash),
1778
+ 'user_name': (row.user_name
1779
+ if row.user_name is not None else current_user_name),
1709
1780
  'workspace': row.workspace,
1710
- 'is_managed': bool(row.is_managed),
1711
- 'config_hash': row.config_hash,
1781
+ 'is_managed': False
1782
+ if exclude_managed_clusters else bool(row.is_managed),
1712
1783
  }
1713
1784
  if not summary_response:
1714
1785
  record['last_creation_yaml'] = row.last_creation_yaml
1715
1786
  record['last_creation_command'] = row.last_creation_command
1716
1787
  record['last_event'] = last_cluster_event_dict.get(
1717
1788
  row.cluster_hash, None)
1789
+ record['config_hash'] = row.config_hash
1790
+ record['owner'] = _load_owner(row.owner)
1791
+ record['metadata'] = json.loads(row.metadata)
1792
+ record['last_use'] = row.last_use
1793
+ record['status_updated_at'] = row.status_updated_at
1718
1794
 
1719
1795
  records.append(record)
1720
1796
  return records
sky/jobs/client/sdk.py CHANGED
@@ -15,6 +15,7 @@ from sky.server import common as server_common
15
15
  from sky.server import rest
16
16
  from sky.server import versions
17
17
  from sky.server.requests import payloads
18
+ from sky.server.requests import request_names
18
19
  from sky.skylet import constants
19
20
  from sky.usage import usage_lib
20
21
  from sky.utils import admin_policy_utils
@@ -84,7 +85,9 @@ def launch(
84
85
 
85
86
  dag = dag_utils.convert_entrypoint_to_dag(task)
86
87
  with admin_policy_utils.apply_and_use_config_in_current_request(
87
- dag, at_client_side=True) as dag:
88
+ dag,
89
+ request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH,
90
+ at_client_side=True) as dag:
88
91
  sdk.validate(dag)
89
92
  if _need_confirmation:
90
93
  job_identity = 'a managed job'
@@ -130,8 +133,11 @@ def queue(
130
133
  refresh: bool,
131
134
  skip_finished: bool = False,
132
135
  all_users: bool = False,
133
- job_ids: Optional[List[int]] = None
134
- ) -> server_common.RequestId[List[responses.ManagedJobRecord]]:
136
+ job_ids: Optional[List[int]] = None,
137
+ limit: Optional[int] = None,
138
+ fields: Optional[List[str]] = None,
139
+ ) -> server_common.RequestId[Union[List[responses.ManagedJobRecord], Tuple[
140
+ List[responses.ManagedJobRecord], int, Dict[str, int], int]]]:
135
141
  """Gets statuses of managed jobs.
136
142
 
137
143
  Please refer to sky.cli.job_queue for documentation.
@@ -141,6 +147,8 @@ def queue(
141
147
  skip_finished: Whether to skip finished jobs.
142
148
  all_users: Whether to show all users' jobs.
143
149
  job_ids: IDs of the managed jobs to show.
150
+ limit: Number of jobs to show.
151
+ fields: Fields to get for the managed jobs.
144
152
 
145
153
  Returns:
146
154
  The request ID of the queue request.
@@ -173,15 +181,29 @@ def queue(
173
181
  does not exist.
174
182
  RuntimeError: if failed to get the managed jobs with ssh.
175
183
  """
176
- body = payloads.JobsQueueBody(
177
- refresh=refresh,
178
- skip_finished=skip_finished,
179
- all_users=all_users,
180
- job_ids=job_ids,
181
- )
184
+ remote_api_version = versions.get_remote_api_version()
185
+ if remote_api_version and remote_api_version >= 18:
186
+ body = payloads.JobsQueueV2Body(
187
+ refresh=refresh,
188
+ skip_finished=skip_finished,
189
+ all_users=all_users,
190
+ job_ids=job_ids,
191
+ limit=limit,
192
+ fields=fields,
193
+ )
194
+ path = '/jobs/queue/v2'
195
+ else:
196
+ body = payloads.JobsQueueBody(
197
+ refresh=refresh,
198
+ skip_finished=skip_finished,
199
+ all_users=all_users,
200
+ job_ids=job_ids,
201
+ )
202
+ path = '/jobs/queue'
203
+
182
204
  response = server_common.make_authenticated_request(
183
205
  'POST',
184
- '/jobs/queue',
206
+ path,
185
207
  json=json.loads(body.model_dump_json()),
186
208
  timeout=(5, None))
187
209
  return server_common.get_request_id(response=response)
@@ -1,12 +1,13 @@
1
1
  """Async SDK functions for managed jobs."""
2
2
  import typing
3
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import Dict, List, Optional, Tuple, Union
4
4
 
5
5
  from sky import backends
6
6
  from sky import sky_logging
7
7
  from sky.adaptors import common as adaptors_common
8
8
  from sky.client import sdk_async
9
9
  from sky.jobs.client import sdk
10
+ from sky.schemas.api import responses
10
11
  from sky.skylet import constants
11
12
  from sky.usage import usage_lib
12
13
  from sky.utils import common_utils
@@ -50,12 +51,17 @@ async def queue(
50
51
  refresh: bool,
51
52
  skip_finished: bool = False,
52
53
  all_users: bool = False,
54
+ job_ids: Optional[List[int]] = None,
55
+ limit: Optional[int] = None,
56
+ fields: Optional[List[str]] = None,
53
57
  stream_logs: Optional[
54
58
  sdk_async.StreamConfig] = sdk_async.DEFAULT_STREAM_CONFIG
55
- ) -> List[Dict[str, Any]]:
59
+ ) -> Union[List[responses.ManagedJobRecord], Tuple[
60
+ List[responses.ManagedJobRecord], int, Dict[str, int], int]]:
56
61
  """Async version of queue() that gets statuses of managed jobs."""
57
62
  request_id = await context_utils.to_thread(sdk.queue, refresh,
58
- skip_finished, all_users)
63
+ skip_finished, all_users,
64
+ job_ids, limit, fields)
59
65
  if stream_logs is not None:
60
66
  return await sdk_async._stream_and_get(request_id, stream_logs) # pylint: disable=protected-access
61
67
  else:
sky/jobs/constants.py CHANGED
@@ -46,7 +46,9 @@ JOBS_CLUSTER_NAME_PREFIX_LENGTH = 25
46
46
  # The version of the lib files that jobs/utils use. Whenever there is an API
47
47
  # change for the jobs/utils, we need to bump this version and update
48
48
  # job.utils.ManagedJobCodeGen to handle the version update.
49
- MANAGED_JOBS_VERSION = 10
49
+ # WARNING: If you update this due to a codegen change, make sure to make the
50
+ # corresponding change in the ManagedJobsService AND bump the SKYLET_VERSION.
51
+ MANAGED_JOBS_VERSION = 12
50
52
 
51
53
  # The command for setting up the jobs dashboard on the controller. It firstly
52
54
  # checks if the systemd services are available, and if not (e.g., Kubernetes