skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py CHANGED
@@ -111,6 +111,24 @@ an autogenerated name."""
111
111
  # command.
112
112
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS = 5
113
113
  _NUM_MANAGED_JOBS_TO_SHOW = 50
114
+ _NUM_REQUESTS_TO_SHOW = 50
115
+ _DEFAULT_REQUEST_FIELDS_TO_SHOW = [
116
+ 'request_id', 'name', 'user_id', 'status', 'created_at'
117
+ ]
118
+ _VERBOSE_REQUEST_FIELDS_TO_SHOW = _DEFAULT_REQUEST_FIELDS_TO_SHOW + [
119
+ 'cluster_name'
120
+ ]
121
+ _DEFAULT_MANAGED_JOB_FIELDS_TO_GET = [
122
+ 'job_id', 'task_id', 'workspace', 'job_name', 'task_name', 'resources',
123
+ 'submitted_at', 'end_at', 'job_duration', 'recovery_count', 'status', 'pool'
124
+ ]
125
+ _VERBOSE_MANAGED_JOB_FIELDS_TO_GET = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + [
126
+ 'current_cluster_name', 'job_id_on_pool_cluster', 'start_at', 'infra',
127
+ 'cloud', 'region', 'zone', 'cluster_resources', 'schedule_state', 'details',
128
+ 'failure_reason', 'metadata'
129
+ ]
130
+ _USER_NAME_FIELD = ['user_name']
131
+ _USER_HASH_FIELD = ['user_hash']
114
132
 
115
133
  _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
116
134
  '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
@@ -151,12 +169,17 @@ def _get_cluster_records_and_set_ssh_config(
151
169
  # Update the SSH config for all clusters
152
170
  for record in cluster_records:
153
171
  handle = record['handle']
154
-
172
+ name = record['name']
155
173
  if not (handle is not None and handle.cached_external_ips is not None
156
174
  and 'credentials' in record):
157
175
  # If the cluster is not UP or does not have credentials available,
158
176
  # we need to remove the cluster from the SSH config.
159
- cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
177
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
178
+ continue
179
+ if not record['credentials']:
180
+ # The credential is missing for some reason, continue.
181
+ logger.debug(
182
+ f'Client did not receive SSH credential for cluster {name}')
160
183
  continue
161
184
 
162
185
  # During the failover, even though a cluster does not exist, the handle
@@ -1321,14 +1344,22 @@ def exec(
1321
1344
 
1322
1345
 
1323
1346
  def _handle_jobs_queue_request(
1324
- request_id: server_common.RequestId[List[responses.ManagedJobRecord]],
1325
- show_all: bool,
1326
- show_user: bool,
1327
- max_num_jobs_to_show: Optional[int],
1328
- is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1347
+ request_id: server_common.RequestId[Union[
1348
+ List[responses.ManagedJobRecord],
1349
+ Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]]],
1350
+ show_all: bool,
1351
+ show_user: bool,
1352
+ max_num_jobs_to_show: Optional[int],
1353
+ pool_status_request_id: Optional[server_common.RequestId[List[Dict[
1354
+ str, Any]]]] = None,
1355
+ is_called_by_user: bool = False,
1356
+ only_in_progress: bool = False,
1357
+ ) -> Tuple[Optional[int], str]:
1329
1358
  """Get the in-progress managed jobs.
1330
1359
 
1331
1360
  Args:
1361
+ request_id: The request ID for managed jobs.
1362
+ pool_status_request_id: The request ID for pool status, or None.
1332
1363
  show_all: Show all information of each job (e.g., region, price).
1333
1364
  show_user: Show the user who submitted the job.
1334
1365
  max_num_jobs_to_show: If not None, limit the number of jobs to show to
@@ -1336,6 +1367,7 @@ def _handle_jobs_queue_request(
1336
1367
  and `sky jobs queue`.
1337
1368
  is_called_by_user: If this function is called by user directly, or an
1338
1369
  internal call.
1370
+ only_in_progress: If True, only return the number of in-progress jobs.
1339
1371
 
1340
1372
  Returns:
1341
1373
  A tuple of (num_in_progress_jobs, msg). If num_in_progress_jobs is None,
@@ -1346,11 +1378,47 @@ def _handle_jobs_queue_request(
1346
1378
  # TODO(SKY-980): remove unnecessary fallbacks on the client side.
1347
1379
  num_in_progress_jobs = None
1348
1380
  msg = ''
1381
+ status_counts: Optional[Dict[str, int]] = None
1382
+ pool_status_result = None
1349
1383
  try:
1350
1384
  if not is_called_by_user:
1351
1385
  usage_lib.messages.usage.set_internal()
1352
- managed_jobs_ = sdk.stream_and_get(request_id)
1353
- num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
1386
+ # Call both stream_and_get functions in parallel
1387
+ def get_jobs_queue_result():
1388
+ return sdk.stream_and_get(request_id)
1389
+
1390
+ def get_pool_status_result():
1391
+ if pool_status_request_id is not None:
1392
+ try:
1393
+ return sdk.stream_and_get(pool_status_request_id)
1394
+ except Exception: # pylint: disable=broad-except
1395
+ # If getting pool status fails, just continue without it
1396
+ return None
1397
+ return None
1398
+
1399
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
1400
+ jobs_future = executor.submit(get_jobs_queue_result)
1401
+ pool_status_future = executor.submit(get_pool_status_result)
1402
+
1403
+ result = jobs_future.result()
1404
+ pool_status_result = pool_status_future.result()
1405
+
1406
+ if isinstance(result, tuple):
1407
+ managed_jobs_, total, status_counts, _ = result
1408
+ if only_in_progress:
1409
+ num_in_progress_jobs = 0
1410
+ if status_counts:
1411
+ for status_value, count in status_counts.items():
1412
+ status_enum = managed_jobs.ManagedJobStatus(
1413
+ status_value)
1414
+ if not status_enum.is_terminal():
1415
+ num_in_progress_jobs += count
1416
+ else:
1417
+ num_in_progress_jobs = total
1418
+ else:
1419
+ managed_jobs_ = result
1420
+ num_in_progress_jobs = len(
1421
+ set(job['job_id'] for job in managed_jobs_))
1354
1422
  except exceptions.ClusterNotUpError as e:
1355
1423
  controller_status = e.cluster_status
1356
1424
  msg = str(e)
@@ -1394,10 +1462,14 @@ def _handle_jobs_queue_request(
1394
1462
  msg += ('Failed to query managed jobs: '
1395
1463
  f'{common_utils.format_exception(e, use_bracket=True)}')
1396
1464
  else:
1397
- msg = table_utils.format_job_table(managed_jobs_,
1398
- show_all=show_all,
1399
- show_user=show_user,
1400
- max_jobs=max_num_jobs_to_show)
1465
+ msg = table_utils.format_job_table(
1466
+ managed_jobs_,
1467
+ pool_status=pool_status_result,
1468
+ show_all=show_all,
1469
+ show_user=show_user,
1470
+ max_jobs=max_num_jobs_to_show,
1471
+ status_counts=status_counts,
1472
+ )
1401
1473
  return num_in_progress_jobs, msg
1402
1474
 
1403
1475
 
@@ -1786,9 +1858,16 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1786
1858
 
1787
1859
  # Phase 2: Parallel submission of all API requests
1788
1860
  def submit_managed_jobs():
1789
- return managed_jobs.queue(refresh=False,
1790
- skip_finished=True,
1791
- all_users=all_users)
1861
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
1862
+ if all_users:
1863
+ fields = fields + _USER_NAME_FIELD
1864
+ return managed_jobs.queue(
1865
+ refresh=False,
1866
+ skip_finished=True,
1867
+ all_users=all_users,
1868
+ fields=fields,
1869
+ limit=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1870
+ )
1792
1871
 
1793
1872
  def submit_services(
1794
1873
  ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
@@ -1861,7 +1940,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1861
1940
  controllers = []
1862
1941
  for cluster_record in cluster_records:
1863
1942
  cluster_name = cluster_record['name']
1864
- controller = controller_utils.Controllers.from_name(cluster_name)
1943
+ controller = controller_utils.Controllers.from_name(
1944
+ cluster_name, expect_exact_match=False)
1865
1945
  if controller is not None:
1866
1946
  controllers.append(cluster_record)
1867
1947
  else:
@@ -1890,10 +1970,12 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1890
1970
  try:
1891
1971
  num_in_progress_jobs, msg = _handle_jobs_queue_request(
1892
1972
  managed_jobs_queue_request_id,
1973
+ pool_status_request_id=pool_status_request_id,
1893
1974
  show_all=False,
1894
1975
  show_user=all_users,
1895
1976
  max_num_jobs_to_show=_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS,
1896
- is_called_by_user=False)
1977
+ is_called_by_user=False,
1978
+ only_in_progress=True)
1897
1979
  except KeyboardInterrupt:
1898
1980
  sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
1899
1981
  managed_jobs_query_interrupted = True
@@ -2027,7 +2109,8 @@ def cost_report(all: bool, days: int): # pylint: disable=redefined-builtin
2027
2109
  for cluster_record in cluster_records:
2028
2110
  cluster_name = cluster_record['name']
2029
2111
  try:
2030
- controller = controller_utils.Controllers.from_name(cluster_name)
2112
+ controller = controller_utils.Controllers.from_name(
2113
+ cluster_name, expect_exact_match=False)
2031
2114
  except AssertionError:
2032
2115
  # There could be some old controller clusters from previous
2033
2116
  # versions that we should not show in the cost report.
@@ -2136,6 +2219,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
2136
2219
  is_flag=True,
2137
2220
  default=False,
2138
2221
  help='Stream the cluster provisioning logs (provision.log).')
2222
+ @click.option('--worker',
2223
+ '-w',
2224
+ default=None,
2225
+ type=int,
2226
+ help='The worker ID to stream the logs from. '
2227
+ 'If not set, stream the logs of the head node.')
2139
2228
  @click.option(
2140
2229
  '--sync-down',
2141
2230
  '-s',
@@ -2173,6 +2262,7 @@ def logs(
2173
2262
  cluster: str,
2174
2263
  job_ids: Tuple[str, ...],
2175
2264
  provision: bool,
2265
+ worker: Optional[int],
2176
2266
  sync_down: bool,
2177
2267
  status: bool, # pylint: disable=redefined-outer-name
2178
2268
  follow: bool,
@@ -2202,6 +2292,13 @@ def logs(
2202
2292
  4. If the job fails or fetching the logs fails, the command will exit with
2203
2293
  a non-zero return code.
2204
2294
  """
2295
+ if worker is not None:
2296
+ if not provision:
2297
+ raise click.UsageError(
2298
+ '--worker can only be used with --provision.')
2299
+ if worker < 1:
2300
+ raise click.UsageError('--worker must be a positive integer.')
2301
+
2205
2302
  if provision and (sync_down or status or job_ids):
2206
2303
  raise click.UsageError(
2207
2304
  '--provision cannot be combined with job log options '
@@ -2221,7 +2318,11 @@ def logs(
2221
2318
 
2222
2319
  if provision:
2223
2320
  # Stream provision logs
2224
- sys.exit(sdk.tail_provision_logs(cluster, follow=follow, tail=tail))
2321
+ sys.exit(
2322
+ sdk.tail_provision_logs(cluster_name=cluster,
2323
+ worker=worker,
2324
+ follow=follow,
2325
+ tail=tail))
2225
2326
 
2226
2327
  if sync_down:
2227
2328
  with rich_utils.client_status(
@@ -2399,7 +2500,8 @@ def cancel(
2399
2500
  job_ids=job_ids_to_cancel)
2400
2501
  _async_call_or_wait(request_id, async_call, 'sky.cancel')
2401
2502
  except exceptions.NotSupportedError as e:
2402
- controller = controller_utils.Controllers.from_name(cluster)
2503
+ controller = controller_utils.Controllers.from_name(
2504
+ cluster, expect_exact_match=False)
2403
2505
  assert controller is not None, cluster
2404
2506
  with ux_utils.print_exception_no_traceback():
2405
2507
  raise click.UsageError(
@@ -2700,7 +2802,8 @@ def start(
2700
2802
  # Get all clusters that are not controllers.
2701
2803
  cluster_records = [
2702
2804
  cluster for cluster in all_clusters
2703
- if controller_utils.Controllers.from_name(cluster['name']) is None
2805
+ if controller_utils.Controllers.from_name(
2806
+ cluster['name'], expect_exact_match=False) is None
2704
2807
  ]
2705
2808
  if cluster_records is None:
2706
2809
  # Get GLOB cluster names
@@ -2762,7 +2865,8 @@ def start(
2762
2865
  # Checks for controller clusters (jobs controller / sky serve controller).
2763
2866
  controllers, normal_clusters = [], []
2764
2867
  for name in to_start:
2765
- if controller_utils.Controllers.from_name(name) is not None:
2868
+ if controller_utils.Controllers.from_name(
2869
+ name, expect_exact_match=False) is not None:
2766
2870
  controllers.append(name)
2767
2871
  else:
2768
2872
  normal_clusters.append(name)
@@ -2898,16 +3002,26 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2898
3002
  to be torn down (e.g., because it has jobs running or
2899
3003
  it is in init state)
2900
3004
  """
2901
- controller = controller_utils.Controllers.from_name(controller_name)
3005
+ controller = controller_utils.Controllers.from_name(
3006
+ controller_name, expect_exact_match=False)
2902
3007
  assert controller is not None, controller_name
2903
3008
 
3009
+ status_counts: Optional[Dict[str, int]] = None
2904
3010
  with rich_utils.client_status(
2905
3011
  '[bold cyan]Checking for in-progress managed jobs and pools[/]'):
2906
3012
  try:
2907
- request_id = managed_jobs.queue(refresh=False,
2908
- skip_finished=True,
2909
- all_users=True)
2910
- managed_jobs_ = sdk.stream_and_get(request_id)
3013
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET + _USER_NAME_FIELD
3014
+ request_id = managed_jobs.queue(
3015
+ refresh=False,
3016
+ skip_finished=True,
3017
+ all_users=True,
3018
+ fields=fields,
3019
+ )
3020
+ result = sdk.stream_and_get(request_id)
3021
+ if isinstance(result, tuple):
3022
+ managed_jobs_, _, status_counts, _ = result
3023
+ else:
3024
+ managed_jobs_ = result
2911
3025
  request_id_pools = managed_jobs.pool_status(pool_names=None)
2912
3026
  pools_ = sdk.stream_and_get(request_id_pools)
2913
3027
  except exceptions.ClusterNotUpError as e:
@@ -2938,10 +3052,17 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2938
3052
  }}):
2939
3053
  # Check again with the consolidation mode disabled. This is to
2940
3054
  # make sure there is no in-progress managed jobs.
2941
- request_id = managed_jobs.queue(refresh=False,
2942
- skip_finished=True,
2943
- all_users=True)
2944
- managed_jobs_ = sdk.stream_and_get(request_id)
3055
+ request_id = managed_jobs.queue(
3056
+ refresh=False,
3057
+ skip_finished=True,
3058
+ all_users=True,
3059
+ fields=fields,
3060
+ )
3061
+ result = sdk.stream_and_get(request_id)
3062
+ if isinstance(result, tuple):
3063
+ managed_jobs_, _, status_counts, _ = result
3064
+ else:
3065
+ managed_jobs_ = result
2945
3066
  request_id_pools = managed_jobs.pool_status(pool_names=None)
2946
3067
  pools_ = sdk.stream_and_get(request_id_pools)
2947
3068
 
@@ -2952,9 +3073,12 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2952
3073
  'jobs (output of `sky jobs queue`) will be lost.')
2953
3074
  click.echo(msg)
2954
3075
  if managed_jobs_:
2955
- job_table = table_utils.format_job_table(managed_jobs_,
2956
- show_all=False,
2957
- show_user=True)
3076
+ job_table = table_utils.format_job_table(
3077
+ managed_jobs_,
3078
+ show_all=False,
3079
+ show_user=True,
3080
+ status_counts=status_counts,
3081
+ )
2958
3082
  msg = controller.value.decline_down_for_dirty_controller_hint
2959
3083
  # Add prefix to each line to align with the bullet point.
2960
3084
  msg += '\n'.join(
@@ -2997,7 +3121,8 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
2997
3121
  to be torn down (e.g., because it has services running or
2998
3122
  it is in init state)
2999
3123
  """
3000
- controller = controller_utils.Controllers.from_name(controller_name)
3124
+ controller = controller_utils.Controllers.from_name(
3125
+ controller_name, expect_exact_match=False)
3001
3126
  assert controller is not None, controller_name
3002
3127
  with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
3003
3128
  try:
@@ -3108,14 +3233,15 @@ def _down_or_stop_clusters(
3108
3233
  names = list(names)
3109
3234
  if names:
3110
3235
  controllers = [
3111
- name for name in names
3112
- if controller_utils.Controllers.from_name(name) is not None
3236
+ name for name in names if controller_utils.Controllers.from_name(
3237
+ name, expect_exact_match=False) is not None
3113
3238
  ]
3114
3239
  controllers_str = ', '.join(map(repr, controllers))
3115
3240
  names = [
3116
3241
  cluster['name']
3117
3242
  for cluster in _get_cluster_records_and_set_ssh_config(names)
3118
- if controller_utils.Controllers.from_name(cluster['name']) is None
3243
+ if controller_utils.Controllers.from_name(
3244
+ cluster['name'], expect_exact_match=False) is None
3119
3245
  ]
3120
3246
 
3121
3247
  # Make sure the controllers are explicitly specified without other
@@ -3140,7 +3266,7 @@ def _down_or_stop_clusters(
3140
3266
  f'{controllers_str} is currently not supported.')
3141
3267
  else:
3142
3268
  controller = controller_utils.Controllers.from_name(
3143
- controller_name)
3269
+ controller_name, expect_exact_match=False)
3144
3270
  assert controller is not None
3145
3271
  hint_or_raise = _controller_to_hint_or_raise(controller)
3146
3272
  try:
@@ -3188,9 +3314,10 @@ def _down_or_stop_clusters(
3188
3314
  names = [
3189
3315
  record['name']
3190
3316
  for record in all_clusters
3191
- if controller_utils.Controllers.from_name(record['name']) is None
3192
- and (down or idle_minutes_to_autostop is not None or
3193
- record['status'] != status_lib.ClusterStatus.STOPPED)
3317
+ if controller_utils.Controllers.from_name(
3318
+ record['name'], expect_exact_match=False) is None and
3319
+ (down or idle_minutes_to_autostop is not None or
3320
+ record['status'] != status_lib.ClusterStatus.STOPPED)
3194
3321
  ]
3195
3322
 
3196
3323
  clusters = names
@@ -3220,6 +3347,9 @@ def _down_or_stop_clusters(
3220
3347
 
3221
3348
  request_ids = []
3222
3349
 
3350
+ successes: List[str] = []
3351
+ failures: List[Tuple[str, str]] = []
3352
+
3223
3353
  def _down_or_stop(name: str):
3224
3354
  success_progress = False
3225
3355
  if idle_minutes_to_autostop is not None:
@@ -3227,16 +3357,20 @@ def _down_or_stop_clusters(
3227
3357
  request_id = sdk.autostop(name, idle_minutes_to_autostop,
3228
3358
  wait_for, down)
3229
3359
  request_ids.append(request_id)
3360
+ progress.stop()
3230
3361
  _async_call_or_wait(
3231
3362
  request_id, async_call,
3232
3363
  server_constants.REQUEST_NAME_PREFIX + operation)
3233
- except (exceptions.NotSupportedError,
3234
- exceptions.ClusterNotUpError) as e:
3364
+ progress.start()
3365
+ except (exceptions.NotSupportedError, exceptions.ClusterNotUpError,
3366
+ exceptions.CloudError) as e:
3235
3367
  message = str(e)
3368
+ failures.append((name, str(e)))
3236
3369
  else: # no exception raised
3237
3370
  success_progress = True
3238
3371
  message = (f'{colorama.Fore.GREEN}{operation} '
3239
3372
  f'cluster {name!r}...done{colorama.Style.RESET_ALL}')
3373
+ successes.append(name)
3240
3374
  if idle_minutes_to_autostop >= 0:
3241
3375
  option_str = 'down' if down else 'stop'
3242
3376
  passive_str = 'downed' if down else 'stopped'
@@ -3256,9 +3390,11 @@ def _down_or_stop_clusters(
3256
3390
  else:
3257
3391
  request_id = sdk.stop(name, purge=purge)
3258
3392
  request_ids.append(request_id)
3393
+ progress.stop()
3259
3394
  _async_call_or_wait(
3260
3395
  request_id, async_call,
3261
3396
  server_constants.REQUEST_NAME_PREFIX + operation)
3397
+ progress.start()
3262
3398
  if not async_call:
3263
3399
  # Remove the cluster from the SSH config file as soon as it
3264
3400
  # is stopped or downed.
@@ -3268,13 +3404,17 @@ def _down_or_stop_clusters(
3268
3404
  f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
3269
3405
  f'{colorama.Style.RESET_ALL}'
3270
3406
  f'\nReason: {common_utils.format_exception(e)}.')
3407
+ failures.append((name, str(e)))
3271
3408
  except (exceptions.NotSupportedError,
3272
- exceptions.ClusterOwnerIdentityMismatchError) as e:
3409
+ exceptions.ClusterOwnerIdentityMismatchError,
3410
+ exceptions.CloudError) as e:
3273
3411
  message = str(e)
3412
+ failures.append((name, str(e)))
3274
3413
  else: # no exception raised
3275
3414
  message = (
3276
3415
  f'{colorama.Fore.GREEN}{operation} cluster {name}...done.'
3277
3416
  f'{colorama.Style.RESET_ALL}')
3417
+ successes.append(name)
3278
3418
  if not down:
3279
3419
  message += ('\n To restart the cluster, run: '
3280
3420
  f'{colorama.Style.BRIGHT}sky start {name}'
@@ -3288,6 +3428,10 @@ def _down_or_stop_clusters(
3288
3428
  progress.start()
3289
3429
 
3290
3430
  with progress:
3431
+ # we write a new line here to avoid the "Waiting for 'sky.down'
3432
+ # request to be scheduled" message from being printed on the same line
3433
+ # as the "Terminating <num> clusters..." message
3434
+ click.echo('')
3291
3435
  subprocess_utils.run_in_parallel(_down_or_stop, clusters)
3292
3436
  progress.live.transient = False
3293
3437
  # Make sure the progress bar not mess up the terminal.
@@ -3297,6 +3441,31 @@ def _down_or_stop_clusters(
3297
3441
  click.secho(f'{operation} requests are sent. Check the requests\' '
3298
3442
  'status with `sky request get <request_id>`.')
3299
3443
 
3444
+ show_summary = len(clusters) > 1
3445
+
3446
+ if show_summary:
3447
+ click.echo('\nSummary:')
3448
+ if successes:
3449
+ # Preserve the original order of clusters as provided by user.
3450
+ click.echo(' ✓ Succeeded: ' + ', '.join(successes))
3451
+ if failures:
3452
+ # Format failures: if one failure, keep on same line. If multiple,
3453
+ # indent each failed cluster on its own line for readability.
3454
+ if len(failures) == 1:
3455
+ name, reason = failures[0]
3456
+ first = reason.strip().splitlines()[0]
3457
+ first = first if len(first) <= 120 else first[:120] + '…'
3458
+ click.echo(f' ✗ Failed: {name} ({first})')
3459
+ else:
3460
+ click.echo(' ✗ Failed:')
3461
+ for name, reason in failures:
3462
+ first = reason.strip().splitlines()[0]
3463
+ first = first if len(first) <= 120 else first[:120] + '…'
3464
+ click.echo(f' {name} ({first})')
3465
+
3466
+ if failures:
3467
+ click.echo('Cluster(s) failed. See details above.')
3468
+
3300
3469
 
3301
3470
  @cli.command(cls=_DocumentedCodeCommand)
3302
3471
  @flags.config_option(expose_value=False)
@@ -4096,6 +4265,10 @@ def volumes():
4096
4265
  pass
4097
4266
 
4098
4267
 
4268
+ # Add 'volume' as an alias for 'volumes'
4269
+ cli.add_command(volumes, name='volume')
4270
+
4271
+
4099
4272
  @volumes.command('apply', cls=_DocumentedCodeCommand)
4100
4273
  @flags.config_option(expose_value=False)
4101
4274
  @click.argument('entrypoint',
@@ -4492,21 +4665,6 @@ def jobs_launch(
4492
4665
 
4493
4666
  job_ids = [job_id_handle[0]] if isinstance(job_id_handle[0],
4494
4667
  int) else job_id_handle[0]
4495
- if pool:
4496
- # Display the worker assignment for the jobs.
4497
- logger.debug(f'Getting service records for pool: {pool}')
4498
- records_request_id = managed_jobs.pool_status(pool_names=pool)
4499
- service_records = _async_call_or_wait(records_request_id, async_call,
4500
- 'sky.jobs.pool_status')
4501
- logger.debug(f'Pool status: {service_records}')
4502
- replica_infos = service_records[0]['replica_info']
4503
- for replica_info in replica_infos:
4504
- job_id = replica_info.get('used_by', None)
4505
- if job_id in job_ids:
4506
- worker_id = replica_info['replica_id']
4507
- version = replica_info['version']
4508
- logger.info(f'Job ID: {job_id} assigned to pool {pool} '
4509
- f'(worker: {worker_id}, version: {version})')
4510
4668
 
4511
4669
  if not detach_run:
4512
4670
  if len(job_ids) == 1:
@@ -4519,7 +4677,8 @@ def jobs_launch(
4519
4677
  else:
4520
4678
  # TODO(tian): This can be very long. Considering have a "group id"
4521
4679
  # and query all job ids with the same group id.
4522
- job_ids_str = ','.join(map(str, job_ids))
4680
+ # Sort job ids to ensure consistent ordering.
4681
+ job_ids_str = ','.join(map(str, sorted(job_ids)))
4523
4682
  click.secho(
4524
4683
  f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
4525
4684
  f'{job_ids_str}{colorama.Style.RESET_ALL}.'
@@ -4538,6 +4697,14 @@ def jobs_launch(
4538
4697
  @jobs.command('queue', cls=_DocumentedCodeCommand)
4539
4698
  @flags.config_option(expose_value=False)
4540
4699
  @flags.verbose_option()
4700
+ @click.option(
4701
+ '--limit',
4702
+ '-l',
4703
+ default=_NUM_MANAGED_JOBS_TO_SHOW,
4704
+ type=int,
4705
+ required=False,
4706
+ help=(f'Number of jobs to show, default is {_NUM_MANAGED_JOBS_TO_SHOW},'
4707
+ f' use "-a/--all" to show all jobs.'))
4541
4708
  @click.option(
4542
4709
  '--refresh',
4543
4710
  '-r',
@@ -4557,7 +4724,7 @@ def jobs_launch(
4557
4724
  @usage_lib.entrypoint
4558
4725
  # pylint: disable=redefined-builtin
4559
4726
  def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4560
- all_users: bool, all: bool):
4727
+ all_users: bool, all: bool, limit: int):
4561
4728
  """Show statuses of managed jobs.
4562
4729
 
4563
4730
  Each managed jobs can have one of the following statuses:
@@ -4608,14 +4775,48 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4608
4775
 
4609
4776
  watch -n60 sky jobs queue
4610
4777
 
4778
+ (Tip) To show only the latest 10 jobs, use ``-l/--limit 10``:
4779
+
4780
+ .. code-block:: bash
4781
+
4782
+ sky jobs queue -l 10
4783
+
4611
4784
  """
4612
4785
  click.secho('Fetching managed job statuses...', fg='cyan')
4613
4786
  with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
4614
- managed_jobs_request_id = managed_jobs.queue(
4615
- refresh=refresh, skip_finished=skip_finished, all_users=all_users)
4616
- max_num_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW if not all else None)
4787
+ max_num_jobs_to_show = (limit if not all else None)
4788
+ fields = _DEFAULT_MANAGED_JOB_FIELDS_TO_GET
4789
+ if verbose:
4790
+ fields = _VERBOSE_MANAGED_JOB_FIELDS_TO_GET
4791
+ if all_users:
4792
+ fields = fields + _USER_NAME_FIELD
4793
+ if verbose:
4794
+ fields = fields + _USER_HASH_FIELD
4795
+ # Call both managed_jobs.queue and managed_jobs.pool_status in parallel
4796
+ def get_managed_jobs_queue():
4797
+ return managed_jobs.queue(refresh=refresh,
4798
+ skip_finished=skip_finished,
4799
+ all_users=all_users,
4800
+ limit=max_num_jobs_to_show,
4801
+ fields=fields)
4802
+
4803
+ def get_pool_status():
4804
+ try:
4805
+ return managed_jobs.pool_status(pool_names=None)
4806
+ except Exception: # pylint: disable=broad-except
4807
+ # If pool_status fails, we'll just skip the worker information
4808
+ return None
4809
+
4810
+ with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
4811
+ managed_jobs_future = executor.submit(get_managed_jobs_queue)
4812
+ pool_status_future = executor.submit(get_pool_status)
4813
+
4814
+ managed_jobs_request_id = managed_jobs_future.result()
4815
+ pool_status_request_id = pool_status_future.result()
4816
+
4617
4817
  num_jobs, msg = _handle_jobs_queue_request(
4618
4818
  managed_jobs_request_id,
4819
+ pool_status_request_id=pool_status_request_id,
4619
4820
  show_all=verbose,
4620
4821
  show_user=all_users,
4621
4822
  max_num_jobs_to_show=max_num_jobs_to_show,
@@ -4632,7 +4833,8 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4632
4833
  f'{colorama.Fore.CYAN}'
4633
4834
  f'Only showing the latest {max_num_jobs_to_show} '
4634
4835
  f'managed jobs'
4635
- f'(use --all to show all managed jobs) {colorama.Style.RESET_ALL} ')
4836
+ f'(use --limit to show more managed jobs or '
4837
+ f'--all to show all managed jobs) {colorama.Style.RESET_ALL} ')
4636
4838
 
4637
4839
 
4638
4840
  @jobs.command('cancel', cls=_DocumentedCodeCommand)
@@ -5212,7 +5414,15 @@ def jobs_pool_logs(
5212
5414
  @flags.config_option(expose_value=False)
5213
5415
  @usage_lib.entrypoint
5214
5416
  def dashboard() -> None:
5215
- """Starts the dashboard for skypilot."""
5417
+ """Opens the SkyPilot dashboard."""
5418
+ sdk.dashboard()
5419
+
5420
+
5421
+ @cli.command(cls=_DocumentedCodeCommand, hidden=True)
5422
+ @flags.config_option(expose_value=False)
5423
+ @usage_lib.entrypoint
5424
+ def ui() -> None:
5425
+ """Opens the SkyPilot dashboard."""
5216
5426
  sdk.dashboard()
5217
5427
 
5218
5428
 
@@ -6120,20 +6330,22 @@ def api_logs(request_id: Optional[str], server_logs: bool,
6120
6330
  **_get_shell_complete_args(_complete_api_request))
6121
6331
  @flags.all_option('Cancel all your requests.')
6122
6332
  @flags.all_users_option('Cancel all requests from all users.')
6333
+ @flags.yes_option()
6123
6334
  @usage_lib.entrypoint
6124
6335
  # pylint: disable=redefined-builtin
6125
- def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6336
+ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool,
6337
+ yes: bool):
6126
6338
  """Cancel a request running on SkyPilot API server."""
6127
6339
  if all or all_users:
6128
- keyword = 'ALL USERS\'' if all_users else 'YOUR'
6129
- user_input = click.prompt(
6130
- f'This will cancel all {keyword} requests.\n'
6131
- f'To proceed, please type {colorama.Style.BRIGHT}'
6132
- f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
6133
- type=str)
6134
- if user_input != 'cancel all requests':
6135
- raise click.Abort()
6136
- if all:
6340
+ if not yes:
6341
+ keyword = 'ALL USERS\'' if all_users else 'YOUR'
6342
+ user_input = click.prompt(
6343
+ f'This will cancel all {keyword} requests.\n'
6344
+ f'To proceed, please type {colorama.Style.BRIGHT}'
6345
+ f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
6346
+ type=str)
6347
+ if user_input != 'cancel all requests':
6348
+ raise click.Abort()
6137
6349
  request_ids = None
6138
6350
  cancelled_request_ids = sdk.get(
6139
6351
  sdk.api_cancel(request_ids=request_ids, all_users=all_users))
@@ -6147,9 +6359,28 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6147
6359
  fg='green')
6148
6360
 
6149
6361
 
6362
+ class IntOrNone(click.ParamType):
6363
+ """Int or None"""
6364
+ name = 'int-or-none'
6365
+
6366
+ def convert(self, value, param, ctx):
6367
+ if isinstance(value, int):
6368
+ return value
6369
+ if isinstance(value, str) and value.lower() in ('none', 'all'):
6370
+ return None
6371
+ try:
6372
+ return int(value)
6373
+ except ValueError:
6374
+ self.fail(f'{value!r} is not a valid integer or "none" or "all"',
6375
+ param, ctx)
6376
+
6377
+
6378
+ INT_OR_NONE = IntOrNone()
6379
+
6380
+
6150
6381
  @api.command('status', cls=_DocumentedCodeCommand)
6151
6382
  @flags.config_option(expose_value=False)
6152
- @click.argument('request_ids',
6383
+ @click.argument('request_id_prefixes',
6153
6384
  required=False,
6154
6385
  type=str,
6155
6386
  nargs=-1,
@@ -6159,16 +6390,30 @@ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
6159
6390
  is_flag=True,
6160
6391
  default=False,
6161
6392
  required=False,
6162
- help='Show requests of all statuses.')
6393
+ help=('Show requests of all statuses, including finished ones '
6394
+ '(SUCCEEDED, FAILED, CANCELLED). By default, only active '
6395
+ 'requests (PENDING, RUNNING) are shown.'))
6396
+ @click.option(
6397
+ '--limit',
6398
+ '-l',
6399
+ default=_NUM_REQUESTS_TO_SHOW,
6400
+ type=INT_OR_NONE,
6401
+ required=False,
6402
+ help=(f'Number of requests to show, default is {_NUM_REQUESTS_TO_SHOW},'
6403
+ f' set to "none" or "all" to show all requests.'))
6163
6404
  @flags.verbose_option('Show more details.')
6164
6405
  @usage_lib.entrypoint
6165
6406
  # pylint: disable=redefined-builtin
6166
- def api_status(request_ids: Optional[List[str]], all_status: bool,
6167
- verbose: bool):
6407
+ def api_status(request_id_prefixes: Optional[List[str]], all_status: bool,
6408
+ verbose: bool, limit: Optional[int]):
6168
6409
  """List requests on SkyPilot API server."""
6169
- if not request_ids:
6170
- request_ids = None
6171
- request_list = sdk.api_status(request_ids, all_status)
6410
+ if not request_id_prefixes:
6411
+ request_id_prefixes = None
6412
+ fields = _DEFAULT_REQUEST_FIELDS_TO_SHOW
6413
+ if verbose:
6414
+ fields = _VERBOSE_REQUEST_FIELDS_TO_SHOW
6415
+ request_list = sdk.api_status(request_id_prefixes, all_status, limit,
6416
+ fields)
6172
6417
  columns = ['ID', 'User', 'Name']
6173
6418
  if verbose:
6174
6419
  columns.append('Cluster')
@@ -6194,8 +6439,12 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
6194
6439
  if verbose:
6195
6440
  dummy_row.append('-')
6196
6441
  table.add_row(dummy_row)
6197
- click.echo()
6198
6442
  click.echo(table)
6443
+ if limit and len(request_list) >= limit:
6444
+ click.echo()
6445
+ click.echo(
6446
+ f'Showing {limit} requests. Use "-l none" or "-l all" to show'
6447
+ f' all requests.')
6199
6448
 
6200
6449
 
6201
6450
  @api.command('login', cls=_DocumentedCodeCommand)