skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/coreweave.py +278 -0
  4. sky/adaptors/kubernetes.py +64 -0
  5. sky/adaptors/shadeform.py +89 -0
  6. sky/admin_policy.py +20 -0
  7. sky/authentication.py +59 -149
  8. sky/backends/backend_utils.py +104 -63
  9. sky/backends/cloud_vm_ray_backend.py +84 -39
  10. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  11. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  12. sky/catalog/kubernetes_catalog.py +24 -28
  13. sky/catalog/runpod_catalog.py +5 -1
  14. sky/catalog/shadeform_catalog.py +165 -0
  15. sky/check.py +25 -13
  16. sky/client/cli/command.py +335 -86
  17. sky/client/cli/flags.py +4 -2
  18. sky/client/cli/table_utils.py +17 -9
  19. sky/client/sdk.py +59 -12
  20. sky/cloud_stores.py +73 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +71 -16
  23. sky/clouds/azure.py +12 -5
  24. sky/clouds/cloud.py +19 -9
  25. sky/clouds/cudo.py +12 -5
  26. sky/clouds/do.py +4 -1
  27. sky/clouds/fluidstack.py +12 -5
  28. sky/clouds/gcp.py +12 -5
  29. sky/clouds/hyperbolic.py +12 -5
  30. sky/clouds/ibm.py +12 -5
  31. sky/clouds/kubernetes.py +62 -25
  32. sky/clouds/lambda_cloud.py +12 -5
  33. sky/clouds/nebius.py +12 -5
  34. sky/clouds/oci.py +12 -5
  35. sky/clouds/paperspace.py +4 -1
  36. sky/clouds/primeintellect.py +4 -1
  37. sky/clouds/runpod.py +12 -5
  38. sky/clouds/scp.py +12 -5
  39. sky/clouds/seeweb.py +4 -1
  40. sky/clouds/shadeform.py +400 -0
  41. sky/clouds/ssh.py +4 -2
  42. sky/clouds/vast.py +12 -5
  43. sky/clouds/vsphere.py +4 -1
  44. sky/core.py +12 -11
  45. sky/dashboard/out/404.html +1 -1
  46. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  47. sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
  48. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  50. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
  53. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  58. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  62. sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
  63. sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
  64. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
  65. sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
  66. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  67. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
  68. sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
  69. sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
  72. sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
  73. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  74. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  75. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  76. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  77. sky/dashboard/out/clusters/[cluster].html +1 -1
  78. sky/dashboard/out/clusters.html +1 -1
  79. sky/dashboard/out/config.html +1 -1
  80. sky/dashboard/out/index.html +1 -1
  81. sky/dashboard/out/infra/[context].html +1 -1
  82. sky/dashboard/out/infra.html +1 -1
  83. sky/dashboard/out/jobs/[job].html +1 -1
  84. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  85. sky/dashboard/out/jobs.html +1 -1
  86. sky/dashboard/out/users.html +1 -1
  87. sky/dashboard/out/volumes.html +1 -1
  88. sky/dashboard/out/workspace/new.html +1 -1
  89. sky/dashboard/out/workspaces/[name].html +1 -1
  90. sky/dashboard/out/workspaces.html +1 -1
  91. sky/data/data_utils.py +92 -1
  92. sky/data/mounting_utils.py +143 -19
  93. sky/data/storage.py +168 -11
  94. sky/exceptions.py +13 -1
  95. sky/execution.py +13 -0
  96. sky/global_user_state.py +189 -113
  97. sky/jobs/client/sdk.py +32 -10
  98. sky/jobs/client/sdk_async.py +9 -3
  99. sky/jobs/constants.py +3 -1
  100. sky/jobs/controller.py +164 -192
  101. sky/jobs/file_content_utils.py +80 -0
  102. sky/jobs/log_gc.py +201 -0
  103. sky/jobs/recovery_strategy.py +59 -82
  104. sky/jobs/scheduler.py +20 -9
  105. sky/jobs/server/core.py +105 -23
  106. sky/jobs/server/server.py +40 -28
  107. sky/jobs/server/utils.py +32 -11
  108. sky/jobs/state.py +588 -110
  109. sky/jobs/utils.py +442 -209
  110. sky/logs/agent.py +1 -1
  111. sky/metrics/utils.py +45 -6
  112. sky/optimizer.py +1 -1
  113. sky/provision/__init__.py +7 -0
  114. sky/provision/aws/instance.py +2 -1
  115. sky/provision/azure/instance.py +2 -1
  116. sky/provision/common.py +2 -0
  117. sky/provision/cudo/instance.py +2 -1
  118. sky/provision/do/instance.py +2 -1
  119. sky/provision/fluidstack/instance.py +4 -3
  120. sky/provision/gcp/instance.py +2 -1
  121. sky/provision/hyperbolic/instance.py +2 -1
  122. sky/provision/instance_setup.py +10 -2
  123. sky/provision/kubernetes/constants.py +0 -1
  124. sky/provision/kubernetes/instance.py +222 -89
  125. sky/provision/kubernetes/network.py +12 -8
  126. sky/provision/kubernetes/utils.py +114 -53
  127. sky/provision/kubernetes/volume.py +5 -4
  128. sky/provision/lambda_cloud/instance.py +2 -1
  129. sky/provision/nebius/instance.py +2 -1
  130. sky/provision/oci/instance.py +2 -1
  131. sky/provision/paperspace/instance.py +2 -1
  132. sky/provision/provisioner.py +11 -2
  133. sky/provision/runpod/instance.py +2 -1
  134. sky/provision/scp/instance.py +2 -1
  135. sky/provision/seeweb/instance.py +3 -3
  136. sky/provision/shadeform/__init__.py +11 -0
  137. sky/provision/shadeform/config.py +12 -0
  138. sky/provision/shadeform/instance.py +351 -0
  139. sky/provision/shadeform/shadeform_utils.py +83 -0
  140. sky/provision/vast/instance.py +2 -1
  141. sky/provision/vsphere/instance.py +2 -1
  142. sky/resources.py +1 -1
  143. sky/schemas/api/responses.py +9 -5
  144. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  145. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  146. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  147. sky/schemas/generated/jobsv1_pb2.py +52 -52
  148. sky/schemas/generated/jobsv1_pb2.pyi +4 -2
  149. sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
  150. sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
  151. sky/serve/client/impl.py +11 -3
  152. sky/serve/replica_managers.py +5 -2
  153. sky/serve/serve_utils.py +9 -2
  154. sky/serve/server/impl.py +7 -2
  155. sky/serve/server/server.py +18 -15
  156. sky/serve/service.py +2 -2
  157. sky/server/auth/oauth2_proxy.py +2 -5
  158. sky/server/common.py +31 -28
  159. sky/server/constants.py +5 -1
  160. sky/server/daemons.py +27 -19
  161. sky/server/requests/executor.py +138 -74
  162. sky/server/requests/payloads.py +9 -1
  163. sky/server/requests/preconditions.py +13 -10
  164. sky/server/requests/request_names.py +120 -0
  165. sky/server/requests/requests.py +485 -153
  166. sky/server/requests/serializers/decoders.py +26 -13
  167. sky/server/requests/serializers/encoders.py +56 -11
  168. sky/server/requests/threads.py +106 -0
  169. sky/server/rest.py +70 -18
  170. sky/server/server.py +283 -104
  171. sky/server/stream_utils.py +233 -59
  172. sky/server/uvicorn.py +18 -17
  173. sky/setup_files/alembic.ini +4 -0
  174. sky/setup_files/dependencies.py +32 -13
  175. sky/sky_logging.py +0 -2
  176. sky/skylet/constants.py +30 -7
  177. sky/skylet/events.py +7 -0
  178. sky/skylet/log_lib.py +8 -2
  179. sky/skylet/log_lib.pyi +1 -1
  180. sky/skylet/services.py +26 -13
  181. sky/skylet/subprocess_daemon.py +103 -29
  182. sky/skypilot_config.py +87 -75
  183. sky/ssh_node_pools/server.py +9 -8
  184. sky/task.py +67 -54
  185. sky/templates/kubernetes-ray.yml.j2 +8 -1
  186. sky/templates/nebius-ray.yml.j2 +1 -0
  187. sky/templates/shadeform-ray.yml.j2 +72 -0
  188. sky/templates/websocket_proxy.py +142 -12
  189. sky/users/permission.py +8 -1
  190. sky/utils/admin_policy_utils.py +16 -3
  191. sky/utils/asyncio_utils.py +78 -0
  192. sky/utils/auth_utils.py +153 -0
  193. sky/utils/cli_utils/status_utils.py +8 -2
  194. sky/utils/command_runner.py +11 -0
  195. sky/utils/common.py +3 -1
  196. sky/utils/common_utils.py +7 -4
  197. sky/utils/context.py +57 -51
  198. sky/utils/context_utils.py +30 -12
  199. sky/utils/controller_utils.py +35 -8
  200. sky/utils/db/db_utils.py +37 -10
  201. sky/utils/db/migration_utils.py +8 -4
  202. sky/utils/locks.py +24 -6
  203. sky/utils/resource_checker.py +4 -1
  204. sky/utils/resources_utils.py +53 -29
  205. sky/utils/schemas.py +23 -4
  206. sky/utils/subprocess_utils.py +17 -4
  207. sky/volumes/server/server.py +7 -6
  208. sky/workspaces/server.py +13 -12
  209. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
  210. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
  211. sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
  213. sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
  214. sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
  216. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  217. sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  221. sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
  222. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
  223. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
  224. sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
  225. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  226. sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
  227. /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  228. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
  229. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  230. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  231. {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/utils.py CHANGED
@@ -6,9 +6,8 @@ ManagedJobCodeGen.
6
6
  """
7
7
  import asyncio
8
8
  import collections
9
- import datetime
9
+ from datetime import datetime
10
10
  import enum
11
- import logging
12
11
  import os
13
12
  import pathlib
14
13
  import re
@@ -84,6 +83,7 @@ _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
84
83
 
85
84
  _JOB_STATUS_FETCH_MAX_RETRIES = 3
86
85
  _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
86
+ _JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
87
87
 
88
88
  _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
89
89
  'Waiting for task to start[/]'
@@ -101,6 +101,28 @@ _JOB_CANCELLED_MESSAGE = (
101
101
  # update the state.
102
102
  _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
103
103
 
104
+ # After enabling consolidation mode, we need to restart the API server to get
105
+ # the jobs refresh deamon and correct number of executors. We use this file to
106
+ # indicate that the API server has been restarted after enabling consolidation
107
+ # mode.
108
+ _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
109
+ '~/.sky/.jobs_controller_consolidation_reloaded_signal')
110
+
111
+ # The response fields for managed jobs that require cluster handle
112
+ _CLUSTER_HANDLE_FIELDS = [
113
+ 'cluster_resources',
114
+ 'cluster_resources_full',
115
+ 'cloud',
116
+ 'region',
117
+ 'zone',
118
+ 'infra',
119
+ 'accelerators',
120
+ ]
121
+
122
+ # The response fields for managed jobs that are not stored in the database
123
+ # These fields will be mapped to the DB fields in the `_update_fields`.
124
+ _NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
125
+
104
126
 
105
127
  class ManagedJobQueueResultType(enum.Enum):
106
128
  """The type of the managed job queue result."""
@@ -117,9 +139,8 @@ class UserSignal(enum.Enum):
117
139
 
118
140
  # ====== internal functions ======
119
141
  def terminate_cluster(
120
- cluster_name: str,
121
- max_retry: int = 6,
122
- _logger: logging.Logger = logger, # pylint: disable=invalid-name
142
+ cluster_name: str,
143
+ max_retry: int = 6,
123
144
  ) -> None:
124
145
  """Terminate the cluster."""
125
146
  from sky import core # pylint: disable=import-outside-toplevel
@@ -143,18 +164,18 @@ def terminate_cluster(
143
164
  return
144
165
  except exceptions.ClusterDoesNotExist:
145
166
  # The cluster is already down.
146
- _logger.debug(f'The cluster {cluster_name} is already down.')
167
+ logger.debug(f'The cluster {cluster_name} is already down.')
147
168
  return
148
169
  except Exception as e: # pylint: disable=broad-except
149
170
  retry_cnt += 1
150
171
  if retry_cnt >= max_retry:
151
172
  raise RuntimeError(
152
173
  f'Failed to terminate the cluster {cluster_name}.') from e
153
- _logger.error(
174
+ logger.error(
154
175
  f'Failed to terminate the cluster {cluster_name}. Retrying.'
155
176
  f'Details: {common_utils.format_exception(e)}')
156
177
  with ux_utils.enable_traceback():
157
- _logger.error(f' Traceback: {traceback.format_exc()}')
178
+ logger.error(f' Traceback: {traceback.format_exc()}')
158
179
  time.sleep(backoff.current_backoff())
159
180
 
160
181
 
@@ -174,8 +195,8 @@ def _validate_consolidation_mode_config(
174
195
  'terminate the controller cluster first.'
175
196
  f'{colorama.Style.RESET_ALL}')
176
197
  else:
177
- all_jobs = managed_job_state.get_managed_jobs()
178
- if all_jobs:
198
+ total_jobs = managed_job_state.get_managed_jobs_total()
199
+ if total_jobs > 0:
179
200
  nonterminal_jobs = (
180
201
  managed_job_state.get_nonterminal_job_ids_by_name(
181
202
  None, None, all_users=True))
@@ -190,7 +211,7 @@ def _validate_consolidation_mode_config(
190
211
  else:
191
212
  logger.warning(
192
213
  f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
193
- f'but there are {len(all_jobs)} jobs from previous '
214
+ f'but there are {total_jobs} jobs from previous '
194
215
  'consolidation mode. Reset the `jobs.controller.'
195
216
  'consolidation_mode` to `true` and run `sky jobs queue` '
196
217
  'to see those jobs. Switching to normal mode will '
@@ -202,13 +223,39 @@ def _validate_consolidation_mode_config(
202
223
  # API Server. Under the hood, we submit the job monitoring logic as processes
203
224
  # directly in the API Server.
204
225
  # Use LRU Cache so that the check is only done once.
205
- @annotations.lru_cache(scope='request', maxsize=1)
206
- def is_consolidation_mode() -> bool:
226
+ @annotations.lru_cache(scope='request', maxsize=2)
227
+ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
207
228
  if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
208
229
  return True
209
230
 
210
- consolidation_mode = skypilot_config.get_nested(
231
+ config_consolidation_mode = skypilot_config.get_nested(
211
232
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
233
+
234
+ signal_file = pathlib.Path(
235
+ _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
236
+
237
+ restart_signal_file_exists = signal_file.exists()
238
+ consolidation_mode = (config_consolidation_mode and
239
+ restart_signal_file_exists)
240
+
241
+ if on_api_restart:
242
+ if config_consolidation_mode:
243
+ signal_file.touch()
244
+ else:
245
+ if not restart_signal_file_exists:
246
+ if config_consolidation_mode:
247
+ logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
248
+ 'managed jobs is enabled in the server config, '
249
+ 'but the API server has not been restarted yet. '
250
+ 'Please restart the API server to enable it.'
251
+ f'{colorama.Style.RESET_ALL}')
252
+ return False
253
+ elif not config_consolidation_mode:
254
+ # Cleanup the signal file if the consolidation mode is disabled in
255
+ # the config. This allow the user to disable the consolidation mode
256
+ # without restarting the API server.
257
+ signal_file.unlink()
258
+
212
259
  # We should only do this check on API server, as the controller will not
213
260
  # have related config and will always seemingly disabled for consolidation
214
261
  # mode. Check #6611 for more details.
@@ -219,6 +266,12 @@ def is_consolidation_mode() -> bool:
219
266
 
220
267
  def ha_recovery_for_consolidation_mode():
221
268
  """Recovery logic for HA mode."""
269
+ # Touch the signal file here to avoid conflict with
270
+ # update_managed_jobs_statuses. Although we run this first and then start
271
+ # the deamon, this function is also called in cancel_jobs_by_id.
272
+ signal_file = pathlib.Path(
273
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
274
+ signal_file.touch()
222
275
  # No setup recovery is needed in consolidation mode, as the API server
223
276
  # already has all runtime installed. Directly start jobs recovery here.
224
277
  # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
@@ -229,7 +282,9 @@ def ha_recovery_for_consolidation_mode():
229
282
  encoding='utf-8') as f:
230
283
  start = time.time()
231
284
  f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
232
- for job in managed_job_state.get_managed_jobs():
285
+ jobs, _ = managed_job_state.get_managed_jobs_with_filters(
286
+ fields=['job_id', 'controller_pid', 'schedule_state', 'status'])
287
+ for job in jobs:
233
288
  job_id = job['job_id']
234
289
  controller_pid = job['controller_pid']
235
290
 
@@ -265,12 +320,12 @@ def ha_recovery_for_consolidation_mode():
265
320
  f'{datetime.datetime.now()}\n')
266
321
  f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
267
322
  f.write(f'Total recovery time: {time.time() - start} seconds\n')
323
+ signal_file.unlink()
268
324
 
269
325
 
270
326
  async def get_job_status(
271
327
  backend: 'backends.CloudVmRayBackend', cluster_name: str,
272
- job_id: Optional[int],
273
- job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
328
+ job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
274
329
  """Check the status of the job running on a managed job cluster.
275
330
 
276
331
  It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
@@ -282,26 +337,28 @@ async def get_job_status(
282
337
  if handle is None:
283
338
  # This can happen if the cluster was preempted and background status
284
339
  # refresh already noticed and cleaned it up.
285
- job_logger.info(f'Cluster {cluster_name} not found.')
340
+ logger.info(f'Cluster {cluster_name} not found.')
286
341
  return None
287
342
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
288
343
  job_ids = None if job_id is None else [job_id]
289
344
  for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
290
345
  try:
291
- job_logger.info('=== Checking the job status... ===')
292
- statuses = await context_utils.to_thread(backend.get_job_status,
293
- handle,
294
- job_ids=job_ids,
295
- stream_logs=False)
346
+ logger.info('=== Checking the job status... ===')
347
+ statuses = await asyncio.wait_for(
348
+ context_utils.to_thread(backend.get_job_status,
349
+ handle,
350
+ job_ids=job_ids,
351
+ stream_logs=False),
352
+ timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
296
353
  status = list(statuses.values())[0]
297
354
  if status is None:
298
- job_logger.info('No job found.')
355
+ logger.info('No job found.')
299
356
  else:
300
- job_logger.info(f'Job status: {status}')
301
- job_logger.info('=' * 34)
357
+ logger.info(f'Job status: {status}')
358
+ logger.info('=' * 34)
302
359
  return status
303
360
  except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
304
- ValueError, TypeError) as e:
361
+ ValueError, TypeError, asyncio.TimeoutError) as e:
305
362
  # Note: Each of these exceptions has some additional conditions to
306
363
  # limit how we handle it and whether or not we catch it.
307
364
  # Retry on k8s transient network errors. This is useful when using
@@ -322,6 +379,9 @@ async def get_job_status(
322
379
  is_transient_error = True
323
380
  elif isinstance(e, grpc.FutureTimeoutError):
324
381
  detailed_reason = 'Timeout'
382
+ elif isinstance(e, asyncio.TimeoutError):
383
+ detailed_reason = ('Job status check timed out after '
384
+ f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
325
385
  # TODO(cooperc): Gracefully handle these exceptions in the backend.
326
386
  elif isinstance(e, ValueError):
327
387
  # If the cluster yaml is deleted in the middle of getting the
@@ -405,7 +465,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
405
465
  """
406
466
  managed_job_state.remove_ha_recovery_script(job_id)
407
467
  error_msg = None
408
- tasks = managed_job_state.get_managed_jobs(job_id)
468
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
409
469
  for task in tasks:
410
470
  pool = task.get('pool', None)
411
471
  if pool is None:
@@ -474,7 +534,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
474
534
 
475
535
  for job_id in job_ids:
476
536
  assert job_id is not None
477
- tasks = managed_job_state.get_managed_jobs(job_id)
537
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
478
538
  # Note: controller_pid and schedule_state are in the job_info table
479
539
  # which is joined to the spot table, so all tasks with the same job_id
480
540
  # will have the same value for these columns. This is what lets us just
@@ -494,9 +554,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
494
554
  if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
495
555
  # There are two cases where we could get a job that is DONE.
496
556
  # 1. At query time (get_jobs_to_check_status), the job was not yet
497
- # DONE, but since then (before get_managed_jobs is called) it has
498
- # hit a terminal status, marked itself done, and exited. This is
499
- # fine.
557
+ # DONE, but since then (before get_managed_job_tasks is called)
558
+ # it has hit a terminal status, marked itself done, and exited.
559
+ # This is fine.
500
560
  # 2. The job is DONE, but in a non-terminal status. This is
501
561
  # unexpected. For instance, the task status is RUNNING, but the
502
562
  # job schedule_state is DONE.
@@ -850,6 +910,14 @@ def cancel_jobs_by_pool(pool_name: str,
850
910
  return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
851
911
 
852
912
 
913
+ def controller_log_file_for_job(job_id: int,
914
+ create_if_not_exists: bool = False) -> str:
915
+ log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
916
+ if create_if_not_exists:
917
+ os.makedirs(log_dir, exist_ok=True)
918
+ return os.path.join(log_dir, f'{job_id}.log')
919
+
920
+
853
921
  def stream_logs_by_id(job_id: int,
854
922
  follow: bool = True,
855
923
  tail: Optional[int] = None) -> Tuple[str, int]:
@@ -882,13 +950,20 @@ def stream_logs_by_id(job_id: int,
882
950
  if managed_job_status.is_failed():
883
951
  job_msg = ('\nFailure reason: '
884
952
  f'{managed_job_state.get_failure_reason(job_id)}')
885
- log_file_exists = False
953
+ log_file_ever_existed = False
886
954
  task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
887
955
  job_id)
888
956
  num_tasks = len(task_info)
889
- for task_id, task_name, task_status, log_file in task_info:
957
+ for (task_id, task_name, task_status, log_file,
958
+ logs_cleaned_at) in task_info:
890
959
  if log_file:
891
- log_file_exists = True
960
+ log_file_ever_existed = True
961
+ if logs_cleaned_at is not None:
962
+ ts_str = datetime.fromtimestamp(
963
+ logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
964
+ print(f'Task {task_name}({task_id}) log has been '
965
+ f'cleaned at {ts_str}.')
966
+ continue
892
967
  task_str = (f'Task {task_name}({task_id})'
893
968
  if task_name else f'Task {task_id}')
894
969
  if num_tasks > 1:
@@ -923,7 +998,7 @@ def stream_logs_by_id(job_id: int,
923
998
  f'{task_str} finished '
924
999
  f'(status: {task_status.value}).'),
925
1000
  flush=True)
926
- if log_file_exists:
1001
+ if log_file_ever_existed:
927
1002
  # Add the "Job finished" message for terminal states
928
1003
  if managed_job_status.is_terminal():
929
1004
  print(ux_utils.finishing_message(
@@ -1151,7 +1226,8 @@ def stream_logs(job_id: Optional[int],
1151
1226
  if controller:
1152
1227
  if job_id is None:
1153
1228
  assert job_name is not None
1154
- managed_jobs = managed_job_state.get_managed_jobs()
1229
+ managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
1230
+ name_match=job_name, fields=['job_id', 'job_name', 'status'])
1155
1231
  # We manually filter the jobs by name, instead of using
1156
1232
  # get_nonterminal_job_ids_by_name, as with `controller=True`, we
1157
1233
  # should be able to show the logs for jobs in terminal states.
@@ -1174,9 +1250,7 @@ def stream_logs(job_id: Optional[int],
1174
1250
  job_id = managed_job_ids.pop()
1175
1251
  assert job_id is not None, (job_id, job_name)
1176
1252
 
1177
- controller_log_path = os.path.join(
1178
- os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
1179
- f'{job_id}.log')
1253
+ controller_log_path = controller_log_file_for_job(job_id)
1180
1254
  job_status = None
1181
1255
 
1182
1256
  # Wait for the log file to be written
@@ -1277,11 +1351,87 @@ def dump_managed_job_queue(
1277
1351
  limit: Optional[int] = None,
1278
1352
  user_hashes: Optional[List[Optional[str]]] = None,
1279
1353
  statuses: Optional[List[str]] = None,
1354
+ fields: Optional[List[str]] = None,
1280
1355
  ) -> str:
1281
1356
  return message_utils.encode_payload(
1282
1357
  get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
1283
1358
  workspace_match, name_match, pool_match, page,
1284
- limit, user_hashes, statuses))
1359
+ limit, user_hashes, statuses, fields))
1360
+
1361
+
1362
+ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
1363
+ """Update the fields list to include the necessary fields.
1364
+
1365
+ Args:
1366
+ fields: The fields to update.
1367
+
1368
+ It will:
1369
+ - Add the necessary dependent fields to the list.
1370
+ - Remove the fields that are not in the DB.
1371
+ - Determine if cluster handle is required.
1372
+
1373
+ Returns:
1374
+ A tuple containing the updated fields and a boolean indicating if
1375
+ cluster handle is required.
1376
+ """
1377
+ cluster_handle_required = True
1378
+ if _cluster_handle_not_required(fields):
1379
+ cluster_handle_required = False
1380
+ # Copy the list to avoid modifying the original list
1381
+ new_fields = fields.copy()
1382
+ # status and job_id are always included
1383
+ if 'status' not in new_fields:
1384
+ new_fields.append('status')
1385
+ if 'job_id' not in new_fields:
1386
+ new_fields.append('job_id')
1387
+ # user_hash is required if user_name is present
1388
+ if 'user_name' in new_fields and 'user_hash' not in new_fields:
1389
+ new_fields.append('user_hash')
1390
+ if 'job_duration' in new_fields:
1391
+ if 'last_recovered_at' not in new_fields:
1392
+ new_fields.append('last_recovered_at')
1393
+ if 'end_at' not in new_fields:
1394
+ new_fields.append('end_at')
1395
+ if 'job_name' in new_fields and 'task_name' not in new_fields:
1396
+ new_fields.append('task_name')
1397
+ if 'details' in new_fields:
1398
+ if 'schedule_state' not in new_fields:
1399
+ new_fields.append('schedule_state')
1400
+ if 'priority' not in new_fields:
1401
+ new_fields.append('priority')
1402
+ if 'failure_reason' not in new_fields:
1403
+ new_fields.append('failure_reason')
1404
+ if 'user_yaml' in new_fields:
1405
+ if 'original_user_yaml_path' not in new_fields:
1406
+ new_fields.append('original_user_yaml_path')
1407
+ if 'original_user_yaml_content' not in new_fields:
1408
+ new_fields.append('original_user_yaml_content')
1409
+ if cluster_handle_required:
1410
+ if 'task_name' not in new_fields:
1411
+ new_fields.append('task_name')
1412
+ if 'current_cluster_name' not in new_fields:
1413
+ new_fields.append('current_cluster_name')
1414
+ # Remove _NON_DB_FIELDS
1415
+ # These fields have been mapped to the DB fields in the above code, so we
1416
+ # don't need to include them in the updated fields.
1417
+ for field in _NON_DB_FIELDS:
1418
+ if field in new_fields:
1419
+ new_fields.remove(field)
1420
+ return new_fields, cluster_handle_required
1421
+
1422
+
1423
+ def _cluster_handle_not_required(fields: List[str]) -> bool:
1424
+ """Determine if cluster handle is not required.
1425
+
1426
+ Args:
1427
+ fields: The fields to check if they contain any of the cluster handle
1428
+ fields.
1429
+
1430
+ Returns:
1431
+ True if the fields do not contain any of the cluster handle fields,
1432
+ False otherwise.
1433
+ """
1434
+ return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
1285
1435
 
1286
1436
 
1287
1437
  def get_managed_job_queue(
@@ -1295,146 +1445,153 @@ def get_managed_job_queue(
1295
1445
  limit: Optional[int] = None,
1296
1446
  user_hashes: Optional[List[Optional[str]]] = None,
1297
1447
  statuses: Optional[List[str]] = None,
1448
+ fields: Optional[List[str]] = None,
1298
1449
  ) -> Dict[str, Any]:
1299
- # Make sure to get all jobs - some logic below (e.g. high priority job
1300
- # detection) requires a full view of the jobs table.
1301
- jobs = managed_job_state.get_managed_jobs()
1450
+ """Get the managed job queue.
1302
1451
 
1303
- # Figure out what the highest priority blocking job is. We need to know in
1304
- # order to determine if other jobs are blocked by a higher priority job, or
1305
- # just by the limited controller resources.
1306
- highest_blocking_priority = constants.MIN_PRIORITY
1307
- for job in jobs:
1308
- if job['schedule_state'] not in (
1309
- # LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
1310
- # lower priority.
1311
- managed_job_state.ManagedJobScheduleState.LAUNCHING,
1312
- managed_job_state.ManagedJobScheduleState.ALIVE_BACKOFF,
1313
- # It's possible for a WAITING/ALIVE_WAITING job to be ready to
1314
- # launch, but the scheduler just hasn't run yet.
1315
- managed_job_state.ManagedJobScheduleState.WAITING,
1316
- managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
1317
- # This job will not block others.
1318
- continue
1319
-
1320
- priority = job.get('priority')
1321
- if priority is not None and priority > highest_blocking_priority:
1322
- highest_blocking_priority = priority
1452
+ Args:
1453
+ skip_finished: Whether to skip finished jobs.
1454
+ accessible_workspaces: The accessible workspaces.
1455
+ job_ids: The job ids.
1456
+ workspace_match: The workspace name to match.
1457
+ name_match: The job name to match.
1458
+ pool_match: The pool name to match.
1459
+ page: The page number.
1460
+ limit: The limit number.
1461
+ user_hashes: The user hashes.
1462
+ statuses: The statuses.
1463
+ fields: The fields to include in the response.
1323
1464
 
1324
- total_no_filter = len(jobs)
1465
+ Returns:
1466
+ A dictionary containing the managed job queue.
1467
+ """
1468
+ cluster_handle_required = True
1469
+ updated_fields = None
1470
+ # The caller only need to specify the fields in the
1471
+ # `class ManagedJobRecord` in `response.py`, and the `_update_fields`
1472
+ # function will add the necessary dependent fields to the list, for
1473
+ # example, if the caller specifies `['user_name']`, the `_update_fields`
1474
+ # function will add `['user_hash']` to the list.
1475
+ if fields:
1476
+ updated_fields, cluster_handle_required = _update_fields(fields)
1477
+
1478
+ total_no_filter = managed_job_state.get_managed_jobs_total()
1479
+
1480
+ status_counts = managed_job_state.get_status_count_with_filters(
1481
+ fields=fields,
1482
+ job_ids=job_ids,
1483
+ accessible_workspaces=accessible_workspaces,
1484
+ workspace_match=workspace_match,
1485
+ name_match=name_match,
1486
+ pool_match=pool_match,
1487
+ user_hashes=user_hashes,
1488
+ skip_finished=skip_finished,
1489
+ )
1490
+
1491
+ jobs, total = managed_job_state.get_managed_jobs_with_filters(
1492
+ fields=updated_fields,
1493
+ job_ids=job_ids,
1494
+ accessible_workspaces=accessible_workspaces,
1495
+ workspace_match=workspace_match,
1496
+ name_match=name_match,
1497
+ pool_match=pool_match,
1498
+ user_hashes=user_hashes,
1499
+ statuses=statuses,
1500
+ skip_finished=skip_finished,
1501
+ page=page,
1502
+ limit=limit,
1503
+ )
1504
+
1505
+ if cluster_handle_required:
1506
+ # Fetch the cluster name to handle map for managed clusters only.
1507
+ cluster_name_to_handle = (
1508
+ global_user_state.get_cluster_name_to_handle_map(is_managed=True))
1325
1509
 
1326
- if user_hashes:
1327
- jobs = [
1328
- job for job in jobs if job.get('user_hash', None) in user_hashes
1329
- ]
1330
- if accessible_workspaces:
1331
- jobs = [
1332
- job for job in jobs
1333
- if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
1334
- accessible_workspaces
1335
- ]
1336
- if skip_finished:
1337
- # Filter out the finished jobs. If a multi-task job is partially
1338
- # finished, we will include all its tasks.
1339
- non_finished_tasks = list(
1340
- filter(
1341
- lambda job: not managed_job_state.ManagedJobStatus(job[
1342
- 'status']).is_terminal(), jobs))
1343
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
1344
- jobs = list(
1345
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
1346
- if job_ids:
1347
- jobs = [job for job in jobs if job['job_id'] in job_ids]
1348
-
1349
- jobs, total, status_counts = filter_jobs(jobs,
1350
- workspace_match,
1351
- name_match,
1352
- pool_match,
1353
- page,
1354
- limit,
1355
- statuses=statuses)
1356
-
1357
- job_ids = set(job['job_id'] for job in jobs)
1358
- job_id_to_pool_info = (
1359
- managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
1360
- cluster_names: Dict[int, str] = {}
1361
- for job in jobs:
1362
- # pool info is (pool, cluster_name, job_id_on_pool_cluster)
1363
- pool_info = job_id_to_pool_info.get(job['job_id'], None)
1364
- if pool_info and pool_info[0]:
1365
- cluster_name = pool_info[1]
1366
- else:
1367
- cluster_name = generate_managed_job_cluster_name(
1368
- job['task_name'], job['job_id'])
1369
- cluster_names[job['job_id']] = cluster_name
1370
- cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
1371
- set(cluster_names.values()))
1510
+ highest_blocking_priority = constants.MIN_PRIORITY
1511
+ if not fields or 'details' in fields:
1512
+ # Figure out what the highest priority blocking job is. We need to know
1513
+ # in order to determine if other jobs are blocked by a higher priority
1514
+ # job, or just by the limited controller resources.
1515
+ highest_blocking_priority = (
1516
+ managed_job_state.get_managed_jobs_highest_priority())
1372
1517
 
1373
1518
  for job in jobs:
1374
- end_at = job['end_at']
1375
- if end_at is None:
1376
- end_at = time.time()
1377
-
1378
- job_submitted_at = job['last_recovered_at'] - job['job_duration']
1379
- if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
1380
- # When job is recovering, the duration is exact job['job_duration']
1381
- job_duration = job['job_duration']
1382
- elif job_submitted_at > 0:
1383
- job_duration = end_at - job_submitted_at
1384
- else:
1385
- # When job_start_at <= 0, that means the last_recovered_at is not
1386
- # set yet, i.e. the job is not started.
1387
- job_duration = 0
1388
- job['job_duration'] = job_duration
1519
+ if not fields or 'job_duration' in fields:
1520
+ end_at = job['end_at']
1521
+ if end_at is None:
1522
+ end_at = time.time()
1523
+
1524
+ job_submitted_at = job['last_recovered_at'] - job['job_duration']
1525
+ if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
1526
+ # When job is recovering, the duration is exact
1527
+ # job['job_duration']
1528
+ job_duration = job['job_duration']
1529
+ elif job_submitted_at > 0:
1530
+ job_duration = end_at - job_submitted_at
1531
+ else:
1532
+ # When job_start_at <= 0, that means the last_recovered_at
1533
+ # is not set yet, i.e. the job is not started.
1534
+ job_duration = 0
1535
+ job['job_duration'] = job_duration
1389
1536
  job['status'] = job['status'].value
1390
- job['schedule_state'] = job['schedule_state'].value
1391
-
1392
- cluster_name = cluster_names[job['job_id']]
1393
- handle = cluster_name_to_handles.get(cluster_name, None)
1394
- if isinstance(handle, backends.CloudVmRayResourceHandle):
1395
- resources_str = resources_utils.get_readable_resources_repr(
1396
- handle, simplify=True)
1397
- resources_str_full = resources_utils.get_readable_resources_repr(
1398
- handle, simplify=False)
1399
- job['cluster_resources'] = resources_str
1400
- job['cluster_resources_full'] = resources_str_full
1401
- job['cloud'] = str(handle.launched_resources.cloud)
1402
- job['region'] = handle.launched_resources.region
1403
- job['zone'] = handle.launched_resources.zone
1404
- job['infra'] = infra_utils.InfraInfo(
1405
- str(handle.launched_resources.cloud),
1406
- handle.launched_resources.region,
1407
- handle.launched_resources.zone).formatted_str()
1408
- job['accelerators'] = handle.launched_resources.accelerators
1537
+ if not fields or 'schedule_state' in fields:
1538
+ job['schedule_state'] = job['schedule_state'].value
1409
1539
  else:
1410
- # FIXME(zongheng): display the last cached values for these.
1411
- job['cluster_resources'] = '-'
1412
- job['cluster_resources_full'] = '-'
1413
- job['cloud'] = '-'
1414
- job['region'] = '-'
1415
- job['zone'] = '-'
1416
- job['infra'] = '-'
1417
-
1418
- # Add details about schedule state / backoff.
1419
- state_details = None
1420
- if job['schedule_state'] == 'ALIVE_BACKOFF':
1421
- state_details = 'In backoff, waiting for resources'
1422
- elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
1423
- priority = job.get('priority')
1424
- if (priority is not None and priority < highest_blocking_priority):
1425
- # Job is lower priority than some other blocking job.
1426
- state_details = 'Waiting for higher priority jobs to launch'
1540
+ job['schedule_state'] = None
1541
+
1542
+ if cluster_handle_required:
1543
+ cluster_name = job.get('current_cluster_name', None)
1544
+ if cluster_name is None:
1545
+ cluster_name = generate_managed_job_cluster_name(
1546
+ job['task_name'], job['job_id'])
1547
+ handle = cluster_name_to_handle.get(
1548
+ cluster_name, None) if cluster_name is not None else None
1549
+ if isinstance(handle, backends.CloudVmRayResourceHandle):
1550
+ resources_str_simple, resources_str_full = (
1551
+ resources_utils.get_readable_resources_repr(
1552
+ handle, simplified_only=False))
1553
+ assert resources_str_full is not None
1554
+ job['cluster_resources'] = resources_str_simple
1555
+ job['cluster_resources_full'] = resources_str_full
1556
+ job['cloud'] = str(handle.launched_resources.cloud)
1557
+ job['region'] = handle.launched_resources.region
1558
+ job['zone'] = handle.launched_resources.zone
1559
+ job['infra'] = infra_utils.InfraInfo(
1560
+ str(handle.launched_resources.cloud),
1561
+ handle.launched_resources.region,
1562
+ handle.launched_resources.zone).formatted_str()
1563
+ job['accelerators'] = handle.launched_resources.accelerators
1427
1564
  else:
1428
- state_details = 'Waiting for other jobs to launch'
1429
-
1430
- if state_details and job['failure_reason']:
1431
- job['details'] = f'{state_details} - {job["failure_reason"]}'
1432
- elif state_details:
1433
- job['details'] = state_details
1434
- elif job['failure_reason']:
1435
- job['details'] = f'Failure: {job["failure_reason"]}'
1436
- else:
1437
- job['details'] = None
1565
+ # FIXME(zongheng): display the last cached values for these.
1566
+ job['cluster_resources'] = '-'
1567
+ job['cluster_resources_full'] = '-'
1568
+ job['cloud'] = '-'
1569
+ job['region'] = '-'
1570
+ job['zone'] = '-'
1571
+ job['infra'] = '-'
1572
+
1573
+ if not fields or 'details' in fields:
1574
+ # Add details about schedule state / backoff.
1575
+ state_details = None
1576
+ if job['schedule_state'] == 'ALIVE_BACKOFF':
1577
+ state_details = 'In backoff, waiting for resources'
1578
+ elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
1579
+ priority = job.get('priority')
1580
+ if (priority is not None and
1581
+ priority < highest_blocking_priority):
1582
+ # Job is lower priority than some other blocking job.
1583
+ state_details = 'Waiting for higher priority jobs to launch'
1584
+ else:
1585
+ state_details = 'Waiting for other jobs to launch'
1586
+
1587
+ if state_details and job['failure_reason']:
1588
+ job['details'] = f'{state_details} - {job["failure_reason"]}'
1589
+ elif state_details:
1590
+ job['details'] = state_details
1591
+ elif job['failure_reason']:
1592
+ job['details'] = f'Failure: {job["failure_reason"]}'
1593
+ else:
1594
+ job['details'] = None
1438
1595
 
1439
1596
  return {
1440
1597
  'jobs': jobs,
@@ -1545,21 +1702,14 @@ def load_managed_job_queue(
1545
1702
  total_no_filter = total
1546
1703
  result_type = ManagedJobQueueResultType.LIST
1547
1704
 
1548
- job_id_to_user_hash: Dict[int, str] = {}
1705
+ all_users = global_user_state.get_all_users()
1706
+ all_users_map = {user.id: user.name for user in all_users}
1549
1707
  for job in jobs:
1708
+ job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1550
1709
  if 'user_hash' in job and job['user_hash'] is not None:
1551
1710
  # Skip jobs that do not have user_hash info.
1552
1711
  # TODO(cooperc): Remove check before 0.12.0.
1553
- job_id_to_user_hash[job['job_id']] = job['user_hash']
1554
- user_hash_to_user = global_user_state.get_users(
1555
- job_id_to_user_hash.values())
1556
-
1557
- for job in jobs:
1558
- job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1559
- if job['job_id'] in job_id_to_user_hash:
1560
- user_hash = job_id_to_user_hash[job['job_id']]
1561
- user = user_hash_to_user.get(user_hash, None)
1562
- job['user_name'] = user.name if user is not None else None
1712
+ job['user_name'] = all_users_map.get(job['user_hash'])
1563
1713
  return jobs, total, result_type, total_no_filter, status_counts
1564
1714
 
1565
1715
 
@@ -1584,29 +1734,40 @@ def _get_job_status_from_tasks(
1584
1734
 
1585
1735
 
1586
1736
  @typing.overload
1587
- def format_job_table(tasks: List[Dict[str, Any]],
1588
- show_all: bool,
1589
- show_user: bool,
1590
- return_rows: Literal[False] = False,
1591
- max_jobs: Optional[int] = None) -> str:
1737
+ def format_job_table(
1738
+ tasks: List[Dict[str, Any]],
1739
+ show_all: bool,
1740
+ show_user: bool,
1741
+ return_rows: Literal[False] = False,
1742
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1743
+ max_jobs: Optional[int] = None,
1744
+ job_status_counts: Optional[Dict[str, int]] = None,
1745
+ ) -> str:
1592
1746
  ...
1593
1747
 
1594
1748
 
1595
1749
  @typing.overload
1596
- def format_job_table(tasks: List[Dict[str, Any]],
1597
- show_all: bool,
1598
- show_user: bool,
1599
- return_rows: Literal[True],
1600
- max_jobs: Optional[int] = None) -> List[List[str]]:
1750
+ def format_job_table(
1751
+ tasks: List[Dict[str, Any]],
1752
+ show_all: bool,
1753
+ show_user: bool,
1754
+ return_rows: Literal[True],
1755
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1756
+ max_jobs: Optional[int] = None,
1757
+ job_status_counts: Optional[Dict[str, int]] = None,
1758
+ ) -> List[List[str]]:
1601
1759
  ...
1602
1760
 
1603
1761
 
1604
1762
  def format_job_table(
1605
- tasks: List[Dict[str, Any]],
1606
- show_all: bool,
1607
- show_user: bool,
1608
- return_rows: bool = False,
1609
- max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
1763
+ tasks: List[Dict[str, Any]],
1764
+ show_all: bool,
1765
+ show_user: bool,
1766
+ return_rows: bool = False,
1767
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1768
+ max_jobs: Optional[int] = None,
1769
+ job_status_counts: Optional[Dict[str, int]] = None,
1770
+ ) -> Union[str, List[List[str]]]:
1610
1771
  """Returns managed jobs as a formatted string.
1611
1772
 
1612
1773
  Args:
@@ -1615,6 +1776,8 @@ def format_job_table(
1615
1776
  max_jobs: The maximum number of jobs to show in the table.
1616
1777
  return_rows: If True, return the rows as a list of strings instead of
1617
1778
  all rows concatenated into a single string.
1779
+ pool_status: List of pool status dictionaries with replica_info.
1780
+ job_status_counts: The counts of each job status.
1618
1781
 
1619
1782
  Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
1620
1783
  a list of "rows" (each of which is a list of str).
@@ -1631,17 +1794,37 @@ def format_job_table(
1631
1794
  return (task['user'], task['job_id'])
1632
1795
  return task['job_id']
1633
1796
 
1797
+ def _get_job_id_to_worker_map(
1798
+ pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
1799
+ """Create a mapping from job_id to worker replica_id.
1800
+
1801
+ Args:
1802
+ pool_status: List of pool status dictionaries with replica_info.
1803
+
1804
+ Returns:
1805
+ Dictionary mapping job_id to replica_id (worker ID).
1806
+ """
1807
+ job_to_worker: Dict[int, int] = {}
1808
+ if pool_status is None:
1809
+ return job_to_worker
1810
+ for pool in pool_status:
1811
+ replica_info = pool.get('replica_info', [])
1812
+ for replica in replica_info:
1813
+ used_by = replica.get('used_by')
1814
+ if used_by is not None:
1815
+ job_to_worker[used_by] = replica.get('replica_id')
1816
+ return job_to_worker
1817
+
1818
+ # Create mapping from job_id to worker replica_id
1819
+ job_to_worker = _get_job_id_to_worker_map(pool_status)
1820
+
1634
1821
  for task in tasks:
1635
1822
  # The tasks within the same job_id are already sorted
1636
1823
  # by the task_id.
1637
1824
  jobs[get_hash(task)].append(task)
1638
1825
 
1639
- status_counts: Dict[str, int] = collections.defaultdict(int)
1640
1826
  workspaces = set()
1641
1827
  for job_tasks in jobs.values():
1642
- managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
1643
- if not managed_job_status.is_terminal():
1644
- status_counts[managed_job_status.value] += 1
1645
1828
  workspaces.add(job_tasks[0].get('workspace',
1646
1829
  constants.SKYPILOT_DEFAULT_WORKSPACE))
1647
1830
 
@@ -1684,9 +1867,15 @@ def format_job_table(
1684
1867
  job_table = log_utils.create_table(columns)
1685
1868
 
1686
1869
  status_counts: Dict[str, int] = collections.defaultdict(int)
1687
- for task in tasks:
1688
- if not task['status'].is_terminal():
1689
- status_counts[task['status'].value] += 1
1870
+ if job_status_counts:
1871
+ for status_value, count in job_status_counts.items():
1872
+ status = managed_job_state.ManagedJobStatus(status_value)
1873
+ if not status.is_terminal():
1874
+ status_counts[status_value] = count
1875
+ else:
1876
+ for task in tasks:
1877
+ if not task['status'].is_terminal():
1878
+ status_counts[task['status'].value] += 1
1690
1879
 
1691
1880
  all_tasks = tasks
1692
1881
  if max_jobs is not None:
@@ -1772,7 +1961,12 @@ def format_job_table(
1772
1961
  if pool is None:
1773
1962
  pool = '-'
1774
1963
 
1964
+ # Add worker information if job is assigned to a worker
1775
1965
  job_id = job_hash[1] if tasks_have_k8s_user else job_hash
1966
+ # job_id is now always an integer, use it to look up worker
1967
+ if job_id in job_to_worker and pool != '-':
1968
+ pool = f'{pool} (worker={job_to_worker[job_id]})'
1969
+
1776
1970
  job_values = [
1777
1971
  job_id,
1778
1972
  '',
@@ -1815,6 +2009,12 @@ def format_job_table(
1815
2009
  pool = task.get('pool')
1816
2010
  if pool is None:
1817
2011
  pool = '-'
2012
+
2013
+ # Add worker information if task is assigned to a worker
2014
+ task_job_id = task['job_id']
2015
+ if task_job_id in job_to_worker and pool != '-':
2016
+ pool = f'{pool} (worker={job_to_worker[task_job_id]})'
2017
+
1818
2018
  values = [
1819
2019
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1820
2020
  task['task_id'] if len(job_tasks) > 1 else '-',
@@ -1934,7 +2134,8 @@ def _job_proto_to_dict(
1934
2134
  # and Protobuf encodes int64 as decimal strings in JSON,
1935
2135
  # so we need to convert them back to ints.
1936
2136
  # https://protobuf.dev/programming-guides/json/#field-representation
1937
- if field.type == descriptor.FieldDescriptor.TYPE_INT64:
2137
+ if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
2138
+ job_dict.get(field.name) is not None):
1938
2139
  job_dict[field.name] = int(job_dict[field.name])
1939
2140
  job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
1940
2141
  job_dict['status'])
@@ -1978,6 +2179,7 @@ class ManagedJobCodeGen:
1978
2179
  limit: Optional[int] = None,
1979
2180
  user_hashes: Optional[List[Optional[str]]] = None,
1980
2181
  statuses: Optional[List[str]] = None,
2182
+ fields: Optional[List[str]] = None,
1981
2183
  ) -> str:
1982
2184
  code = textwrap.dedent(f"""\
1983
2185
  if managed_job_version < 9:
@@ -1996,7 +2198,7 @@ class ManagedJobCodeGen:
1996
2198
  page={page!r},
1997
2199
  limit={limit!r},
1998
2200
  user_hashes={user_hashes!r})
1999
- else:
2201
+ elif managed_job_version < 12:
2000
2202
  job_table = utils.dump_managed_job_queue(
2001
2203
  skip_finished={skip_finished},
2002
2204
  accessible_workspaces={accessible_workspaces!r},
@@ -2008,6 +2210,19 @@ class ManagedJobCodeGen:
2008
2210
  limit={limit!r},
2009
2211
  user_hashes={user_hashes!r},
2010
2212
  statuses={statuses!r})
2213
+ else:
2214
+ job_table = utils.dump_managed_job_queue(
2215
+ skip_finished={skip_finished},
2216
+ accessible_workspaces={accessible_workspaces!r},
2217
+ job_ids={job_ids!r},
2218
+ workspace_match={workspace_match!r},
2219
+ name_match={name_match!r},
2220
+ pool_match={pool_match!r},
2221
+ page={page!r},
2222
+ limit={limit!r},
2223
+ user_hashes={user_hashes!r},
2224
+ statuses={statuses!r},
2225
+ fields={fields!r})
2011
2226
  print(job_table, flush=True)
2012
2227
  """)
2013
2228
  return cls._build(code)
@@ -2075,6 +2290,18 @@ class ManagedJobCodeGen:
2075
2290
  """)
2076
2291
  return cls._build(code)
2077
2292
 
2293
+ @classmethod
2294
+ def get_version(cls) -> str:
2295
+ """Generate code to get controller version."""
2296
+ code = textwrap.dedent("""\
2297
+ from sky.skylet import constants as controller_constants
2298
+
2299
+ # Get controller version
2300
+ controller_version = controller_constants.SKYLET_VERSION
2301
+ print(f"controller_version:{controller_version}", flush=True)
2302
+ """)
2303
+ return cls._build(code)
2304
+
2078
2305
  @classmethod
2079
2306
  def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
2080
2307
  code = textwrap.dedent(f"""\
@@ -2112,8 +2339,12 @@ class ManagedJobCodeGen:
2112
2339
  return cls._build(code)
2113
2340
 
2114
2341
  @classmethod
2115
- def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
2116
- workspace: str, entrypoint: str) -> str:
2342
+ def set_pending(cls,
2343
+ job_id: int,
2344
+ managed_job_dag: 'dag_lib.Dag',
2345
+ workspace: str,
2346
+ entrypoint: str,
2347
+ user_hash: Optional[str] = None) -> str:
2117
2348
  dag_name = managed_job_dag.name
2118
2349
  pool = managed_job_dag.pool
2119
2350
  # Add the managed job to queue table.
@@ -2130,6 +2361,8 @@ class ManagedJobCodeGen:
2130
2361
  pool_hash = serve_state.get_service_hash({pool!r})
2131
2362
  set_job_info_kwargs['pool'] = {pool!r}
2132
2363
  set_job_info_kwargs['pool_hash'] = pool_hash
2364
+ if managed_job_version >= 11:
2365
+ set_job_info_kwargs['user_hash'] = {user_hash!r}
2133
2366
  managed_job_state.set_job_info(
2134
2367
  {job_id}, {dag_name!r}, **set_job_info_kwargs)
2135
2368
  """)