skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/utils.py CHANGED
@@ -4,21 +4,23 @@ NOTE: whenever an API change is made in this file, we need to bump the
4
4
  jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
5
5
  ManagedJobCodeGen.
6
6
  """
7
+ import asyncio
7
8
  import collections
8
- import datetime
9
+ from datetime import datetime
9
10
  import enum
10
11
  import os
11
12
  import pathlib
13
+ import re
12
14
  import shlex
13
15
  import textwrap
14
16
  import time
15
17
  import traceback
16
18
  import typing
17
- from typing import Any, Deque, Dict, List, Optional, Set, TextIO, Tuple, Union
19
+ from typing import (Any, Deque, Dict, Iterable, List, Literal, Optional, Set,
20
+ TextIO, Tuple, Union)
18
21
 
19
22
  import colorama
20
23
  import filelock
21
- from typing_extensions import Literal
22
24
 
23
25
  from sky import backends
24
26
  from sky import exceptions
@@ -27,16 +29,18 @@ from sky import sky_logging
27
29
  from sky import skypilot_config
28
30
  from sky.adaptors import common as adaptors_common
29
31
  from sky.backends import backend_utils
32
+ from sky.backends import cloud_vm_ray_backend
30
33
  from sky.jobs import constants as managed_job_constants
31
34
  from sky.jobs import scheduler
32
35
  from sky.jobs import state as managed_job_state
36
+ from sky.schemas.api import responses
33
37
  from sky.skylet import constants
34
38
  from sky.skylet import job_lib
35
39
  from sky.skylet import log_lib
36
40
  from sky.usage import usage_lib
37
41
  from sky.utils import annotations
38
- from sky.utils import command_runner
39
42
  from sky.utils import common_utils
43
+ from sky.utils import context_utils
40
44
  from sky.utils import controller_utils
41
45
  from sky.utils import infra_utils
42
46
  from sky.utils import log_utils
@@ -47,18 +51,29 @@ from sky.utils import subprocess_utils
47
51
  from sky.utils import ux_utils
48
52
 
49
53
  if typing.TYPE_CHECKING:
54
+ from google.protobuf import descriptor
55
+ from google.protobuf import json_format
56
+ import grpc
50
57
  import psutil
51
58
 
52
59
  import sky
53
60
  from sky import dag as dag_lib
61
+ from sky.schemas.generated import jobsv1_pb2
62
+ from sky.schemas.generated import managed_jobsv1_pb2
54
63
  else:
64
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
65
+ descriptor = adaptors_common.LazyImport('google.protobuf.descriptor')
55
66
  psutil = adaptors_common.LazyImport('psutil')
67
+ grpc = adaptors_common.LazyImport('grpc')
68
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
69
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
70
+ 'sky.schemas.generated.managed_jobsv1_pb2')
56
71
 
57
72
  logger = sky_logging.init_logger(__name__)
58
73
 
59
- SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
60
74
  # Controller checks its job's status every this many seconds.
61
- JOB_STATUS_CHECK_GAP_SECONDS = 20
75
+ # This is a tradeoff between the latency and the resource usage.
76
+ JOB_STATUS_CHECK_GAP_SECONDS = 15
62
77
 
63
78
  # Controller checks if its job has started every this many seconds.
64
79
  JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
@@ -67,6 +82,7 @@ _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
67
82
 
68
83
  _JOB_STATUS_FETCH_MAX_RETRIES = 3
69
84
  _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
85
+ _JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
70
86
 
71
87
  _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
72
88
  'Waiting for task to start[/]'
@@ -82,7 +98,29 @@ _JOB_CANCELLED_MESSAGE = (
82
98
  # blocking for a long time. This should be significantly longer than the
83
99
  # JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
84
100
  # update the state.
85
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
101
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
102
+
103
+ # After enabling consolidation mode, we need to restart the API server to get
104
+ # the jobs refresh deamon and correct number of executors. We use this file to
105
+ # indicate that the API server has been restarted after enabling consolidation
106
+ # mode.
107
+ _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
108
+ '~/.sky/.jobs_controller_consolidation_reloaded_signal')
109
+
110
+ # The response fields for managed jobs that require cluster handle
111
+ _CLUSTER_HANDLE_FIELDS = [
112
+ 'cluster_resources',
113
+ 'cluster_resources_full',
114
+ 'cloud',
115
+ 'region',
116
+ 'zone',
117
+ 'infra',
118
+ 'accelerators',
119
+ ]
120
+
121
+ # The response fields for managed jobs that are not stored in the database
122
+ # These fields will be mapped to the DB fields in the `_update_fields`.
123
+ _NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
86
124
 
87
125
 
88
126
  class ManagedJobQueueResultType(enum.Enum):
@@ -99,7 +137,10 @@ class UserSignal(enum.Enum):
99
137
 
100
138
 
101
139
  # ====== internal functions ======
102
- def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
140
+ def terminate_cluster(
141
+ cluster_name: str,
142
+ max_retry: int = 6,
143
+ ) -> None:
103
144
  """Terminate the cluster."""
104
145
  from sky import core # pylint: disable=import-outside-toplevel
105
146
  retry_cnt = 0
@@ -144,32 +185,28 @@ def _validate_consolidation_mode_config(
144
185
  if current_is_consolidation_mode:
145
186
  controller_cn = (
146
187
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
147
- if global_user_state.get_cluster_from_name(controller_cn) is not None:
148
- with ux_utils.print_exception_no_traceback():
149
- raise exceptions.InconsistentConsolidationModeError(
150
- f'{colorama.Fore.RED}Consolidation mode for jobs is '
151
- f'enabled, but the controller cluster '
152
- f'{controller_cn} is still running. Please '
153
- 'terminate the controller cluster first.'
154
- f'{colorama.Style.RESET_ALL}')
188
+ if global_user_state.cluster_with_name_exists(controller_cn):
189
+ logger.warning(
190
+ f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
191
+ f'but the controller cluster {controller_cn} is still running. '
192
+ 'Please terminate the controller cluster first.'
193
+ f'{colorama.Style.RESET_ALL}')
155
194
  else:
156
- all_jobs = managed_job_state.get_managed_jobs()
157
- if all_jobs:
195
+ total_jobs = managed_job_state.get_managed_jobs_total()
196
+ if total_jobs > 0:
158
197
  nonterminal_jobs = (
159
198
  managed_job_state.get_nonterminal_job_ids_by_name(
160
- None, all_users=True))
199
+ None, None, all_users=True))
161
200
  if nonterminal_jobs:
162
- with ux_utils.print_exception_no_traceback():
163
- raise exceptions.InconsistentConsolidationModeError(
164
- f'{colorama.Fore.RED}Consolidation mode '
165
- 'is disabled, but there are still '
166
- f'{len(nonterminal_jobs)} managed jobs '
167
- 'running. Please terminate those jobs '
168
- f'first.{colorama.Style.RESET_ALL}')
201
+ logger.warning(
202
+ f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
203
+ f'but there are still {len(nonterminal_jobs)} managed jobs '
204
+ 'running. Please terminate those jobs first.'
205
+ f'{colorama.Style.RESET_ALL}')
169
206
  else:
170
207
  logger.warning(
171
208
  f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
172
- f'but there are {len(all_jobs)} jobs from previous '
209
+ f'but there are {total_jobs} jobs from previous '
173
210
  'consolidation mode. Reset the `jobs.controller.'
174
211
  'consolidation_mode` to `true` and run `sky jobs queue` '
175
212
  'to see those jobs. Switching to normal mode will '
@@ -181,75 +218,127 @@ def _validate_consolidation_mode_config(
181
218
  # API Server. Under the hood, we submit the job monitoring logic as processes
182
219
  # directly in the API Server.
183
220
  # Use LRU Cache so that the check is only done once.
184
- @annotations.lru_cache(scope='request', maxsize=1)
185
- def is_consolidation_mode() -> bool:
186
- consolidation_mode = skypilot_config.get_nested(
221
+ @annotations.lru_cache(scope='request', maxsize=2)
222
+ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
223
+ if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
224
+ return True
225
+
226
+ config_consolidation_mode = skypilot_config.get_nested(
187
227
  ('jobs', 'controller', 'consolidation_mode'), default_value=False)
228
+
229
+ signal_file = pathlib.Path(
230
+ _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
231
+
232
+ if on_api_restart:
233
+ if config_consolidation_mode:
234
+ signal_file.touch()
235
+ else:
236
+ restart_signal_file_exists = signal_file.exists()
237
+ if not restart_signal_file_exists:
238
+ if config_consolidation_mode:
239
+ logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
240
+ 'managed jobs is enabled in the server config, '
241
+ 'but the API server has not been restarted yet. '
242
+ 'Please restart the API server to enable it.'
243
+ f'{colorama.Style.RESET_ALL}')
244
+ return False
245
+ elif not config_consolidation_mode:
246
+ # Cleanup the signal file if the consolidation mode is disabled in
247
+ # the config. This allow the user to disable the consolidation mode
248
+ # without restarting the API server.
249
+ signal_file.unlink()
250
+
188
251
  # We should only do this check on API server, as the controller will not
189
252
  # have related config and will always seemingly disabled for consolidation
190
253
  # mode. Check #6611 for more details.
191
254
  if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
192
- _validate_consolidation_mode_config(consolidation_mode)
193
- return consolidation_mode
255
+ _validate_consolidation_mode_config(config_consolidation_mode)
256
+ return config_consolidation_mode
194
257
 
195
258
 
196
- def ha_recovery_for_consolidation_mode():
197
- """Recovery logic for HA mode."""
259
+ def ha_recovery_for_consolidation_mode() -> None:
260
+ """Recovery logic for consolidation mode.
261
+
262
+ This should only be called from the managed-job-status-refresh-daemon, due
263
+ so that we have correct ordering recovery -> controller start -> job status
264
+ updates. This also should ensure correct operation during a rolling update.
265
+ """
198
266
  # No setup recovery is needed in consolidation mode, as the API server
199
267
  # already has all runtime installed. Directly start jobs recovery here.
200
268
  # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
201
- runner = command_runner.LocalProcessCommandRunner()
269
+ scheduler.maybe_start_controllers()
202
270
  with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
203
- 'w',
271
+ 'a',
204
272
  encoding='utf-8') as f:
205
273
  start = time.time()
206
- f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
207
- for job in managed_job_state.get_managed_jobs():
274
+ f.write(f'Starting HA recovery at {datetime.now()}\n')
275
+ jobs, _ = managed_job_state.get_managed_jobs_with_filters(fields=[
276
+ 'job_id', 'controller_pid', 'controller_pid_started_at',
277
+ 'schedule_state', 'status'
278
+ ])
279
+ for job in jobs:
208
280
  job_id = job['job_id']
209
281
  controller_pid = job['controller_pid']
282
+ controller_pid_started_at = job.get('controller_pid_started_at')
210
283
 
211
284
  # In consolidation mode, it is possible that only the API server
212
285
  # process is restarted, and the controller process is not. In such
213
286
  # case, we don't need to do anything and the controller process will
214
- # just keep running.
287
+ # just keep running. However, in most cases, the controller process
288
+ # will also be stopped - either by a pod restart in k8s API server,
289
+ # or by `sky api stop`, which will stop controllers.
290
+ # TODO(cooperc): Make sure we cannot have a controller process
291
+ # running across API server restarts for consistency.
215
292
  if controller_pid is not None:
216
293
  try:
217
- if _controller_process_alive(controller_pid, job_id):
218
- f.write(f'Controller pid {controller_pid} for '
219
- f'job {job_id} is still running. '
220
- 'Skipping recovery.\n')
294
+ # Note: We provide the legacy job id to the
295
+ # controller_process_alive just in case, but we shouldn't
296
+ # have a running legacy job controller process at this point
297
+ if controller_process_alive(
298
+ managed_job_state.ControllerPidRecord(
299
+ pid=controller_pid,
300
+ started_at=controller_pid_started_at), job_id):
301
+ message = (f'Controller pid {controller_pid} for '
302
+ f'job {job_id} is still running. '
303
+ 'Skipping recovery.\n')
304
+ logger.debug(message)
305
+ f.write(message)
221
306
  continue
222
307
  except Exception: # pylint: disable=broad-except
223
308
  # _controller_process_alive may raise if psutil fails; we
224
309
  # should not crash the recovery logic because of this.
225
- f.write('Error checking controller pid '
226
- f'{controller_pid} for job {job_id}\n')
310
+ message = ('Error checking controller pid '
311
+ f'{controller_pid} for job {job_id}\n')
312
+ logger.warning(message, exc_info=True)
313
+ f.write(message)
227
314
 
315
+ # Controller process is not set or not alive.
228
316
  if job['schedule_state'] not in [
229
317
  managed_job_state.ManagedJobScheduleState.DONE,
230
- managed_job_state.ManagedJobScheduleState.WAITING
318
+ managed_job_state.ManagedJobScheduleState.WAITING,
319
+ # INACTIVE job may be mid-submission, don't set to WAITING.
320
+ managed_job_state.ManagedJobScheduleState.INACTIVE,
231
321
  ]:
232
- script = managed_job_state.get_ha_recovery_script(job_id)
233
- if script is None:
234
- f.write(f'Job {job_id}\'s recovery script does not exist. '
235
- 'Skipping recovery. Job schedule state: '
236
- f'{job["schedule_state"]}\n')
237
- continue
238
- runner.run(script)
239
- f.write(f'Job {job_id} completed recovery at '
240
- f'{datetime.datetime.now()}\n')
241
- f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
322
+ managed_job_state.reset_job_for_recovery(job_id)
323
+ message = (f'Job {job_id} completed recovery at '
324
+ f'{datetime.now()}\n')
325
+ logger.info(message)
326
+ f.write(message)
327
+ f.write(f'HA recovery completed at {datetime.now()}\n')
242
328
  f.write(f'Total recovery time: {time.time() - start} seconds\n')
243
329
 
244
330
 
245
- def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
246
- job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
331
+ async def get_job_status(
332
+ backend: 'backends.CloudVmRayBackend', cluster_name: str,
333
+ job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
247
334
  """Check the status of the job running on a managed job cluster.
248
335
 
249
336
  It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
250
337
  FAILED_SETUP or CANCELLED.
251
338
  """
252
- handle = global_user_state.get_handle_from_cluster_name(cluster_name)
339
+ # TODO(luca) make this async
340
+ handle = await context_utils.to_thread(
341
+ global_user_state.get_handle_from_cluster_name, cluster_name)
253
342
  if handle is None:
254
343
  # This can happen if the cluster was preempted and background status
255
344
  # refresh already noticed and cleaned it up.
@@ -260,9 +349,12 @@ def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
260
349
  for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
261
350
  try:
262
351
  logger.info('=== Checking the job status... ===')
263
- statuses = backend.get_job_status(handle,
264
- job_ids=job_ids,
265
- stream_logs=False)
352
+ statuses = await asyncio.wait_for(
353
+ context_utils.to_thread(backend.get_job_status,
354
+ handle,
355
+ job_ids=job_ids,
356
+ stream_logs=False),
357
+ timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
266
358
  status = list(statuses.values())[0]
267
359
  if status is None:
268
360
  logger.info('No job found.')
@@ -270,29 +362,129 @@ def get_job_status(backend: 'backends.CloudVmRayBackend', cluster_name: str,
270
362
  logger.info(f'Job status: {status}')
271
363
  logger.info('=' * 34)
272
364
  return status
273
- except exceptions.CommandError as e:
365
+ except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
366
+ ValueError, TypeError, asyncio.TimeoutError) as e:
367
+ # Note: Each of these exceptions has some additional conditions to
368
+ # limit how we handle it and whether or not we catch it.
274
369
  # Retry on k8s transient network errors. This is useful when using
275
370
  # coreweave which may have transient network issue sometimes.
276
- if (e.detailed_reason is not None and
277
- _JOB_K8S_TRANSIENT_NW_MSG in e.detailed_reason):
371
+ is_transient_error = False
372
+ detailed_reason = None
373
+ if isinstance(e, exceptions.CommandError):
374
+ detailed_reason = e.detailed_reason
375
+ if (detailed_reason is not None and
376
+ _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
377
+ is_transient_error = True
378
+ elif isinstance(e, grpc.RpcError):
379
+ detailed_reason = e.details()
380
+ if e.code() in [
381
+ grpc.StatusCode.UNAVAILABLE,
382
+ grpc.StatusCode.DEADLINE_EXCEEDED
383
+ ]:
384
+ is_transient_error = True
385
+ elif isinstance(e, grpc.FutureTimeoutError):
386
+ detailed_reason = 'Timeout'
387
+ elif isinstance(e, asyncio.TimeoutError):
388
+ detailed_reason = ('Job status check timed out after '
389
+ f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
390
+ # TODO(cooperc): Gracefully handle these exceptions in the backend.
391
+ elif isinstance(e, ValueError):
392
+ # If the cluster yaml is deleted in the middle of getting the
393
+ # SSH credentials, we could see this. See
394
+ # sky/global_user_state.py get_cluster_yaml_dict.
395
+ if re.search(r'Cluster yaml .* not found', str(e)):
396
+ detailed_reason = 'Cluster yaml was deleted'
397
+ else:
398
+ raise
399
+ elif isinstance(e, TypeError):
400
+ # We will grab the SSH credentials from the cluster yaml, but if
401
+ # handle.cluster_yaml is None, we will just return an empty dict
402
+ # for the credentials. See
403
+ # backend_utils.ssh_credential_from_yaml. Then, the credentials
404
+ # are passed as kwargs to SSHCommandRunner.__init__ - see
405
+ # cloud_vm_ray_backend.get_command_runners. So we can hit this
406
+ # TypeError if the cluster yaml is removed from the handle right
407
+ # when we pull it before the cluster is fully deleted.
408
+ error_msg_to_check = (
409
+ 'SSHCommandRunner.__init__() missing 2 required positional '
410
+ 'arguments: \'ssh_user\' and \'ssh_private_key\'')
411
+ if str(e) == error_msg_to_check:
412
+ detailed_reason = 'SSH credentials were already cleaned up'
413
+ else:
414
+ raise
415
+ if is_transient_error:
278
416
  logger.info('Failed to connect to the cluster. Retrying '
279
417
  f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
280
418
  logger.info('=' * 34)
281
- time.sleep(1)
419
+ await asyncio.sleep(1)
282
420
  else:
283
- logger.info(f'Failed to get job status: {e.detailed_reason}')
421
+ logger.info(f'Failed to get job status: {detailed_reason}')
284
422
  logger.info('=' * 34)
285
423
  return None
286
424
  return None
287
425
 
288
426
 
289
- def _controller_process_alive(pid: int, job_id: int) -> bool:
290
- """Check if the controller process is alive."""
427
+ def controller_process_alive(record: managed_job_state.ControllerPidRecord,
428
+ legacy_job_id: Optional[int] = None,
429
+ quiet: bool = True) -> bool:
430
+ """Check if the controller process is alive.
431
+
432
+ If legacy_job_id is provided, this will also return True for a legacy
433
+ single-job controller process with that job id, based on the cmdline. This
434
+ is how the old check worked before #7051.
435
+ """
291
436
  try:
292
- process = psutil.Process(pid)
293
- cmd_str = ' '.join(process.cmdline())
294
- return process.is_running() and f'--job-id {job_id}' in cmd_str
295
- except psutil.NoSuchProcess:
437
+ process = psutil.Process(record.pid)
438
+
439
+ if record.started_at is not None:
440
+ if process.create_time() != record.started_at:
441
+ if not quiet:
442
+ logger.debug(f'Controller process {record.pid} has started '
443
+ f'at {record.started_at} but process has '
444
+ f'started at {process.create_time()}')
445
+ return False
446
+ else:
447
+ # If we can't check the create_time try to check the cmdline instead
448
+ cmd_str = ' '.join(process.cmdline())
449
+ # pylint: disable=line-too-long
450
+ # Pre-#7051 cmdline: /path/to/python -u -m sky.jobs.controller <dag.yaml_path> --job-id <job_id>
451
+ # Post-#7051 cmdline: /path/to/python -u -msky.jobs.controller
452
+ # pylint: enable=line-too-long
453
+ if ('-m sky.jobs.controller' not in cmd_str and
454
+ '-msky.jobs.controller' not in cmd_str):
455
+ if not quiet:
456
+ logger.debug(f'Process {record.pid} is not a controller '
457
+ 'process - missing "-m sky.jobs.controller" '
458
+ f'from cmdline: {cmd_str}')
459
+ return False
460
+ if (legacy_job_id is not None and '--job-id' in cmd_str and
461
+ f'--job-id {legacy_job_id}' not in cmd_str):
462
+ if not quiet:
463
+ logger.debug(f'Controller process {record.pid} has the '
464
+ f'wrong --job-id (expected {legacy_job_id}) '
465
+ f'in cmdline: {cmd_str}')
466
+ return False
467
+
468
+ # On linux, psutil.Process(pid) will return a valid process object
469
+ # even if the pid is actually a thread ID within the process. This
470
+ # hugely inflates the number of valid-looking pids, increasing the
471
+ # chance that we will falsely believe a controller is alive. The pid
472
+ # file should never contain thread IDs, just process IDs. We can
473
+ # check this with psutil.pid_exists(pid), which is false for TIDs.
474
+ # See pid_exists in psutil/_pslinux.py
475
+ if not psutil.pid_exists(record.pid):
476
+ if not quiet:
477
+ logger.debug(
478
+ f'Controller process {record.pid} is not a valid '
479
+ 'process id.')
480
+ return False
481
+
482
+ return process.is_running()
483
+
484
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
485
+ OSError) as e:
486
+ if not quiet:
487
+ logger.debug(f'Controller process {record.pid} is not running: {e}')
296
488
  return False
297
489
 
298
490
 
@@ -326,9 +518,8 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
326
518
  This function should not throw any exception. If it fails, it will
327
519
  capture the error message, and log/return it.
328
520
  """
329
- managed_job_state.remove_ha_recovery_script(job_id)
330
521
  error_msg = None
331
- tasks = managed_job_state.get_managed_jobs(job_id)
522
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
332
523
  for task in tasks:
333
524
  pool = task.get('pool', None)
334
525
  if pool is None:
@@ -351,43 +542,6 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
351
542
  logger.exception(error_msg, exc_info=e)
352
543
  return error_msg
353
544
 
354
- # For backwards compatible jobs
355
- # TODO(cooperc): Remove before 0.11.0.
356
- def _handle_legacy_job(job_id: int):
357
- controller_status = job_lib.get_status(job_id)
358
- if controller_status is None or controller_status.is_terminal():
359
- logger.error(f'Controller process for legacy job {job_id} is '
360
- 'in an unexpected state.')
361
-
362
- cleanup_error = _cleanup_job_clusters(job_id)
363
- if cleanup_error:
364
- # Unconditionally set the job to failed_controller if the
365
- # cleanup fails.
366
- managed_job_state.set_failed(
367
- job_id,
368
- task_id=None,
369
- failure_type=managed_job_state.ManagedJobStatus.
370
- FAILED_CONTROLLER,
371
- failure_reason=
372
- 'Legacy controller process has exited abnormally, and '
373
- f'cleanup failed: {cleanup_error}. For more details, run: '
374
- f'sky jobs logs --controller {job_id}',
375
- override_terminal=True)
376
- return
377
-
378
- # It's possible for the job to have transitioned to
379
- # another terminal state while between when we checked its
380
- # state and now. In that case, set_failed won't do
381
- # anything, which is fine.
382
- managed_job_state.set_failed(
383
- job_id,
384
- task_id=None,
385
- failure_type=managed_job_state.ManagedJobStatus.
386
- FAILED_CONTROLLER,
387
- failure_reason=(
388
- 'Legacy controller process has exited abnormally. For '
389
- f'more details, run: sky jobs logs --controller {job_id}'))
390
-
391
545
  # Get jobs that need checking (non-terminal or not DONE)
392
546
  job_ids = managed_job_state.get_jobs_to_check_status(job_id)
393
547
  if not job_ids:
@@ -397,29 +551,22 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
397
551
 
398
552
  for job_id in job_ids:
399
553
  assert job_id is not None
400
- tasks = managed_job_state.get_managed_jobs(job_id)
554
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
401
555
  # Note: controller_pid and schedule_state are in the job_info table
402
556
  # which is joined to the spot table, so all tasks with the same job_id
403
557
  # will have the same value for these columns. This is what lets us just
404
558
  # take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
405
559
  schedule_state = tasks[0]['schedule_state']
406
560
 
407
- # Backwards compatibility: this job was submitted when ray was still
408
- # used for managing the parallelism of job controllers, before #4485.
409
- # TODO(cooperc): Remove before 0.11.0.
410
- if (schedule_state is
411
- managed_job_state.ManagedJobScheduleState.INVALID):
412
- _handle_legacy_job(job_id)
413
- continue
414
-
415
561
  # Handle jobs with schedule state (non-legacy jobs):
416
562
  pid = tasks[0]['controller_pid']
563
+ pid_started_at = tasks[0].get('controller_pid_started_at')
417
564
  if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
418
565
  # There are two cases where we could get a job that is DONE.
419
566
  # 1. At query time (get_jobs_to_check_status), the job was not yet
420
- # DONE, but since then (before get_managed_jobs is called) it has
421
- # hit a terminal status, marked itself done, and exited. This is
422
- # fine.
567
+ # DONE, but since then (before get_managed_job_tasks is called)
568
+ # it has hit a terminal status, marked itself done, and exited.
569
+ # This is fine.
423
570
  # 2. The job is DONE, but in a non-terminal status. This is
424
571
  # unexpected. For instance, the task status is RUNNING, but the
425
572
  # job schedule_state is DONE.
@@ -466,7 +613,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
466
613
  failure_reason = f'No controller pid set for {schedule_state.value}'
467
614
  else:
468
615
  logger.debug(f'Checking controller pid {pid}')
469
- if _controller_process_alive(pid, job_id):
616
+ if controller_process_alive(
617
+ managed_job_state.ControllerPidRecord(
618
+ pid=pid, started_at=pid_started_at), job_id):
470
619
  # The controller is still running, so this job is fine.
471
620
  continue
472
621
 
@@ -526,9 +675,32 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
526
675
  def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
527
676
  job_id: Optional[int], get_end_time: bool) -> float:
528
677
  """Get the submitted/ended time of the job."""
529
- code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
530
- job_id=job_id, get_ended_time=get_end_time)
531
678
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
679
+ assert handle is not None, (
680
+ f'handle for cluster {cluster_name!r} should not be None')
681
+ if handle.is_grpc_enabled_with_flag:
682
+ try:
683
+ if get_end_time:
684
+ end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
685
+ job_id=job_id)
686
+ end_ts_response = backend_utils.invoke_skylet_with_retries(
687
+ lambda: cloud_vm_ray_backend.SkyletClient(
688
+ handle.get_grpc_channel()).get_job_ended_timestamp(
689
+ end_ts_request))
690
+ return end_ts_response.timestamp
691
+ else:
692
+ submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
693
+ job_id=job_id)
694
+ submit_ts_response = backend_utils.invoke_skylet_with_retries(
695
+ lambda: cloud_vm_ray_backend.SkyletClient(
696
+ handle.get_grpc_channel()).get_job_submitted_timestamp(
697
+ submit_ts_request))
698
+ return submit_ts_response.timestamp
699
+ except exceptions.SkyletMethodNotImplementedError:
700
+ pass
701
+
702
+ code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
703
+ job_id=job_id, get_ended_time=get_end_time))
532
704
  returncode, stdout, stderr = backend.run_on_head(handle,
533
705
  code,
534
706
  stream_logs=False,
@@ -552,8 +724,13 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
552
724
  cluster_name,
553
725
  job_id=job_id,
554
726
  get_end_time=True)
555
- except exceptions.CommandError as e:
556
- if e.returncode == 255:
727
+ except (exceptions.CommandError, grpc.RpcError,
728
+ grpc.FutureTimeoutError) as e:
729
+ if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
730
+ (isinstance(e, grpc.RpcError) and e.code() in [
731
+ grpc.StatusCode.UNAVAILABLE,
732
+ grpc.StatusCode.DEADLINE_EXCEEDED,
733
+ ]) or isinstance(e, grpc.FutureTimeoutError):
557
734
  # Failed to connect - probably the instance was preempted since the
558
735
  # job completed. We shouldn't crash here, so just log and use the
559
736
  # current time.
@@ -565,7 +742,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
565
742
  raise
566
743
 
567
744
 
568
- def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
745
+ def event_callback_func(
746
+ job_id: int, task_id: Optional[int],
747
+ task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
569
748
  """Run event callback for the task."""
570
749
 
571
750
  def callback_func(status: str):
@@ -604,7 +783,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
604
783
  f'Bash:{event_callback},log_path:{log_path},result:{result}')
605
784
  logger.info(f'=== END: event callback for {status!r} ===')
606
785
 
607
- return callback_func
786
+ async def async_callback_func(status: str):
787
+ return await context_utils.to_thread(callback_func, status)
788
+
789
+ return async_callback_func
608
790
 
609
791
 
610
792
  # ======== user functions ========
@@ -624,14 +806,15 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
624
806
 
625
807
  def cancel_jobs_by_id(job_ids: Optional[List[int]],
626
808
  all_users: bool = False,
627
- current_workspace: Optional[str] = None) -> str:
809
+ current_workspace: Optional[str] = None,
810
+ user_hash: Optional[str] = None) -> str:
628
811
  """Cancel jobs by id.
629
812
 
630
813
  If job_ids is None, cancel all jobs.
631
814
  """
632
815
  if job_ids is None:
633
816
  job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
634
- None, all_users)
817
+ None, user_hash, all_users)
635
818
  job_ids = list(set(job_ids))
636
819
  if not job_ids:
637
820
  return 'No job to cancel.'
@@ -651,6 +834,12 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
651
834
  logger.info(f'Job {job_id} is already in terminal state '
652
835
  f'{job_status.value}. Skipped.')
653
836
  continue
837
+ elif job_status == managed_job_state.ManagedJobStatus.PENDING:
838
+ # the "if PENDING" is a short circuit, this will be atomic.
839
+ cancelled = managed_job_state.set_pending_cancelled(job_id)
840
+ if cancelled:
841
+ cancelled_job_ids.append(job_id)
842
+ continue
654
843
 
655
844
  update_managed_jobs_statuses(job_id)
656
845
 
@@ -659,14 +848,30 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
659
848
  wrong_workspace_job_ids.append(job_id)
660
849
  continue
661
850
 
662
- # Send the signal to the jobs controller.
663
- signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
664
- # Filelock is needed to prevent race condition between signal
665
- # check/removal and signal writing.
666
- with filelock.FileLock(str(signal_file) + '.lock'):
667
- with signal_file.open('w', encoding='utf-8') as f:
668
- f.write(UserSignal.CANCEL.value)
669
- f.flush()
851
+ if managed_job_state.is_legacy_controller_process(job_id):
852
+ # The job is running on a legacy single-job controller process.
853
+ # TODO(cooperc): Remove this handling for 0.13.0
854
+
855
+ # Send the signal to the jobs controller.
856
+ signal_file = (pathlib.Path(
857
+ managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
858
+ # Filelock is needed to prevent race condition between signal
859
+ # check/removal and signal writing.
860
+ with filelock.FileLock(str(signal_file) + '.lock'):
861
+ with signal_file.open('w', encoding='utf-8') as f:
862
+ f.write(UserSignal.CANCEL.value)
863
+ f.flush()
864
+ else:
865
+ # New controller process.
866
+ try:
867
+ signal_file = pathlib.Path(
868
+ managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
869
+ signal_file.touch()
870
+ except OSError as e:
871
+ logger.error(f'Failed to cancel job {job_id}: {e}')
872
+ # Don't add it to the to be cancelled job ids
873
+ continue
874
+
670
875
  cancelled_job_ids.append(job_id)
671
876
 
672
877
  wrong_workspace_job_str = ''
@@ -714,6 +919,14 @@ def cancel_jobs_by_pool(pool_name: str,
714
919
  return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
715
920
 
716
921
 
922
+ def controller_log_file_for_job(job_id: int,
923
+ create_if_not_exists: bool = False) -> str:
924
+ log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
925
+ if create_if_not_exists:
926
+ os.makedirs(log_dir, exist_ok=True)
927
+ return os.path.join(log_dir, f'{job_id}.log')
928
+
929
+
717
930
  def stream_logs_by_id(job_id: int,
718
931
  follow: bool = True,
719
932
  tail: Optional[int] = None) -> Tuple[str, int]:
@@ -746,13 +959,20 @@ def stream_logs_by_id(job_id: int,
746
959
  if managed_job_status.is_failed():
747
960
  job_msg = ('\nFailure reason: '
748
961
  f'{managed_job_state.get_failure_reason(job_id)}')
749
- log_file_exists = False
962
+ log_file_ever_existed = False
750
963
  task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
751
964
  job_id)
752
965
  num_tasks = len(task_info)
753
- for task_id, task_name, task_status, log_file in task_info:
966
+ for (task_id, task_name, task_status, log_file,
967
+ logs_cleaned_at) in task_info:
754
968
  if log_file:
755
- log_file_exists = True
969
+ log_file_ever_existed = True
970
+ if logs_cleaned_at is not None:
971
+ ts_str = datetime.fromtimestamp(
972
+ logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
973
+ print(f'Task {task_name}({task_id}) log has been '
974
+ f'cleaned at {ts_str}.')
975
+ continue
756
976
  task_str = (f'Task {task_name}({task_id})'
757
977
  if task_name else f'Task {task_id}')
758
978
  if num_tasks > 1:
@@ -787,7 +1007,7 @@ def stream_logs_by_id(job_id: int,
787
1007
  f'{task_str} finished '
788
1008
  f'(status: {task_status.value}).'),
789
1009
  flush=True)
790
- if log_file_exists:
1010
+ if log_file_ever_existed:
791
1011
  # Add the "Job finished" message for terminal states
792
1012
  if managed_job_status.is_terminal():
793
1013
  print(ux_utils.finishing_message(
@@ -1015,7 +1235,8 @@ def stream_logs(job_id: Optional[int],
1015
1235
  if controller:
1016
1236
  if job_id is None:
1017
1237
  assert job_name is not None
1018
- managed_jobs = managed_job_state.get_managed_jobs()
1238
+ managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
1239
+ name_match=job_name, fields=['job_id', 'job_name', 'status'])
1019
1240
  # We manually filter the jobs by name, instead of using
1020
1241
  # get_nonterminal_job_ids_by_name, as with `controller=True`, we
1021
1242
  # should be able to show the logs for jobs in terminal states.
@@ -1038,9 +1259,7 @@ def stream_logs(job_id: Optional[int],
1038
1259
  job_id = managed_job_ids.pop()
1039
1260
  assert job_id is not None, (job_id, job_name)
1040
1261
 
1041
- controller_log_path = os.path.join(
1042
- os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
1043
- f'{job_id}.log')
1262
+ controller_log_path = controller_log_file_for_job(job_id)
1044
1263
  job_status = None
1045
1264
 
1046
1265
  # Wait for the log file to be written
@@ -1141,144 +1360,254 @@ def dump_managed_job_queue(
1141
1360
  limit: Optional[int] = None,
1142
1361
  user_hashes: Optional[List[Optional[str]]] = None,
1143
1362
  statuses: Optional[List[str]] = None,
1363
+ fields: Optional[List[str]] = None,
1144
1364
  ) -> str:
1145
- # Make sure to get all jobs - some logic below (e.g. high priority job
1146
- # detection) requires a full view of the jobs table.
1147
- jobs = managed_job_state.get_managed_jobs()
1365
+ return message_utils.encode_payload(
1366
+ get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
1367
+ workspace_match, name_match, pool_match, page,
1368
+ limit, user_hashes, statuses, fields))
1148
1369
 
1149
- # Figure out what the highest priority blocking job is. We need to know in
1150
- # order to determine if other jobs are blocked by a higher priority job, or
1151
- # just by the limited controller resources.
1152
- highest_blocking_priority = constants.MIN_PRIORITY
1153
- for job in jobs:
1154
- if job['schedule_state'] not in (
1155
- # LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
1156
- # lower priority.
1157
- managed_job_state.ManagedJobScheduleState.LAUNCHING,
1158
- managed_job_state.ManagedJobScheduleState.ALIVE_BACKOFF,
1159
- # It's possible for a WAITING/ALIVE_WAITING job to be ready to
1160
- # launch, but the scheduler just hasn't run yet.
1161
- managed_job_state.ManagedJobScheduleState.WAITING,
1162
- managed_job_state.ManagedJobScheduleState.ALIVE_WAITING,
1163
- ):
1164
- # This job will not block others.
1165
- continue
1166
1370
 
1167
- priority = job.get('priority')
1168
- if priority is not None and priority > highest_blocking_priority:
1169
- highest_blocking_priority = priority
1371
+ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
1372
+ """Update the fields list to include the necessary fields.
1170
1373
 
1171
- total_no_filter = len(jobs)
1374
+ Args:
1375
+ fields: The fields to update.
1376
+
1377
+ It will:
1378
+ - Add the necessary dependent fields to the list.
1379
+ - Remove the fields that are not in the DB.
1380
+ - Determine if cluster handle is required.
1381
+
1382
+ Returns:
1383
+ A tuple containing the updated fields and a boolean indicating if
1384
+ cluster handle is required.
1385
+ """
1386
+ cluster_handle_required = True
1387
+ if _cluster_handle_not_required(fields):
1388
+ cluster_handle_required = False
1389
+ # Copy the list to avoid modifying the original list
1390
+ new_fields = fields.copy()
1391
+ # status and job_id are always included
1392
+ if 'status' not in new_fields:
1393
+ new_fields.append('status')
1394
+ if 'job_id' not in new_fields:
1395
+ new_fields.append('job_id')
1396
+ # user_hash is required if user_name is present
1397
+ if 'user_name' in new_fields and 'user_hash' not in new_fields:
1398
+ new_fields.append('user_hash')
1399
+ if 'job_duration' in new_fields:
1400
+ if 'last_recovered_at' not in new_fields:
1401
+ new_fields.append('last_recovered_at')
1402
+ if 'end_at' not in new_fields:
1403
+ new_fields.append('end_at')
1404
+ if 'job_name' in new_fields and 'task_name' not in new_fields:
1405
+ new_fields.append('task_name')
1406
+ if 'details' in new_fields:
1407
+ if 'schedule_state' not in new_fields:
1408
+ new_fields.append('schedule_state')
1409
+ if 'priority' not in new_fields:
1410
+ new_fields.append('priority')
1411
+ if 'failure_reason' not in new_fields:
1412
+ new_fields.append('failure_reason')
1413
+ if 'user_yaml' in new_fields:
1414
+ if 'original_user_yaml_path' not in new_fields:
1415
+ new_fields.append('original_user_yaml_path')
1416
+ if 'original_user_yaml_content' not in new_fields:
1417
+ new_fields.append('original_user_yaml_content')
1418
+ if cluster_handle_required:
1419
+ if 'task_name' not in new_fields:
1420
+ new_fields.append('task_name')
1421
+ if 'current_cluster_name' not in new_fields:
1422
+ new_fields.append('current_cluster_name')
1423
+ # Remove _NON_DB_FIELDS
1424
+ # These fields have been mapped to the DB fields in the above code, so we
1425
+ # don't need to include them in the updated fields.
1426
+ for field in _NON_DB_FIELDS:
1427
+ if field in new_fields:
1428
+ new_fields.remove(field)
1429
+ return new_fields, cluster_handle_required
1430
+
1431
+
1432
+ def _cluster_handle_not_required(fields: List[str]) -> bool:
1433
+ """Determine if cluster handle is not required.
1434
+
1435
+ Args:
1436
+ fields: The fields to check if they contain any of the cluster handle
1437
+ fields.
1438
+
1439
+ Returns:
1440
+ True if the fields do not contain any of the cluster handle fields,
1441
+ False otherwise.
1442
+ """
1443
+ return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
1444
+
1445
+
1446
+ def get_managed_job_queue(
1447
+ skip_finished: bool = False,
1448
+ accessible_workspaces: Optional[List[str]] = None,
1449
+ job_ids: Optional[List[int]] = None,
1450
+ workspace_match: Optional[str] = None,
1451
+ name_match: Optional[str] = None,
1452
+ pool_match: Optional[str] = None,
1453
+ page: Optional[int] = None,
1454
+ limit: Optional[int] = None,
1455
+ user_hashes: Optional[List[Optional[str]]] = None,
1456
+ statuses: Optional[List[str]] = None,
1457
+ fields: Optional[List[str]] = None,
1458
+ ) -> Dict[str, Any]:
1459
+ """Get the managed job queue.
1460
+
1461
+ Args:
1462
+ skip_finished: Whether to skip finished jobs.
1463
+ accessible_workspaces: The accessible workspaces.
1464
+ job_ids: The job ids.
1465
+ workspace_match: The workspace name to match.
1466
+ name_match: The job name to match.
1467
+ pool_match: The pool name to match.
1468
+ page: The page number.
1469
+ limit: The limit number.
1470
+ user_hashes: The user hashes.
1471
+ statuses: The statuses.
1472
+ fields: The fields to include in the response.
1473
+
1474
+ Returns:
1475
+ A dictionary containing the managed job queue.
1476
+ """
1477
+ cluster_handle_required = True
1478
+ updated_fields = None
1479
+ # The caller only need to specify the fields in the
1480
+ # `class ManagedJobRecord` in `response.py`, and the `_update_fields`
1481
+ # function will add the necessary dependent fields to the list, for
1482
+ # example, if the caller specifies `['user_name']`, the `_update_fields`
1483
+ # function will add `['user_hash']` to the list.
1484
+ if fields:
1485
+ updated_fields, cluster_handle_required = _update_fields(fields)
1486
+
1487
+ total_no_filter = managed_job_state.get_managed_jobs_total()
1488
+
1489
+ status_counts = managed_job_state.get_status_count_with_filters(
1490
+ fields=fields,
1491
+ job_ids=job_ids,
1492
+ accessible_workspaces=accessible_workspaces,
1493
+ workspace_match=workspace_match,
1494
+ name_match=name_match,
1495
+ pool_match=pool_match,
1496
+ user_hashes=user_hashes,
1497
+ skip_finished=skip_finished,
1498
+ )
1499
+
1500
+ jobs, total = managed_job_state.get_managed_jobs_with_filters(
1501
+ fields=updated_fields,
1502
+ job_ids=job_ids,
1503
+ accessible_workspaces=accessible_workspaces,
1504
+ workspace_match=workspace_match,
1505
+ name_match=name_match,
1506
+ pool_match=pool_match,
1507
+ user_hashes=user_hashes,
1508
+ statuses=statuses,
1509
+ skip_finished=skip_finished,
1510
+ page=page,
1511
+ limit=limit,
1512
+ )
1513
+
1514
+ if cluster_handle_required:
1515
+ # Fetch the cluster name to handle map for managed clusters only.
1516
+ cluster_name_to_handle = (
1517
+ global_user_state.get_cluster_name_to_handle_map(is_managed=True))
1518
+
1519
+ highest_blocking_priority = constants.MIN_PRIORITY
1520
+ if not fields or 'details' in fields:
1521
+ # Figure out what the highest priority blocking job is. We need to know
1522
+ # in order to determine if other jobs are blocked by a higher priority
1523
+ # job, or just by the limited controller resources.
1524
+ highest_blocking_priority = (
1525
+ managed_job_state.get_managed_jobs_highest_priority())
1172
1526
 
1173
- if user_hashes:
1174
- jobs = [
1175
- job for job in jobs if job.get('user_hash', None) in user_hashes
1176
- ]
1177
- if accessible_workspaces:
1178
- jobs = [
1179
- job for job in jobs
1180
- if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
1181
- accessible_workspaces
1182
- ]
1183
- if skip_finished:
1184
- # Filter out the finished jobs. If a multi-task job is partially
1185
- # finished, we will include all its tasks.
1186
- non_finished_tasks = list(
1187
- filter(
1188
- lambda job: not managed_job_state.ManagedJobStatus(job[
1189
- 'status']).is_terminal(), jobs))
1190
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
1191
- jobs = list(
1192
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
1193
- if job_ids:
1194
- jobs = [job for job in jobs if job['job_id'] in job_ids]
1195
-
1196
- jobs, total, status_counts = filter_jobs(jobs,
1197
- workspace_match,
1198
- name_match,
1199
- pool_match,
1200
- page,
1201
- limit,
1202
- statuses=statuses)
1203
1527
  for job in jobs:
1204
- end_at = job['end_at']
1205
- if end_at is None:
1206
- end_at = time.time()
1207
-
1208
- job_submitted_at = job['last_recovered_at'] - job['job_duration']
1209
- if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
1210
- # When job is recovering, the duration is exact job['job_duration']
1211
- job_duration = job['job_duration']
1212
- elif job_submitted_at > 0:
1213
- job_duration = end_at - job_submitted_at
1214
- else:
1215
- # When job_start_at <= 0, that means the last_recovered_at is not
1216
- # set yet, i.e. the job is not started.
1217
- job_duration = 0
1218
- job['job_duration'] = job_duration
1528
+ if not fields or 'job_duration' in fields:
1529
+ end_at = job['end_at']
1530
+ if end_at is None:
1531
+ end_at = time.time()
1532
+
1533
+ job_submitted_at = job['last_recovered_at'] - job['job_duration']
1534
+ if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
1535
+ # When job is recovering, the duration is exact
1536
+ # job['job_duration']
1537
+ job_duration = job['job_duration']
1538
+ elif job_submitted_at > 0:
1539
+ job_duration = end_at - job_submitted_at
1540
+ else:
1541
+ # When job_start_at <= 0, that means the last_recovered_at
1542
+ # is not set yet, i.e. the job is not started.
1543
+ job_duration = 0
1544
+ job['job_duration'] = job_duration
1219
1545
  job['status'] = job['status'].value
1220
- job['schedule_state'] = job['schedule_state'].value
1221
-
1222
- pool = managed_job_state.get_pool_from_job_id(job['job_id'])
1223
- if pool is not None:
1224
- cluster_name, _ = managed_job_state.get_pool_submit_info(
1225
- job['job_id'])
1226
- else:
1227
- cluster_name = generate_managed_job_cluster_name(
1228
- job['task_name'], job['job_id'])
1229
- handle = global_user_state.get_handle_from_cluster_name(
1230
- cluster_name) if cluster_name is not None else None
1231
- if isinstance(handle, backends.CloudVmRayResourceHandle):
1232
- resources_str = resources_utils.get_readable_resources_repr(
1233
- handle, simplify=True)
1234
- resources_str_full = resources_utils.get_readable_resources_repr(
1235
- handle, simplify=False)
1236
- job['cluster_resources'] = resources_str
1237
- job['cluster_resources_full'] = resources_str_full
1238
- job['cloud'] = str(handle.launched_resources.cloud)
1239
- job['region'] = handle.launched_resources.region
1240
- job['zone'] = handle.launched_resources.zone
1241
- job['infra'] = infra_utils.InfraInfo(
1242
- str(handle.launched_resources.cloud),
1243
- handle.launched_resources.region,
1244
- handle.launched_resources.zone).formatted_str()
1245
- job['accelerators'] = handle.launched_resources.accelerators
1546
+ if not fields or 'schedule_state' in fields:
1547
+ job['schedule_state'] = job['schedule_state'].value
1246
1548
  else:
1247
- # FIXME(zongheng): display the last cached values for these.
1248
- job['cluster_resources'] = '-'
1249
- job['cluster_resources_full'] = '-'
1250
- job['cloud'] = '-'
1251
- job['region'] = '-'
1252
- job['zone'] = '-'
1253
- job['infra'] = '-'
1254
-
1255
- # Add details about schedule state / backoff.
1256
- state_details = None
1257
- if job['schedule_state'] == 'ALIVE_BACKOFF':
1258
- state_details = 'In backoff, waiting for resources'
1259
- elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
1260
- priority = job.get('priority')
1261
- if (priority is not None and priority < highest_blocking_priority):
1262
- # Job is lower priority than some other blocking job.
1263
- state_details = 'Waiting for higher priority jobs to launch'
1549
+ job['schedule_state'] = None
1550
+
1551
+ if cluster_handle_required:
1552
+ cluster_name = job.get('current_cluster_name', None)
1553
+ if cluster_name is None:
1554
+ cluster_name = generate_managed_job_cluster_name(
1555
+ job['task_name'], job['job_id'])
1556
+ handle = cluster_name_to_handle.get(
1557
+ cluster_name, None) if cluster_name is not None else None
1558
+ if isinstance(handle, backends.CloudVmRayResourceHandle):
1559
+ resources_str_simple, resources_str_full = (
1560
+ resources_utils.get_readable_resources_repr(
1561
+ handle, simplified_only=False))
1562
+ assert resources_str_full is not None
1563
+ job['cluster_resources'] = resources_str_simple
1564
+ job['cluster_resources_full'] = resources_str_full
1565
+ job['cloud'] = str(handle.launched_resources.cloud)
1566
+ job['region'] = handle.launched_resources.region
1567
+ job['zone'] = handle.launched_resources.zone
1568
+ job['infra'] = infra_utils.InfraInfo(
1569
+ str(handle.launched_resources.cloud),
1570
+ handle.launched_resources.region,
1571
+ handle.launched_resources.zone).formatted_str()
1572
+ job['accelerators'] = handle.launched_resources.accelerators
1264
1573
  else:
1265
- state_details = 'Waiting for other jobs to launch'
1266
-
1267
- if state_details and job['failure_reason']:
1268
- job['details'] = f'{state_details} - {job["failure_reason"]}'
1269
- elif state_details:
1270
- job['details'] = state_details
1271
- elif job['failure_reason']:
1272
- job['details'] = f'Failure: {job["failure_reason"]}'
1273
- else:
1274
- job['details'] = None
1574
+ # FIXME(zongheng): display the last cached values for these.
1575
+ job['cluster_resources'] = '-'
1576
+ job['cluster_resources_full'] = '-'
1577
+ job['cloud'] = '-'
1578
+ job['region'] = '-'
1579
+ job['zone'] = '-'
1580
+ job['infra'] = '-'
1581
+
1582
+ if not fields or 'details' in fields:
1583
+ # Add details about schedule state / backoff.
1584
+ state_details = None
1585
+ if job['schedule_state'] == 'ALIVE_BACKOFF':
1586
+ state_details = 'In backoff, waiting for resources'
1587
+ elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
1588
+ priority = job.get('priority')
1589
+ if (priority is not None and
1590
+ priority < highest_blocking_priority):
1591
+ # Job is lower priority than some other blocking job.
1592
+ state_details = 'Waiting for higher priority jobs to launch'
1593
+ else:
1594
+ state_details = 'Waiting for other jobs to launch'
1595
+
1596
+ if state_details and job['failure_reason']:
1597
+ job['details'] = f'{state_details} - {job["failure_reason"]}'
1598
+ elif state_details:
1599
+ job['details'] = state_details
1600
+ elif job['failure_reason']:
1601
+ job['details'] = f'Failure: {job["failure_reason"]}'
1602
+ else:
1603
+ job['details'] = None
1275
1604
 
1276
- return message_utils.encode_payload({
1605
+ return {
1277
1606
  'jobs': jobs,
1278
1607
  'total': total,
1279
1608
  'total_no_filter': total_no_filter,
1280
1609
  'status_counts': status_counts
1281
- })
1610
+ }
1282
1611
 
1283
1612
 
1284
1613
  def filter_jobs(
@@ -1370,30 +1699,31 @@ def load_managed_job_queue(
1370
1699
  """Load job queue from json string."""
1371
1700
  result = message_utils.decode_payload(payload)
1372
1701
  result_type = ManagedJobQueueResultType.DICT
1373
- status_counts = {}
1702
+ status_counts: Dict[str, int] = {}
1374
1703
  if isinstance(result, dict):
1375
- jobs = result['jobs']
1376
- total = result['total']
1704
+ jobs: List[Dict[str, Any]] = result['jobs']
1705
+ total: int = result['total']
1377
1706
  status_counts = result.get('status_counts', {})
1378
- total_no_filter = result.get('total_no_filter', total)
1707
+ total_no_filter: int = result.get('total_no_filter', total)
1379
1708
  else:
1380
1709
  jobs = result
1381
1710
  total = len(jobs)
1382
1711
  total_no_filter = total
1383
1712
  result_type = ManagedJobQueueResultType.LIST
1384
1713
 
1714
+ all_users = global_user_state.get_all_users()
1715
+ all_users_map = {user.id: user.name for user in all_users}
1385
1716
  for job in jobs:
1386
1717
  job['status'] = managed_job_state.ManagedJobStatus(job['status'])
1387
1718
  if 'user_hash' in job and job['user_hash'] is not None:
1388
1719
  # Skip jobs that do not have user_hash info.
1389
1720
  # TODO(cooperc): Remove check before 0.12.0.
1390
- user = global_user_state.get_user(job['user_hash'])
1391
- job['user_name'] = user.name if user is not None else None
1721
+ job['user_name'] = all_users_map.get(job['user_hash'])
1392
1722
  return jobs, total, result_type, total_no_filter, status_counts
1393
1723
 
1394
1724
 
1395
1725
  def _get_job_status_from_tasks(
1396
- job_tasks: List[Dict[str, Any]]
1726
+ job_tasks: Union[List[responses.ManagedJobRecord], List[Dict[str, Any]]]
1397
1727
  ) -> Tuple[managed_job_state.ManagedJobStatus, int]:
1398
1728
  """Get the current task status and the current task id for a job."""
1399
1729
  managed_task_status = managed_job_state.ManagedJobStatus.SUCCEEDED
@@ -1413,29 +1743,40 @@ def _get_job_status_from_tasks(
1413
1743
 
1414
1744
 
1415
1745
  @typing.overload
1416
- def format_job_table(tasks: List[Dict[str, Any]],
1417
- show_all: bool,
1418
- show_user: bool,
1419
- return_rows: Literal[False] = False,
1420
- max_jobs: Optional[int] = None) -> str:
1746
+ def format_job_table(
1747
+ tasks: List[Dict[str, Any]],
1748
+ show_all: bool,
1749
+ show_user: bool,
1750
+ return_rows: Literal[False] = False,
1751
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1752
+ max_jobs: Optional[int] = None,
1753
+ job_status_counts: Optional[Dict[str, int]] = None,
1754
+ ) -> str:
1421
1755
  ...
1422
1756
 
1423
1757
 
1424
1758
  @typing.overload
1425
- def format_job_table(tasks: List[Dict[str, Any]],
1426
- show_all: bool,
1427
- show_user: bool,
1428
- return_rows: Literal[True],
1429
- max_jobs: Optional[int] = None) -> List[List[str]]:
1759
+ def format_job_table(
1760
+ tasks: List[Dict[str, Any]],
1761
+ show_all: bool,
1762
+ show_user: bool,
1763
+ return_rows: Literal[True],
1764
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1765
+ max_jobs: Optional[int] = None,
1766
+ job_status_counts: Optional[Dict[str, int]] = None,
1767
+ ) -> List[List[str]]:
1430
1768
  ...
1431
1769
 
1432
1770
 
1433
1771
  def format_job_table(
1434
- tasks: List[Dict[str, Any]],
1435
- show_all: bool,
1436
- show_user: bool,
1437
- return_rows: bool = False,
1438
- max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
1772
+ tasks: List[Dict[str, Any]],
1773
+ show_all: bool,
1774
+ show_user: bool,
1775
+ return_rows: bool = False,
1776
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1777
+ max_jobs: Optional[int] = None,
1778
+ job_status_counts: Optional[Dict[str, int]] = None,
1779
+ ) -> Union[str, List[List[str]]]:
1439
1780
  """Returns managed jobs as a formatted string.
1440
1781
 
1441
1782
  Args:
@@ -1444,13 +1785,15 @@ def format_job_table(
1444
1785
  max_jobs: The maximum number of jobs to show in the table.
1445
1786
  return_rows: If True, return the rows as a list of strings instead of
1446
1787
  all rows concatenated into a single string.
1788
+ pool_status: List of pool status dictionaries with replica_info.
1789
+ job_status_counts: The counts of each job status.
1447
1790
 
1448
1791
  Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
1449
1792
  a list of "rows" (each of which is a list of str).
1450
1793
  """
1451
1794
  jobs = collections.defaultdict(list)
1452
1795
  # Check if the tasks have user information from kubernetes.
1453
- # This is only used for sky status --kubernetes.
1796
+ # This is only used for sky status-kubernetes.
1454
1797
  tasks_have_k8s_user = any([task.get('user') for task in tasks])
1455
1798
  if max_jobs and tasks_have_k8s_user:
1456
1799
  raise ValueError('max_jobs is not supported when tasks have user info.')
@@ -1460,17 +1803,37 @@ def format_job_table(
1460
1803
  return (task['user'], task['job_id'])
1461
1804
  return task['job_id']
1462
1805
 
1806
+ def _get_job_id_to_worker_map(
1807
+ pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
1808
+ """Create a mapping from job_id to worker replica_id.
1809
+
1810
+ Args:
1811
+ pool_status: List of pool status dictionaries with replica_info.
1812
+
1813
+ Returns:
1814
+ Dictionary mapping job_id to replica_id (worker ID).
1815
+ """
1816
+ job_to_worker: Dict[int, int] = {}
1817
+ if pool_status is None:
1818
+ return job_to_worker
1819
+ for pool in pool_status:
1820
+ replica_info = pool.get('replica_info', [])
1821
+ for replica in replica_info:
1822
+ used_by = replica.get('used_by')
1823
+ if used_by is not None:
1824
+ job_to_worker[used_by] = replica.get('replica_id')
1825
+ return job_to_worker
1826
+
1827
+ # Create mapping from job_id to worker replica_id
1828
+ job_to_worker = _get_job_id_to_worker_map(pool_status)
1829
+
1463
1830
  for task in tasks:
1464
1831
  # The tasks within the same job_id are already sorted
1465
1832
  # by the task_id.
1466
1833
  jobs[get_hash(task)].append(task)
1467
1834
 
1468
- status_counts: Dict[str, int] = collections.defaultdict(int)
1469
1835
  workspaces = set()
1470
1836
  for job_tasks in jobs.values():
1471
- managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
1472
- if not managed_job_status.is_terminal():
1473
- status_counts[managed_job_status.value] += 1
1474
1837
  workspaces.add(job_tasks[0].get('workspace',
1475
1838
  constants.SKYPILOT_DEFAULT_WORKSPACE))
1476
1839
 
@@ -1513,9 +1876,15 @@ def format_job_table(
1513
1876
  job_table = log_utils.create_table(columns)
1514
1877
 
1515
1878
  status_counts: Dict[str, int] = collections.defaultdict(int)
1516
- for task in tasks:
1517
- if not task['status'].is_terminal():
1518
- status_counts[task['status'].value] += 1
1879
+ if job_status_counts:
1880
+ for status_value, count in job_status_counts.items():
1881
+ status = managed_job_state.ManagedJobStatus(status_value)
1882
+ if not status.is_terminal():
1883
+ status_counts[status_value] = count
1884
+ else:
1885
+ for task in tasks:
1886
+ if not task['status'].is_terminal():
1887
+ status_counts[task['status'].value] += 1
1519
1888
 
1520
1889
  all_tasks = tasks
1521
1890
  if max_jobs is not None:
@@ -1601,7 +1970,12 @@ def format_job_table(
1601
1970
  if pool is None:
1602
1971
  pool = '-'
1603
1972
 
1973
+ # Add worker information if job is assigned to a worker
1604
1974
  job_id = job_hash[1] if tasks_have_k8s_user else job_hash
1975
+ # job_id is now always an integer, use it to look up worker
1976
+ if job_id in job_to_worker and pool != '-':
1977
+ pool = f'{pool} (worker={job_to_worker[job_id]})'
1978
+
1605
1979
  job_values = [
1606
1980
  job_id,
1607
1981
  '',
@@ -1644,6 +2018,12 @@ def format_job_table(
1644
2018
  pool = task.get('pool')
1645
2019
  if pool is None:
1646
2020
  pool = '-'
2021
+
2022
+ # Add worker information if task is assigned to a worker
2023
+ task_job_id = task['job_id']
2024
+ if task_job_id in job_to_worker and pool != '-':
2025
+ pool = f'{pool} (worker={job_to_worker[task_job_id]})'
2026
+
1647
2027
  values = [
1648
2028
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1649
2029
  task['task_id'] if len(job_tasks) > 1 else '-',
@@ -1726,6 +2106,59 @@ def format_job_table(
1726
2106
  return output
1727
2107
 
1728
2108
 
2109
+ def decode_managed_job_protos(
2110
+ job_protos: Iterable['managed_jobsv1_pb2.ManagedJobInfo']
2111
+ ) -> List[Dict[str, Any]]:
2112
+ """Decode job protos to dicts. Similar to load_managed_job_queue."""
2113
+ user_hash_to_user = global_user_state.get_users(
2114
+ set(job.user_hash for job in job_protos if job.user_hash))
2115
+
2116
+ jobs = []
2117
+ for job_proto in job_protos:
2118
+ job_dict = _job_proto_to_dict(job_proto)
2119
+ user_hash = job_dict.get('user_hash', None)
2120
+ if user_hash is not None:
2121
+ # Skip jobs that do not have user_hash info.
2122
+ # TODO(cooperc): Remove check before 0.12.0.
2123
+ user = user_hash_to_user.get(user_hash, None)
2124
+ job_dict['user_name'] = user.name if user is not None else None
2125
+ jobs.append(job_dict)
2126
+ return jobs
2127
+
2128
+
2129
+ def _job_proto_to_dict(
2130
+ job_proto: 'managed_jobsv1_pb2.ManagedJobInfo') -> Dict[str, Any]:
2131
+ job_dict = json_format.MessageToDict(
2132
+ job_proto,
2133
+ always_print_fields_with_no_presence=True,
2134
+ # Our API returns fields in snake_case.
2135
+ preserving_proto_field_name=True,
2136
+ use_integers_for_enums=True)
2137
+ for field in job_proto.DESCRIPTOR.fields:
2138
+ # Ensure optional fields are present with None values for
2139
+ # backwards compatibility with older clients.
2140
+ if field.has_presence and field.name not in job_dict:
2141
+ job_dict[field.name] = None
2142
+ # json_format.MessageToDict is meant for encoding to JSON,
2143
+ # and Protobuf encodes int64 as decimal strings in JSON,
2144
+ # so we need to convert them back to ints.
2145
+ # https://protobuf.dev/programming-guides/json/#field-representation
2146
+ if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
2147
+ job_dict.get(field.name) is not None):
2148
+ job_dict[field.name] = int(job_dict[field.name])
2149
+ job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
2150
+ job_dict['status'])
2151
+ # For backwards compatibility, convert schedule_state to a string,
2152
+ # as we don't have the logic to handle it in our request
2153
+ # encoder/decoder, unlike status.
2154
+ schedule_state_enum = (
2155
+ managed_job_state.ManagedJobScheduleState.from_protobuf(
2156
+ job_dict['schedule_state']))
2157
+ job_dict['schedule_state'] = (schedule_state_enum.value
2158
+ if schedule_state_enum is not None else None)
2159
+ return job_dict
2160
+
2161
+
1729
2162
  class ManagedJobCodeGen:
1730
2163
  """Code generator for managed job utility functions.
1731
2164
 
@@ -1755,6 +2188,7 @@ class ManagedJobCodeGen:
1755
2188
  limit: Optional[int] = None,
1756
2189
  user_hashes: Optional[List[Optional[str]]] = None,
1757
2190
  statuses: Optional[List[str]] = None,
2191
+ fields: Optional[List[str]] = None,
1758
2192
  ) -> str:
1759
2193
  code = textwrap.dedent(f"""\
1760
2194
  if managed_job_version < 9:
@@ -1773,7 +2207,7 @@ class ManagedJobCodeGen:
1773
2207
  page={page!r},
1774
2208
  limit={limit!r},
1775
2209
  user_hashes={user_hashes!r})
1776
- else:
2210
+ elif managed_job_version < 12:
1777
2211
  job_table = utils.dump_managed_job_queue(
1778
2212
  skip_finished={skip_finished},
1779
2213
  accessible_workspaces={accessible_workspaces!r},
@@ -1785,6 +2219,19 @@ class ManagedJobCodeGen:
1785
2219
  limit={limit!r},
1786
2220
  user_hashes={user_hashes!r},
1787
2221
  statuses={statuses!r})
2222
+ else:
2223
+ job_table = utils.dump_managed_job_queue(
2224
+ skip_finished={skip_finished},
2225
+ accessible_workspaces={accessible_workspaces!r},
2226
+ job_ids={job_ids!r},
2227
+ workspace_match={workspace_match!r},
2228
+ name_match={name_match!r},
2229
+ pool_match={pool_match!r},
2230
+ page={page!r},
2231
+ limit={limit!r},
2232
+ user_hashes={user_hashes!r},
2233
+ statuses={statuses!r},
2234
+ fields={fields!r})
1788
2235
  print(job_table, flush=True)
1789
2236
  """)
1790
2237
  return cls._build(code)
@@ -1852,6 +2299,18 @@ class ManagedJobCodeGen:
1852
2299
  """)
1853
2300
  return cls._build(code)
1854
2301
 
2302
+ @classmethod
2303
+ def get_version(cls) -> str:
2304
+ """Generate code to get controller version."""
2305
+ code = textwrap.dedent("""\
2306
+ from sky.skylet import constants as controller_constants
2307
+
2308
+ # Get controller version
2309
+ controller_version = controller_constants.SKYLET_VERSION
2310
+ print(f"controller_version:{controller_version}", flush=True)
2311
+ """)
2312
+ return cls._build(code)
2313
+
1855
2314
  @classmethod
1856
2315
  def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
1857
2316
  code = textwrap.dedent(f"""\
@@ -1889,8 +2348,12 @@ class ManagedJobCodeGen:
1889
2348
  return cls._build(code)
1890
2349
 
1891
2350
  @classmethod
1892
- def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
1893
- workspace: str, entrypoint: str) -> str:
2351
+ def set_pending(cls,
2352
+ job_id: int,
2353
+ managed_job_dag: 'dag_lib.Dag',
2354
+ workspace: str,
2355
+ entrypoint: str,
2356
+ user_hash: Optional[str] = None) -> str:
1894
2357
  dag_name = managed_job_dag.name
1895
2358
  pool = managed_job_dag.pool
1896
2359
  # Add the managed job to queue table.
@@ -1907,6 +2370,8 @@ class ManagedJobCodeGen:
1907
2370
  pool_hash = serve_state.get_service_hash({pool!r})
1908
2371
  set_job_info_kwargs['pool'] = {pool!r}
1909
2372
  set_job_info_kwargs['pool_hash'] = pool_hash
2373
+ if managed_job_version >= 11:
2374
+ set_job_info_kwargs['user_hash'] = {user_hash!r}
1910
2375
  managed_job_state.set_job_info(
1911
2376
  {job_id}, {dag_name!r}, **set_job_info_kwargs)
1912
2377
  """)