skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/utils.py CHANGED
@@ -4,60 +4,86 @@ NOTE: whenever an API change is made in this file, we need to bump the
4
4
  jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
5
5
  ManagedJobCodeGen.
6
6
  """
7
+ import asyncio
7
8
  import collections
9
+ from datetime import datetime
8
10
  import enum
9
11
  import os
10
12
  import pathlib
13
+ import re
11
14
  import shlex
12
15
  import textwrap
13
16
  import time
14
17
  import traceback
15
18
  import typing
16
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
19
+ from typing import (Any, Deque, Dict, Iterable, List, Literal, Optional, Set,
20
+ TextIO, Tuple, Union)
17
21
 
18
22
  import colorama
19
23
  import filelock
20
- from typing_extensions import Literal
21
24
 
22
25
  from sky import backends
23
26
  from sky import exceptions
24
27
  from sky import global_user_state
25
28
  from sky import sky_logging
29
+ from sky import skypilot_config
26
30
  from sky.adaptors import common as adaptors_common
27
31
  from sky.backends import backend_utils
32
+ from sky.backends import cloud_vm_ray_backend
28
33
  from sky.jobs import constants as managed_job_constants
29
34
  from sky.jobs import scheduler
30
35
  from sky.jobs import state as managed_job_state
36
+ from sky.schemas.api import responses
31
37
  from sky.skylet import constants
32
38
  from sky.skylet import job_lib
33
39
  from sky.skylet import log_lib
34
40
  from sky.usage import usage_lib
41
+ from sky.utils import annotations
35
42
  from sky.utils import common_utils
43
+ from sky.utils import context_utils
44
+ from sky.utils import controller_utils
45
+ from sky.utils import infra_utils
36
46
  from sky.utils import log_utils
37
47
  from sky.utils import message_utils
48
+ from sky.utils import resources_utils
38
49
  from sky.utils import rich_utils
39
50
  from sky.utils import subprocess_utils
40
51
  from sky.utils import ux_utils
41
52
 
42
53
  if typing.TYPE_CHECKING:
54
+ from google.protobuf import descriptor
55
+ from google.protobuf import json_format
56
+ import grpc
43
57
  import psutil
44
58
 
45
59
  import sky
46
60
  from sky import dag as dag_lib
61
+ from sky.schemas.generated import jobsv1_pb2
62
+ from sky.schemas.generated import managed_jobsv1_pb2
47
63
  else:
64
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
65
+ descriptor = adaptors_common.LazyImport('google.protobuf.descriptor')
48
66
  psutil = adaptors_common.LazyImport('psutil')
67
+ grpc = adaptors_common.LazyImport('grpc')
68
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
69
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
70
+ 'sky.schemas.generated.managed_jobsv1_pb2')
49
71
 
50
72
  logger = sky_logging.init_logger(__name__)
51
73
 
52
- SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
53
74
  # Controller checks its job's status every this many seconds.
54
- JOB_STATUS_CHECK_GAP_SECONDS = 20
75
+ # This is a tradeoff between the latency and the resource usage.
76
+ JOB_STATUS_CHECK_GAP_SECONDS = 15
55
77
 
56
78
  # Controller checks if its job has started every this many seconds.
57
79
  JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
58
80
 
59
81
  _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
60
82
 
83
+ _JOB_STATUS_FETCH_MAX_RETRIES = 3
84
+ _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
85
+ _JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
86
+
61
87
  _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
62
88
  'Waiting for task to start[/]'
63
89
  '{status_str}. It may take a few minutes.\n'
@@ -72,7 +98,35 @@ _JOB_CANCELLED_MESSAGE = (
72
98
  # blocking for a long time. This should be significantly longer than the
73
99
  # JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
74
100
  # update the state.
75
- _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
101
+ _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
102
+
103
+ # After enabling consolidation mode, we need to restart the API server to get
104
+ # the jobs refresh deamon and correct number of executors. We use this file to
105
+ # indicate that the API server has been restarted after enabling consolidation
106
+ # mode.
107
+ _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
108
+ '~/.sky/.jobs_controller_consolidation_reloaded_signal')
109
+
110
+ # The response fields for managed jobs that require cluster handle
111
+ _CLUSTER_HANDLE_FIELDS = [
112
+ 'cluster_resources',
113
+ 'cluster_resources_full',
114
+ 'cloud',
115
+ 'region',
116
+ 'zone',
117
+ 'infra',
118
+ 'accelerators',
119
+ ]
120
+
121
+ # The response fields for managed jobs that are not stored in the database
122
+ # These fields will be mapped to the DB fields in the `_update_fields`.
123
+ _NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
124
+
125
+
126
+ class ManagedJobQueueResultType(enum.Enum):
127
+ """The type of the managed job queue result."""
128
+ DICT = 'DICT'
129
+ LIST = 'LIST'
76
130
 
77
131
 
78
132
  class UserSignal(enum.Enum):
@@ -83,7 +137,10 @@ class UserSignal(enum.Enum):
83
137
 
84
138
 
85
139
  # ====== internal functions ======
86
- def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
140
+ def terminate_cluster(
141
+ cluster_name: str,
142
+ max_retry: int = 6,
143
+ ) -> None:
87
144
  """Terminate the cluster."""
88
145
  from sky import core # pylint: disable=import-outside-toplevel
89
146
  retry_cnt = 0
@@ -121,43 +178,313 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
121
178
  time.sleep(backoff.current_backoff())
122
179
 
123
180
 
124
- def get_job_status(backend: 'backends.CloudVmRayBackend',
125
- cluster_name: str) -> Optional['job_lib.JobStatus']:
181
+ def _validate_consolidation_mode_config(
182
+ current_is_consolidation_mode: bool) -> None:
183
+ """Validate the consolidation mode config."""
184
+ # Check whether the consolidation mode config is changed.
185
+ if current_is_consolidation_mode:
186
+ controller_cn = (
187
+ controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
188
+ if global_user_state.cluster_with_name_exists(controller_cn):
189
+ logger.warning(
190
+ f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
191
+ f'but the controller cluster {controller_cn} is still running. '
192
+ 'Please terminate the controller cluster first.'
193
+ f'{colorama.Style.RESET_ALL}')
194
+ else:
195
+ total_jobs = managed_job_state.get_managed_jobs_total()
196
+ if total_jobs > 0:
197
+ nonterminal_jobs = (
198
+ managed_job_state.get_nonterminal_job_ids_by_name(
199
+ None, None, all_users=True))
200
+ if nonterminal_jobs:
201
+ logger.warning(
202
+ f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
203
+ f'but there are still {len(nonterminal_jobs)} managed jobs '
204
+ 'running. Please terminate those jobs first.'
205
+ f'{colorama.Style.RESET_ALL}')
206
+ else:
207
+ logger.warning(
208
+ f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
209
+ f'but there are {total_jobs} jobs from previous '
210
+ 'consolidation mode. Reset the `jobs.controller.'
211
+ 'consolidation_mode` to `true` and run `sky jobs queue` '
212
+ 'to see those jobs. Switching to normal mode will '
213
+ f'lose the job history.{colorama.Style.RESET_ALL}')
214
+
215
+
216
+ # Whether to use consolidation mode or not. When this is enabled, the managed
217
+ # jobs controller will not be running on a separate cluster, but locally on the
218
+ # API Server. Under the hood, we submit the job monitoring logic as processes
219
+ # directly in the API Server.
220
+ # Use LRU Cache so that the check is only done once.
221
+ @annotations.lru_cache(scope='request', maxsize=2)
222
+ def is_consolidation_mode(on_api_restart: bool = False) -> bool:
223
+ if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
224
+ return True
225
+
226
+ config_consolidation_mode = skypilot_config.get_nested(
227
+ ('jobs', 'controller', 'consolidation_mode'), default_value=False)
228
+
229
+ signal_file = pathlib.Path(
230
+ _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
231
+
232
+ if on_api_restart:
233
+ if config_consolidation_mode:
234
+ signal_file.touch()
235
+ else:
236
+ restart_signal_file_exists = signal_file.exists()
237
+ if not restart_signal_file_exists:
238
+ if config_consolidation_mode:
239
+ logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
240
+ 'managed jobs is enabled in the server config, '
241
+ 'but the API server has not been restarted yet. '
242
+ 'Please restart the API server to enable it.'
243
+ f'{colorama.Style.RESET_ALL}')
244
+ return False
245
+ elif not config_consolidation_mode:
246
+ # Cleanup the signal file if the consolidation mode is disabled in
247
+ # the config. This allow the user to disable the consolidation mode
248
+ # without restarting the API server.
249
+ signal_file.unlink()
250
+
251
+ # We should only do this check on API server, as the controller will not
252
+ # have related config and will always seemingly disabled for consolidation
253
+ # mode. Check #6611 for more details.
254
+ if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
255
+ _validate_consolidation_mode_config(config_consolidation_mode)
256
+ return config_consolidation_mode
257
+
258
+
259
+ def ha_recovery_for_consolidation_mode() -> None:
260
+ """Recovery logic for consolidation mode.
261
+
262
+ This should only be called from the managed-job-status-refresh-daemon, due
263
+ so that we have correct ordering recovery -> controller start -> job status
264
+ updates. This also should ensure correct operation during a rolling update.
265
+ """
266
+ # No setup recovery is needed in consolidation mode, as the API server
267
+ # already has all runtime installed. Directly start jobs recovery here.
268
+ # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
269
+ scheduler.maybe_start_controllers()
270
+ with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
271
+ 'a',
272
+ encoding='utf-8') as f:
273
+ start = time.time()
274
+ f.write(f'Starting HA recovery at {datetime.now()}\n')
275
+ jobs, _ = managed_job_state.get_managed_jobs_with_filters(fields=[
276
+ 'job_id', 'controller_pid', 'controller_pid_started_at',
277
+ 'schedule_state', 'status'
278
+ ])
279
+ for job in jobs:
280
+ job_id = job['job_id']
281
+ controller_pid = job['controller_pid']
282
+ controller_pid_started_at = job.get('controller_pid_started_at')
283
+
284
+ # In consolidation mode, it is possible that only the API server
285
+ # process is restarted, and the controller process is not. In such
286
+ # case, we don't need to do anything and the controller process will
287
+ # just keep running. However, in most cases, the controller process
288
+ # will also be stopped - either by a pod restart in k8s API server,
289
+ # or by `sky api stop`, which will stop controllers.
290
+ # TODO(cooperc): Make sure we cannot have a controller process
291
+ # running across API server restarts for consistency.
292
+ if controller_pid is not None:
293
+ try:
294
+ # Note: We provide the legacy job id to the
295
+ # controller_process_alive just in case, but we shouldn't
296
+ # have a running legacy job controller process at this point
297
+ if controller_process_alive(
298
+ managed_job_state.ControllerPidRecord(
299
+ pid=controller_pid,
300
+ started_at=controller_pid_started_at), job_id):
301
+ message = (f'Controller pid {controller_pid} for '
302
+ f'job {job_id} is still running. '
303
+ 'Skipping recovery.\n')
304
+ logger.debug(message)
305
+ f.write(message)
306
+ continue
307
+ except Exception: # pylint: disable=broad-except
308
+ # _controller_process_alive may raise if psutil fails; we
309
+ # should not crash the recovery logic because of this.
310
+ message = ('Error checking controller pid '
311
+ f'{controller_pid} for job {job_id}\n')
312
+ logger.warning(message, exc_info=True)
313
+ f.write(message)
314
+
315
+ # Controller process is not set or not alive.
316
+ if job['schedule_state'] not in [
317
+ managed_job_state.ManagedJobScheduleState.DONE,
318
+ managed_job_state.ManagedJobScheduleState.WAITING,
319
+ # INACTIVE job may be mid-submission, don't set to WAITING.
320
+ managed_job_state.ManagedJobScheduleState.INACTIVE,
321
+ ]:
322
+ managed_job_state.reset_job_for_recovery(job_id)
323
+ message = (f'Job {job_id} completed recovery at '
324
+ f'{datetime.now()}\n')
325
+ logger.info(message)
326
+ f.write(message)
327
+ f.write(f'HA recovery completed at {datetime.now()}\n')
328
+ f.write(f'Total recovery time: {time.time() - start} seconds\n')
329
+
330
+
331
+ async def get_job_status(
332
+ backend: 'backends.CloudVmRayBackend', cluster_name: str,
333
+ job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
126
334
  """Check the status of the job running on a managed job cluster.
127
335
 
128
336
  It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
129
337
  FAILED_SETUP or CANCELLED.
130
338
  """
131
- handle = global_user_state.get_handle_from_cluster_name(cluster_name)
339
+ # TODO(luca) make this async
340
+ handle = await context_utils.to_thread(
341
+ global_user_state.get_handle_from_cluster_name, cluster_name)
132
342
  if handle is None:
133
343
  # This can happen if the cluster was preempted and background status
134
344
  # refresh already noticed and cleaned it up.
135
345
  logger.info(f'Cluster {cluster_name} not found.')
136
346
  return None
137
347
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
138
- status = None
348
+ job_ids = None if job_id is None else [job_id]
349
+ for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
350
+ try:
351
+ logger.info('=== Checking the job status... ===')
352
+ statuses = await asyncio.wait_for(
353
+ context_utils.to_thread(backend.get_job_status,
354
+ handle,
355
+ job_ids=job_ids,
356
+ stream_logs=False),
357
+ timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
358
+ status = list(statuses.values())[0]
359
+ if status is None:
360
+ logger.info('No job found.')
361
+ else:
362
+ logger.info(f'Job status: {status}')
363
+ logger.info('=' * 34)
364
+ return status
365
+ except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
366
+ ValueError, TypeError, asyncio.TimeoutError) as e:
367
+ # Note: Each of these exceptions has some additional conditions to
368
+ # limit how we handle it and whether or not we catch it.
369
+ # Retry on k8s transient network errors. This is useful when using
370
+ # coreweave which may have transient network issue sometimes.
371
+ is_transient_error = False
372
+ detailed_reason = None
373
+ if isinstance(e, exceptions.CommandError):
374
+ detailed_reason = e.detailed_reason
375
+ if (detailed_reason is not None and
376
+ _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
377
+ is_transient_error = True
378
+ elif isinstance(e, grpc.RpcError):
379
+ detailed_reason = e.details()
380
+ if e.code() in [
381
+ grpc.StatusCode.UNAVAILABLE,
382
+ grpc.StatusCode.DEADLINE_EXCEEDED
383
+ ]:
384
+ is_transient_error = True
385
+ elif isinstance(e, grpc.FutureTimeoutError):
386
+ detailed_reason = 'Timeout'
387
+ elif isinstance(e, asyncio.TimeoutError):
388
+ detailed_reason = ('Job status check timed out after '
389
+ f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
390
+ # TODO(cooperc): Gracefully handle these exceptions in the backend.
391
+ elif isinstance(e, ValueError):
392
+ # If the cluster yaml is deleted in the middle of getting the
393
+ # SSH credentials, we could see this. See
394
+ # sky/global_user_state.py get_cluster_yaml_dict.
395
+ if re.search(r'Cluster yaml .* not found', str(e)):
396
+ detailed_reason = 'Cluster yaml was deleted'
397
+ else:
398
+ raise
399
+ elif isinstance(e, TypeError):
400
+ # We will grab the SSH credentials from the cluster yaml, but if
401
+ # handle.cluster_yaml is None, we will just return an empty dict
402
+ # for the credentials. See
403
+ # backend_utils.ssh_credential_from_yaml. Then, the credentials
404
+ # are passed as kwargs to SSHCommandRunner.__init__ - see
405
+ # cloud_vm_ray_backend.get_command_runners. So we can hit this
406
+ # TypeError if the cluster yaml is removed from the handle right
407
+ # when we pull it before the cluster is fully deleted.
408
+ error_msg_to_check = (
409
+ 'SSHCommandRunner.__init__() missing 2 required positional '
410
+ 'arguments: \'ssh_user\' and \'ssh_private_key\'')
411
+ if str(e) == error_msg_to_check:
412
+ detailed_reason = 'SSH credentials were already cleaned up'
413
+ else:
414
+ raise
415
+ if is_transient_error:
416
+ logger.info('Failed to connect to the cluster. Retrying '
417
+ f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
418
+ logger.info('=' * 34)
419
+ await asyncio.sleep(1)
420
+ else:
421
+ logger.info(f'Failed to get job status: {detailed_reason}')
422
+ logger.info('=' * 34)
423
+ return None
424
+ return None
425
+
426
+
427
+ def controller_process_alive(record: managed_job_state.ControllerPidRecord,
428
+ legacy_job_id: Optional[int] = None,
429
+ quiet: bool = True) -> bool:
430
+ """Check if the controller process is alive.
431
+
432
+ If legacy_job_id is provided, this will also return True for a legacy
433
+ single-job controller process with that job id, based on the cmdline. This
434
+ is how the old check worked before #7051.
435
+ """
139
436
  try:
140
- logger.info('=== Checking the job status... ===')
141
- statuses = backend.get_job_status(handle, stream_logs=False)
142
- status = list(statuses.values())[0]
143
- if status is None:
144
- logger.info('No job found.')
437
+ process = psutil.Process(record.pid)
438
+
439
+ if record.started_at is not None:
440
+ if process.create_time() != record.started_at:
441
+ if not quiet:
442
+ logger.debug(f'Controller process {record.pid} has started '
443
+ f'at {record.started_at} but process has '
444
+ f'started at {process.create_time()}')
445
+ return False
145
446
  else:
146
- logger.info(f'Job status: {status}')
147
- except exceptions.CommandError:
148
- logger.info('Failed to connect to the cluster.')
149
- logger.info('=' * 34)
150
- return status
447
+ # If we can't check the create_time try to check the cmdline instead
448
+ cmd_str = ' '.join(process.cmdline())
449
+ # pylint: disable=line-too-long
450
+ # Pre-#7051 cmdline: /path/to/python -u -m sky.jobs.controller <dag.yaml_path> --job-id <job_id>
451
+ # Post-#7051 cmdline: /path/to/python -u -msky.jobs.controller
452
+ # pylint: enable=line-too-long
453
+ if ('-m sky.jobs.controller' not in cmd_str and
454
+ '-msky.jobs.controller' not in cmd_str):
455
+ if not quiet:
456
+ logger.debug(f'Process {record.pid} is not a controller '
457
+ 'process - missing "-m sky.jobs.controller" '
458
+ f'from cmdline: {cmd_str}')
459
+ return False
460
+ if (legacy_job_id is not None and '--job-id' in cmd_str and
461
+ f'--job-id {legacy_job_id}' not in cmd_str):
462
+ if not quiet:
463
+ logger.debug(f'Controller process {record.pid} has the '
464
+ f'wrong --job-id (expected {legacy_job_id}) '
465
+ f'in cmdline: {cmd_str}')
466
+ return False
467
+
468
+ # On linux, psutil.Process(pid) will return a valid process object
469
+ # even if the pid is actually a thread ID within the process. This
470
+ # hugely inflates the number of valid-looking pids, increasing the
471
+ # chance that we will falsely believe a controller is alive. The pid
472
+ # file should never contain thread IDs, just process IDs. We can
473
+ # check this with psutil.pid_exists(pid), which is false for TIDs.
474
+ # See pid_exists in psutil/_pslinux.py
475
+ if not psutil.pid_exists(record.pid):
476
+ if not quiet:
477
+ logger.debug(
478
+ f'Controller process {record.pid} is not a valid '
479
+ 'process id.')
480
+ return False
151
481
 
482
+ return process.is_running()
152
483
 
153
- def _controller_process_alive(pid: int, job_id: int) -> bool:
154
- """Check if the controller process is alive."""
155
- try:
156
- process = psutil.Process(pid)
157
- # The last two args of the command line should be --job-id <id>
158
- job_args = process.cmdline()[-2:]
159
- return process.is_running() and job_args == ['--job-id', str(job_id)]
160
- except psutil.NoSuchProcess:
484
+ except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
485
+ OSError) as e:
486
+ if not quiet:
487
+ logger.debug(f'Controller process {record.pid} is not running: {e}')
161
488
  return False
162
489
 
163
490
 
@@ -173,6 +500,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
173
500
  Note: we expect that job_id, if provided, refers to a nonterminal job or a
174
501
  job that has not completed its cleanup (schedule state not DONE).
175
502
  """
503
+ # This signal file suggests that the controller is recovering from a
504
+ # failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
505
+ # When restarting the controller processes, we don't want this event to
506
+ # set the job status to FAILED_CONTROLLER.
507
+ # TODO(tian): Change this to restart the controller process. For now we
508
+ # disabled it when recovering because we want to avoid caveats of infinite
509
+ # restart of last controller process that fully occupied the controller VM.
510
+ if os.path.exists(
511
+ os.path.expanduser(
512
+ constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
513
+ return
176
514
 
177
515
  def _cleanup_job_clusters(job_id: int) -> Optional[str]:
178
516
  """Clean up clusters for a job. Returns error message if any.
@@ -181,15 +519,22 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
181
519
  capture the error message, and log/return it.
182
520
  """
183
521
  error_msg = None
184
- tasks = managed_job_state.get_managed_jobs(job_id)
522
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
185
523
  for task in tasks:
186
- task_name = task['job_name']
187
- cluster_name = generate_managed_job_cluster_name(task_name, job_id)
524
+ pool = task.get('pool', None)
525
+ if pool is None:
526
+ task_name = task['job_name']
527
+ cluster_name = generate_managed_job_cluster_name(
528
+ task_name, job_id)
529
+ else:
530
+ cluster_name, _ = (
531
+ managed_job_state.get_pool_submit_info(job_id))
188
532
  handle = global_user_state.get_handle_from_cluster_name(
189
533
  cluster_name)
190
534
  if handle is not None:
191
535
  try:
192
- terminate_cluster(cluster_name)
536
+ if pool is None:
537
+ terminate_cluster(cluster_name)
193
538
  except Exception as e: # pylint: disable=broad-except
194
539
  error_msg = (
195
540
  f'Failed to terminate cluster {cluster_name}: '
@@ -197,43 +542,6 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
197
542
  logger.exception(error_msg, exc_info=e)
198
543
  return error_msg
199
544
 
200
- # For backwards compatible jobs
201
- # TODO(cooperc): Remove before 0.11.0.
202
- def _handle_legacy_job(job_id: int):
203
- controller_status = job_lib.get_status(job_id)
204
- if controller_status is None or controller_status.is_terminal():
205
- logger.error(f'Controller process for legacy job {job_id} is '
206
- 'in an unexpected state.')
207
-
208
- cleanup_error = _cleanup_job_clusters(job_id)
209
- if cleanup_error:
210
- # Unconditionally set the job to failed_controller if the
211
- # cleanup fails.
212
- managed_job_state.set_failed(
213
- job_id,
214
- task_id=None,
215
- failure_type=managed_job_state.ManagedJobStatus.
216
- FAILED_CONTROLLER,
217
- failure_reason=
218
- 'Legacy controller process has exited abnormally, and '
219
- f'cleanup failed: {cleanup_error}. For more details, run: '
220
- f'sky jobs logs --controller {job_id}',
221
- override_terminal=True)
222
- return
223
-
224
- # It's possible for the job to have transitioned to
225
- # another terminal state while between when we checked its
226
- # state and now. In that case, set_failed won't do
227
- # anything, which is fine.
228
- managed_job_state.set_failed(
229
- job_id,
230
- task_id=None,
231
- failure_type=managed_job_state.ManagedJobStatus.
232
- FAILED_CONTROLLER,
233
- failure_reason=(
234
- 'Legacy controller process has exited abnormally. For '
235
- f'more details, run: sky jobs logs --controller {job_id}'))
236
-
237
545
  # Get jobs that need checking (non-terminal or not DONE)
238
546
  job_ids = managed_job_state.get_jobs_to_check_status(job_id)
239
547
  if not job_ids:
@@ -242,29 +550,23 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
242
550
  return
243
551
 
244
552
  for job_id in job_ids:
245
- tasks = managed_job_state.get_managed_jobs(job_id)
553
+ assert job_id is not None
554
+ tasks = managed_job_state.get_managed_job_tasks(job_id)
246
555
  # Note: controller_pid and schedule_state are in the job_info table
247
556
  # which is joined to the spot table, so all tasks with the same job_id
248
557
  # will have the same value for these columns. This is what lets us just
249
558
  # take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
250
559
  schedule_state = tasks[0]['schedule_state']
251
560
 
252
- # Backwards compatibility: this job was submitted when ray was still
253
- # used for managing the parallelism of job controllers, before #4485.
254
- # TODO(cooperc): Remove before 0.11.0.
255
- if (schedule_state is
256
- managed_job_state.ManagedJobScheduleState.INVALID):
257
- _handle_legacy_job(job_id)
258
- continue
259
-
260
561
  # Handle jobs with schedule state (non-legacy jobs):
261
562
  pid = tasks[0]['controller_pid']
563
+ pid_started_at = tasks[0].get('controller_pid_started_at')
262
564
  if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
263
565
  # There are two cases where we could get a job that is DONE.
264
566
  # 1. At query time (get_jobs_to_check_status), the job was not yet
265
- # DONE, but since then (before get_managed_jobs is called) it has
266
- # hit a terminal status, marked itself done, and exited. This is
267
- # fine.
567
+ # DONE, but since then (before get_managed_job_tasks is called)
568
+ # it has hit a terminal status, marked itself done, and exited.
569
+ # This is fine.
268
570
  # 2. The job is DONE, but in a non-terminal status. This is
269
571
  # unexpected. For instance, the task status is RUNNING, but the
270
572
  # job schedule_state is DONE.
@@ -311,7 +613,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
311
613
  failure_reason = f'No controller pid set for {schedule_state.value}'
312
614
  else:
313
615
  logger.debug(f'Checking controller pid {pid}')
314
- if _controller_process_alive(pid, job_id):
616
+ if controller_process_alive(
617
+ managed_job_state.ControllerPidRecord(
618
+ pid=pid, started_at=pid_started_at), job_id):
315
619
  # The controller is still running, so this job is fine.
316
620
  continue
317
621
 
@@ -369,11 +673,34 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
369
673
 
370
674
 
371
675
  def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
372
- get_end_time: bool) -> float:
676
+ job_id: Optional[int], get_end_time: bool) -> float:
373
677
  """Get the submitted/ended time of the job."""
374
- code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
375
- job_id=None, get_ended_time=get_end_time)
376
678
  handle = global_user_state.get_handle_from_cluster_name(cluster_name)
679
+ assert handle is not None, (
680
+ f'handle for cluster {cluster_name!r} should not be None')
681
+ if handle.is_grpc_enabled_with_flag:
682
+ try:
683
+ if get_end_time:
684
+ end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
685
+ job_id=job_id)
686
+ end_ts_response = backend_utils.invoke_skylet_with_retries(
687
+ lambda: cloud_vm_ray_backend.SkyletClient(
688
+ handle.get_grpc_channel()).get_job_ended_timestamp(
689
+ end_ts_request))
690
+ return end_ts_response.timestamp
691
+ else:
692
+ submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
693
+ job_id=job_id)
694
+ submit_ts_response = backend_utils.invoke_skylet_with_retries(
695
+ lambda: cloud_vm_ray_backend.SkyletClient(
696
+ handle.get_grpc_channel()).get_job_submitted_timestamp(
697
+ submit_ts_request))
698
+ return submit_ts_response.timestamp
699
+ except exceptions.SkyletMethodNotImplementedError:
700
+ pass
701
+
702
+ code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
703
+ job_id=job_id, get_ended_time=get_end_time))
377
704
  returncode, stdout, stderr = backend.run_on_head(handle,
378
705
  code,
379
706
  stream_logs=False,
@@ -386,16 +713,24 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
386
713
 
387
714
 
388
715
  def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
389
- cluster_name: str) -> float:
716
+ cluster_name: str, job_id: Optional[int]) -> float:
390
717
  """Try to get the end time of the job.
391
718
 
392
719
  If the job is preempted or we can't connect to the instance for whatever
393
720
  reason, fall back to the current time.
394
721
  """
395
722
  try:
396
- return get_job_timestamp(backend, cluster_name, get_end_time=True)
397
- except exceptions.CommandError as e:
398
- if e.returncode == 255:
723
+ return get_job_timestamp(backend,
724
+ cluster_name,
725
+ job_id=job_id,
726
+ get_end_time=True)
727
+ except (exceptions.CommandError, grpc.RpcError,
728
+ grpc.FutureTimeoutError) as e:
729
+ if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
730
+ (isinstance(e, grpc.RpcError) and e.code() in [
731
+ grpc.StatusCode.UNAVAILABLE,
732
+ grpc.StatusCode.DEADLINE_EXCEEDED,
733
+ ]) or isinstance(e, grpc.FutureTimeoutError):
399
734
  # Failed to connect - probably the instance was preempted since the
400
735
  # job completed. We shouldn't crash here, so just log and use the
401
736
  # current time.
@@ -407,7 +742,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
407
742
  raise
408
743
 
409
744
 
410
- def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
745
+ def event_callback_func(
746
+ job_id: int, task_id: Optional[int],
747
+ task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
411
748
  """Run event callback for the task."""
412
749
 
413
750
  def callback_func(status: str):
@@ -415,8 +752,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
415
752
  if event_callback is None or task is None:
416
753
  return
417
754
  event_callback = event_callback.strip()
418
- cluster_name = generate_managed_job_cluster_name(
419
- task.name, job_id) if task.name else None
755
+ pool = managed_job_state.get_pool_from_job_id(job_id)
756
+ if pool is not None:
757
+ cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
758
+ else:
759
+ cluster_name = generate_managed_job_cluster_name(
760
+ task.name, job_id) if task.name else None
420
761
  logger.info(f'=== START: event callback for {status!r} ===')
421
762
  log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
422
763
  'managed_job_event',
@@ -442,7 +783,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
442
783
  f'Bash:{event_callback},log_path:{log_path},result:{result}')
443
784
  logger.info(f'=== END: event callback for {status!r} ===')
444
785
 
445
- return callback_func
786
+ async def async_callback_func(status: str):
787
+ return await context_utils.to_thread(callback_func, status)
788
+
789
+ return async_callback_func
446
790
 
447
791
 
448
792
  # ======== user functions ========
@@ -461,20 +805,24 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
461
805
 
462
806
 
463
807
  def cancel_jobs_by_id(job_ids: Optional[List[int]],
464
- all_users: bool = False) -> str:
808
+ all_users: bool = False,
809
+ current_workspace: Optional[str] = None,
810
+ user_hash: Optional[str] = None) -> str:
465
811
  """Cancel jobs by id.
466
812
 
467
813
  If job_ids is None, cancel all jobs.
468
814
  """
469
815
  if job_ids is None:
470
816
  job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
471
- None, all_users)
817
+ None, user_hash, all_users)
472
818
  job_ids = list(set(job_ids))
473
819
  if not job_ids:
474
820
  return 'No job to cancel.'
475
- job_id_str = ', '.join(map(str, job_ids))
476
- logger.info(f'Cancelling jobs {job_id_str}.')
821
+ if current_workspace is None:
822
+ current_workspace = constants.SKYPILOT_DEFAULT_WORKSPACE
823
+
477
824
  cancelled_job_ids: List[int] = []
825
+ wrong_workspace_job_ids: List[int] = []
478
826
  for job_id in job_ids:
479
827
  # Check the status of the managed job status. If it is in
480
828
  # terminal state, we can safely skip it.
@@ -486,30 +834,70 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
486
834
  logger.info(f'Job {job_id} is already in terminal state '
487
835
  f'{job_status.value}. Skipped.')
488
836
  continue
837
+ elif job_status == managed_job_state.ManagedJobStatus.PENDING:
838
+ # the "if PENDING" is a short circuit, this will be atomic.
839
+ cancelled = managed_job_state.set_pending_cancelled(job_id)
840
+ if cancelled:
841
+ cancelled_job_ids.append(job_id)
842
+ continue
489
843
 
490
844
  update_managed_jobs_statuses(job_id)
491
845
 
492
- # Send the signal to the jobs controller.
493
- signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
494
- # Filelock is needed to prevent race condition between signal
495
- # check/removal and signal writing.
496
- with filelock.FileLock(str(signal_file) + '.lock'):
497
- with signal_file.open('w', encoding='utf-8') as f:
498
- f.write(UserSignal.CANCEL.value)
499
- f.flush()
846
+ job_workspace = managed_job_state.get_workspace(job_id)
847
+ if current_workspace is not None and job_workspace != current_workspace:
848
+ wrong_workspace_job_ids.append(job_id)
849
+ continue
850
+
851
+ if managed_job_state.is_legacy_controller_process(job_id):
852
+ # The job is running on a legacy single-job controller process.
853
+ # TODO(cooperc): Remove this handling for 0.13.0
854
+
855
+ # Send the signal to the jobs controller.
856
+ signal_file = (pathlib.Path(
857
+ managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
858
+ # Filelock is needed to prevent race condition between signal
859
+ # check/removal and signal writing.
860
+ with filelock.FileLock(str(signal_file) + '.lock'):
861
+ with signal_file.open('w', encoding='utf-8') as f:
862
+ f.write(UserSignal.CANCEL.value)
863
+ f.flush()
864
+ else:
865
+ # New controller process.
866
+ try:
867
+ signal_file = pathlib.Path(
868
+ managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
869
+ signal_file.touch()
870
+ except OSError as e:
871
+ logger.error(f'Failed to cancel job {job_id}: {e}')
872
+ # Don't add it to the to be cancelled job ids
873
+ continue
874
+
500
875
  cancelled_job_ids.append(job_id)
501
876
 
877
+ wrong_workspace_job_str = ''
878
+ if wrong_workspace_job_ids:
879
+ plural = 's' if len(wrong_workspace_job_ids) > 1 else ''
880
+ plural_verb = 'are' if len(wrong_workspace_job_ids) > 1 else 'is'
881
+ wrong_workspace_job_str = (
882
+ f' Job{plural} with ID{plural}'
883
+ f' {", ".join(map(str, wrong_workspace_job_ids))} '
884
+ f'{plural_verb} skipped as they are not in the active workspace '
885
+ f'{current_workspace!r}. Check the workspace of the job with: '
886
+ f'sky jobs queue')
887
+
502
888
  if not cancelled_job_ids:
503
- return 'No job to cancel.'
889
+ return f'No job to cancel.{wrong_workspace_job_str}'
504
890
  identity_str = f'Job with ID {cancelled_job_ids[0]} is'
505
891
  if len(cancelled_job_ids) > 1:
506
892
  cancelled_job_ids_str = ', '.join(map(str, cancelled_job_ids))
507
893
  identity_str = f'Jobs with IDs {cancelled_job_ids_str} are'
508
894
 
509
- return f'{identity_str} scheduled to be cancelled.'
895
+ msg = f'{identity_str} scheduled to be cancelled.{wrong_workspace_job_str}'
896
+ return msg
510
897
 
511
898
 
512
- def cancel_job_by_name(job_name: str) -> str:
899
+ def cancel_job_by_name(job_name: str,
900
+ current_workspace: Optional[str] = None) -> str:
513
901
  """Cancel a job by name."""
514
902
  job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
515
903
  if not job_ids:
@@ -518,11 +906,30 @@ def cancel_job_by_name(job_name: str) -> str:
518
906
  return (f'{colorama.Fore.RED}Multiple running jobs found '
519
907
  f'with name {job_name!r}.\n'
520
908
  f'Job IDs: {job_ids}{colorama.Style.RESET_ALL}')
521
- cancel_jobs_by_id(job_ids)
522
- return f'Job {job_name!r} is scheduled to be cancelled.'
909
+ msg = cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
910
+ return f'{job_name!r} {msg}'
523
911
 
524
912
 
525
- def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
913
+ def cancel_jobs_by_pool(pool_name: str,
914
+ current_workspace: Optional[str] = None) -> str:
915
+ """Cancel all jobs in a pool."""
916
+ job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
917
+ if not job_ids:
918
+ return f'No running job found in pool {pool_name!r}.'
919
+ return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
920
+
921
+
922
+ def controller_log_file_for_job(job_id: int,
923
+ create_if_not_exists: bool = False) -> str:
924
+ log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
925
+ if create_if_not_exists:
926
+ os.makedirs(log_dir, exist_ok=True)
927
+ return os.path.join(log_dir, f'{job_id}.log')
928
+
929
+
930
+ def stream_logs_by_id(job_id: int,
931
+ follow: bool = True,
932
+ tail: Optional[int] = None) -> Tuple[str, int]:
526
933
  """Stream logs by job id.
527
934
 
528
935
  Returns:
@@ -552,18 +959,60 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
552
959
  if managed_job_status.is_failed():
553
960
  job_msg = ('\nFailure reason: '
554
961
  f'{managed_job_state.get_failure_reason(job_id)}')
555
- log_file = managed_job_state.get_local_log_file(job_id, None)
556
- if log_file is not None:
557
- with open(os.path.expanduser(log_file), 'r',
558
- encoding='utf-8') as f:
559
- # Stream the logs to the console without reading the whole
560
- # file into memory.
561
- start_streaming = False
562
- for line in f:
563
- if log_lib.LOG_FILE_START_STREAMING_AT in line:
962
+ log_file_ever_existed = False
963
+ task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
964
+ job_id)
965
+ num_tasks = len(task_info)
966
+ for (task_id, task_name, task_status, log_file,
967
+ logs_cleaned_at) in task_info:
968
+ if log_file:
969
+ log_file_ever_existed = True
970
+ if logs_cleaned_at is not None:
971
+ ts_str = datetime.fromtimestamp(
972
+ logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
973
+ print(f'Task {task_name}({task_id}) log has been '
974
+ f'cleaned at {ts_str}.')
975
+ continue
976
+ task_str = (f'Task {task_name}({task_id})'
977
+ if task_name else f'Task {task_id}')
978
+ if num_tasks > 1:
979
+ print(f'=== {task_str} ===')
980
+ with open(os.path.expanduser(log_file),
981
+ 'r',
982
+ encoding='utf-8') as f:
983
+ # Stream the logs to the console without reading the
984
+ # whole file into memory.
985
+ start_streaming = False
986
+ read_from: Union[TextIO, Deque[str]] = f
987
+ if tail is not None:
988
+ assert tail > 0
989
+ # Read only the last 'tail' lines using deque
990
+ read_from = collections.deque(f, maxlen=tail)
991
+ # We set start_streaming to True here in case
992
+ # truncating the log file removes the line that
993
+ # contains LOG_FILE_START_STREAMING_AT. This does
994
+ # not cause issues for log files shorter than tail
995
+ # because tail_logs in sky/skylet/log_lib.py also
996
+ # handles LOG_FILE_START_STREAMING_AT.
564
997
  start_streaming = True
565
- if start_streaming:
566
- print(line, end='', flush=True)
998
+ for line in read_from:
999
+ if log_lib.LOG_FILE_START_STREAMING_AT in line:
1000
+ start_streaming = True
1001
+ if start_streaming:
1002
+ print(line, end='', flush=True)
1003
+ if num_tasks > 1:
1004
+ # Add the "Task finished" message for terminal states
1005
+ if task_status.is_terminal():
1006
+ print(ux_utils.finishing_message(
1007
+ f'{task_str} finished '
1008
+ f'(status: {task_status.value}).'),
1009
+ flush=True)
1010
+ if log_file_ever_existed:
1011
+ # Add the "Job finished" message for terminal states
1012
+ if managed_job_status.is_terminal():
1013
+ print(ux_utils.finishing_message(
1014
+ f'Job finished (status: {managed_job_status.value}).'),
1015
+ flush=True)
567
1016
  return '', exceptions.JobExitCode.from_managed_job_status(
568
1017
  managed_job_status)
569
1018
  return (f'{colorama.Fore.YELLOW}'
@@ -585,12 +1034,19 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
585
1034
 
586
1035
  while should_keep_logging(managed_job_status):
587
1036
  handle = None
1037
+ job_id_to_tail = None
588
1038
  if task_id is not None:
589
- task_name = managed_job_state.get_task_name(job_id, task_id)
590
- cluster_name = generate_managed_job_cluster_name(
591
- task_name, job_id)
592
- handle = global_user_state.get_handle_from_cluster_name(
593
- cluster_name)
1039
+ pool = managed_job_state.get_pool_from_job_id(job_id)
1040
+ if pool is not None:
1041
+ cluster_name, job_id_to_tail = (
1042
+ managed_job_state.get_pool_submit_info(job_id))
1043
+ else:
1044
+ task_name = managed_job_state.get_task_name(job_id, task_id)
1045
+ cluster_name = generate_managed_job_cluster_name(
1046
+ task_name, job_id)
1047
+ if cluster_name is not None:
1048
+ handle = global_user_state.get_handle_from_cluster_name(
1049
+ cluster_name)
594
1050
 
595
1051
  # Check the handle: The cluster can be preempted and removed from
596
1052
  # the table before the managed job state is updated by the
@@ -620,10 +1076,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
620
1076
  managed_job_state.ManagedJobStatus.RUNNING)
621
1077
  assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
622
1078
  status_display.stop()
1079
+ tail_param = tail if tail is not None else 0
623
1080
  returncode = backend.tail_logs(handle,
624
- job_id=None,
1081
+ job_id=job_id_to_tail,
625
1082
  managed_job_id=job_id,
626
- follow=follow)
1083
+ follow=follow,
1084
+ tail=tail_param)
627
1085
  if returncode in [rc.value for rc in exceptions.JobExitCode]:
628
1086
  # If the log tailing exits with a known exit code we can safely
629
1087
  # break the loop because it indicates the tailing process
@@ -760,7 +1218,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
760
1218
  def stream_logs(job_id: Optional[int],
761
1219
  job_name: Optional[str],
762
1220
  controller: bool = False,
763
- follow: bool = True) -> Tuple[str, int]:
1221
+ follow: bool = True,
1222
+ tail: Optional[int] = None) -> Tuple[str, int]:
764
1223
  """Stream logs by job id or job name.
765
1224
 
766
1225
  Returns:
@@ -776,7 +1235,8 @@ def stream_logs(job_id: Optional[int],
776
1235
  if controller:
777
1236
  if job_id is None:
778
1237
  assert job_name is not None
779
- managed_jobs = managed_job_state.get_managed_jobs()
1238
+ managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
1239
+ name_match=job_name, fields=['job_id', 'job_name', 'status'])
780
1240
  # We manually filter the jobs by name, instead of using
781
1241
  # get_nonterminal_job_ids_by_name, as with `controller=True`, we
782
1242
  # should be able to show the logs for jobs in terminal states.
@@ -799,9 +1259,7 @@ def stream_logs(job_id: Optional[int],
799
1259
  job_id = managed_job_ids.pop()
800
1260
  assert job_id is not None, (job_id, job_name)
801
1261
 
802
- controller_log_path = os.path.join(
803
- os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
804
- f'{job_id}.log')
1262
+ controller_log_path = controller_log_file_for_job(job_id)
805
1263
  job_status = None
806
1264
 
807
1265
  # Wait for the log file to be written
@@ -831,7 +1289,12 @@ def stream_logs(job_id: Optional[int],
831
1289
  with open(controller_log_path, 'r', newline='', encoding='utf-8') as f:
832
1290
  # Note: we do not need to care about start_stream_at here, since
833
1291
  # that should be in the job log printed above.
834
- for line in f:
1292
+ read_from: Union[TextIO, Deque[str]] = f
1293
+ if tail is not None:
1294
+ assert tail > 0
1295
+ # Read only the last 'tail' lines efficiently using deque
1296
+ read_from = collections.deque(f, maxlen=tail)
1297
+ for line in read_from:
835
1298
  print(line, end='')
836
1299
  # Flush.
837
1300
  print(end='', flush=True)
@@ -883,61 +1346,384 @@ def stream_logs(job_id: Optional[int],
883
1346
  f'Multiple running jobs found with name {job_name!r}.')
884
1347
  job_id = job_ids[0]
885
1348
 
886
- return stream_logs_by_id(job_id, follow)
1349
+ return stream_logs_by_id(job_id, follow, tail)
1350
+
1351
+
1352
+ def dump_managed_job_queue(
1353
+ skip_finished: bool = False,
1354
+ accessible_workspaces: Optional[List[str]] = None,
1355
+ job_ids: Optional[List[int]] = None,
1356
+ workspace_match: Optional[str] = None,
1357
+ name_match: Optional[str] = None,
1358
+ pool_match: Optional[str] = None,
1359
+ page: Optional[int] = None,
1360
+ limit: Optional[int] = None,
1361
+ user_hashes: Optional[List[Optional[str]]] = None,
1362
+ statuses: Optional[List[str]] = None,
1363
+ fields: Optional[List[str]] = None,
1364
+ ) -> str:
1365
+ return message_utils.encode_payload(
1366
+ get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
1367
+ workspace_match, name_match, pool_match, page,
1368
+ limit, user_hashes, statuses, fields))
887
1369
 
888
1370
 
889
- def dump_managed_job_queue() -> str:
890
- jobs = managed_job_state.get_managed_jobs()
1371
+ def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
1372
+ """Update the fields list to include the necessary fields.
1373
+
1374
+ Args:
1375
+ fields: The fields to update.
1376
+
1377
+ It will:
1378
+ - Add the necessary dependent fields to the list.
1379
+ - Remove the fields that are not in the DB.
1380
+ - Determine if cluster handle is required.
1381
+
1382
+ Returns:
1383
+ A tuple containing the updated fields and a boolean indicating if
1384
+ cluster handle is required.
1385
+ """
1386
+ cluster_handle_required = True
1387
+ if _cluster_handle_not_required(fields):
1388
+ cluster_handle_required = False
1389
+ # Copy the list to avoid modifying the original list
1390
+ new_fields = fields.copy()
1391
+ # status and job_id are always included
1392
+ if 'status' not in new_fields:
1393
+ new_fields.append('status')
1394
+ if 'job_id' not in new_fields:
1395
+ new_fields.append('job_id')
1396
+ # user_hash is required if user_name is present
1397
+ if 'user_name' in new_fields and 'user_hash' not in new_fields:
1398
+ new_fields.append('user_hash')
1399
+ if 'job_duration' in new_fields:
1400
+ if 'last_recovered_at' not in new_fields:
1401
+ new_fields.append('last_recovered_at')
1402
+ if 'end_at' not in new_fields:
1403
+ new_fields.append('end_at')
1404
+ if 'job_name' in new_fields and 'task_name' not in new_fields:
1405
+ new_fields.append('task_name')
1406
+ if 'details' in new_fields:
1407
+ if 'schedule_state' not in new_fields:
1408
+ new_fields.append('schedule_state')
1409
+ if 'priority' not in new_fields:
1410
+ new_fields.append('priority')
1411
+ if 'failure_reason' not in new_fields:
1412
+ new_fields.append('failure_reason')
1413
+ if 'user_yaml' in new_fields:
1414
+ if 'original_user_yaml_path' not in new_fields:
1415
+ new_fields.append('original_user_yaml_path')
1416
+ if 'original_user_yaml_content' not in new_fields:
1417
+ new_fields.append('original_user_yaml_content')
1418
+ if cluster_handle_required:
1419
+ if 'task_name' not in new_fields:
1420
+ new_fields.append('task_name')
1421
+ if 'current_cluster_name' not in new_fields:
1422
+ new_fields.append('current_cluster_name')
1423
+ # Remove _NON_DB_FIELDS
1424
+ # These fields have been mapped to the DB fields in the above code, so we
1425
+ # don't need to include them in the updated fields.
1426
+ for field in _NON_DB_FIELDS:
1427
+ if field in new_fields:
1428
+ new_fields.remove(field)
1429
+ return new_fields, cluster_handle_required
1430
+
1431
+
1432
+ def _cluster_handle_not_required(fields: List[str]) -> bool:
1433
+ """Determine if cluster handle is not required.
1434
+
1435
+ Args:
1436
+ fields: The fields to check if they contain any of the cluster handle
1437
+ fields.
1438
+
1439
+ Returns:
1440
+ True if the fields do not contain any of the cluster handle fields,
1441
+ False otherwise.
1442
+ """
1443
+ return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
1444
+
1445
+
1446
+ def get_managed_job_queue(
1447
+ skip_finished: bool = False,
1448
+ accessible_workspaces: Optional[List[str]] = None,
1449
+ job_ids: Optional[List[int]] = None,
1450
+ workspace_match: Optional[str] = None,
1451
+ name_match: Optional[str] = None,
1452
+ pool_match: Optional[str] = None,
1453
+ page: Optional[int] = None,
1454
+ limit: Optional[int] = None,
1455
+ user_hashes: Optional[List[Optional[str]]] = None,
1456
+ statuses: Optional[List[str]] = None,
1457
+ fields: Optional[List[str]] = None,
1458
+ ) -> Dict[str, Any]:
1459
+ """Get the managed job queue.
1460
+
1461
+ Args:
1462
+ skip_finished: Whether to skip finished jobs.
1463
+ accessible_workspaces: The accessible workspaces.
1464
+ job_ids: The job ids.
1465
+ workspace_match: The workspace name to match.
1466
+ name_match: The job name to match.
1467
+ pool_match: The pool name to match.
1468
+ page: The page number.
1469
+ limit: The limit number.
1470
+ user_hashes: The user hashes.
1471
+ statuses: The statuses.
1472
+ fields: The fields to include in the response.
1473
+
1474
+ Returns:
1475
+ A dictionary containing the managed job queue.
1476
+ """
1477
+ cluster_handle_required = True
1478
+ updated_fields = None
1479
+ # The caller only need to specify the fields in the
1480
+ # `class ManagedJobRecord` in `response.py`, and the `_update_fields`
1481
+ # function will add the necessary dependent fields to the list, for
1482
+ # example, if the caller specifies `['user_name']`, the `_update_fields`
1483
+ # function will add `['user_hash']` to the list.
1484
+ if fields:
1485
+ updated_fields, cluster_handle_required = _update_fields(fields)
1486
+
1487
+ total_no_filter = managed_job_state.get_managed_jobs_total()
1488
+
1489
+ status_counts = managed_job_state.get_status_count_with_filters(
1490
+ fields=fields,
1491
+ job_ids=job_ids,
1492
+ accessible_workspaces=accessible_workspaces,
1493
+ workspace_match=workspace_match,
1494
+ name_match=name_match,
1495
+ pool_match=pool_match,
1496
+ user_hashes=user_hashes,
1497
+ skip_finished=skip_finished,
1498
+ )
1499
+
1500
+ jobs, total = managed_job_state.get_managed_jobs_with_filters(
1501
+ fields=updated_fields,
1502
+ job_ids=job_ids,
1503
+ accessible_workspaces=accessible_workspaces,
1504
+ workspace_match=workspace_match,
1505
+ name_match=name_match,
1506
+ pool_match=pool_match,
1507
+ user_hashes=user_hashes,
1508
+ statuses=statuses,
1509
+ skip_finished=skip_finished,
1510
+ page=page,
1511
+ limit=limit,
1512
+ )
1513
+
1514
+ if cluster_handle_required:
1515
+ # Fetch the cluster name to handle map for managed clusters only.
1516
+ cluster_name_to_handle = (
1517
+ global_user_state.get_cluster_name_to_handle_map(is_managed=True))
1518
+
1519
+ highest_blocking_priority = constants.MIN_PRIORITY
1520
+ if not fields or 'details' in fields:
1521
+ # Figure out what the highest priority blocking job is. We need to know
1522
+ # in order to determine if other jobs are blocked by a higher priority
1523
+ # job, or just by the limited controller resources.
1524
+ highest_blocking_priority = (
1525
+ managed_job_state.get_managed_jobs_highest_priority())
891
1526
 
892
1527
  for job in jobs:
893
- end_at = job['end_at']
894
- if end_at is None:
895
- end_at = time.time()
896
-
897
- job_submitted_at = job['last_recovered_at'] - job['job_duration']
898
- if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
899
- # When job is recovering, the duration is exact job['job_duration']
900
- job_duration = job['job_duration']
901
- elif job_submitted_at > 0:
902
- job_duration = end_at - job_submitted_at
903
- else:
904
- # When job_start_at <= 0, that means the last_recovered_at is not
905
- # set yet, i.e. the job is not started.
906
- job_duration = 0
907
- job['job_duration'] = job_duration
1528
+ if not fields or 'job_duration' in fields:
1529
+ end_at = job['end_at']
1530
+ if end_at is None:
1531
+ end_at = time.time()
1532
+
1533
+ job_submitted_at = job['last_recovered_at'] - job['job_duration']
1534
+ if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
1535
+ # When job is recovering, the duration is exact
1536
+ # job['job_duration']
1537
+ job_duration = job['job_duration']
1538
+ elif job_submitted_at > 0:
1539
+ job_duration = end_at - job_submitted_at
1540
+ else:
1541
+ # When job_start_at <= 0, that means the last_recovered_at
1542
+ # is not set yet, i.e. the job is not started.
1543
+ job_duration = 0
1544
+ job['job_duration'] = job_duration
908
1545
  job['status'] = job['status'].value
909
- job['schedule_state'] = job['schedule_state'].value
910
-
911
- cluster_name = generate_managed_job_cluster_name(
912
- job['task_name'], job['job_id'])
913
- handle = global_user_state.get_handle_from_cluster_name(cluster_name)
914
- if handle is not None:
915
- assert isinstance(handle, backends.CloudVmRayResourceHandle)
916
- job['cluster_resources'] = (
917
- f'{handle.launched_nodes}x {handle.launched_resources}')
918
- job['region'] = handle.launched_resources.region
1546
+ if not fields or 'schedule_state' in fields:
1547
+ job['schedule_state'] = job['schedule_state'].value
919
1548
  else:
920
- # FIXME(zongheng): display the last cached values for these.
921
- job['cluster_resources'] = '-'
922
- job['region'] = '-'
1549
+ job['schedule_state'] = None
923
1550
 
924
- return message_utils.encode_payload(jobs)
1551
+ if cluster_handle_required:
1552
+ cluster_name = job.get('current_cluster_name', None)
1553
+ if cluster_name is None:
1554
+ cluster_name = generate_managed_job_cluster_name(
1555
+ job['task_name'], job['job_id'])
1556
+ handle = cluster_name_to_handle.get(
1557
+ cluster_name, None) if cluster_name is not None else None
1558
+ if isinstance(handle, backends.CloudVmRayResourceHandle):
1559
+ resources_str_simple, resources_str_full = (
1560
+ resources_utils.get_readable_resources_repr(
1561
+ handle, simplified_only=False))
1562
+ assert resources_str_full is not None
1563
+ job['cluster_resources'] = resources_str_simple
1564
+ job['cluster_resources_full'] = resources_str_full
1565
+ job['cloud'] = str(handle.launched_resources.cloud)
1566
+ job['region'] = handle.launched_resources.region
1567
+ job['zone'] = handle.launched_resources.zone
1568
+ job['infra'] = infra_utils.InfraInfo(
1569
+ str(handle.launched_resources.cloud),
1570
+ handle.launched_resources.region,
1571
+ handle.launched_resources.zone).formatted_str()
1572
+ job['accelerators'] = handle.launched_resources.accelerators
1573
+ else:
1574
+ # FIXME(zongheng): display the last cached values for these.
1575
+ job['cluster_resources'] = '-'
1576
+ job['cluster_resources_full'] = '-'
1577
+ job['cloud'] = '-'
1578
+ job['region'] = '-'
1579
+ job['zone'] = '-'
1580
+ job['infra'] = '-'
1581
+
1582
+ if not fields or 'details' in fields:
1583
+ # Add details about schedule state / backoff.
1584
+ state_details = None
1585
+ if job['schedule_state'] == 'ALIVE_BACKOFF':
1586
+ state_details = 'In backoff, waiting for resources'
1587
+ elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
1588
+ priority = job.get('priority')
1589
+ if (priority is not None and
1590
+ priority < highest_blocking_priority):
1591
+ # Job is lower priority than some other blocking job.
1592
+ state_details = 'Waiting for higher priority jobs to launch'
1593
+ else:
1594
+ state_details = 'Waiting for other jobs to launch'
1595
+
1596
+ if state_details and job['failure_reason']:
1597
+ job['details'] = f'{state_details} - {job["failure_reason"]}'
1598
+ elif state_details:
1599
+ job['details'] = state_details
1600
+ elif job['failure_reason']:
1601
+ job['details'] = f'Failure: {job["failure_reason"]}'
1602
+ else:
1603
+ job['details'] = None
1604
+
1605
+ return {
1606
+ 'jobs': jobs,
1607
+ 'total': total,
1608
+ 'total_no_filter': total_no_filter,
1609
+ 'status_counts': status_counts
1610
+ }
1611
+
1612
+
1613
+ def filter_jobs(
1614
+ jobs: List[Dict[str, Any]],
1615
+ workspace_match: Optional[str],
1616
+ name_match: Optional[str],
1617
+ pool_match: Optional[str],
1618
+ page: Optional[int],
1619
+ limit: Optional[int],
1620
+ user_match: Optional[str] = None,
1621
+ enable_user_match: bool = False,
1622
+ statuses: Optional[List[str]] = None,
1623
+ ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int]]:
1624
+ """Filter jobs based on the given criteria.
1625
+
1626
+ Args:
1627
+ jobs: List of jobs to filter.
1628
+ workspace_match: Workspace name to filter.
1629
+ name_match: Job name to filter.
1630
+ pool_match: Pool name to filter.
1631
+ page: Page to filter.
1632
+ limit: Limit to filter.
1633
+ user_match: User name to filter.
1634
+ enable_user_match: Whether to enable user match.
1635
+ statuses: Statuses to filter.
1636
+
1637
+ Returns:
1638
+ List of filtered jobs
1639
+ Total number of jobs
1640
+ Dictionary of status counts
1641
+ """
1642
+
1643
+ # TODO(hailong): refactor the whole function including the
1644
+ # `dump_managed_job_queue()` to use DB filtering.
1645
+
1646
+ def _pattern_matches(job: Dict[str, Any], key: str,
1647
+ pattern: Optional[str]) -> bool:
1648
+ if pattern is None:
1649
+ return True
1650
+ if key not in job:
1651
+ return False
1652
+ value = job[key]
1653
+ if not value:
1654
+ return False
1655
+ return pattern in str(value)
1656
+
1657
+ def _handle_page_and_limit(
1658
+ result: List[Dict[str, Any]],
1659
+ page: Optional[int],
1660
+ limit: Optional[int],
1661
+ ) -> List[Dict[str, Any]]:
1662
+ if page is None and limit is None:
1663
+ return result
1664
+ assert page is not None and limit is not None, (page, limit)
1665
+ # page starts from 1
1666
+ start = (page - 1) * limit
1667
+ end = min(start + limit, len(result))
1668
+ return result[start:end]
925
1669
 
1670
+ status_counts: Dict[str, int] = collections.defaultdict(int)
1671
+ result = []
1672
+ checks = [
1673
+ ('workspace', workspace_match),
1674
+ ('job_name', name_match),
1675
+ ('pool', pool_match),
1676
+ ]
1677
+ if enable_user_match:
1678
+ checks.append(('user_name', user_match))
1679
+
1680
+ for job in jobs:
1681
+ if not all(
1682
+ _pattern_matches(job, key, pattern) for key, pattern in checks):
1683
+ continue
1684
+ status_counts[job['status'].value] += 1
1685
+ if statuses:
1686
+ if job['status'].value not in statuses:
1687
+ continue
1688
+ result.append(job)
1689
+
1690
+ total = len(result)
1691
+
1692
+ return _handle_page_and_limit(result, page, limit), total, status_counts
926
1693
 
927
- def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
1694
+
1695
+ def load_managed_job_queue(
1696
+ payload: str
1697
+ ) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType, int, Dict[
1698
+ str, int]]:
928
1699
  """Load job queue from json string."""
929
- jobs = message_utils.decode_payload(payload)
1700
+ result = message_utils.decode_payload(payload)
1701
+ result_type = ManagedJobQueueResultType.DICT
1702
+ status_counts: Dict[str, int] = {}
1703
+ if isinstance(result, dict):
1704
+ jobs: List[Dict[str, Any]] = result['jobs']
1705
+ total: int = result['total']
1706
+ status_counts = result.get('status_counts', {})
1707
+ total_no_filter: int = result.get('total_no_filter', total)
1708
+ else:
1709
+ jobs = result
1710
+ total = len(jobs)
1711
+ total_no_filter = total
1712
+ result_type = ManagedJobQueueResultType.LIST
1713
+
1714
+ all_users = global_user_state.get_all_users()
1715
+ all_users_map = {user.id: user.name for user in all_users}
930
1716
  for job in jobs:
931
1717
  job['status'] = managed_job_state.ManagedJobStatus(job['status'])
932
1718
  if 'user_hash' in job and job['user_hash'] is not None:
933
1719
  # Skip jobs that do not have user_hash info.
934
1720
  # TODO(cooperc): Remove check before 0.12.0.
935
- job['user_name'] = global_user_state.get_user(job['user_hash']).name
936
- return jobs
1721
+ job['user_name'] = all_users_map.get(job['user_hash'])
1722
+ return jobs, total, result_type, total_no_filter, status_counts
937
1723
 
938
1724
 
939
1725
  def _get_job_status_from_tasks(
940
- job_tasks: List[Dict[str, Any]]
1726
+ job_tasks: Union[List[responses.ManagedJobRecord], List[Dict[str, Any]]]
941
1727
  ) -> Tuple[managed_job_state.ManagedJobStatus, int]:
942
1728
  """Get the current task status and the current task id for a job."""
943
1729
  managed_task_status = managed_job_state.ManagedJobStatus.SUCCEEDED
@@ -949,7 +1735,7 @@ def _get_job_status_from_tasks(
949
1735
  # Use the first non-succeeded status.
950
1736
  if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
951
1737
  # TODO(zhwu): we should not blindly use the first non-
952
- # succeeded as the status could be changed to SUBMITTED
1738
+ # succeeded as the status could be changed to PENDING
953
1739
  # when going from one task to the next one, which can be
954
1740
  # confusing.
955
1741
  break
@@ -957,29 +1743,40 @@ def _get_job_status_from_tasks(
957
1743
 
958
1744
 
959
1745
  @typing.overload
960
- def format_job_table(tasks: List[Dict[str, Any]],
961
- show_all: bool,
962
- show_user: bool,
963
- return_rows: Literal[False] = False,
964
- max_jobs: Optional[int] = None) -> str:
1746
+ def format_job_table(
1747
+ tasks: List[Dict[str, Any]],
1748
+ show_all: bool,
1749
+ show_user: bool,
1750
+ return_rows: Literal[False] = False,
1751
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1752
+ max_jobs: Optional[int] = None,
1753
+ job_status_counts: Optional[Dict[str, int]] = None,
1754
+ ) -> str:
965
1755
  ...
966
1756
 
967
1757
 
968
1758
  @typing.overload
969
- def format_job_table(tasks: List[Dict[str, Any]],
970
- show_all: bool,
971
- show_user: bool,
972
- return_rows: Literal[True],
973
- max_jobs: Optional[int] = None) -> List[List[str]]:
1759
+ def format_job_table(
1760
+ tasks: List[Dict[str, Any]],
1761
+ show_all: bool,
1762
+ show_user: bool,
1763
+ return_rows: Literal[True],
1764
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1765
+ max_jobs: Optional[int] = None,
1766
+ job_status_counts: Optional[Dict[str, int]] = None,
1767
+ ) -> List[List[str]]:
974
1768
  ...
975
1769
 
976
1770
 
977
1771
  def format_job_table(
978
- tasks: List[Dict[str, Any]],
979
- show_all: bool,
980
- show_user: bool,
981
- return_rows: bool = False,
982
- max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
1772
+ tasks: List[Dict[str, Any]],
1773
+ show_all: bool,
1774
+ show_user: bool,
1775
+ return_rows: bool = False,
1776
+ pool_status: Optional[List[Dict[str, Any]]] = None,
1777
+ max_jobs: Optional[int] = None,
1778
+ job_status_counts: Optional[Dict[str, int]] = None,
1779
+ ) -> Union[str, List[List[str]]]:
983
1780
  """Returns managed jobs as a formatted string.
984
1781
 
985
1782
  Args:
@@ -988,13 +1785,15 @@ def format_job_table(
988
1785
  max_jobs: The maximum number of jobs to show in the table.
989
1786
  return_rows: If True, return the rows as a list of strings instead of
990
1787
  all rows concatenated into a single string.
1788
+ pool_status: List of pool status dictionaries with replica_info.
1789
+ job_status_counts: The counts of each job status.
991
1790
 
992
1791
  Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
993
1792
  a list of "rows" (each of which is a list of str).
994
1793
  """
995
1794
  jobs = collections.defaultdict(list)
996
1795
  # Check if the tasks have user information from kubernetes.
997
- # This is only used for sky status --kubernetes.
1796
+ # This is only used for sky status-kubernetes.
998
1797
  tasks_have_k8s_user = any([task.get('user') for task in tasks])
999
1798
  if max_jobs and tasks_have_k8s_user:
1000
1799
  raise ValueError('max_jobs is not supported when tasks have user info.')
@@ -1004,16 +1803,41 @@ def format_job_table(
1004
1803
  return (task['user'], task['job_id'])
1005
1804
  return task['job_id']
1006
1805
 
1806
+ def _get_job_id_to_worker_map(
1807
+ pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
1808
+ """Create a mapping from job_id to worker replica_id.
1809
+
1810
+ Args:
1811
+ pool_status: List of pool status dictionaries with replica_info.
1812
+
1813
+ Returns:
1814
+ Dictionary mapping job_id to replica_id (worker ID).
1815
+ """
1816
+ job_to_worker: Dict[int, int] = {}
1817
+ if pool_status is None:
1818
+ return job_to_worker
1819
+ for pool in pool_status:
1820
+ replica_info = pool.get('replica_info', [])
1821
+ for replica in replica_info:
1822
+ used_by = replica.get('used_by')
1823
+ if used_by is not None:
1824
+ job_to_worker[used_by] = replica.get('replica_id')
1825
+ return job_to_worker
1826
+
1827
+ # Create mapping from job_id to worker replica_id
1828
+ job_to_worker = _get_job_id_to_worker_map(pool_status)
1829
+
1007
1830
  for task in tasks:
1008
1831
  # The tasks within the same job_id are already sorted
1009
1832
  # by the task_id.
1010
1833
  jobs[get_hash(task)].append(task)
1011
1834
 
1012
- status_counts: Dict[str, int] = collections.defaultdict(int)
1835
+ workspaces = set()
1013
1836
  for job_tasks in jobs.values():
1014
- managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
1015
- if not managed_job_status.is_terminal():
1016
- status_counts[managed_job_status.value] += 1
1837
+ workspaces.add(job_tasks[0].get('workspace',
1838
+ constants.SKYPILOT_DEFAULT_WORKSPACE))
1839
+
1840
+ show_workspace = len(workspaces) > 1 or show_all
1017
1841
 
1018
1842
  user_cols: List[str] = []
1019
1843
  if show_user:
@@ -1024,26 +1848,43 @@ def format_job_table(
1024
1848
  columns = [
1025
1849
  'ID',
1026
1850
  'TASK',
1851
+ *(['WORKSPACE'] if show_workspace else []),
1027
1852
  'NAME',
1028
1853
  *user_cols,
1029
- 'RESOURCES',
1854
+ 'REQUESTED',
1030
1855
  'SUBMITTED',
1031
1856
  'TOT. DURATION',
1032
1857
  'JOB DURATION',
1033
1858
  '#RECOVERIES',
1034
1859
  'STATUS',
1860
+ 'POOL',
1035
1861
  ]
1036
1862
  if show_all:
1037
1863
  # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
1038
- columns += ['STARTED', 'CLUSTER', 'REGION', 'SCHED. STATE', 'DETAILS']
1864
+ columns += [
1865
+ 'WORKER_CLUSTER',
1866
+ 'WORKER_JOB_ID',
1867
+ 'STARTED',
1868
+ 'INFRA',
1869
+ 'RESOURCES',
1870
+ 'SCHED. STATE',
1871
+ 'DETAILS',
1872
+ 'GIT_COMMIT',
1873
+ ]
1039
1874
  if tasks_have_k8s_user:
1040
1875
  columns.insert(0, 'USER')
1041
1876
  job_table = log_utils.create_table(columns)
1042
1877
 
1043
1878
  status_counts: Dict[str, int] = collections.defaultdict(int)
1044
- for task in tasks:
1045
- if not task['status'].is_terminal():
1046
- status_counts[task['status'].value] += 1
1879
+ if job_status_counts:
1880
+ for status_value, count in job_status_counts.items():
1881
+ status = managed_job_state.ManagedJobStatus(status_value)
1882
+ if not status.is_terminal():
1883
+ status_counts[status_value] = count
1884
+ else:
1885
+ for task in tasks:
1886
+ if not task['status'].is_terminal():
1887
+ status_counts[task['status'].value] += 1
1047
1888
 
1048
1889
  all_tasks = tasks
1049
1890
  if max_jobs is not None:
@@ -1054,7 +1895,10 @@ def format_job_table(
1054
1895
  # by the task_id.
1055
1896
  jobs[get_hash(task)].append(task)
1056
1897
 
1057
- def generate_details(failure_reason: Optional[str]) -> str:
1898
+ def generate_details(details: Optional[str],
1899
+ failure_reason: Optional[str]) -> str:
1900
+ if details is not None:
1901
+ return details
1058
1902
  if failure_reason is not None:
1059
1903
  return f'Failure: {failure_reason}'
1060
1904
  return '-'
@@ -1083,6 +1927,8 @@ def format_job_table(
1083
1927
  for job_hash, job_tasks in jobs.items():
1084
1928
  if show_all:
1085
1929
  schedule_state = job_tasks[0]['schedule_state']
1930
+ workspace = job_tasks[0].get('workspace',
1931
+ constants.SKYPILOT_DEFAULT_WORKSPACE)
1086
1932
 
1087
1933
  if len(job_tasks) > 1:
1088
1934
  # Aggregate the tasks into a new row in the table.
@@ -1120,10 +1966,20 @@ def format_job_table(
1120
1966
 
1121
1967
  user_values = get_user_column_values(job_tasks[0])
1122
1968
 
1969
+ pool = job_tasks[0].get('pool')
1970
+ if pool is None:
1971
+ pool = '-'
1972
+
1973
+ # Add worker information if job is assigned to a worker
1123
1974
  job_id = job_hash[1] if tasks_have_k8s_user else job_hash
1975
+ # job_id is now always an integer, use it to look up worker
1976
+ if job_id in job_to_worker and pool != '-':
1977
+ pool = f'{pool} (worker={job_to_worker[job_id]})'
1978
+
1124
1979
  job_values = [
1125
1980
  job_id,
1126
1981
  '',
1982
+ *([''] if show_workspace else []),
1127
1983
  job_name,
1128
1984
  *user_values,
1129
1985
  '-',
@@ -1132,15 +1988,20 @@ def format_job_table(
1132
1988
  job_duration,
1133
1989
  recovery_cnt,
1134
1990
  status_str,
1991
+ pool,
1135
1992
  ]
1136
1993
  if show_all:
1994
+ details = job_tasks[current_task_id].get('details')
1137
1995
  failure_reason = job_tasks[current_task_id]['failure_reason']
1138
1996
  job_values.extend([
1997
+ '-',
1998
+ '-',
1139
1999
  '-',
1140
2000
  '-',
1141
2001
  '-',
1142
2002
  job_tasks[0]['schedule_state'],
1143
- generate_details(failure_reason),
2003
+ generate_details(details, failure_reason),
2004
+ job_tasks[0].get('metadata', {}).get('git_commit', '-'),
1144
2005
  ])
1145
2006
  if tasks_have_k8s_user:
1146
2007
  job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -1153,9 +2014,20 @@ def format_job_table(
1153
2014
  0, task['job_duration'], absolute=True)
1154
2015
  submitted = log_utils.readable_time_duration(task['submitted_at'])
1155
2016
  user_values = get_user_column_values(task)
2017
+ task_workspace = '-' if len(job_tasks) > 1 else workspace
2018
+ pool = task.get('pool')
2019
+ if pool is None:
2020
+ pool = '-'
2021
+
2022
+ # Add worker information if task is assigned to a worker
2023
+ task_job_id = task['job_id']
2024
+ if task_job_id in job_to_worker and pool != '-':
2025
+ pool = f'{pool} (worker={job_to_worker[task_job_id]})'
2026
+
1156
2027
  values = [
1157
2028
  task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
1158
2029
  task['task_id'] if len(job_tasks) > 1 else '-',
2030
+ *([task_workspace] if show_workspace else []),
1159
2031
  task['task_name'],
1160
2032
  *user_values,
1161
2033
  task['resources'],
@@ -1168,20 +2040,50 @@ def format_job_table(
1168
2040
  job_duration,
1169
2041
  task['recovery_count'],
1170
2042
  task['status'].colored_str(),
2043
+ pool,
1171
2044
  ]
1172
2045
  if show_all:
1173
2046
  # schedule_state is only set at the job level, so if we have
1174
2047
  # more than one task, only display on the aggregated row.
1175
2048
  schedule_state = (task['schedule_state']
1176
2049
  if len(job_tasks) == 1 else '-')
2050
+ infra_str = task.get('infra')
2051
+ if infra_str is None:
2052
+ cloud = task.get('cloud')
2053
+ if cloud is None:
2054
+ # Backward compatibility for old jobs controller without
2055
+ # cloud info returned, we parse it from the cluster
2056
+ # resources
2057
+ # TODO(zhwu): remove this after 0.12.0
2058
+ cloud = task['cluster_resources'].split('(')[0].split(
2059
+ 'x')[-1]
2060
+ task['cluster_resources'] = task[
2061
+ 'cluster_resources'].replace(f'{cloud}(',
2062
+ '(').replace(
2063
+ 'x ', 'x')
2064
+ region = task['region']
2065
+ zone = task.get('zone')
2066
+ if cloud == '-':
2067
+ cloud = None
2068
+ if region == '-':
2069
+ region = None
2070
+ if zone == '-':
2071
+ zone = None
2072
+ infra_str = infra_utils.InfraInfo(cloud, region,
2073
+ zone).formatted_str()
1177
2074
  values.extend([
2075
+ task.get('current_cluster_name', '-'),
2076
+ task.get('job_id_on_pool_cluster', '-'),
1178
2077
  # STARTED
1179
2078
  log_utils.readable_time_duration(task['start_at']),
2079
+ infra_str,
1180
2080
  task['cluster_resources'],
1181
- task['region'],
1182
2081
  schedule_state,
1183
- generate_details(task['failure_reason']),
2082
+ generate_details(task.get('details'),
2083
+ task['failure_reason']),
1184
2084
  ])
2085
+
2086
+ values.append(task.get('metadata', {}).get('git_commit', '-'))
1185
2087
  if tasks_have_k8s_user:
1186
2088
  values.insert(0, task.get('user', '-'))
1187
2089
  job_table.add_row(values)
@@ -1204,6 +2106,59 @@ def format_job_table(
1204
2106
  return output
1205
2107
 
1206
2108
 
2109
+ def decode_managed_job_protos(
2110
+ job_protos: Iterable['managed_jobsv1_pb2.ManagedJobInfo']
2111
+ ) -> List[Dict[str, Any]]:
2112
+ """Decode job protos to dicts. Similar to load_managed_job_queue."""
2113
+ user_hash_to_user = global_user_state.get_users(
2114
+ set(job.user_hash for job in job_protos if job.user_hash))
2115
+
2116
+ jobs = []
2117
+ for job_proto in job_protos:
2118
+ job_dict = _job_proto_to_dict(job_proto)
2119
+ user_hash = job_dict.get('user_hash', None)
2120
+ if user_hash is not None:
2121
+ # Skip jobs that do not have user_hash info.
2122
+ # TODO(cooperc): Remove check before 0.12.0.
2123
+ user = user_hash_to_user.get(user_hash, None)
2124
+ job_dict['user_name'] = user.name if user is not None else None
2125
+ jobs.append(job_dict)
2126
+ return jobs
2127
+
2128
+
2129
+ def _job_proto_to_dict(
2130
+ job_proto: 'managed_jobsv1_pb2.ManagedJobInfo') -> Dict[str, Any]:
2131
+ job_dict = json_format.MessageToDict(
2132
+ job_proto,
2133
+ always_print_fields_with_no_presence=True,
2134
+ # Our API returns fields in snake_case.
2135
+ preserving_proto_field_name=True,
2136
+ use_integers_for_enums=True)
2137
+ for field in job_proto.DESCRIPTOR.fields:
2138
+ # Ensure optional fields are present with None values for
2139
+ # backwards compatibility with older clients.
2140
+ if field.has_presence and field.name not in job_dict:
2141
+ job_dict[field.name] = None
2142
+ # json_format.MessageToDict is meant for encoding to JSON,
2143
+ # and Protobuf encodes int64 as decimal strings in JSON,
2144
+ # so we need to convert them back to ints.
2145
+ # https://protobuf.dev/programming-guides/json/#field-representation
2146
+ if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
2147
+ job_dict.get(field.name) is not None):
2148
+ job_dict[field.name] = int(job_dict[field.name])
2149
+ job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
2150
+ job_dict['status'])
2151
+ # For backwards compatibility, convert schedule_state to a string,
2152
+ # as we don't have the logic to handle it in our request
2153
+ # encoder/decoder, unlike status.
2154
+ schedule_state_enum = (
2155
+ managed_job_state.ManagedJobScheduleState.from_protobuf(
2156
+ job_dict['schedule_state']))
2157
+ job_dict['schedule_state'] = (schedule_state_enum.value
2158
+ if schedule_state_enum is not None else None)
2159
+ return job_dict
2160
+
2161
+
1207
2162
  class ManagedJobCodeGen:
1208
2163
  """Code generator for managed job utility functions.
1209
2164
 
@@ -1221,9 +2176,62 @@ class ManagedJobCodeGen:
1221
2176
  """)
1222
2177
 
1223
2178
  @classmethod
1224
- def get_job_table(cls) -> str:
1225
- code = textwrap.dedent("""\
1226
- job_table = utils.dump_managed_job_queue()
2179
+ def get_job_table(
2180
+ cls,
2181
+ skip_finished: bool = False,
2182
+ accessible_workspaces: Optional[List[str]] = None,
2183
+ job_ids: Optional[List[int]] = None,
2184
+ workspace_match: Optional[str] = None,
2185
+ name_match: Optional[str] = None,
2186
+ pool_match: Optional[str] = None,
2187
+ page: Optional[int] = None,
2188
+ limit: Optional[int] = None,
2189
+ user_hashes: Optional[List[Optional[str]]] = None,
2190
+ statuses: Optional[List[str]] = None,
2191
+ fields: Optional[List[str]] = None,
2192
+ ) -> str:
2193
+ code = textwrap.dedent(f"""\
2194
+ if managed_job_version < 9:
2195
+ # For backward compatibility, since filtering is not supported
2196
+ # before #6652.
2197
+ # TODO(hailong): Remove compatibility before 0.12.0
2198
+ job_table = utils.dump_managed_job_queue()
2199
+ elif managed_job_version < 10:
2200
+ job_table = utils.dump_managed_job_queue(
2201
+ skip_finished={skip_finished},
2202
+ accessible_workspaces={accessible_workspaces!r},
2203
+ job_ids={job_ids!r},
2204
+ workspace_match={workspace_match!r},
2205
+ name_match={name_match!r},
2206
+ pool_match={pool_match!r},
2207
+ page={page!r},
2208
+ limit={limit!r},
2209
+ user_hashes={user_hashes!r})
2210
+ elif managed_job_version < 12:
2211
+ job_table = utils.dump_managed_job_queue(
2212
+ skip_finished={skip_finished},
2213
+ accessible_workspaces={accessible_workspaces!r},
2214
+ job_ids={job_ids!r},
2215
+ workspace_match={workspace_match!r},
2216
+ name_match={name_match!r},
2217
+ pool_match={pool_match!r},
2218
+ page={page!r},
2219
+ limit={limit!r},
2220
+ user_hashes={user_hashes!r},
2221
+ statuses={statuses!r})
2222
+ else:
2223
+ job_table = utils.dump_managed_job_queue(
2224
+ skip_finished={skip_finished},
2225
+ accessible_workspaces={accessible_workspaces!r},
2226
+ job_ids={job_ids!r},
2227
+ workspace_match={workspace_match!r},
2228
+ name_match={name_match!r},
2229
+ pool_match={pool_match!r},
2230
+ page={page!r},
2231
+ limit={limit!r},
2232
+ user_hashes={user_hashes!r},
2233
+ statuses={statuses!r},
2234
+ fields={fields!r})
1227
2235
  print(job_table, flush=True)
1228
2236
  """)
1229
2237
  return cls._build(code)
@@ -1232,26 +2240,77 @@ class ManagedJobCodeGen:
1232
2240
  def cancel_jobs_by_id(cls,
1233
2241
  job_ids: Optional[List[int]],
1234
2242
  all_users: bool = False) -> str:
2243
+ active_workspace = skypilot_config.get_active_workspace()
1235
2244
  code = textwrap.dedent(f"""\
1236
2245
  if managed_job_version < 2:
1237
2246
  # For backward compatibility, since all_users is not supported
1238
- # before #4787. Assume th
2247
+ # before #4787.
1239
2248
  # TODO(cooperc): Remove compatibility before 0.12.0
1240
2249
  msg = utils.cancel_jobs_by_id({job_ids})
1241
- else:
2250
+ elif managed_job_version < 4:
2251
+ # For backward compatibility, since current_workspace is not
2252
+ # supported before #5660. Don't check the workspace.
2253
+ # TODO(zhwu): Remove compatibility before 0.12.0
1242
2254
  msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users})
2255
+ else:
2256
+ msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users},
2257
+ current_workspace={active_workspace!r})
1243
2258
  print(msg, end="", flush=True)
1244
2259
  """)
1245
2260
  return cls._build(code)
1246
2261
 
1247
2262
  @classmethod
1248
2263
  def cancel_job_by_name(cls, job_name: str) -> str:
2264
+ active_workspace = skypilot_config.get_active_workspace()
1249
2265
  code = textwrap.dedent(f"""\
1250
- msg = utils.cancel_job_by_name({job_name!r})
2266
+ if managed_job_version < 4:
2267
+ # For backward compatibility, since current_workspace is not
2268
+ # supported before #5660. Don't check the workspace.
2269
+ # TODO(zhwu): Remove compatibility before 0.12.0
2270
+ msg = utils.cancel_job_by_name({job_name!r})
2271
+ else:
2272
+ msg = utils.cancel_job_by_name({job_name!r}, {active_workspace!r})
1251
2273
  print(msg, end="", flush=True)
1252
2274
  """)
1253
2275
  return cls._build(code)
1254
2276
 
2277
+ @classmethod
2278
+ def cancel_jobs_by_pool(cls, pool_name: str) -> str:
2279
+ active_workspace = skypilot_config.get_active_workspace()
2280
+ code = textwrap.dedent(f"""\
2281
+ msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
2282
+ print(msg, end="", flush=True)
2283
+ """)
2284
+ return cls._build(code)
2285
+
2286
+ @classmethod
2287
+ def get_version_and_job_table(cls) -> str:
2288
+ """Generate code to get controller version and raw job table."""
2289
+ code = textwrap.dedent("""\
2290
+ from sky.skylet import constants as controller_constants
2291
+
2292
+ # Get controller version
2293
+ controller_version = controller_constants.SKYLET_VERSION
2294
+ print(f"controller_version:{controller_version}", flush=True)
2295
+
2296
+ # Get and print raw job table (load_managed_job_queue can parse this directly)
2297
+ job_table = utils.dump_managed_job_queue()
2298
+ print(job_table, flush=True)
2299
+ """)
2300
+ return cls._build(code)
2301
+
2302
+ @classmethod
2303
+ def get_version(cls) -> str:
2304
+ """Generate code to get controller version."""
2305
+ code = textwrap.dedent("""\
2306
+ from sky.skylet import constants as controller_constants
2307
+
2308
+ # Get controller version
2309
+ controller_version = controller_constants.SKYLET_VERSION
2310
+ print(f"controller_version:{controller_version}", flush=True)
2311
+ """)
2312
+ return cls._build(code)
2313
+
1255
2314
  @classmethod
1256
2315
  def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
1257
2316
  code = textwrap.dedent(f"""\
@@ -1266,10 +2325,16 @@ class ManagedJobCodeGen:
1266
2325
  job_name: Optional[str],
1267
2326
  job_id: Optional[int],
1268
2327
  follow: bool = True,
1269
- controller: bool = False) -> str:
2328
+ controller: bool = False,
2329
+ tail: Optional[int] = None) -> str:
1270
2330
  code = textwrap.dedent(f"""\
1271
- result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
1272
- follow={follow}, controller={controller})
2331
+ if managed_job_version < 6:
2332
+ # Versions before 5 did not support tail parameter
2333
+ result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
2334
+ follow={follow}, controller={controller})
2335
+ else:
2336
+ result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
2337
+ follow={follow}, controller={controller}, tail={tail!r})
1273
2338
  if managed_job_version < 3:
1274
2339
  # Versions 2 and older did not return a retcode, so we just print
1275
2340
  # the result.
@@ -1283,18 +2348,44 @@ class ManagedJobCodeGen:
1283
2348
  return cls._build(code)
1284
2349
 
1285
2350
  @classmethod
1286
- def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag') -> str:
2351
+ def set_pending(cls,
2352
+ job_id: int,
2353
+ managed_job_dag: 'dag_lib.Dag',
2354
+ workspace: str,
2355
+ entrypoint: str,
2356
+ user_hash: Optional[str] = None) -> str:
1287
2357
  dag_name = managed_job_dag.name
2358
+ pool = managed_job_dag.pool
1288
2359
  # Add the managed job to queue table.
1289
2360
  code = textwrap.dedent(f"""\
1290
- managed_job_state.set_job_info({job_id}, {dag_name!r})
2361
+ set_job_info_kwargs = {{'workspace': {workspace!r}}}
2362
+ if managed_job_version < 4:
2363
+ set_job_info_kwargs = {{}}
2364
+ if managed_job_version >= 5:
2365
+ set_job_info_kwargs['entrypoint'] = {entrypoint!r}
2366
+ if managed_job_version >= 8:
2367
+ from sky.serve import serve_state
2368
+ pool_hash = None
2369
+ if {pool!r} != None:
2370
+ pool_hash = serve_state.get_service_hash({pool!r})
2371
+ set_job_info_kwargs['pool'] = {pool!r}
2372
+ set_job_info_kwargs['pool_hash'] = pool_hash
2373
+ if managed_job_version >= 11:
2374
+ set_job_info_kwargs['user_hash'] = {user_hash!r}
2375
+ managed_job_state.set_job_info(
2376
+ {job_id}, {dag_name!r}, **set_job_info_kwargs)
1291
2377
  """)
1292
2378
  for task_id, task in enumerate(managed_job_dag.tasks):
1293
2379
  resources_str = backend_utils.get_task_resources_str(
1294
2380
  task, is_managed_job=True)
1295
2381
  code += textwrap.dedent(f"""\
1296
- managed_job_state.set_pending({job_id}, {task_id},
1297
- {task.name!r}, {resources_str!r})
2382
+ if managed_job_version < 7:
2383
+ managed_job_state.set_pending({job_id}, {task_id},
2384
+ {task.name!r}, {resources_str!r})
2385
+ else:
2386
+ managed_job_state.set_pending({job_id}, {task_id},
2387
+ {task.name!r}, {resources_str!r},
2388
+ {task.metadata_json!r})
1298
2389
  """)
1299
2390
  return cls._build(code)
1300
2391