skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/jobs/server/core.py CHANGED
@@ -1,11 +1,13 @@
1
1
  """SDK functions for managed jobs."""
2
+ import concurrent.futures
3
+ import copy
4
+ import ipaddress
2
5
  import os
3
- import signal
4
- import subprocess
6
+ import pathlib
5
7
  import tempfile
6
- import time
7
8
  import typing
8
9
  from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from urllib import parse as urlparse
9
11
  import uuid
10
12
 
11
13
  import colorama
@@ -17,13 +19,23 @@ from sky import execution
17
19
  from sky import global_user_state
18
20
  from sky import provision as provision_lib
19
21
  from sky import sky_logging
22
+ from sky import skypilot_config
20
23
  from sky import task as task_lib
24
+ from sky.adaptors import common as adaptors_common
21
25
  from sky.backends import backend_utils
22
- from sky.clouds.service_catalog import common as service_catalog_common
26
+ from sky.backends import cloud_vm_ray_backend
27
+ from sky.catalog import common as service_catalog_common
23
28
  from sky.data import storage as storage_lib
24
29
  from sky.jobs import constants as managed_job_constants
30
+ from sky.jobs import state as managed_job_state
25
31
  from sky.jobs import utils as managed_job_utils
32
+ from sky.metrics import utils as metrics_lib
26
33
  from sky.provision import common as provision_common
34
+ from sky.schemas.api import responses
35
+ from sky.serve import serve_state
36
+ from sky.serve import serve_utils
37
+ from sky.serve.server import impl
38
+ from sky.server.requests import request_names
27
39
  from sky.skylet import constants as skylet_constants
28
40
  from sky.usage import usage_lib
29
41
  from sky.utils import admin_policy_utils
@@ -36,21 +48,153 @@ from sky.utils import status_lib
36
48
  from sky.utils import subprocess_utils
37
49
  from sky.utils import timeline
38
50
  from sky.utils import ux_utils
51
+ from sky.workspaces import core as workspaces_core
39
52
 
40
53
  if typing.TYPE_CHECKING:
54
+ from google.protobuf import json_format
55
+
41
56
  import sky
42
- from sky.backends import cloud_vm_ray_backend
57
+ from sky.schemas.generated import managed_jobsv1_pb2
58
+ else:
59
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
60
+
61
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
62
+ 'sky.schemas.generated.managed_jobsv1_pb2')
43
63
 
44
64
  logger = sky_logging.init_logger(__name__)
45
65
 
66
+ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
67
+ 'job_id',
68
+ 'task_id',
69
+ 'workspace',
70
+ 'job_name',
71
+ 'task_name',
72
+ 'resources',
73
+ 'submitted_at',
74
+ 'end_at',
75
+ 'job_duration',
76
+ 'recovery_count',
77
+ 'status',
78
+ 'pool',
79
+ 'current_cluster_name',
80
+ 'job_id_on_pool_cluster',
81
+ 'start_at',
82
+ 'infra',
83
+ 'cloud',
84
+ 'region',
85
+ 'zone',
86
+ 'cluster_resources',
87
+ 'schedule_state',
88
+ 'details',
89
+ 'failure_reason',
90
+ 'metadata',
91
+ 'user_name',
92
+ 'user_hash',
93
+ ]
94
+
95
+
96
+ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
97
+ """Upload files to the controller.
98
+
99
+ In consolidation mode, we still need to upload files to the controller as
100
+ we should keep a separate workdir for each jobs. Assuming two jobs using
101
+ the same workdir, if there are some modifications to the workdir after job 1
102
+ is submitted, on recovery of job 1, the modifications should not be applied.
103
+ """
104
+ local_to_controller_file_mounts: Dict[str, str] = {}
105
+
106
+ # For consolidation mode, we don't need to use cloud storage,
107
+ # as uploading to the controller is only a local copy.
108
+ storage_clouds = (
109
+ storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
110
+ force_disable_cloud_bucket = skypilot_config.get_nested(
111
+ ('jobs', 'force_disable_cloud_bucket'), False)
112
+ if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
113
+ not force_disable_cloud_bucket):
114
+ for task_ in dag.tasks:
115
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
116
+ task_, task_type='jobs')
117
+ else:
118
+ # We do not have any cloud storage available, so fall back to
119
+ # two-hop file_mount uploading.
120
+ # Note: we can't easily hack sync_storage_mounts() to upload
121
+ # directly to the controller, because the controller may not
122
+ # even be up yet.
123
+ for task_ in dag.tasks:
124
+ if task_.storage_mounts and not storage_clouds:
125
+ # Technically, we could convert COPY storage_mounts that
126
+ # have a local source and do not specify `store`, but we
127
+ # will not do that for now. Only plain file_mounts are
128
+ # supported.
129
+ raise exceptions.NotSupportedError(
130
+ 'Cloud-based file_mounts are specified, but no cloud '
131
+ 'storage is available. Please specify local '
132
+ 'file_mounts only.')
133
+
134
+ # Merge file mounts from all tasks.
135
+ local_to_controller_file_mounts.update(
136
+ controller_utils.translate_local_file_mounts_to_two_hop(task_))
137
+
138
+ return local_to_controller_file_mounts
139
+
140
+
141
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
142
+ num_jobs: int) -> Optional[List[int]]:
143
+ """Submit the managed job locally if in consolidation mode.
144
+
145
+ In normal mode the managed job submission is done in the ray job submission.
146
+ For consolidation mode, we need to manually submit it. Check the following
147
+ function for the normal mode submission:
148
+ sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
149
+ _exec_code_on_head::_maybe_add_managed_job_code
150
+ """
151
+ if not managed_job_utils.is_consolidation_mode():
152
+ return None
153
+
154
+ # Create local directory for the managed job.
155
+ pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
156
+ job_ids = []
157
+ pool = dag.pool
158
+ pool_hash = None
159
+ if pool is not None:
160
+ pool_hash = serve_state.get_service_hash(pool)
161
+ # Already checked in the sdk.
162
+ assert pool_hash is not None, f'Pool {pool} not found'
163
+ for _ in range(num_jobs):
164
+ # TODO(tian): We should have a separate name for each job when
165
+ # submitting multiple jobs. Current blocker is that we are sharing
166
+ # the same dag object for all jobs. Maybe we can do copy.copy() for
167
+ # each job and then give it a unique name (e.g. append job id after
168
+ # the task name). The name of the dag also needs to be aligned with
169
+ # the task name.
170
+ consolidation_mode_job_id = (
171
+ managed_job_state.set_job_info_without_job_id(
172
+ dag.name,
173
+ workspace=skypilot_config.get_active_workspace(
174
+ force_user_workspace=True),
175
+ entrypoint=common_utils.get_current_command(),
176
+ pool=pool,
177
+ pool_hash=pool_hash,
178
+ user_hash=common_utils.get_user_hash()))
179
+ for task_id, task in enumerate(dag.tasks):
180
+ resources_str = backend_utils.get_task_resources_str(
181
+ task, is_managed_job=True)
182
+ managed_job_state.set_pending(consolidation_mode_job_id, task_id,
183
+ task.name, resources_str,
184
+ task.metadata_json)
185
+ job_ids.append(consolidation_mode_job_id)
186
+ return job_ids
187
+
46
188
 
47
189
  @timeline.event
48
190
  @usage_lib.entrypoint
49
191
  def launch(
50
192
  task: Union['sky.Task', 'sky.Dag'],
51
193
  name: Optional[str] = None,
194
+ pool: Optional[str] = None,
195
+ num_jobs: Optional[int] = None,
52
196
  stream_logs: bool = True,
53
- ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
197
+ ) -> Tuple[Optional[Union[int, List[int]]], Optional[backends.ResourceHandle]]:
54
198
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
55
199
  """Launches a managed job.
56
200
 
@@ -76,21 +220,58 @@ def launch(
76
220
  None if dryrun.
77
221
  """
78
222
  entrypoint = task
223
+ # using hasattr instead of isinstance to avoid importing sky
224
+ if hasattr(task, 'metadata'):
225
+ metadata = task.metadata
226
+ else:
227
+ # we are a Dag, not a Task
228
+ if len(task.tasks) == 1:
229
+ metadata = task.tasks[0].metadata
230
+ else:
231
+ # doesn't make sense to have a git commit since there might be
232
+ # different metadatas for each task
233
+ metadata = {}
234
+
79
235
  dag_uuid = str(uuid.uuid4().hex[:4])
80
236
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
237
+
81
238
  # Always apply the policy again here, even though it might have been applied
82
239
  # in the CLI. This is to ensure that we apply the policy to the final DAG
83
240
  # and get the mutated config.
84
241
  dag, mutated_user_config = admin_policy_utils.apply(
85
- dag, use_mutated_config_in_current_request=False)
242
+ dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
243
+ dag.resolve_and_validate_volumes()
86
244
  if not dag.is_chain():
87
245
  with ux_utils.print_exception_no_traceback():
88
246
  raise ValueError('Only single-task or chain DAG is '
89
247
  f'allowed for job_launch. Dag: {dag}')
90
248
  dag.validate()
249
+ # TODO(aylei): use consolidated job controller instead of performing
250
+ # pre-mount operations when submitting jobs.
251
+ dag.pre_mount_volumes()
252
+
253
+ # If there is a local postgres db, when the api server tries launching on
254
+ # the remote jobs controller it will fail. therefore, we should remove this
255
+ # before sending the config to the jobs controller.
256
+ # TODO(luca) there are a lot of potential problems with postgres being sent
257
+ # to the jobs controller. for example if the postgres is whitelisted to
258
+ # only the API server, this will then break. the simple solution to that is
259
+ # telling the user to add the jobs controller to the postgres whitelist.
260
+ if not managed_job_utils.is_consolidation_mode():
261
+ db_path = mutated_user_config.get('db', None)
262
+ if db_path is not None:
263
+ parsed = urlparse.urlparse(db_path)
264
+ if ((parsed.hostname == 'localhost' or
265
+ ipaddress.ip_address(parsed.hostname).is_loopback)):
266
+ mutated_user_config.pop('db', None)
267
+
268
+ user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
269
+ dag, use_user_specified_yaml=True)
270
+
91
271
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
92
272
 
93
273
  task_names = set()
274
+ priority = None
94
275
  for task_ in dag.tasks:
95
276
  if task_.name in task_names:
96
277
  with ux_utils.print_exception_no_traceback():
@@ -101,6 +282,42 @@ def launch(
101
282
  'will be auto-generated) .')
102
283
  task_names.add(task_.name)
103
284
 
285
+ # Check for priority in resources
286
+ task_priority = None
287
+ if task_.resources:
288
+ # Convert set to list to access elements by index
289
+ resources_list = list(task_.resources)
290
+ # Take first resource's priority as reference
291
+ task_priority = resources_list[0].priority
292
+
293
+ # Check all other resources have same priority
294
+ for resource in resources_list[1:]:
295
+ if resource.priority != task_priority:
296
+ with ux_utils.print_exception_no_traceback():
297
+ raise ValueError(
298
+ f'Task {task_.name!r}: All resources must have the '
299
+ 'same priority. Found priority '
300
+ f'{resource.priority} but expected {task_priority}.'
301
+ )
302
+
303
+ if task_priority is not None:
304
+ if (priority is not None and priority != task_priority):
305
+ with ux_utils.print_exception_no_traceback():
306
+ raise ValueError(
307
+ 'Multiple tasks in the DAG have different priorities. '
308
+ 'Either specify a priority in only one task, or set '
309
+ 'the same priority for each task.')
310
+ priority = task_priority
311
+
312
+ if priority is None:
313
+ priority = skylet_constants.DEFAULT_PRIORITY
314
+
315
+ if (priority < skylet_constants.MIN_PRIORITY or
316
+ priority > skylet_constants.MAX_PRIORITY):
317
+ raise ValueError(
318
+ f'Priority must be between {skylet_constants.MIN_PRIORITY}'
319
+ f' and {skylet_constants.MAX_PRIORITY}, got {priority}')
320
+
104
321
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
105
322
 
106
323
  with rich_utils.safe_status(
@@ -109,15 +326,13 @@ def launch(
109
326
  # Check whether cached jobs controller cluster is accessible
110
327
  cluster_name = (
111
328
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
112
- record = global_user_state.get_cluster_from_name(cluster_name)
113
- if record is not None:
329
+ if global_user_state.cluster_with_name_exists(cluster_name):
114
330
  # there is a cached jobs controller cluster
115
331
  try:
116
332
  # TODO: do something with returned status?
117
333
  _, _ = backend_utils.refresh_cluster_status_handle(
118
334
  cluster_name=cluster_name,
119
- force_refresh_statuses=set(status_lib.ClusterStatus),
120
- acquire_per_cluster_status_lock=False)
335
+ force_refresh_statuses=set(status_lib.ClusterStatus))
121
336
  except (exceptions.ClusterOwnerIdentityMismatchError,
122
337
  exceptions.CloudUserIdentityError,
123
338
  exceptions.ClusterStatusFetchingError) as e:
@@ -131,100 +346,223 @@ def launch(
131
346
  f'with:\n\n`sky down {cluster_name} --purge`\n\n'
132
347
  f'Reason: {common_utils.format_exception(e)}')
133
348
 
134
- local_to_controller_file_mounts = {}
135
-
136
- if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
137
- for task_ in dag.tasks:
138
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
139
- task_, task_type='jobs')
349
+ local_to_controller_file_mounts = _upload_files_to_controller(dag)
350
+ controller = controller_utils.Controllers.JOBS_CONTROLLER
351
+ controller_name = controller.value.cluster_name
352
+ prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
353
+ controller_resources = controller_utils.get_controller_resources(
354
+ controller=controller,
355
+ task_resources=sum([list(t.resources) for t in dag.tasks], []))
356
+
357
+ num_jobs = num_jobs if num_jobs is not None else 1
358
+ # We do this assignment after applying the admin policy, so that we don't
359
+ # need to serialize the pool name in the dag. The dag object will be
360
+ # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
361
+ dag.pool = pool
362
+ consolidation_mode_job_ids = _maybe_submit_job_locally(
363
+ prefix, dag, num_jobs)
364
+
365
+ # This is only needed for non-consolidation mode. For consolidation
366
+ # mode, the controller uses the same catalog as API server.
367
+ modified_catalogs = {} if consolidation_mode_job_ids is not None else (
368
+ service_catalog_common.get_modified_catalog_file_mounts())
369
+
370
+ def _submit_one(
371
+ consolidation_mode_job_id: Optional[int] = None,
372
+ job_rank: Optional[int] = None,
373
+ num_jobs: Optional[int] = None,
374
+ ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
375
+ rank_suffix = '' if job_rank is None else f'-{job_rank}'
376
+ remote_original_user_yaml_path = (
377
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.original_user_yaml')
378
+ remote_user_yaml_path = (
379
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.yaml')
380
+ remote_user_config_path = (
381
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.config_yaml')
382
+ remote_env_file_path = (
383
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.env')
384
+ with tempfile.NamedTemporaryFile(
385
+ prefix=f'managed-dag-{dag.name}{rank_suffix}-',
386
+ mode='w',
387
+ ) as f, tempfile.NamedTemporaryFile(
388
+ prefix=f'managed-user-dag-{dag.name}{rank_suffix}-',
389
+ mode='w',
390
+ ) as original_user_yaml_path:
391
+ original_user_yaml_path.write(user_dag_str_user_specified)
392
+ original_user_yaml_path.flush()
393
+ # Copy tasks to avoid race conditions when multiple threads modify
394
+ # the same dag object concurrently. Each thread needs its own copy.
395
+ dag_copy = copy.deepcopy(dag)
396
+ for task_ in dag_copy.tasks:
397
+ if job_rank is not None:
398
+ task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
399
+ if num_jobs is not None:
400
+ task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
401
+
402
+ dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
403
+
404
+ vars_to_fill = {
405
+ 'remote_original_user_yaml_path':
406
+ (remote_original_user_yaml_path),
407
+ 'original_user_dag_path': original_user_yaml_path.name,
408
+ 'remote_user_yaml_path': remote_user_yaml_path,
409
+ 'user_yaml_path': f.name,
410
+ 'local_to_controller_file_mounts':
411
+ (local_to_controller_file_mounts),
412
+ 'jobs_controller': controller_name,
413
+ # Note: actual cluster name will be <task.name>-<managed job ID>
414
+ 'dag_name': dag.name,
415
+ 'remote_user_config_path': remote_user_config_path,
416
+ 'remote_env_file_path': remote_env_file_path,
417
+ 'modified_catalogs': modified_catalogs,
418
+ 'priority': priority,
419
+ 'consolidation_mode_job_id': consolidation_mode_job_id,
420
+ 'pool': pool,
421
+ 'job_controller_indicator_file':
422
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
423
+ **controller_utils.shared_controller_vars_to_fill(
424
+ controller,
425
+ remote_user_config_path=remote_user_config_path,
426
+ # TODO(aylei): the mutated config will not be updated
427
+ # afterwards without recreate the controller. Need to
428
+ # revisit this.
429
+ local_user_config=mutated_user_config,
430
+ ),
431
+ }
140
432
 
141
- else:
142
- # We do not have any cloud storage available, so fall back to
143
- # two-hop file_mount uploading.
144
- # Note: we can't easily hack sync_storage_mounts() to upload
145
- # directly to the controller, because the controller may not
146
- # even be up yet.
147
- for task_ in dag.tasks:
148
- if task_.storage_mounts:
149
- # Technically, we could convert COPY storage_mounts that
150
- # have a local source and do not specify `store`, but we
151
- # will not do that for now. Only plain file_mounts are
152
- # supported.
153
- raise exceptions.NotSupportedError(
154
- 'Cloud-based file_mounts are specified, but no cloud '
155
- 'storage is available. Please specify local '
156
- 'file_mounts only.')
157
-
158
- # Merge file mounts from all tasks.
159
- local_to_controller_file_mounts.update(
160
- controller_utils.translate_local_file_mounts_to_two_hop(
161
- task_))
162
-
163
- with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
164
- mode='w') as f:
165
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
166
- controller = controller_utils.Controllers.JOBS_CONTROLLER
167
- controller_name = controller.value.cluster_name
168
- prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
169
- remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml'
170
- remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml'
171
- remote_env_file_path = f'{prefix}/{dag.name}-{dag_uuid}.env'
172
- controller_resources = controller_utils.get_controller_resources(
173
- controller=controller,
174
- task_resources=sum([list(t.resources) for t in dag.tasks], []))
175
- controller_idle_minutes_to_autostop, controller_down = (
176
- controller_utils.get_controller_autostop_config(
177
- controller=controller))
178
-
179
- vars_to_fill = {
180
- 'remote_user_yaml_path': remote_user_yaml_path,
181
- 'user_yaml_path': f.name,
182
- 'local_to_controller_file_mounts': local_to_controller_file_mounts,
183
- 'jobs_controller': controller_name,
184
- # Note: actual cluster name will be <task.name>-<managed job ID>
185
- 'dag_name': dag.name,
186
- 'remote_user_config_path': remote_user_config_path,
187
- 'remote_env_file_path': remote_env_file_path,
188
- 'modified_catalogs':
189
- service_catalog_common.get_modified_catalog_file_mounts(),
190
- 'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
191
- 'dashboard_user_id': common.SERVER_ID,
192
- **controller_utils.shared_controller_vars_to_fill(
193
- controller,
194
- remote_user_config_path=remote_user_config_path,
195
- local_user_config=mutated_user_config,
196
- ),
197
- }
198
-
199
- yaml_path = os.path.join(
200
- managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
201
- f'{name}-{dag_uuid}.yaml')
202
- common_utils.fill_template(
203
- managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
204
- vars_to_fill,
205
- output_path=yaml_path)
206
- controller_task = task_lib.Task.from_yaml(yaml_path)
207
- controller_task.set_resources(controller_resources)
208
-
209
- controller_task.managed_job_dag = dag
210
-
211
- sky_logging.print(
212
- f'{colorama.Fore.YELLOW}'
213
- f'Launching managed job {dag.name!r} from jobs controller...'
214
- f'{colorama.Style.RESET_ALL}')
215
-
216
- # Launch with the api server's user hash, so that sky status does not
217
- # show the owner of the controller as whatever user launched it first.
218
- with common.with_server_user_hash():
219
- return execution.launch(
220
- task=controller_task,
221
- cluster_name=controller_name,
222
- stream_logs=stream_logs,
223
- idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
224
- down=controller_down,
225
- retry_until_up=True,
226
- fast=True,
227
- _disable_controller_check=True)
433
+ yaml_path = os.path.join(
434
+ managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
435
+ f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
436
+ )
437
+ common_utils.fill_template(
438
+ managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
439
+ vars_to_fill,
440
+ output_path=yaml_path)
441
+ controller_task = task_lib.Task.from_yaml(yaml_path)
442
+ controller_task.set_resources(controller_resources)
443
+
444
+ controller_task.managed_job_dag = dag_copy
445
+ # pylint: disable=protected-access
446
+ controller_task._metadata = metadata
447
+
448
+ job_identity = ''
449
+ if job_rank is not None:
450
+ job_identity = f' (rank: {job_rank})'
451
+ job_controller_postfix = (' from jobs controller' if
452
+ consolidation_mode_job_id is None else '')
453
+ logger.info(
454
+ f'{colorama.Fore.YELLOW}'
455
+ f'Launching managed job {dag.name!r}{job_identity}'
456
+ f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
457
+
458
+ # Launch with the api server's user hash, so that sky status does
459
+ # not show the owner of the controller as whatever user launched
460
+ # it first.
461
+ with common.with_server_user():
462
+ # Always launch the controller in the default workspace.
463
+ with skypilot_config.local_active_workspace_ctx(
464
+ skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
465
+ # TODO(zhwu): the buckets need to be correctly handled for
466
+ # a specific workspace. For example, if a job is launched in
467
+ # workspace A, but the controller is in workspace B, the
468
+ # intermediate bucket and newly created bucket should be in
469
+ # workspace A.
470
+ if consolidation_mode_job_id is None:
471
+ return execution.launch(
472
+ task=controller_task,
473
+ cluster_name=controller_name,
474
+ stream_logs=stream_logs,
475
+ retry_until_up=True,
476
+ fast=True,
477
+ _request_name=request_names.AdminPolicyRequestName.
478
+ JOBS_LAUNCH_CONTROLLER,
479
+ _disable_controller_check=True)
480
+ # Manually launch the scheduler in consolidation mode.
481
+ local_handle = backend_utils.is_controller_accessible(
482
+ controller=controller, stopped_message='')
483
+ backend = backend_utils.get_backend_from_handle(
484
+ local_handle)
485
+ assert isinstance(backend, backends.CloudVmRayBackend)
486
+ # Suppress file mount logs when submitting multiple jobs.
487
+ should_silence = num_jobs is not None and num_jobs > 1
488
+ with sky_logging.silent(should_silence):
489
+ backend.sync_file_mounts(
490
+ handle=local_handle,
491
+ all_file_mounts=controller_task.file_mounts,
492
+ storage_mounts=controller_task.storage_mounts)
493
+ run_script = controller_task.run
494
+ assert isinstance(run_script, str)
495
+ # Manually add the env variables to the run script.
496
+ # Originally this is done in ray jobs submission but now we
497
+ # have to do it manually because there is no ray runtime on
498
+ # the API server.
499
+ env_cmds = [
500
+ f'export {k}={v!r}'
501
+ for k, v in controller_task.envs.items()
502
+ ]
503
+ run_script = '\n'.join(env_cmds + [run_script])
504
+ log_dir = os.path.join(skylet_constants.SKY_LOGS_DIRECTORY,
505
+ 'managed_jobs')
506
+ os.makedirs(log_dir, exist_ok=True)
507
+ log_path = os.path.join(
508
+ log_dir, f'submit-job-{consolidation_mode_job_id}.log')
509
+ backend.run_on_head(local_handle,
510
+ run_script,
511
+ log_path=log_path)
512
+ ux_utils.starting_message(
513
+ f'Job submitted, ID: {consolidation_mode_job_id}')
514
+ return consolidation_mode_job_id, local_handle
515
+
516
+ if pool is None:
517
+ if consolidation_mode_job_ids is None:
518
+ return _submit_one()
519
+ assert len(consolidation_mode_job_ids) == 1
520
+ return _submit_one(consolidation_mode_job_ids[0])
521
+
522
+ ids: List[int] = []
523
+ all_handle: Optional[backends.ResourceHandle] = None
524
+
525
+ if num_jobs == 1:
526
+ job_id = (consolidation_mode_job_ids[0]
527
+ if consolidation_mode_job_ids is not None else None)
528
+ jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
529
+ assert jid is not None, (job_id, handle)
530
+ ids.append(jid)
531
+ all_handle = handle
532
+ else:
533
+ # Submit jobs in parallel using ThreadPoolExecutor
534
+ with concurrent.futures.ThreadPoolExecutor(
535
+ max_workers=min(num_jobs,
536
+ os.cpu_count() or 1)) as executor:
537
+ # Submit jobs concurrently
538
+ future_to_rank = {}
539
+ for job_rank in range(num_jobs):
540
+ job_id = (consolidation_mode_job_ids[job_rank]
541
+ if consolidation_mode_job_ids is not None else None)
542
+ future = executor.submit(_submit_one, job_id, job_rank,
543
+ num_jobs)
544
+ future_to_rank[future] = job_rank
545
+
546
+ # Collect results in order of job_rank to maintain consistent order.
547
+ results: List[Optional[Tuple[
548
+ int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
549
+ for future in concurrent.futures.as_completed(future_to_rank):
550
+ job_rank = future_to_rank[future]
551
+ try:
552
+ jid, handle = future.result()
553
+ assert jid is not None, (job_id, handle)
554
+ results[job_rank] = (jid, handle)
555
+ all_handle = handle # Keep the last handle.
556
+ except Exception as e:
557
+ logger.error(f'Error launching job {job_rank}: {e}')
558
+ raise e
559
+
560
+ # Extract job IDs in order
561
+ for res in results:
562
+ if res is not None:
563
+ ids.append(res[0])
564
+
565
+ return ids, all_handle
228
566
 
229
567
 
230
568
  def queue_from_kubernetes_pod(
@@ -275,7 +613,9 @@ def queue_from_kubernetes_pod(
275
613
  managed_jobs_runner = provision_lib.get_command_runners(
276
614
  'kubernetes', cluster_info)[0]
277
615
 
278
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
616
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
617
+ skip_finished=skip_finished,
618
+ fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
279
619
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
280
620
  code,
281
621
  require_outputs=True,
@@ -291,7 +631,14 @@ def queue_from_kubernetes_pod(
291
631
  except exceptions.CommandError as e:
292
632
  raise RuntimeError(str(e)) from e
293
633
 
294
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
634
+ jobs, _, result_type, _, _ = managed_job_utils.load_managed_job_queue(
635
+ job_table_payload)
636
+
637
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
638
+ return jobs
639
+
640
+ # Backward compatibility for old jobs controller without filtering
641
+ # TODO(hailong): remove this after 0.12.0
295
642
  if skip_finished:
296
643
  # Filter out the finished jobs. If a multi-task job is partially
297
644
  # finished, we will include all its tasks.
@@ -322,28 +669,22 @@ def _maybe_restart_controller(
322
669
  if handle is not None:
323
670
  return handle
324
671
 
325
- sky_logging.print(f'{colorama.Fore.YELLOW}'
326
- f'Restarting {jobs_controller_type.value.name}...'
327
- f'{colorama.Style.RESET_ALL}')
672
+ logger.info(f'{colorama.Fore.YELLOW}'
673
+ f'Restarting {jobs_controller_type.value.name}...'
674
+ f'{colorama.Style.RESET_ALL}')
328
675
 
329
676
  rich_utils.force_update_status(
330
677
  ux_utils.spinner_message(f'{spinner_message} - restarting '
331
678
  'controller'))
332
- handle = core.start(cluster_name=jobs_controller_type.value.cluster_name)
333
- # Make sure the dashboard is running when the controller is restarted.
334
- # We should not directly use execution.launch() and have the dashboard cmd
335
- # in the task setup because since we are using detached_setup, it will
336
- # become a job on controller which messes up the job IDs (we assume the
337
- # job ID in controller's job queue is consistent with managed job IDs).
338
- with rich_utils.safe_status(
339
- ux_utils.spinner_message('Starting dashboard...')):
340
- runner = handle.get_command_runners()[0]
341
- runner.run(
342
- f'export '
343
- f'{skylet_constants.USER_ID_ENV_VAR}={common.SERVER_ID!r}; '
344
- f'{managed_job_constants.DASHBOARD_SETUP_CMD}',
345
- stream_logs=True,
346
- )
679
+ with skypilot_config.local_active_workspace_ctx(
680
+ skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
681
+ global_user_state.add_cluster_event(
682
+ jobs_controller_type.value.cluster_name,
683
+ status_lib.ClusterStatus.INIT, 'Jobs controller restarted.',
684
+ global_user_state.ClusterEventType.STATUS_CHANGE)
685
+ handle = core.start(
686
+ cluster_name=jobs_controller_type.value.cluster_name)
687
+
347
688
  controller_status = status_lib.ClusterStatus.UP
348
689
  rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
349
690
 
@@ -351,10 +692,13 @@ def _maybe_restart_controller(
351
692
  return handle
352
693
 
353
694
 
695
+ # For backwards compatibility
696
+ # TODO(hailong): Remove before 0.12.0.
354
697
  @usage_lib.entrypoint
355
698
  def queue(refresh: bool,
356
699
  skip_finished: bool = False,
357
- all_users: bool = False) -> List[Dict[str, Any]]:
700
+ all_users: bool = False,
701
+ job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
358
702
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
359
703
  """Gets statuses of managed jobs.
360
704
 
@@ -368,13 +712,15 @@ def queue(refresh: bool,
368
712
  'resources': str,
369
713
  'submitted_at': (float) timestamp of submission,
370
714
  'end_at': (float) timestamp of end,
371
- 'duration': (float) duration in seconds,
715
+ 'job_duration': (float) duration in seconds,
372
716
  'recovery_count': (int) Number of retries,
373
717
  'status': (sky.jobs.ManagedJobStatus) of the job,
374
718
  'cluster_resources': (str) resources of the cluster,
375
719
  'region': (str) region of the cluster,
376
720
  'user_name': (Optional[str]) job creator's user name,
377
721
  'user_hash': (str) job creator's user hash,
722
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
723
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
378
724
  }
379
725
  ]
380
726
  Raises:
@@ -382,51 +728,222 @@ def queue(refresh: bool,
382
728
  does not exist.
383
729
  RuntimeError: if failed to get the managed jobs with ssh.
384
730
  """
385
- handle = _maybe_restart_controller(refresh,
386
- stopped_message='No in-progress '
387
- 'managed jobs.',
388
- spinner_message='Checking '
389
- 'managed jobs')
731
+ jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
732
+
733
+ return jobs
734
+
735
+
736
+ @usage_lib.entrypoint
737
+ def queue_v2_api(
738
+ refresh: bool,
739
+ skip_finished: bool = False,
740
+ all_users: bool = False,
741
+ job_ids: Optional[List[int]] = None,
742
+ user_match: Optional[str] = None,
743
+ workspace_match: Optional[str] = None,
744
+ name_match: Optional[str] = None,
745
+ pool_match: Optional[str] = None,
746
+ page: Optional[int] = None,
747
+ limit: Optional[int] = None,
748
+ statuses: Optional[List[str]] = None,
749
+ fields: Optional[List[str]] = None,
750
+ ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
751
+ """Gets statuses of managed jobs and parse the
752
+ jobs to responses.ManagedJobRecord."""
753
+ jobs, total, status_counts, total_no_filter = queue_v2(
754
+ refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
755
+ name_match, pool_match, page, limit, statuses, fields)
756
+ return [responses.ManagedJobRecord(**job) for job in jobs
757
+ ], total, status_counts, total_no_filter
758
+
759
+
760
+ @metrics_lib.time_me
761
+ def queue_v2(
762
+ refresh: bool,
763
+ skip_finished: bool = False,
764
+ all_users: bool = False,
765
+ job_ids: Optional[List[int]] = None,
766
+ user_match: Optional[str] = None,
767
+ workspace_match: Optional[str] = None,
768
+ name_match: Optional[str] = None,
769
+ pool_match: Optional[str] = None,
770
+ page: Optional[int] = None,
771
+ limit: Optional[int] = None,
772
+ statuses: Optional[List[str]] = None,
773
+ fields: Optional[List[str]] = None,
774
+ ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
775
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
776
+ """Gets statuses of managed jobs with filtering.
777
+
778
+ Please refer to sky.cli.job_queue for documentation.
779
+
780
+ Returns:
781
+ jobs: List[Dict[str, Any]]
782
+ [
783
+ {
784
+ 'job_id': int,
785
+ 'job_name': str,
786
+ 'resources': str,
787
+ 'submitted_at': (float) timestamp of submission,
788
+ 'end_at': (float) timestamp of end,
789
+ 'job_duration': (float) duration in seconds,
790
+ 'recovery_count': (int) Number of retries,
791
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
792
+ 'cluster_resources': (str) resources of the cluster,
793
+ 'region': (str) region of the cluster,
794
+ 'user_name': (Optional[str]) job creator's user name,
795
+ 'user_hash': (str) job creator's user hash,
796
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
797
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
798
+ }
799
+ ]
800
+ total: int, total number of jobs after filter
801
+ status_counts: Dict[str, int], status counts after filter
802
+ total_no_filter: int, total number of jobs before filter
803
+ Raises:
804
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up or
805
+ does not exist.
806
+ RuntimeError: if failed to get the managed jobs with ssh.
807
+ """
808
+ if limit is not None:
809
+ if limit < 1:
810
+ raise ValueError(f'Limit must be at least 1, got {limit}')
811
+ if page is None:
812
+ page = 1
813
+ if page < 1:
814
+ raise ValueError(f'Page must be at least 1, got {page}')
815
+ else:
816
+ if page is not None:
817
+ raise ValueError('Limit must be specified when page is specified')
818
+
819
+ with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
820
+ handle = _maybe_restart_controller(refresh,
821
+ stopped_message='No in-progress '
822
+ 'managed jobs.',
823
+ spinner_message='Checking '
824
+ 'managed jobs')
390
825
  backend = backend_utils.get_backend_from_handle(handle)
391
826
  assert isinstance(backend, backends.CloudVmRayBackend)
392
827
 
393
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
394
- returncode, job_table_payload, stderr = backend.run_on_head(
395
- handle,
396
- code,
397
- require_outputs=True,
398
- stream_logs=False,
399
- separate_stderr=True)
828
+ user_hashes: Optional[List[Optional[str]]] = None
829
+ show_jobs_without_user_hash = False
830
+ if not all_users:
831
+ user_hashes = [common_utils.get_user_hash()]
832
+ # For backwards compatibility, we show jobs that do not have a
833
+ # user_hash. TODO(cooperc): Remove before 0.12.0.
834
+ user_hashes.append(None)
835
+ show_jobs_without_user_hash = True
836
+ elif user_match is not None:
837
+ users = global_user_state.get_user_by_name_match(user_match)
838
+ if not users:
839
+ return [], 0, {}, 0
840
+ user_hashes = [user.id for user in users]
841
+
842
+ accessible_workspaces = list(workspaces_core.get_workspaces().keys())
843
+
844
+ if handle.is_grpc_enabled_with_flag:
845
+ try:
846
+ request = managed_jobsv1_pb2.GetJobTableRequest(
847
+ skip_finished=skip_finished,
848
+ accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
849
+ workspaces=accessible_workspaces)),
850
+ job_ids=managed_jobsv1_pb2.JobIds(
851
+ ids=job_ids) if job_ids is not None else None,
852
+ workspace_match=workspace_match,
853
+ name_match=name_match,
854
+ pool_match=pool_match,
855
+ page=page,
856
+ limit=limit,
857
+ # Remove None from user_hashes, as the gRPC server uses the
858
+ # show_jobs_without_user_hash flag instead.
859
+ user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
860
+ user_hash for user_hash in user_hashes
861
+ if user_hash is not None
862
+ ]) if user_hashes is not None else None,
863
+ statuses=managed_jobsv1_pb2.Statuses(
864
+ statuses=statuses) if statuses is not None else None,
865
+ fields=managed_jobsv1_pb2.Fields(
866
+ fields=fields) if fields is not None else None,
867
+ show_jobs_without_user_hash=show_jobs_without_user_hash,
868
+ )
869
+ response = backend_utils.invoke_skylet_with_retries(
870
+ lambda: cloud_vm_ray_backend.SkyletClient(
871
+ handle.get_grpc_channel()).get_managed_job_table(request))
872
+ jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
873
+ return jobs, response.total, dict(
874
+ response.status_counts), response.total_no_filter
875
+ except exceptions.SkyletMethodNotImplementedError:
876
+ pass
877
+
878
+ with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
879
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
880
+ skip_finished, accessible_workspaces, job_ids, workspace_match,
881
+ name_match, pool_match, page, limit, user_hashes, statuses, fields)
882
+ with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
883
+ returncode, job_table_payload, stderr = backend.run_on_head(
884
+ handle,
885
+ code,
886
+ require_outputs=True,
887
+ stream_logs=False,
888
+ separate_stderr=True)
400
889
 
401
890
  if returncode != 0:
402
891
  logger.error(job_table_payload + stderr)
403
892
  raise RuntimeError('Failed to fetch managed jobs with returncode: '
404
- f'{returncode}')
893
+ f'{returncode}.\n{job_table_payload + stderr}')
405
894
 
406
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
895
+ with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
896
+ (jobs, total, result_type, total_no_filter, status_counts
897
+ ) = managed_job_utils.load_managed_job_queue(job_table_payload)
407
898
 
408
- if not all_users:
899
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
900
+ return jobs, total, status_counts, total_no_filter
409
901
 
410
- def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
411
- user_hash = job.get('user_hash', None)
412
- if user_hash is None:
413
- # For backwards compatibility, we show jobs that do not have a
414
- # user_hash. TODO(cooperc): Remove before 0.12.0.
415
- return True
416
- return user_hash == common_utils.get_user_hash()
902
+ # Backward compatibility for old jobs controller without filtering
903
+ # TODO(hailong): remove this after 0.12.0
904
+ with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
905
+ if not all_users:
417
906
 
418
- jobs = list(filter(user_hash_matches_or_missing, jobs))
907
+ def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
908
+ user_hash = job.get('user_hash', None)
909
+ if user_hash is None:
910
+ # For backwards compatibility, we show jobs that do not have
911
+ # a user_hash. TODO(cooperc): Remove before 0.12.0.
912
+ return True
913
+ return user_hash == common_utils.get_user_hash()
419
914
 
420
- if skip_finished:
421
- # Filter out the finished jobs. If a multi-task job is partially
422
- # finished, we will include all its tasks.
423
- non_finished_tasks = list(
424
- filter(lambda job: not job['status'].is_terminal(), jobs))
425
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
426
- jobs = list(
427
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
915
+ jobs = list(filter(user_hash_matches_or_missing, jobs))
428
916
 
429
- return jobs
917
+ jobs = list(
918
+ filter(
919
+ lambda job: job.get('workspace', skylet_constants.
920
+ SKYPILOT_DEFAULT_WORKSPACE) in
921
+ accessible_workspaces, jobs))
922
+
923
+ if skip_finished:
924
+ # Filter out the finished jobs. If a multi-task job is partially
925
+ # finished, we will include all its tasks.
926
+ non_finished_tasks = list(
927
+ filter(lambda job: not job['status'].is_terminal(), jobs))
928
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
929
+ jobs = list(
930
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
931
+
932
+ if job_ids:
933
+ jobs = [job for job in jobs if job['job_id'] in job_ids]
934
+
935
+ filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
936
+ jobs,
937
+ workspace_match,
938
+ name_match,
939
+ pool_match,
940
+ page=page,
941
+ limit=limit,
942
+ user_match=user_match,
943
+ enable_user_match=True,
944
+ statuses=statuses,
945
+ )
946
+ return filtered_jobs, total, status_counts, total_no_filter
430
947
 
431
948
 
432
949
  @usage_lib.entrypoint
@@ -434,7 +951,8 @@ def queue(refresh: bool,
434
951
  def cancel(name: Optional[str] = None,
435
952
  job_ids: Optional[List[int]] = None,
436
953
  all: bool = False,
437
- all_users: bool = False) -> None:
954
+ all_users: bool = False,
955
+ pool: Optional[str] = None) -> None:
438
956
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
439
957
  """Cancels managed jobs.
440
958
 
@@ -444,57 +962,98 @@ def cancel(name: Optional[str] = None,
444
962
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
445
963
  RuntimeError: failed to cancel the job.
446
964
  """
447
- job_ids = [] if job_ids is None else job_ids
448
- handle = backend_utils.is_controller_accessible(
449
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
450
- stopped_message='All managed jobs should have finished.')
451
-
452
- job_id_str = ','.join(map(str, job_ids))
453
- if sum([bool(job_ids), name is not None, all or all_users]) != 1:
454
- arguments = []
455
- arguments += [f'job_ids={job_id_str}'] if job_ids else []
456
- arguments += [f'name={name}'] if name is not None else []
457
- arguments += ['all'] if all else []
458
- arguments += ['all_users'] if all_users else []
459
- with ux_utils.print_exception_no_traceback():
460
- raise ValueError('Can only specify one of JOB_IDS, name, or all/'
461
- f'all_users. Provided {" ".join(arguments)!r}.')
965
+ with rich_utils.safe_status(
966
+ ux_utils.spinner_message('Cancelling managed jobs')):
967
+ job_ids = [] if job_ids is None else job_ids
968
+ handle = backend_utils.is_controller_accessible(
969
+ controller=controller_utils.Controllers.JOBS_CONTROLLER,
970
+ stopped_message='All managed jobs should have finished.')
971
+
972
+ job_id_str = ','.join(map(str, job_ids))
973
+ if sum([
974
+ bool(job_ids), name is not None, pool is not None, all or
975
+ all_users
976
+ ]) != 1:
977
+ arguments = []
978
+ arguments += [f'job_ids={job_id_str}'] if job_ids else []
979
+ arguments += [f'name={name}'] if name is not None else []
980
+ arguments += [f'pool={pool}'] if pool is not None else []
981
+ arguments += ['all'] if all else []
982
+ arguments += ['all_users'] if all_users else []
983
+ with ux_utils.print_exception_no_traceback():
984
+ raise ValueError(
985
+ 'Can only specify one of JOB_IDS, name, pool, or all/'
986
+ f'all_users. Provided {" ".join(arguments)!r}.')
462
987
 
463
- backend = backend_utils.get_backend_from_handle(handle)
464
- assert isinstance(backend, backends.CloudVmRayBackend)
465
- if all_users:
466
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
467
- None, all_users=True)
468
- elif all:
469
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
470
- elif job_ids:
471
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(job_ids)
472
- else:
473
- assert name is not None, (job_ids, name, all)
474
- code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
475
- # The stderr is redirected to stdout
476
- returncode, stdout, _ = backend.run_on_head(handle,
477
- code,
478
- require_outputs=True,
479
- stream_logs=False)
480
- try:
481
- subprocess_utils.handle_returncode(returncode, code,
482
- 'Failed to cancel managed job',
483
- stdout)
484
- except exceptions.CommandError as e:
485
- with ux_utils.print_exception_no_traceback():
486
- raise RuntimeError(e.error_msg) from e
988
+ job_ids = None if (all_users or all) else job_ids
487
989
 
488
- sky_logging.print(stdout)
489
- if 'Multiple jobs found with name' in stdout:
490
- with ux_utils.print_exception_no_traceback():
491
- raise RuntimeError(
492
- 'Please specify the job ID instead of the job name.')
990
+ backend = backend_utils.get_backend_from_handle(handle)
991
+ assert isinstance(backend, backends.CloudVmRayBackend)
992
+
993
+ use_legacy = not handle.is_grpc_enabled_with_flag
994
+
995
+ if not use_legacy:
996
+ current_workspace = skypilot_config.get_active_workspace()
997
+ try:
998
+ request = managed_jobsv1_pb2.CancelJobsRequest(
999
+ current_workspace=current_workspace)
1000
+
1001
+ if all_users or all or job_ids:
1002
+ request.all_users = all_users
1003
+ if all:
1004
+ request.user_hash = common_utils.get_user_hash()
1005
+ if job_ids is not None:
1006
+ request.job_ids.CopyFrom(
1007
+ managed_jobsv1_pb2.JobIds(ids=job_ids))
1008
+ elif name is not None:
1009
+ request.job_name = name
1010
+ else:
1011
+ assert pool is not None, (job_ids, name, pool, all)
1012
+ request.pool_name = pool
1013
+
1014
+ response = backend_utils.invoke_skylet_with_retries(
1015
+ lambda: cloud_vm_ray_backend.SkyletClient(
1016
+ handle.get_grpc_channel()).cancel_managed_jobs(request))
1017
+ stdout = response.message
1018
+ except exceptions.SkyletMethodNotImplementedError:
1019
+ use_legacy = True
1020
+
1021
+ if use_legacy:
1022
+ if all_users or all or job_ids:
1023
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
1024
+ job_ids, all_users=all_users)
1025
+ elif name is not None:
1026
+ code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
1027
+ name)
1028
+ else:
1029
+ assert pool is not None, (job_ids, name, pool, all)
1030
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
1031
+ pool)
1032
+ # The stderr is redirected to stdout
1033
+ returncode, stdout, stderr = backend.run_on_head(
1034
+ handle, code, require_outputs=True, stream_logs=False)
1035
+ try:
1036
+ subprocess_utils.handle_returncode(
1037
+ returncode, code, 'Failed to cancel managed job',
1038
+ stdout + stderr)
1039
+ except exceptions.CommandError as e:
1040
+ with ux_utils.print_exception_no_traceback():
1041
+ raise RuntimeError(e.error_msg) from e
1042
+
1043
+ logger.info(stdout)
1044
+ if 'Multiple jobs found with name' in stdout:
1045
+ with ux_utils.print_exception_no_traceback():
1046
+ raise RuntimeError(
1047
+ 'Please specify the job ID instead of the job name.')
493
1048
 
494
1049
 
495
1050
  @usage_lib.entrypoint
496
- def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
497
- controller: bool, refresh: bool) -> int:
1051
+ def tail_logs(name: Optional[str],
1052
+ job_id: Optional[int],
1053
+ follow: bool,
1054
+ controller: bool,
1055
+ refresh: bool,
1056
+ tail: Optional[int] = None) -> int:
498
1057
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
499
1058
  """Tail logs of managed jobs.
500
1059
 
@@ -537,56 +1096,8 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
537
1096
  job_id=job_id,
538
1097
  job_name=name,
539
1098
  follow=follow,
540
- controller=controller)
541
-
542
-
543
- def start_dashboard_forwarding(refresh: bool = False) -> Tuple[int, int]:
544
- """Opens a dashboard for managed jobs (needs controller to be UP)."""
545
- # TODO(SKY-1212): ideally, the controller/dashboard server should expose the
546
- # API perhaps via REST. Then here we would (1) not have to use SSH to try to
547
- # see if the controller is UP first, which is slow; (2) not have to run SSH
548
- # port forwarding first (we'd just launch a local dashboard which would make
549
- # REST API calls to the controller dashboard server).
550
- logger.info('Starting dashboard')
551
- hint = ('Dashboard is not available if jobs controller is not up. Run '
552
- 'a managed job first or run: sky jobs queue --refresh')
553
- handle = _maybe_restart_controller(
554
- refresh=refresh,
555
- stopped_message=hint,
556
- spinner_message='Checking jobs controller')
557
-
558
- # SSH forward a free local port to remote's dashboard port.
559
- remote_port = skylet_constants.SPOT_DASHBOARD_REMOTE_PORT
560
- free_port = common_utils.find_free_port(remote_port)
561
- runner = handle.get_command_runners()[0]
562
- port_forward_command = ' '.join(
563
- runner.port_forward_command(port_forward=[(free_port, remote_port)],
564
- connect_timeout=1))
565
- port_forward_command = (
566
- f'{port_forward_command} '
567
- f'> ~/sky_logs/api_server/dashboard-{common_utils.get_user_hash()}.log '
568
- '2>&1')
569
- logger.info(f'Forwarding port: {colorama.Style.DIM}{port_forward_command}'
570
- f'{colorama.Style.RESET_ALL}')
571
-
572
- ssh_process = subprocess.Popen(port_forward_command,
573
- shell=True,
574
- start_new_session=True)
575
- time.sleep(3) # Added delay for ssh_command to initialize.
576
- logger.info(f'{colorama.Fore.GREEN}Dashboard is now available at: '
577
- f'http://127.0.0.1:{free_port}{colorama.Style.RESET_ALL}')
578
-
579
- return free_port, ssh_process.pid
580
-
581
-
582
- def stop_dashboard_forwarding(pid: int) -> None:
583
- # Exit the ssh command when the context manager is closed.
584
- try:
585
- os.killpg(os.getpgid(pid), signal.SIGTERM)
586
- except ProcessLookupError:
587
- # This happens if jobs controller is auto-stopped.
588
- pass
589
- logger.info('Forwarding port closed. Exiting.')
1099
+ controller=controller,
1100
+ tail=tail)
590
1101
 
591
1102
 
592
1103
  @usage_lib.entrypoint
@@ -635,3 +1146,73 @@ def download_logs(
635
1146
  job_name=name,
636
1147
  controller=controller,
637
1148
  local_dir=local_dir)
1149
+
1150
+
1151
+ @usage_lib.entrypoint
1152
+ def pool_apply(
1153
+ task: 'sky.Task',
1154
+ pool_name: str,
1155
+ mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
1156
+ workers: Optional[int] = None,
1157
+ ) -> None:
1158
+ """Apply a config to a pool."""
1159
+ return impl.apply(task, workers, pool_name, mode, pool=True)
1160
+
1161
+
1162
+ @usage_lib.entrypoint
1163
+ # pylint: disable=redefined-builtin
1164
+ def pool_down(
1165
+ pool_names: Optional[Union[str, List[str]]] = None,
1166
+ all: bool = False,
1167
+ purge: bool = False,
1168
+ ) -> None:
1169
+ """Delete a pool."""
1170
+ return impl.down(pool_names, all, purge, pool=True)
1171
+
1172
+
1173
+ @usage_lib.entrypoint
1174
+ def pool_status(
1175
+ pool_names: Optional[Union[str,
1176
+ List[str]]] = None,) -> List[Dict[str, Any]]:
1177
+ """Query a pool."""
1178
+ return impl.status(pool_names, pool=True)
1179
+
1180
+
1181
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
1182
+
1183
+
1184
+ @usage_lib.entrypoint
1185
+ def pool_tail_logs(
1186
+ pool_name: str,
1187
+ *,
1188
+ target: ServiceComponentOrStr,
1189
+ worker_id: Optional[int] = None,
1190
+ follow: bool = True,
1191
+ tail: Optional[int] = None,
1192
+ ) -> None:
1193
+ """Tail logs of a pool."""
1194
+ return impl.tail_logs(pool_name,
1195
+ target=target,
1196
+ replica_id=worker_id,
1197
+ follow=follow,
1198
+ tail=tail,
1199
+ pool=True)
1200
+
1201
+
1202
+ @usage_lib.entrypoint
1203
+ def pool_sync_down_logs(
1204
+ pool_name: str,
1205
+ *,
1206
+ local_dir: str,
1207
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
1208
+ None] = None,
1209
+ worker_ids: Optional[List[int]] = None,
1210
+ tail: Optional[int] = None,
1211
+ ) -> str:
1212
+ """Sync down logs of a pool."""
1213
+ return impl.sync_down_logs(pool_name,
1214
+ local_dir=local_dir,
1215
+ targets=targets,
1216
+ replica_ids=worker_ids,
1217
+ tail=tail,
1218
+ pool=True)