skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/jobs/server/core.py CHANGED
@@ -1,11 +1,13 @@
1
1
  """SDK functions for managed jobs."""
2
+ import concurrent.futures
3
+ import copy
4
+ import ipaddress
2
5
  import os
3
- import signal
4
- import subprocess
6
+ import pathlib
5
7
  import tempfile
6
- import time
7
8
  import typing
8
9
  from typing import Any, Dict, List, Optional, Tuple, Union
10
+ from urllib import parse as urlparse
9
11
  import uuid
10
12
 
11
13
  import colorama
@@ -17,13 +19,23 @@ from sky import execution
17
19
  from sky import global_user_state
18
20
  from sky import provision as provision_lib
19
21
  from sky import sky_logging
22
+ from sky import skypilot_config
20
23
  from sky import task as task_lib
24
+ from sky.adaptors import common as adaptors_common
21
25
  from sky.backends import backend_utils
22
- from sky.clouds.service_catalog import common as service_catalog_common
26
+ from sky.backends import cloud_vm_ray_backend
27
+ from sky.catalog import common as service_catalog_common
23
28
  from sky.data import storage as storage_lib
24
29
  from sky.jobs import constants as managed_job_constants
30
+ from sky.jobs import state as managed_job_state
25
31
  from sky.jobs import utils as managed_job_utils
32
+ from sky.metrics import utils as metrics_lib
26
33
  from sky.provision import common as provision_common
34
+ from sky.schemas.api import responses
35
+ from sky.serve import serve_state
36
+ from sky.serve import serve_utils
37
+ from sky.serve.server import impl
38
+ from sky.server.requests import request_names
27
39
  from sky.skylet import constants as skylet_constants
28
40
  from sky.usage import usage_lib
29
41
  from sky.utils import admin_policy_utils
@@ -36,21 +48,153 @@ from sky.utils import status_lib
36
48
  from sky.utils import subprocess_utils
37
49
  from sky.utils import timeline
38
50
  from sky.utils import ux_utils
51
+ from sky.workspaces import core as workspaces_core
39
52
 
40
53
  if typing.TYPE_CHECKING:
54
+ from google.protobuf import json_format
55
+
41
56
  import sky
42
- from sky.backends import cloud_vm_ray_backend
57
+ from sky.schemas.generated import managed_jobsv1_pb2
58
+ else:
59
+ json_format = adaptors_common.LazyImport('google.protobuf.json_format')
60
+
61
+ managed_jobsv1_pb2 = adaptors_common.LazyImport(
62
+ 'sky.schemas.generated.managed_jobsv1_pb2')
43
63
 
44
64
  logger = sky_logging.init_logger(__name__)
45
65
 
66
+ _MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES = [
67
+ 'job_id',
68
+ 'task_id',
69
+ 'workspace',
70
+ 'job_name',
71
+ 'task_name',
72
+ 'resources',
73
+ 'submitted_at',
74
+ 'end_at',
75
+ 'job_duration',
76
+ 'recovery_count',
77
+ 'status',
78
+ 'pool',
79
+ 'current_cluster_name',
80
+ 'job_id_on_pool_cluster',
81
+ 'start_at',
82
+ 'infra',
83
+ 'cloud',
84
+ 'region',
85
+ 'zone',
86
+ 'cluster_resources',
87
+ 'schedule_state',
88
+ 'details',
89
+ 'failure_reason',
90
+ 'metadata',
91
+ 'user_name',
92
+ 'user_hash',
93
+ ]
94
+
95
+
96
+ def _upload_files_to_controller(dag: 'sky.Dag') -> Dict[str, str]:
97
+ """Upload files to the controller.
98
+
99
+ In consolidation mode, we still need to upload files to the controller as
100
+ we should keep a separate workdir for each jobs. Assuming two jobs using
101
+ the same workdir, if there are some modifications to the workdir after job 1
102
+ is submitted, on recovery of job 1, the modifications should not be applied.
103
+ """
104
+ local_to_controller_file_mounts: Dict[str, str] = {}
105
+
106
+ # For consolidation mode, we don't need to use cloud storage,
107
+ # as uploading to the controller is only a local copy.
108
+ storage_clouds = (
109
+ storage_lib.get_cached_enabled_storage_cloud_names_or_refresh())
110
+ force_disable_cloud_bucket = skypilot_config.get_nested(
111
+ ('jobs', 'force_disable_cloud_bucket'), False)
112
+ if (not managed_job_utils.is_consolidation_mode() and storage_clouds and
113
+ not force_disable_cloud_bucket):
114
+ for task_ in dag.tasks:
115
+ controller_utils.maybe_translate_local_file_mounts_and_sync_up(
116
+ task_, task_type='jobs')
117
+ else:
118
+ # We do not have any cloud storage available, so fall back to
119
+ # two-hop file_mount uploading.
120
+ # Note: we can't easily hack sync_storage_mounts() to upload
121
+ # directly to the controller, because the controller may not
122
+ # even be up yet.
123
+ for task_ in dag.tasks:
124
+ if task_.storage_mounts and not storage_clouds:
125
+ # Technically, we could convert COPY storage_mounts that
126
+ # have a local source and do not specify `store`, but we
127
+ # will not do that for now. Only plain file_mounts are
128
+ # supported.
129
+ raise exceptions.NotSupportedError(
130
+ 'Cloud-based file_mounts are specified, but no cloud '
131
+ 'storage is available. Please specify local '
132
+ 'file_mounts only.')
133
+
134
+ # Merge file mounts from all tasks.
135
+ local_to_controller_file_mounts.update(
136
+ controller_utils.translate_local_file_mounts_to_two_hop(task_))
137
+
138
+ return local_to_controller_file_mounts
139
+
140
+
141
+ def _maybe_submit_job_locally(prefix: str, dag: 'sky.Dag',
142
+ num_jobs: int) -> Optional[List[int]]:
143
+ """Submit the managed job locally if in consolidation mode.
144
+
145
+ In normal mode the managed job submission is done in the ray job submission.
146
+ For consolidation mode, we need to manually submit it. Check the following
147
+ function for the normal mode submission:
148
+ sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend,
149
+ _exec_code_on_head::_maybe_add_managed_job_code
150
+ """
151
+ if not managed_job_utils.is_consolidation_mode():
152
+ return None
153
+
154
+ # Create local directory for the managed job.
155
+ pathlib.Path(prefix).expanduser().mkdir(parents=True, exist_ok=True)
156
+ job_ids = []
157
+ pool = dag.pool
158
+ pool_hash = None
159
+ if pool is not None:
160
+ pool_hash = serve_state.get_service_hash(pool)
161
+ # Already checked in the sdk.
162
+ assert pool_hash is not None, f'Pool {pool} not found'
163
+ for _ in range(num_jobs):
164
+ # TODO(tian): We should have a separate name for each job when
165
+ # submitting multiple jobs. Current blocker is that we are sharing
166
+ # the same dag object for all jobs. Maybe we can do copy.copy() for
167
+ # each job and then give it a unique name (e.g. append job id after
168
+ # the task name). The name of the dag also needs to be aligned with
169
+ # the task name.
170
+ consolidation_mode_job_id = (
171
+ managed_job_state.set_job_info_without_job_id(
172
+ dag.name,
173
+ workspace=skypilot_config.get_active_workspace(
174
+ force_user_workspace=True),
175
+ entrypoint=common_utils.get_current_command(),
176
+ pool=pool,
177
+ pool_hash=pool_hash,
178
+ user_hash=common_utils.get_user_hash()))
179
+ for task_id, task in enumerate(dag.tasks):
180
+ resources_str = backend_utils.get_task_resources_str(
181
+ task, is_managed_job=True)
182
+ managed_job_state.set_pending(consolidation_mode_job_id, task_id,
183
+ task.name, resources_str,
184
+ task.metadata_json)
185
+ job_ids.append(consolidation_mode_job_id)
186
+ return job_ids
187
+
46
188
 
47
189
  @timeline.event
48
190
  @usage_lib.entrypoint
49
191
  def launch(
50
192
  task: Union['sky.Task', 'sky.Dag'],
51
193
  name: Optional[str] = None,
194
+ pool: Optional[str] = None,
195
+ num_jobs: Optional[int] = None,
52
196
  stream_logs: bool = True,
53
- ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
197
+ ) -> Tuple[Optional[Union[int, List[int]]], Optional[backends.ResourceHandle]]:
54
198
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
55
199
  """Launches a managed job.
56
200
 
@@ -76,21 +220,58 @@ def launch(
76
220
  None if dryrun.
77
221
  """
78
222
  entrypoint = task
223
+ # using hasattr instead of isinstance to avoid importing sky
224
+ if hasattr(task, 'metadata'):
225
+ metadata = task.metadata
226
+ else:
227
+ # we are a Dag, not a Task
228
+ if len(task.tasks) == 1:
229
+ metadata = task.tasks[0].metadata
230
+ else:
231
+ # doesn't make sense to have a git commit since there might be
232
+ # different metadatas for each task
233
+ metadata = {}
234
+
79
235
  dag_uuid = str(uuid.uuid4().hex[:4])
80
236
  dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
237
+
81
238
  # Always apply the policy again here, even though it might have been applied
82
239
  # in the CLI. This is to ensure that we apply the policy to the final DAG
83
240
  # and get the mutated config.
84
241
  dag, mutated_user_config = admin_policy_utils.apply(
85
- dag, use_mutated_config_in_current_request=False)
242
+ dag, request_name=request_names.AdminPolicyRequestName.JOBS_LAUNCH)
243
+ dag.resolve_and_validate_volumes()
86
244
  if not dag.is_chain():
87
245
  with ux_utils.print_exception_no_traceback():
88
246
  raise ValueError('Only single-task or chain DAG is '
89
247
  f'allowed for job_launch. Dag: {dag}')
90
248
  dag.validate()
249
+ # TODO(aylei): use consolidated job controller instead of performing
250
+ # pre-mount operations when submitting jobs.
251
+ dag.pre_mount_volumes()
252
+
253
+ # If there is a local postgres db, when the api server tries launching on
254
+ # the remote jobs controller it will fail. therefore, we should remove this
255
+ # before sending the config to the jobs controller.
256
+ # TODO(luca) there are a lot of potential problems with postgres being sent
257
+ # to the jobs controller. for example if the postgres is whitelisted to
258
+ # only the API server, this will then break. the simple solution to that is
259
+ # telling the user to add the jobs controller to the postgres whitelist.
260
+ if not managed_job_utils.is_consolidation_mode():
261
+ db_path = mutated_user_config.get('db', None)
262
+ if db_path is not None:
263
+ parsed = urlparse.urlparse(db_path)
264
+ if ((parsed.hostname == 'localhost' or
265
+ ipaddress.ip_address(parsed.hostname).is_loopback)):
266
+ mutated_user_config.pop('db', None)
267
+
268
+ user_dag_str_user_specified = dag_utils.dump_chain_dag_to_yaml_str(
269
+ dag, use_user_specified_yaml=True)
270
+
91
271
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
92
272
 
93
273
  task_names = set()
274
+ priority = None
94
275
  for task_ in dag.tasks:
95
276
  if task_.name in task_names:
96
277
  with ux_utils.print_exception_no_traceback():
@@ -101,6 +282,42 @@ def launch(
101
282
  'will be auto-generated) .')
102
283
  task_names.add(task_.name)
103
284
 
285
+ # Check for priority in resources
286
+ task_priority = None
287
+ if task_.resources:
288
+ # Convert set to list to access elements by index
289
+ resources_list = list(task_.resources)
290
+ # Take first resource's priority as reference
291
+ task_priority = resources_list[0].priority
292
+
293
+ # Check all other resources have same priority
294
+ for resource in resources_list[1:]:
295
+ if resource.priority != task_priority:
296
+ with ux_utils.print_exception_no_traceback():
297
+ raise ValueError(
298
+ f'Task {task_.name!r}: All resources must have the '
299
+ 'same priority. Found priority '
300
+ f'{resource.priority} but expected {task_priority}.'
301
+ )
302
+
303
+ if task_priority is not None:
304
+ if (priority is not None and priority != task_priority):
305
+ with ux_utils.print_exception_no_traceback():
306
+ raise ValueError(
307
+ 'Multiple tasks in the DAG have different priorities. '
308
+ 'Either specify a priority in only one task, or set '
309
+ 'the same priority for each task.')
310
+ priority = task_priority
311
+
312
+ if priority is None:
313
+ priority = skylet_constants.DEFAULT_PRIORITY
314
+
315
+ if (priority < skylet_constants.MIN_PRIORITY or
316
+ priority > skylet_constants.MAX_PRIORITY):
317
+ raise ValueError(
318
+ f'Priority must be between {skylet_constants.MIN_PRIORITY}'
319
+ f' and {skylet_constants.MAX_PRIORITY}, got {priority}')
320
+
104
321
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
105
322
 
106
323
  with rich_utils.safe_status(
@@ -109,15 +326,13 @@ def launch(
109
326
  # Check whether cached jobs controller cluster is accessible
110
327
  cluster_name = (
111
328
  controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
112
- record = global_user_state.get_cluster_from_name(cluster_name)
113
- if record is not None:
329
+ if global_user_state.cluster_with_name_exists(cluster_name):
114
330
  # there is a cached jobs controller cluster
115
331
  try:
116
332
  # TODO: do something with returned status?
117
333
  _, _ = backend_utils.refresh_cluster_status_handle(
118
334
  cluster_name=cluster_name,
119
- force_refresh_statuses=set(status_lib.ClusterStatus),
120
- acquire_per_cluster_status_lock=False)
335
+ force_refresh_statuses=set(status_lib.ClusterStatus))
121
336
  except (exceptions.ClusterOwnerIdentityMismatchError,
122
337
  exceptions.CloudUserIdentityError,
123
338
  exceptions.ClusterStatusFetchingError) as e:
@@ -131,100 +346,216 @@ def launch(
131
346
  f'with:\n\n`sky down {cluster_name} --purge`\n\n'
132
347
  f'Reason: {common_utils.format_exception(e)}')
133
348
 
134
- local_to_controller_file_mounts = {}
135
-
136
- if storage_lib.get_cached_enabled_storage_cloud_names_or_refresh():
137
- for task_ in dag.tasks:
138
- controller_utils.maybe_translate_local_file_mounts_and_sync_up(
139
- task_, task_type='jobs')
349
+ local_to_controller_file_mounts = _upload_files_to_controller(dag)
350
+ controller = controller_utils.Controllers.JOBS_CONTROLLER
351
+ controller_name = controller.value.cluster_name
352
+ prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
353
+ controller_resources = controller_utils.get_controller_resources(
354
+ controller=controller,
355
+ task_resources=sum([list(t.resources) for t in dag.tasks], []))
356
+
357
+ num_jobs = num_jobs if num_jobs is not None else 1
358
+ # We do this assignment after applying the admin policy, so that we don't
359
+ # need to serialize the pool name in the dag. The dag object will be
360
+ # preserved. See sky/admin_policy.py::MutatedUserRequest::decode.
361
+ dag.pool = pool
362
+ consolidation_mode_job_ids = _maybe_submit_job_locally(
363
+ prefix, dag, num_jobs)
364
+
365
+ # This is only needed for non-consolidation mode. For consolidation
366
+ # mode, the controller uses the same catalog as API server.
367
+ modified_catalogs = {} if consolidation_mode_job_ids is not None else (
368
+ service_catalog_common.get_modified_catalog_file_mounts())
369
+
370
+ def _submit_one(
371
+ consolidation_mode_job_id: Optional[int] = None,
372
+ job_rank: Optional[int] = None,
373
+ num_jobs: Optional[int] = None,
374
+ ) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
375
+ rank_suffix = '' if job_rank is None else f'-{job_rank}'
376
+ remote_original_user_yaml_path = (
377
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.original_user_yaml')
378
+ remote_user_yaml_path = (
379
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.yaml')
380
+ remote_user_config_path = (
381
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.config_yaml')
382
+ remote_env_file_path = (
383
+ f'{prefix}/{dag.name}-{dag_uuid}{rank_suffix}.env')
384
+ with tempfile.NamedTemporaryFile(
385
+ prefix=f'managed-dag-{dag.name}{rank_suffix}-',
386
+ mode='w',
387
+ ) as f, tempfile.NamedTemporaryFile(
388
+ prefix=f'managed-user-dag-{dag.name}{rank_suffix}-',
389
+ mode='w',
390
+ ) as original_user_yaml_path:
391
+ original_user_yaml_path.write(user_dag_str_user_specified)
392
+ original_user_yaml_path.flush()
393
+ # Copy tasks to avoid race conditions when multiple threads modify
394
+ # the same dag object concurrently. Each thread needs its own copy.
395
+ dag_copy = copy.deepcopy(dag)
396
+ for task_ in dag_copy.tasks:
397
+ if job_rank is not None:
398
+ task_.update_envs({'SKYPILOT_JOB_RANK': str(job_rank)})
399
+ task_.update_envs({'SKYPILOT_NUM_JOBS': str(num_jobs)})
400
+
401
+ dag_utils.dump_chain_dag_to_yaml(dag_copy, f.name)
402
+
403
+ vars_to_fill = {
404
+ 'remote_original_user_yaml_path':
405
+ (remote_original_user_yaml_path),
406
+ 'original_user_dag_path': original_user_yaml_path.name,
407
+ 'remote_user_yaml_path': remote_user_yaml_path,
408
+ 'user_yaml_path': f.name,
409
+ 'local_to_controller_file_mounts':
410
+ (local_to_controller_file_mounts),
411
+ 'jobs_controller': controller_name,
412
+ # Note: actual cluster name will be <task.name>-<managed job ID>
413
+ 'dag_name': dag.name,
414
+ 'remote_user_config_path': remote_user_config_path,
415
+ 'remote_env_file_path': remote_env_file_path,
416
+ 'modified_catalogs': modified_catalogs,
417
+ 'priority': priority,
418
+ 'consolidation_mode_job_id': consolidation_mode_job_id,
419
+ 'pool': pool,
420
+ 'job_controller_indicator_file':
421
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE,
422
+ **controller_utils.shared_controller_vars_to_fill(
423
+ controller,
424
+ remote_user_config_path=remote_user_config_path,
425
+ # TODO(aylei): the mutated config will not be updated
426
+ # afterwards without recreate the controller. Need to
427
+ # revisit this.
428
+ local_user_config=mutated_user_config,
429
+ ),
430
+ }
140
431
 
141
- else:
142
- # We do not have any cloud storage available, so fall back to
143
- # two-hop file_mount uploading.
144
- # Note: we can't easily hack sync_storage_mounts() to upload
145
- # directly to the controller, because the controller may not
146
- # even be up yet.
147
- for task_ in dag.tasks:
148
- if task_.storage_mounts:
149
- # Technically, we could convert COPY storage_mounts that
150
- # have a local source and do not specify `store`, but we
151
- # will not do that for now. Only plain file_mounts are
152
- # supported.
153
- raise exceptions.NotSupportedError(
154
- 'Cloud-based file_mounts are specified, but no cloud '
155
- 'storage is available. Please specify local '
156
- 'file_mounts only.')
157
-
158
- # Merge file mounts from all tasks.
159
- local_to_controller_file_mounts.update(
160
- controller_utils.translate_local_file_mounts_to_two_hop(
161
- task_))
162
-
163
- with tempfile.NamedTemporaryFile(prefix=f'managed-dag-{dag.name}-',
164
- mode='w') as f:
165
- dag_utils.dump_chain_dag_to_yaml(dag, f.name)
166
- controller = controller_utils.Controllers.JOBS_CONTROLLER
167
- controller_name = controller.value.cluster_name
168
- prefix = managed_job_constants.JOBS_TASK_YAML_PREFIX
169
- remote_user_yaml_path = f'{prefix}/{dag.name}-{dag_uuid}.yaml'
170
- remote_user_config_path = f'{prefix}/{dag.name}-{dag_uuid}.config_yaml'
171
- remote_env_file_path = f'{prefix}/{dag.name}-{dag_uuid}.env'
172
- controller_resources = controller_utils.get_controller_resources(
173
- controller=controller,
174
- task_resources=sum([list(t.resources) for t in dag.tasks], []))
175
- controller_idle_minutes_to_autostop, controller_down = (
176
- controller_utils.get_controller_autostop_config(
177
- controller=controller))
178
-
179
- vars_to_fill = {
180
- 'remote_user_yaml_path': remote_user_yaml_path,
181
- 'user_yaml_path': f.name,
182
- 'local_to_controller_file_mounts': local_to_controller_file_mounts,
183
- 'jobs_controller': controller_name,
184
- # Note: actual cluster name will be <task.name>-<managed job ID>
185
- 'dag_name': dag.name,
186
- 'remote_user_config_path': remote_user_config_path,
187
- 'remote_env_file_path': remote_env_file_path,
188
- 'modified_catalogs':
189
- service_catalog_common.get_modified_catalog_file_mounts(),
190
- 'dashboard_setup_cmd': managed_job_constants.DASHBOARD_SETUP_CMD,
191
- 'dashboard_user_id': common.SERVER_ID,
192
- **controller_utils.shared_controller_vars_to_fill(
193
- controller,
194
- remote_user_config_path=remote_user_config_path,
195
- local_user_config=mutated_user_config,
196
- ),
197
- }
198
-
199
- yaml_path = os.path.join(
200
- managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
201
- f'{name}-{dag_uuid}.yaml')
202
- common_utils.fill_template(
203
- managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
204
- vars_to_fill,
205
- output_path=yaml_path)
206
- controller_task = task_lib.Task.from_yaml(yaml_path)
207
- controller_task.set_resources(controller_resources)
208
-
209
- controller_task.managed_job_dag = dag
210
-
211
- sky_logging.print(
212
- f'{colorama.Fore.YELLOW}'
213
- f'Launching managed job {dag.name!r} from jobs controller...'
214
- f'{colorama.Style.RESET_ALL}')
215
-
216
- # Launch with the api server's user hash, so that sky status does not
217
- # show the owner of the controller as whatever user launched it first.
218
- with common.with_server_user_hash():
219
- return execution.launch(
220
- task=controller_task,
221
- cluster_name=controller_name,
222
- stream_logs=stream_logs,
223
- idle_minutes_to_autostop=controller_idle_minutes_to_autostop,
224
- down=controller_down,
225
- retry_until_up=True,
226
- fast=True,
227
- _disable_controller_check=True)
432
+ yaml_path = os.path.join(
433
+ managed_job_constants.JOBS_CONTROLLER_YAML_PREFIX,
434
+ f'{name}-{dag_uuid}-{consolidation_mode_job_id}-{job_rank}.yaml'
435
+ )
436
+ common_utils.fill_template(
437
+ managed_job_constants.JOBS_CONTROLLER_TEMPLATE,
438
+ vars_to_fill,
439
+ output_path=yaml_path)
440
+ controller_task = task_lib.Task.from_yaml(yaml_path)
441
+ controller_task.set_resources(controller_resources)
442
+
443
+ controller_task.managed_job_dag = dag_copy
444
+ # pylint: disable=protected-access
445
+ controller_task._metadata = metadata
446
+
447
+ job_identity = ''
448
+ if job_rank is not None:
449
+ job_identity = f' (rank: {job_rank})'
450
+ job_controller_postfix = (' from jobs controller' if
451
+ consolidation_mode_job_id is None else '')
452
+ logger.info(
453
+ f'{colorama.Fore.YELLOW}'
454
+ f'Launching managed job {dag.name!r}{job_identity}'
455
+ f'{job_controller_postfix}...{colorama.Style.RESET_ALL}')
456
+
457
+ # Launch with the api server's user hash, so that sky status does
458
+ # not show the owner of the controller as whatever user launched
459
+ # it first.
460
+ with common.with_server_user():
461
+ # Always launch the controller in the default workspace.
462
+ with skypilot_config.local_active_workspace_ctx(
463
+ skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
464
+ # TODO(zhwu): the buckets need to be correctly handled for
465
+ # a specific workspace. For example, if a job is launched in
466
+ # workspace A, but the controller is in workspace B, the
467
+ # intermediate bucket and newly created bucket should be in
468
+ # workspace A.
469
+ if consolidation_mode_job_id is None:
470
+ return execution.launch(
471
+ task=controller_task,
472
+ cluster_name=controller_name,
473
+ stream_logs=stream_logs,
474
+ retry_until_up=True,
475
+ fast=True,
476
+ _request_name=request_names.AdminPolicyRequestName.
477
+ JOBS_LAUNCH_CONTROLLER,
478
+ _disable_controller_check=True)
479
+ # Manually launch the scheduler in consolidation mode.
480
+ local_handle = backend_utils.is_controller_accessible(
481
+ controller=controller, stopped_message='')
482
+ backend = backend_utils.get_backend_from_handle(
483
+ local_handle)
484
+ assert isinstance(backend, backends.CloudVmRayBackend)
485
+ with sky_logging.silent():
486
+ backend.sync_file_mounts(
487
+ handle=local_handle,
488
+ all_file_mounts=controller_task.file_mounts,
489
+ storage_mounts=controller_task.storage_mounts)
490
+ run_script = controller_task.run
491
+ assert isinstance(run_script, str)
492
+ # Manually add the env variables to the run script.
493
+ # Originally this is done in ray jobs submission but now we
494
+ # have to do it manually because there is no ray runtime on
495
+ # the API server.
496
+ env_cmds = [
497
+ f'export {k}={v!r}'
498
+ for k, v in controller_task.envs.items()
499
+ ]
500
+ run_script = '\n'.join(env_cmds + [run_script])
501
+ # Dump script for high availability recovery.
502
+ managed_job_state.set_ha_recovery_script(
503
+ consolidation_mode_job_id, run_script)
504
+ backend.run_on_head(local_handle, run_script)
505
+ ux_utils.starting_message(
506
+ f'Job submitted, ID: {consolidation_mode_job_id}')
507
+ return consolidation_mode_job_id, local_handle
508
+
509
+ if pool is None:
510
+ if consolidation_mode_job_ids is None:
511
+ return _submit_one()
512
+ assert len(consolidation_mode_job_ids) == 1
513
+ return _submit_one(consolidation_mode_job_ids[0])
514
+
515
+ ids: List[int] = []
516
+ all_handle: Optional[backends.ResourceHandle] = None
517
+
518
+ if num_jobs == 1:
519
+ job_id = (consolidation_mode_job_ids[0]
520
+ if consolidation_mode_job_ids is not None else None)
521
+ jid, handle = _submit_one(job_id, 0, num_jobs=num_jobs)
522
+ assert jid is not None, (job_id, handle)
523
+ ids.append(jid)
524
+ all_handle = handle
525
+ else:
526
+ # Submit jobs in parallel using ThreadPoolExecutor
527
+ with concurrent.futures.ThreadPoolExecutor(
528
+ max_workers=min(num_jobs,
529
+ os.cpu_count() or 1)) as executor:
530
+ # Submit jobs concurrently
531
+ future_to_rank = {}
532
+ for job_rank in range(num_jobs):
533
+ job_id = (consolidation_mode_job_ids[job_rank]
534
+ if consolidation_mode_job_ids is not None else None)
535
+ future = executor.submit(_submit_one, job_id, job_rank,
536
+ num_jobs)
537
+ future_to_rank[future] = job_rank
538
+
539
+ # Collect results in order of job_rank to maintain consistent order.
540
+ results: List[Optional[Tuple[
541
+ int, Optional[backends.ResourceHandle]]]] = [None] * num_jobs
542
+ for future in concurrent.futures.as_completed(future_to_rank):
543
+ job_rank = future_to_rank[future]
544
+ try:
545
+ jid, handle = future.result()
546
+ assert jid is not None, (job_id, handle)
547
+ results[job_rank] = (jid, handle)
548
+ all_handle = handle # Keep the last handle.
549
+ except Exception as e:
550
+ logger.error(f'Error launching job {job_rank}: {e}')
551
+ raise e
552
+
553
+ # Extract job IDs in order
554
+ for res in results:
555
+ if res is not None:
556
+ ids.append(res[0])
557
+
558
+ return ids, all_handle
228
559
 
229
560
 
230
561
  def queue_from_kubernetes_pod(
@@ -275,7 +606,9 @@ def queue_from_kubernetes_pod(
275
606
  managed_jobs_runner = provision_lib.get_command_runners(
276
607
  'kubernetes', cluster_info)[0]
277
608
 
278
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
609
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
610
+ skip_finished=skip_finished,
611
+ fields=_MANAGED_JOB_FIELDS_FOR_QUEUE_KUBERNETES)
279
612
  returncode, job_table_payload, stderr = managed_jobs_runner.run(
280
613
  code,
281
614
  require_outputs=True,
@@ -291,7 +624,14 @@ def queue_from_kubernetes_pod(
291
624
  except exceptions.CommandError as e:
292
625
  raise RuntimeError(str(e)) from e
293
626
 
294
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
627
+ jobs, _, result_type, _, _ = managed_job_utils.load_managed_job_queue(
628
+ job_table_payload)
629
+
630
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
631
+ return jobs
632
+
633
+ # Backward compatibility for old jobs controller without filtering
634
+ # TODO(hailong): remove this after 0.12.0
295
635
  if skip_finished:
296
636
  # Filter out the finished jobs. If a multi-task job is partially
297
637
  # finished, we will include all its tasks.
@@ -322,28 +662,22 @@ def _maybe_restart_controller(
322
662
  if handle is not None:
323
663
  return handle
324
664
 
325
- sky_logging.print(f'{colorama.Fore.YELLOW}'
326
- f'Restarting {jobs_controller_type.value.name}...'
327
- f'{colorama.Style.RESET_ALL}')
665
+ logger.info(f'{colorama.Fore.YELLOW}'
666
+ f'Restarting {jobs_controller_type.value.name}...'
667
+ f'{colorama.Style.RESET_ALL}')
328
668
 
329
669
  rich_utils.force_update_status(
330
670
  ux_utils.spinner_message(f'{spinner_message} - restarting '
331
671
  'controller'))
332
- handle = core.start(cluster_name=jobs_controller_type.value.cluster_name)
333
- # Make sure the dashboard is running when the controller is restarted.
334
- # We should not directly use execution.launch() and have the dashboard cmd
335
- # in the task setup because since we are using detached_setup, it will
336
- # become a job on controller which messes up the job IDs (we assume the
337
- # job ID in controller's job queue is consistent with managed job IDs).
338
- with rich_utils.safe_status(
339
- ux_utils.spinner_message('Starting dashboard...')):
340
- runner = handle.get_command_runners()[0]
341
- runner.run(
342
- f'export '
343
- f'{skylet_constants.USER_ID_ENV_VAR}={common.SERVER_ID!r}; '
344
- f'{managed_job_constants.DASHBOARD_SETUP_CMD}',
345
- stream_logs=True,
346
- )
672
+ with skypilot_config.local_active_workspace_ctx(
673
+ skylet_constants.SKYPILOT_DEFAULT_WORKSPACE):
674
+ global_user_state.add_cluster_event(
675
+ jobs_controller_type.value.cluster_name,
676
+ status_lib.ClusterStatus.INIT, 'Jobs controller restarted.',
677
+ global_user_state.ClusterEventType.STATUS_CHANGE)
678
+ handle = core.start(
679
+ cluster_name=jobs_controller_type.value.cluster_name)
680
+
347
681
  controller_status = status_lib.ClusterStatus.UP
348
682
  rich_utils.force_update_status(ux_utils.spinner_message(spinner_message))
349
683
 
@@ -351,10 +685,13 @@ def _maybe_restart_controller(
351
685
  return handle
352
686
 
353
687
 
688
+ # For backwards compatibility
689
+ # TODO(hailong): Remove before 0.12.0.
354
690
  @usage_lib.entrypoint
355
691
  def queue(refresh: bool,
356
692
  skip_finished: bool = False,
357
- all_users: bool = False) -> List[Dict[str, Any]]:
693
+ all_users: bool = False,
694
+ job_ids: Optional[List[int]] = None) -> List[Dict[str, Any]]:
358
695
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
359
696
  """Gets statuses of managed jobs.
360
697
 
@@ -368,13 +705,15 @@ def queue(refresh: bool,
368
705
  'resources': str,
369
706
  'submitted_at': (float) timestamp of submission,
370
707
  'end_at': (float) timestamp of end,
371
- 'duration': (float) duration in seconds,
708
+ 'job_duration': (float) duration in seconds,
372
709
  'recovery_count': (int) Number of retries,
373
710
  'status': (sky.jobs.ManagedJobStatus) of the job,
374
711
  'cluster_resources': (str) resources of the cluster,
375
712
  'region': (str) region of the cluster,
376
713
  'user_name': (Optional[str]) job creator's user name,
377
714
  'user_hash': (str) job creator's user hash,
715
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
716
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
378
717
  }
379
718
  ]
380
719
  Raises:
@@ -382,51 +721,222 @@ def queue(refresh: bool,
382
721
  does not exist.
383
722
  RuntimeError: if failed to get the managed jobs with ssh.
384
723
  """
385
- handle = _maybe_restart_controller(refresh,
386
- stopped_message='No in-progress '
387
- 'managed jobs.',
388
- spinner_message='Checking '
389
- 'managed jobs')
724
+ jobs, _, _, _ = queue_v2(refresh, skip_finished, all_users, job_ids)
725
+
726
+ return jobs
727
+
728
+
729
+ @usage_lib.entrypoint
730
+ def queue_v2_api(
731
+ refresh: bool,
732
+ skip_finished: bool = False,
733
+ all_users: bool = False,
734
+ job_ids: Optional[List[int]] = None,
735
+ user_match: Optional[str] = None,
736
+ workspace_match: Optional[str] = None,
737
+ name_match: Optional[str] = None,
738
+ pool_match: Optional[str] = None,
739
+ page: Optional[int] = None,
740
+ limit: Optional[int] = None,
741
+ statuses: Optional[List[str]] = None,
742
+ fields: Optional[List[str]] = None,
743
+ ) -> Tuple[List[responses.ManagedJobRecord], int, Dict[str, int], int]:
744
+ """Gets statuses of managed jobs and parse the
745
+ jobs to responses.ManagedJobRecord."""
746
+ jobs, total, status_counts, total_no_filter = queue_v2(
747
+ refresh, skip_finished, all_users, job_ids, user_match, workspace_match,
748
+ name_match, pool_match, page, limit, statuses, fields)
749
+ return [responses.ManagedJobRecord(**job) for job in jobs
750
+ ], total, status_counts, total_no_filter
751
+
752
+
753
+ @metrics_lib.time_me
754
+ def queue_v2(
755
+ refresh: bool,
756
+ skip_finished: bool = False,
757
+ all_users: bool = False,
758
+ job_ids: Optional[List[int]] = None,
759
+ user_match: Optional[str] = None,
760
+ workspace_match: Optional[str] = None,
761
+ name_match: Optional[str] = None,
762
+ pool_match: Optional[str] = None,
763
+ page: Optional[int] = None,
764
+ limit: Optional[int] = None,
765
+ statuses: Optional[List[str]] = None,
766
+ fields: Optional[List[str]] = None,
767
+ ) -> Tuple[List[Dict[str, Any]], int, Dict[str, int], int]:
768
+ # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
769
+ """Gets statuses of managed jobs with filtering.
770
+
771
+ Please refer to sky.cli.job_queue for documentation.
772
+
773
+ Returns:
774
+ jobs: List[Dict[str, Any]]
775
+ [
776
+ {
777
+ 'job_id': int,
778
+ 'job_name': str,
779
+ 'resources': str,
780
+ 'submitted_at': (float) timestamp of submission,
781
+ 'end_at': (float) timestamp of end,
782
+ 'job_duration': (float) duration in seconds,
783
+ 'recovery_count': (int) Number of retries,
784
+ 'status': (sky.jobs.ManagedJobStatus) of the job,
785
+ 'cluster_resources': (str) resources of the cluster,
786
+ 'region': (str) region of the cluster,
787
+ 'user_name': (Optional[str]) job creator's user name,
788
+ 'user_hash': (str) job creator's user hash,
789
+ 'task_id': (int), set to 0 (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
790
+ 'task_name': (str), same as job_name (except in pipelines, which may have multiple tasks), # pylint: disable=line-too-long
791
+ }
792
+ ]
793
+ total: int, total number of jobs after filter
794
+ status_counts: Dict[str, int], status counts after filter
795
+ total_no_filter: int, total number of jobs before filter
796
+ Raises:
797
+ sky.exceptions.ClusterNotUpError: the jobs controller is not up or
798
+ does not exist.
799
+ RuntimeError: if failed to get the managed jobs with ssh.
800
+ """
801
+ if limit is not None:
802
+ if limit < 1:
803
+ raise ValueError(f'Limit must be at least 1, got {limit}')
804
+ if page is None:
805
+ page = 1
806
+ if page < 1:
807
+ raise ValueError(f'Page must be at least 1, got {page}')
808
+ else:
809
+ if page is not None:
810
+ raise ValueError('Limit must be specified when page is specified')
811
+
812
+ with metrics_lib.time_it('jobs.queue.restart_controller', group='jobs'):
813
+ handle = _maybe_restart_controller(refresh,
814
+ stopped_message='No in-progress '
815
+ 'managed jobs.',
816
+ spinner_message='Checking '
817
+ 'managed jobs')
390
818
  backend = backend_utils.get_backend_from_handle(handle)
391
819
  assert isinstance(backend, backends.CloudVmRayBackend)
392
820
 
393
- code = managed_job_utils.ManagedJobCodeGen.get_job_table()
394
- returncode, job_table_payload, stderr = backend.run_on_head(
395
- handle,
396
- code,
397
- require_outputs=True,
398
- stream_logs=False,
399
- separate_stderr=True)
821
+ user_hashes: Optional[List[Optional[str]]] = None
822
+ show_jobs_without_user_hash = False
823
+ if not all_users:
824
+ user_hashes = [common_utils.get_user_hash()]
825
+ # For backwards compatibility, we show jobs that do not have a
826
+ # user_hash. TODO(cooperc): Remove before 0.12.0.
827
+ user_hashes.append(None)
828
+ show_jobs_without_user_hash = True
829
+ elif user_match is not None:
830
+ users = global_user_state.get_user_by_name_match(user_match)
831
+ if not users:
832
+ return [], 0, {}, 0
833
+ user_hashes = [user.id for user in users]
834
+
835
+ accessible_workspaces = list(workspaces_core.get_workspaces().keys())
836
+
837
+ if handle.is_grpc_enabled_with_flag:
838
+ try:
839
+ request = managed_jobsv1_pb2.GetJobTableRequest(
840
+ skip_finished=skip_finished,
841
+ accessible_workspaces=(managed_jobsv1_pb2.Workspaces(
842
+ workspaces=accessible_workspaces)),
843
+ job_ids=managed_jobsv1_pb2.JobIds(
844
+ ids=job_ids) if job_ids is not None else None,
845
+ workspace_match=workspace_match,
846
+ name_match=name_match,
847
+ pool_match=pool_match,
848
+ page=page,
849
+ limit=limit,
850
+ # Remove None from user_hashes, as the gRPC server uses the
851
+ # show_jobs_without_user_hash flag instead.
852
+ user_hashes=managed_jobsv1_pb2.UserHashes(hashes=[
853
+ user_hash for user_hash in user_hashes
854
+ if user_hash is not None
855
+ ]) if user_hashes is not None else None,
856
+ statuses=managed_jobsv1_pb2.Statuses(
857
+ statuses=statuses) if statuses is not None else None,
858
+ fields=managed_jobsv1_pb2.Fields(
859
+ fields=fields) if fields is not None else None,
860
+ show_jobs_without_user_hash=show_jobs_without_user_hash,
861
+ )
862
+ response = backend_utils.invoke_skylet_with_retries(
863
+ lambda: cloud_vm_ray_backend.SkyletClient(
864
+ handle.get_grpc_channel()).get_managed_job_table(request))
865
+ jobs = managed_job_utils.decode_managed_job_protos(response.jobs)
866
+ return jobs, response.total, dict(
867
+ response.status_counts), response.total_no_filter
868
+ except exceptions.SkyletMethodNotImplementedError:
869
+ pass
870
+
871
+ with metrics_lib.time_it('jobs.queue.generate_code', group='jobs'):
872
+ code = managed_job_utils.ManagedJobCodeGen.get_job_table(
873
+ skip_finished, accessible_workspaces, job_ids, workspace_match,
874
+ name_match, pool_match, page, limit, user_hashes, statuses, fields)
875
+ with metrics_lib.time_it('jobs.queue.run_on_head', group='jobs'):
876
+ returncode, job_table_payload, stderr = backend.run_on_head(
877
+ handle,
878
+ code,
879
+ require_outputs=True,
880
+ stream_logs=False,
881
+ separate_stderr=True)
400
882
 
401
883
  if returncode != 0:
402
884
  logger.error(job_table_payload + stderr)
403
885
  raise RuntimeError('Failed to fetch managed jobs with returncode: '
404
- f'{returncode}')
886
+ f'{returncode}.\n{job_table_payload + stderr}')
405
887
 
406
- jobs = managed_job_utils.load_managed_job_queue(job_table_payload)
888
+ with metrics_lib.time_it('jobs.queue.load_job_queue', group='jobs'):
889
+ (jobs, total, result_type, total_no_filter, status_counts
890
+ ) = managed_job_utils.load_managed_job_queue(job_table_payload)
407
891
 
408
- if not all_users:
892
+ if result_type == managed_job_utils.ManagedJobQueueResultType.DICT:
893
+ return jobs, total, status_counts, total_no_filter
409
894
 
410
- def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
411
- user_hash = job.get('user_hash', None)
412
- if user_hash is None:
413
- # For backwards compatibility, we show jobs that do not have a
414
- # user_hash. TODO(cooperc): Remove before 0.12.0.
415
- return True
416
- return user_hash == common_utils.get_user_hash()
895
+ # Backward compatibility for old jobs controller without filtering
896
+ # TODO(hailong): remove this after 0.12.0
897
+ with metrics_lib.time_it('jobs.queue.filter_and_process', group='jobs'):
898
+ if not all_users:
417
899
 
418
- jobs = list(filter(user_hash_matches_or_missing, jobs))
900
+ def user_hash_matches_or_missing(job: Dict[str, Any]) -> bool:
901
+ user_hash = job.get('user_hash', None)
902
+ if user_hash is None:
903
+ # For backwards compatibility, we show jobs that do not have
904
+ # a user_hash. TODO(cooperc): Remove before 0.12.0.
905
+ return True
906
+ return user_hash == common_utils.get_user_hash()
419
907
 
420
- if skip_finished:
421
- # Filter out the finished jobs. If a multi-task job is partially
422
- # finished, we will include all its tasks.
423
- non_finished_tasks = list(
424
- filter(lambda job: not job['status'].is_terminal(), jobs))
425
- non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
426
- jobs = list(
427
- filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
908
+ jobs = list(filter(user_hash_matches_or_missing, jobs))
428
909
 
429
- return jobs
910
+ jobs = list(
911
+ filter(
912
+ lambda job: job.get('workspace', skylet_constants.
913
+ SKYPILOT_DEFAULT_WORKSPACE) in
914
+ accessible_workspaces, jobs))
915
+
916
+ if skip_finished:
917
+ # Filter out the finished jobs. If a multi-task job is partially
918
+ # finished, we will include all its tasks.
919
+ non_finished_tasks = list(
920
+ filter(lambda job: not job['status'].is_terminal(), jobs))
921
+ non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
922
+ jobs = list(
923
+ filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
924
+
925
+ if job_ids:
926
+ jobs = [job for job in jobs if job['job_id'] in job_ids]
927
+
928
+ filtered_jobs, total, status_counts = managed_job_utils.filter_jobs(
929
+ jobs,
930
+ workspace_match,
931
+ name_match,
932
+ pool_match,
933
+ page=page,
934
+ limit=limit,
935
+ user_match=user_match,
936
+ enable_user_match=True,
937
+ statuses=statuses,
938
+ )
939
+ return filtered_jobs, total, status_counts, total_no_filter
430
940
 
431
941
 
432
942
  @usage_lib.entrypoint
@@ -434,7 +944,8 @@ def queue(refresh: bool,
434
944
  def cancel(name: Optional[str] = None,
435
945
  job_ids: Optional[List[int]] = None,
436
946
  all: bool = False,
437
- all_users: bool = False) -> None:
947
+ all_users: bool = False,
948
+ pool: Optional[str] = None) -> None:
438
949
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
439
950
  """Cancels managed jobs.
440
951
 
@@ -444,57 +955,98 @@ def cancel(name: Optional[str] = None,
444
955
  sky.exceptions.ClusterNotUpError: the jobs controller is not up.
445
956
  RuntimeError: failed to cancel the job.
446
957
  """
447
- job_ids = [] if job_ids is None else job_ids
448
- handle = backend_utils.is_controller_accessible(
449
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
450
- stopped_message='All managed jobs should have finished.')
451
-
452
- job_id_str = ','.join(map(str, job_ids))
453
- if sum([bool(job_ids), name is not None, all or all_users]) != 1:
454
- arguments = []
455
- arguments += [f'job_ids={job_id_str}'] if job_ids else []
456
- arguments += [f'name={name}'] if name is not None else []
457
- arguments += ['all'] if all else []
458
- arguments += ['all_users'] if all_users else []
459
- with ux_utils.print_exception_no_traceback():
460
- raise ValueError('Can only specify one of JOB_IDS, name, or all/'
461
- f'all_users. Provided {" ".join(arguments)!r}.')
958
+ with rich_utils.safe_status(
959
+ ux_utils.spinner_message('Cancelling managed jobs')):
960
+ job_ids = [] if job_ids is None else job_ids
961
+ handle = backend_utils.is_controller_accessible(
962
+ controller=controller_utils.Controllers.JOBS_CONTROLLER,
963
+ stopped_message='All managed jobs should have finished.')
964
+
965
+ job_id_str = ','.join(map(str, job_ids))
966
+ if sum([
967
+ bool(job_ids), name is not None, pool is not None, all or
968
+ all_users
969
+ ]) != 1:
970
+ arguments = []
971
+ arguments += [f'job_ids={job_id_str}'] if job_ids else []
972
+ arguments += [f'name={name}'] if name is not None else []
973
+ arguments += [f'pool={pool}'] if pool is not None else []
974
+ arguments += ['all'] if all else []
975
+ arguments += ['all_users'] if all_users else []
976
+ with ux_utils.print_exception_no_traceback():
977
+ raise ValueError(
978
+ 'Can only specify one of JOB_IDS, name, pool, or all/'
979
+ f'all_users. Provided {" ".join(arguments)!r}.')
462
980
 
463
- backend = backend_utils.get_backend_from_handle(handle)
464
- assert isinstance(backend, backends.CloudVmRayBackend)
465
- if all_users:
466
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
467
- None, all_users=True)
468
- elif all:
469
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(None)
470
- elif job_ids:
471
- code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(job_ids)
472
- else:
473
- assert name is not None, (job_ids, name, all)
474
- code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(name)
475
- # The stderr is redirected to stdout
476
- returncode, stdout, _ = backend.run_on_head(handle,
477
- code,
478
- require_outputs=True,
479
- stream_logs=False)
480
- try:
481
- subprocess_utils.handle_returncode(returncode, code,
482
- 'Failed to cancel managed job',
483
- stdout)
484
- except exceptions.CommandError as e:
485
- with ux_utils.print_exception_no_traceback():
486
- raise RuntimeError(e.error_msg) from e
981
+ job_ids = None if (all_users or all) else job_ids
487
982
 
488
- sky_logging.print(stdout)
489
- if 'Multiple jobs found with name' in stdout:
490
- with ux_utils.print_exception_no_traceback():
491
- raise RuntimeError(
492
- 'Please specify the job ID instead of the job name.')
983
+ backend = backend_utils.get_backend_from_handle(handle)
984
+ assert isinstance(backend, backends.CloudVmRayBackend)
985
+
986
+ use_legacy = not handle.is_grpc_enabled_with_flag
987
+
988
+ if not use_legacy:
989
+ current_workspace = skypilot_config.get_active_workspace()
990
+ try:
991
+ request = managed_jobsv1_pb2.CancelJobsRequest(
992
+ current_workspace=current_workspace)
993
+
994
+ if all_users or all or job_ids:
995
+ request.all_users = all_users
996
+ if all:
997
+ request.user_hash = common_utils.get_user_hash()
998
+ if job_ids is not None:
999
+ request.job_ids.CopyFrom(
1000
+ managed_jobsv1_pb2.JobIds(ids=job_ids))
1001
+ elif name is not None:
1002
+ request.job_name = name
1003
+ else:
1004
+ assert pool is not None, (job_ids, name, pool, all)
1005
+ request.pool_name = pool
1006
+
1007
+ response = backend_utils.invoke_skylet_with_retries(
1008
+ lambda: cloud_vm_ray_backend.SkyletClient(
1009
+ handle.get_grpc_channel()).cancel_managed_jobs(request))
1010
+ stdout = response.message
1011
+ except exceptions.SkyletMethodNotImplementedError:
1012
+ use_legacy = True
1013
+
1014
+ if use_legacy:
1015
+ if all_users or all or job_ids:
1016
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_id(
1017
+ job_ids, all_users=all_users)
1018
+ elif name is not None:
1019
+ code = managed_job_utils.ManagedJobCodeGen.cancel_job_by_name(
1020
+ name)
1021
+ else:
1022
+ assert pool is not None, (job_ids, name, pool, all)
1023
+ code = managed_job_utils.ManagedJobCodeGen.cancel_jobs_by_pool(
1024
+ pool)
1025
+ # The stderr is redirected to stdout
1026
+ returncode, stdout, stderr = backend.run_on_head(
1027
+ handle, code, require_outputs=True, stream_logs=False)
1028
+ try:
1029
+ subprocess_utils.handle_returncode(
1030
+ returncode, code, 'Failed to cancel managed job',
1031
+ stdout + stderr)
1032
+ except exceptions.CommandError as e:
1033
+ with ux_utils.print_exception_no_traceback():
1034
+ raise RuntimeError(e.error_msg) from e
1035
+
1036
+ logger.info(stdout)
1037
+ if 'Multiple jobs found with name' in stdout:
1038
+ with ux_utils.print_exception_no_traceback():
1039
+ raise RuntimeError(
1040
+ 'Please specify the job ID instead of the job name.')
493
1041
 
494
1042
 
495
1043
  @usage_lib.entrypoint
496
- def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
497
- controller: bool, refresh: bool) -> int:
1044
+ def tail_logs(name: Optional[str],
1045
+ job_id: Optional[int],
1046
+ follow: bool,
1047
+ controller: bool,
1048
+ refresh: bool,
1049
+ tail: Optional[int] = None) -> int:
498
1050
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
499
1051
  """Tail logs of managed jobs.
500
1052
 
@@ -537,56 +1089,8 @@ def tail_logs(name: Optional[str], job_id: Optional[int], follow: bool,
537
1089
  job_id=job_id,
538
1090
  job_name=name,
539
1091
  follow=follow,
540
- controller=controller)
541
-
542
-
543
- def start_dashboard_forwarding(refresh: bool = False) -> Tuple[int, int]:
544
- """Opens a dashboard for managed jobs (needs controller to be UP)."""
545
- # TODO(SKY-1212): ideally, the controller/dashboard server should expose the
546
- # API perhaps via REST. Then here we would (1) not have to use SSH to try to
547
- # see if the controller is UP first, which is slow; (2) not have to run SSH
548
- # port forwarding first (we'd just launch a local dashboard which would make
549
- # REST API calls to the controller dashboard server).
550
- logger.info('Starting dashboard')
551
- hint = ('Dashboard is not available if jobs controller is not up. Run '
552
- 'a managed job first or run: sky jobs queue --refresh')
553
- handle = _maybe_restart_controller(
554
- refresh=refresh,
555
- stopped_message=hint,
556
- spinner_message='Checking jobs controller')
557
-
558
- # SSH forward a free local port to remote's dashboard port.
559
- remote_port = skylet_constants.SPOT_DASHBOARD_REMOTE_PORT
560
- free_port = common_utils.find_free_port(remote_port)
561
- runner = handle.get_command_runners()[0]
562
- port_forward_command = ' '.join(
563
- runner.port_forward_command(port_forward=[(free_port, remote_port)],
564
- connect_timeout=1))
565
- port_forward_command = (
566
- f'{port_forward_command} '
567
- f'> ~/sky_logs/api_server/dashboard-{common_utils.get_user_hash()}.log '
568
- '2>&1')
569
- logger.info(f'Forwarding port: {colorama.Style.DIM}{port_forward_command}'
570
- f'{colorama.Style.RESET_ALL}')
571
-
572
- ssh_process = subprocess.Popen(port_forward_command,
573
- shell=True,
574
- start_new_session=True)
575
- time.sleep(3) # Added delay for ssh_command to initialize.
576
- logger.info(f'{colorama.Fore.GREEN}Dashboard is now available at: '
577
- f'http://127.0.0.1:{free_port}{colorama.Style.RESET_ALL}')
578
-
579
- return free_port, ssh_process.pid
580
-
581
-
582
- def stop_dashboard_forwarding(pid: int) -> None:
583
- # Exit the ssh command when the context manager is closed.
584
- try:
585
- os.killpg(os.getpgid(pid), signal.SIGTERM)
586
- except ProcessLookupError:
587
- # This happens if jobs controller is auto-stopped.
588
- pass
589
- logger.info('Forwarding port closed. Exiting.')
1092
+ controller=controller,
1093
+ tail=tail)
590
1094
 
591
1095
 
592
1096
  @usage_lib.entrypoint
@@ -635,3 +1139,73 @@ def download_logs(
635
1139
  job_name=name,
636
1140
  controller=controller,
637
1141
  local_dir=local_dir)
1142
+
1143
+
1144
+ @usage_lib.entrypoint
1145
+ def pool_apply(
1146
+ task: 'sky.Task',
1147
+ pool_name: str,
1148
+ mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
1149
+ workers: Optional[int] = None,
1150
+ ) -> None:
1151
+ """Apply a config to a pool."""
1152
+ return impl.apply(task, workers, pool_name, mode, pool=True)
1153
+
1154
+
1155
+ @usage_lib.entrypoint
1156
+ # pylint: disable=redefined-builtin
1157
+ def pool_down(
1158
+ pool_names: Optional[Union[str, List[str]]] = None,
1159
+ all: bool = False,
1160
+ purge: bool = False,
1161
+ ) -> None:
1162
+ """Delete a pool."""
1163
+ return impl.down(pool_names, all, purge, pool=True)
1164
+
1165
+
1166
+ @usage_lib.entrypoint
1167
+ def pool_status(
1168
+ pool_names: Optional[Union[str,
1169
+ List[str]]] = None,) -> List[Dict[str, Any]]:
1170
+ """Query a pool."""
1171
+ return impl.status(pool_names, pool=True)
1172
+
1173
+
1174
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
1175
+
1176
+
1177
+ @usage_lib.entrypoint
1178
+ def pool_tail_logs(
1179
+ pool_name: str,
1180
+ *,
1181
+ target: ServiceComponentOrStr,
1182
+ worker_id: Optional[int] = None,
1183
+ follow: bool = True,
1184
+ tail: Optional[int] = None,
1185
+ ) -> None:
1186
+ """Tail logs of a pool."""
1187
+ return impl.tail_logs(pool_name,
1188
+ target=target,
1189
+ replica_id=worker_id,
1190
+ follow=follow,
1191
+ tail=tail,
1192
+ pool=True)
1193
+
1194
+
1195
+ @usage_lib.entrypoint
1196
+ def pool_sync_down_logs(
1197
+ pool_name: str,
1198
+ *,
1199
+ local_dir: str,
1200
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
1201
+ None] = None,
1202
+ worker_ids: Optional[List[int]] = None,
1203
+ tail: Optional[int] = None,
1204
+ ) -> str:
1205
+ """Sync down logs of a pool."""
1206
+ return impl.sync_down_logs(pool_name,
1207
+ local_dir=local_dir,
1208
+ targets=targets,
1209
+ replica_ids=worker_ids,
1210
+ tail=tail,
1211
+ pool=True)