skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/skylet/constants.py CHANGED
@@ -6,6 +6,23 @@ from packaging import version
6
6
  import sky
7
7
  from sky.setup_files import dependencies
8
8
 
9
+ # The base directory for all SkyPilot runtime artifacts.
10
+ # Historically, we have always used $HOME, but we couldn't
11
+ # do that for Slurm, because $HOME typically points to a NFS
12
+ # mounted directory, which does not work well with SQLite.
13
+ # https://sqlite.org/faq.html#q5
14
+ # Additionally, having the skypilot-runtime python venv be
15
+ # on an NFS makes things very slow.
16
+ SKY_RUNTIME_DIR = '${SKY_RUNTIME_DIR:-$HOME}'
17
+ # Same as above but for use within python code instead of shell commands.
18
+ # Example usage:
19
+ # os.path.join(
20
+ # os.path.expanduser(os.environ.get(SKY_RUNTIME_DIR_ENV_VAR_KEY, '~')),
21
+ # '.sky/jobs.db')
22
+ SKY_RUNTIME_DIR_ENV_VAR_KEY = 'SKY_RUNTIME_DIR'
23
+ # We keep sky_logs and sky_workdir in $HOME, because
24
+ # these are artifacts that users can access, and having
25
+ # them be in $HOME makes it more convenient.
9
26
  SKY_LOGS_DIRECTORY = '~/sky_logs'
10
27
  SKY_REMOTE_WORKDIR = '~/sky_workdir'
11
28
  SKY_IGNORE_FILE = '.skyignore'
@@ -24,22 +41,23 @@ SKY_REMOTE_RAY_PORT_DICT_STR = (
24
41
  f'"ray_dashboard_port":{SKY_REMOTE_RAY_DASHBOARD_PORT}}}')
25
42
  # The file contains the ports of the Ray cluster that SkyPilot launched,
26
43
  # i.e. the PORT_DICT_STR above.
27
- SKY_REMOTE_RAY_PORT_FILE = '~/.sky/ray_port.json'
44
+ SKY_REMOTE_RAY_PORT_FILE = '.sky/ray_port.json'
28
45
  SKY_REMOTE_RAY_TEMPDIR = '/tmp/ray_skypilot'
29
46
  SKY_REMOTE_RAY_VERSION = '2.9.3'
30
47
 
48
+ SKY_UNSET_PYTHONPATH = 'env -u PYTHONPATH'
31
49
  # We store the absolute path of the python executable (/opt/conda/bin/python3)
32
50
  # in this file, so that any future internal commands that need to use python
33
51
  # can use this path. This is useful for the case where the user has a custom
34
52
  # conda environment as a default environment, which is not the same as the one
35
53
  # used for installing SkyPilot runtime (ray and skypilot).
36
- SKY_PYTHON_PATH_FILE = '~/.sky/python_path'
37
- SKY_RAY_PATH_FILE = '~/.sky/ray_path'
54
+ SKY_PYTHON_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/python_path'
55
+ SKY_RAY_PATH_FILE = f'{SKY_RUNTIME_DIR}/.sky/ray_path'
38
56
  SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
39
57
  f'cat {SKY_PYTHON_PATH_FILE} 2> /dev/null || '
40
58
  'which python3')
41
59
  # Python executable, e.g., /opt/conda/bin/python3
42
- SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
60
+ SKY_PYTHON_CMD = f'{SKY_UNSET_PYTHONPATH} $({SKY_GET_PYTHON_PATH_CMD})'
43
61
  # Prefer SKY_UV_PIP_CMD, which is faster.
44
62
  # TODO(cooperc): remove remaining usage (GCP TPU setup).
45
63
  SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
@@ -51,24 +69,33 @@ SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
51
69
  f'cat {SKY_RAY_PATH_FILE} 2> /dev/null || which ray)')
52
70
  # Separate env for SkyPilot runtime dependencies.
53
71
  SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
54
- SKY_REMOTE_PYTHON_ENV: str = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
72
+ SKY_REMOTE_PYTHON_ENV: str = f'{SKY_RUNTIME_DIR}/{SKY_REMOTE_PYTHON_ENV_NAME}'
55
73
  ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
56
74
  # uv is used for venv and pip, much faster than python implementations.
57
75
  SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
58
- SKY_UV_CMD = f'UV_SYSTEM_PYTHON=false {SKY_UV_INSTALL_DIR}/uv'
76
+ # set UV_SYSTEM_PYTHON to false in case the
77
+ # user provided docker image set it to true.
78
+ # unset PYTHONPATH in case the user provided docker image set it.
79
+ SKY_UV_CMD = ('UV_SYSTEM_PYTHON=false '
80
+ f'{SKY_UNSET_PYTHONPATH} {SKY_UV_INSTALL_DIR}/uv')
59
81
  # This won't reinstall uv if it's already installed, so it's safe to re-run.
60
82
  SKY_UV_INSTALL_CMD = (f'{SKY_UV_CMD} -V >/dev/null 2>&1 || '
61
83
  'curl -LsSf https://astral.sh/uv/install.sh '
62
84
  f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh')
63
85
  SKY_UV_PIP_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip')
64
- # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
65
- # environment. `deactivate` command does not work when conda is used.
86
+ SKY_UV_RUN_CMD: str = (f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} run '
87
+ '--no-project --no-config')
88
+ # Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH and unsetting relevant
89
+ # VIRTUAL_ENV envvars to deactivate the environment. `deactivate` command does
90
+ # not work when conda is used.
66
91
  DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
67
92
  'export PATH='
68
- f'$(echo $PATH | sed "s|$(echo ~)/{SKY_REMOTE_PYTHON_ENV_NAME}/bin:||")')
93
+ f'$(echo $PATH | sed "s|$(echo {SKY_REMOTE_PYTHON_ENV})/bin:||") && '
94
+ 'unset VIRTUAL_ENV && unset VIRTUAL_ENV_PROMPT')
69
95
 
70
96
  # Prefix for SkyPilot environment variables
71
97
  SKYPILOT_ENV_VAR_PREFIX = 'SKYPILOT_'
98
+ SKYPILOT_SERVER_ENV_VAR_PREFIX = 'SKYPILOT_SERVER_'
72
99
 
73
100
  # The name for the environment variable that stores the unique ID of the
74
101
  # current task. This will stay the same across multiple recoveries of the
@@ -89,17 +116,17 @@ TASK_ID_LIST_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}TASK_IDS'
89
116
  # cluster yaml is updated.
90
117
  #
91
118
  # TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
92
- SKYLET_VERSION = '12'
119
+ SKYLET_VERSION = '27'
93
120
  # The version of the lib files that skylet/jobs use. Whenever there is an API
94
121
  # change for the job_lib or log_lib, we need to bump this version, so that the
95
122
  # user can be notified to update their SkyPilot version on the remote cluster.
96
- SKYLET_LIB_VERSION = 3
97
- SKYLET_VERSION_FILE = '~/.sky/skylet_version'
98
-
99
- # `sky jobs dashboard`-related
100
- #
101
- # Port on the remote jobs controller that the dashboard is running on.
102
- SPOT_DASHBOARD_REMOTE_PORT = 5000
123
+ SKYLET_LIB_VERSION = 4
124
+ SKYLET_VERSION_FILE = '.sky/skylet_version'
125
+ SKYLET_LOG_FILE = '.sky/skylet.log'
126
+ SKYLET_PID_FILE = '.sky/skylet_pid'
127
+ SKYLET_PORT_FILE = '.sky/skylet_port'
128
+ SKYLET_GRPC_PORT = 46590
129
+ SKYLET_GRPC_TIMEOUT_SECONDS = 10
103
130
 
104
131
  # Docker default options
105
132
  DEFAULT_DOCKER_CONTAINER_NAME = 'sky_container'
@@ -151,7 +178,7 @@ CONDA_INSTALLATION_COMMANDS = (
151
178
  # because for some images, conda is already installed, but not initialized.
152
179
  # In this case, we need to initialize conda and set auto_activate_base to
153
180
  # true.
154
- '{ bash Miniconda3-Linux.sh -b; '
181
+ '{ bash Miniconda3-Linux.sh -b || true; '
155
182
  'eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && '
156
183
  # Caller should replace {conda_auto_activate} with either true or false.
157
184
  'conda config --set auto_activate_base {conda_auto_activate} && '
@@ -173,7 +200,7 @@ CONDA_INSTALLATION_COMMANDS = (
173
200
  'fi;'
174
201
  # Install uv for venv management and pip installation.
175
202
  f'{SKY_UV_INSTALL_CMD};'
176
- # Create a separate conda environment for SkyPilot dependencies.
203
+ # Create a separate python environment for SkyPilot dependencies.
177
204
  f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
178
205
  # Do NOT use --system-site-packages here, because if users upgrade any
179
206
  # packages in the base env, they interfere with skypilot dependencies.
@@ -218,7 +245,9 @@ RAY_INSTALLATION_COMMANDS = (
218
245
  f'{SKY_UV_PIP_CMD} list | grep "ray " | '
219
246
  f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
220
247
  f'|| {RAY_STATUS} || '
221
- f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
248
+ # The pydantic-core==2.41.3 for arm seems corrupted
249
+ # so we need to avoid that specific version.
250
+ f'{SKY_UV_PIP_CMD} install -U "ray[default]=={SKY_REMOTE_RAY_VERSION}" "pydantic-core==2.41.1"; ' # pylint: disable=line-too-long
222
251
  # In some envs, e.g. pip does not have permission to write under /opt/conda
223
252
  # ray package will be installed under ~/.local/bin. If the user's PATH does
224
253
  # not include ~/.local/bin (the pip install will have the output: `WARNING:
@@ -230,9 +259,24 @@ RAY_INSTALLATION_COMMANDS = (
230
259
  'export PATH=$PATH:$HOME/.local/bin; '
231
260
  # Writes ray path to file if it does not exist or the file is empty.
232
261
  f'[ -s {SKY_RAY_PATH_FILE} ] || '
233
- f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
262
+ f'{{ {SKY_UV_RUN_CMD} '
234
263
  f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
235
264
 
265
+ # Copy SkyPilot templates from the installed wheel to ~/sky_templates.
266
+ # This must run after the skypilot wheel is installed.
267
+ COPY_SKYPILOT_TEMPLATES_COMMANDS = (
268
+ f'{ACTIVATE_SKY_REMOTE_PYTHON_ENV}; '
269
+ f'{SKY_PYTHON_CMD} -c \''
270
+ 'import sky_templates, shutil, os; '
271
+ 'src = os.path.dirname(sky_templates.__file__); '
272
+ 'dst = os.path.expanduser(\"~/sky_templates\"); '
273
+ 'print(f\"Copying templates from {src} to {dst}...\"); '
274
+ 'shutil.copytree(src, dst, dirs_exist_ok=True); '
275
+ 'print(f\"Templates copied successfully\")\'; '
276
+ # Make scripts executable.
277
+ 'find ~/sky_templates -type f ! -name "*.py" ! -name "*.md" '
278
+ '-exec chmod +x {} \\; ')
279
+
236
280
  SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
237
281
  f'{SKY_UV_INSTALL_CMD};'
238
282
  f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
@@ -323,6 +367,14 @@ FILE_MOUNTS_LOCAL_TMP_BASE_PATH = '~/.sky/tmp/'
323
367
  # controller_utils.translate_local_file_mounts_to_two_hop().
324
368
  FILE_MOUNTS_CONTROLLER_TMP_BASE_PATH = '~/.sky/tmp/controller'
325
369
 
370
+ # For passing in CPU and memory limits to the controller pod when running
371
+ # in k8s. Right now, we only use this for the jobs controller, but we may
372
+ # use this for the serve controller as well in the future.
373
+ # These files are written to disk by the skylet, who reads it from env vars
374
+ # passed by the backend when starting the skylet (start_skylet_on_head_node).
375
+ CONTROLLER_K8S_CPU_FILE = '~/.sky/_internal_k8s_pod_cpu'
376
+ CONTROLLER_K8S_MEMORY_FILE = '~/.sky/_internal_k8s_pod_memory'
377
+
326
378
  # Used when an managed jobs are created and
327
379
  # files are synced up to the cloud.
328
380
  FILE_MOUNTS_WORKDIR_SUBPATH = 'job-{run_id}/workdir'
@@ -346,9 +398,16 @@ API_SERVER_CREATION_LOCK_PATH = '~/.sky/api_server/.creation.lock'
346
398
  # API server.
347
399
  SKY_API_SERVER_URL_ENV_VAR = f'{SKYPILOT_ENV_VAR_PREFIX}API_SERVER_ENDPOINT'
348
400
 
401
+ # The name for the environment variable that stores the SkyPilot service
402
+ # account token on client side.
403
+ SERVICE_ACCOUNT_TOKEN_ENV_VAR = (
404
+ f'{SKYPILOT_ENV_VAR_PREFIX}SERVICE_ACCOUNT_TOKEN')
405
+
349
406
  # SkyPilot environment variables
350
407
  SKYPILOT_NUM_NODES = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_NODES'
351
408
  SKYPILOT_NODE_IPS = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_IPS'
409
+ SKYPILOT_SETUP_NUM_GPUS_PER_NODE = (
410
+ f'{SKYPILOT_ENV_VAR_PREFIX}SETUP_NUM_GPUS_PER_NODE')
352
411
  SKYPILOT_NUM_GPUS_PER_NODE = f'{SKYPILOT_ENV_VAR_PREFIX}NUM_GPUS_PER_NODE'
353
412
  SKYPILOT_NODE_RANK = f'{SKYPILOT_ENV_VAR_PREFIX}NODE_RANK'
354
413
 
@@ -358,7 +417,7 @@ SKY_SSH_USER_PLACEHOLDER = 'skypilot:ssh_user'
358
417
 
359
418
  RCLONE_CONFIG_DIR = '~/.config/rclone'
360
419
  RCLONE_CONFIG_PATH = f'{RCLONE_CONFIG_DIR}/rclone.conf'
361
- RCLONE_LOG_DIR = '~/.sky/rclone_log'
420
+ RCLONE_MOUNT_CACHED_LOG_DIR = '~/.sky/rclone_log'
362
421
  RCLONE_CACHE_DIR = '~/.cache/rclone'
363
422
  RCLONE_CACHE_REFRESH_INTERVAL = 10
364
423
 
@@ -367,15 +426,43 @@ RCLONE_CACHE_REFRESH_INTERVAL = 10
367
426
  OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
368
427
  ('docker', 'run_options'),
369
428
  ('nvidia_gpus', 'disable_ecc'),
429
+ ('ssh', 'custom_metadata'),
430
+ ('ssh', 'pod_config'),
431
+ ('ssh', 'provision_timeout'),
432
+ ('kubernetes', 'custom_metadata'),
370
433
  ('kubernetes', 'pod_config'),
371
434
  ('kubernetes', 'provision_timeout'),
435
+ ('kubernetes', 'dws'),
436
+ ('kubernetes', 'kueue'),
372
437
  ('gcp', 'managed_instance_group'),
438
+ ('gcp', 'enable_gvnic'),
439
+ ('gcp', 'enable_gpu_direct'),
440
+ ('gcp', 'placement_policy'),
441
+ ('active_workspace',),
373
442
  ]
374
443
  # When overriding the SkyPilot configs on the API server with the client one,
375
444
  # we skip the following keys because they are meant to be client-side configs.
376
- SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [('admin_policy',),
377
- ('api_server',),
378
- ('allowed_clouds',)]
445
+ # Also, we skip the consolidation mode config as those should be only set on
446
+ # the API server side.
447
+ SKIPPED_CLIENT_OVERRIDE_KEYS: List[Tuple[str, ...]] = [
448
+ ('api_server',),
449
+ ('allowed_clouds',),
450
+ ('workspaces',),
451
+ ('db',),
452
+ ('daemons',),
453
+ # TODO(kevin,tian): Override the whole controller config once our test
454
+ # infrastructure supports setting dynamic server side configs.
455
+ # Tests that are affected:
456
+ # - test_managed_jobs_ha_kill_starting
457
+ # - test_managed_jobs_ha_kill_running
458
+ # - all tests that use LOW_CONTROLLER_RESOURCE_ENV or
459
+ # LOW_CONTROLLER_RESOURCE_OVERRIDE_CONFIG (won't cause test failure,
460
+ # but the configs won't be applied)
461
+ ('jobs', 'controller', 'consolidation_mode'),
462
+ ('serve', 'controller', 'consolidation_mode'),
463
+ ('jobs', 'controller', 'controller_logs_gc_retention_hours'),
464
+ ('jobs', 'controller', 'task_logs_gc_retention_hours'),
465
+ ]
379
466
 
380
467
  # Constants for Azure blob storage
381
468
  WAIT_FOR_STORAGE_ACCOUNT_CREATION = 60
@@ -392,6 +479,12 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
392
479
  # persistent through PVC. See kubernetes-ray.yml.j2.
393
480
  PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
394
481
  PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
482
+ # Signal file to indicate that the controller is recovering from a failure.
483
+ # See sky/jobs/utils.py::update_managed_jobs_statuses for more details.
484
+ PERSISTENT_RUN_RESTARTING_SIGNAL_FILE = (
485
+ '~/.sky/.controller_recovery_restarting_signal')
486
+
487
+ HA_PERSISTENT_RECOVERY_LOG_PATH = '/tmp/{}ha_recovery.log'
395
488
 
396
489
  # The placeholder for the local skypilot config path in file mounts for
397
490
  # controllers.
@@ -400,5 +493,103 @@ LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
400
493
  # Path to the generated cluster config yamls and ssh configs.
401
494
  SKY_USER_FILE_PATH = '~/.sky/generated'
402
495
 
496
+ # TODO(cooperc): Update all env vars to begin with SKYPILOT_ or SKYPILOT_SERVER_
403
497
  # Environment variable that is set to 'true' if this is a skypilot server.
404
498
  ENV_VAR_IS_SKYPILOT_SERVER = 'IS_SKYPILOT_SERVER'
499
+ OVERRIDE_CONSOLIDATION_MODE = 'IS_SKYPILOT_JOB_CONTROLLER'
500
+ IS_SKYPILOT_SERVE_CONTROLLER = 'IS_SKYPILOT_SERVE_CONTROLLER'
501
+
502
+ SERVE_OVERRIDE_CONCURRENT_LAUNCHES = (
503
+ f'{SKYPILOT_ENV_VAR_PREFIX}SERVE_OVERRIDE_CONCURRENT_LAUNCHES')
504
+
505
+ # Environment variable that is set to 'true' if metrics are enabled.
506
+ ENV_VAR_SERVER_METRICS_ENABLED = 'SKY_API_SERVER_METRICS_ENABLED'
507
+
508
+ # If set, overrides the header that we can use to get the user name.
509
+ ENV_VAR_SERVER_AUTH_USER_HEADER = f'{SKYPILOT_ENV_VAR_PREFIX}AUTH_USER_HEADER'
510
+
511
+ # Environment variable that is used as the DB connection string for the
512
+ # skypilot server.
513
+ ENV_VAR_DB_CONNECTION_URI = (f'{SKYPILOT_ENV_VAR_PREFIX}DB_CONNECTION_URI')
514
+
515
+ # Environment variable that is set to 'true' if basic
516
+ # authentication is enabled in the API server.
517
+ ENV_VAR_ENABLE_BASIC_AUTH = 'ENABLE_BASIC_AUTH'
518
+ SKYPILOT_INITIAL_BASIC_AUTH = 'SKYPILOT_INITIAL_BASIC_AUTH'
519
+ SKYPILOT_INGRESS_BASIC_AUTH_ENABLED = 'SKYPILOT_INGRESS_BASIC_AUTH_ENABLED'
520
+ ENV_VAR_ENABLE_SERVICE_ACCOUNTS = 'ENABLE_SERVICE_ACCOUNTS'
521
+
522
+ # Enable debug logging for requests.
523
+ ENV_VAR_ENABLE_REQUEST_DEBUG_LOGGING = (
524
+ f'{SKYPILOT_SERVER_ENV_VAR_PREFIX}ENABLE_REQUEST_DEBUG_LOGGING')
525
+
526
+ SKYPILOT_DEFAULT_WORKSPACE = 'default'
527
+
528
+ # BEGIN constants used for service catalog.
529
+ HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
530
+ HOSTED_CATALOG_DIR_URL_S3_MIRROR = 'https://skypilot-catalog.s3.us-east-1.amazonaws.com/catalogs' # pylint: disable=line-too-long
531
+ CATALOG_SCHEMA_VERSION = 'v8'
532
+ CATALOG_DIR = '~/.sky/catalogs'
533
+ ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
534
+ 'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
535
+ 'paperspace', 'primeintellect', 'do', 'nebius', 'ssh',
536
+ 'hyperbolic', 'seeweb', 'shadeform')
537
+ # END constants used for service catalog.
538
+
539
+ # The user ID of the SkyPilot system.
540
+ SKYPILOT_SYSTEM_USER_ID = 'skypilot-system'
541
+
542
+ # The directory to store the logging configuration.
543
+ LOGGING_CONFIG_DIR = '~/.sky/logging'
544
+
545
+ # Resources constants
546
+ TIME_UNITS = {
547
+ 'm': 1,
548
+ 'h': 60,
549
+ 'd': 24 * 60,
550
+ 'w': 7 * 24 * 60,
551
+ }
552
+
553
+ TIME_PATTERN: str = ('^[0-9]+('
554
+ f'{"|".join([unit.lower() for unit in TIME_UNITS])}|'
555
+ f'{"|".join([unit.upper() for unit in TIME_UNITS])}|'
556
+ ')?$')
557
+
558
+ MEMORY_SIZE_UNITS = {
559
+ 'kb': 2**10,
560
+ 'ki': 2**10,
561
+ 'mb': 2**20,
562
+ 'mi': 2**20,
563
+ 'gb': 2**30,
564
+ 'gi': 2**30,
565
+ 'tb': 2**40,
566
+ 'ti': 2**40,
567
+ 'pb': 2**50,
568
+ 'pi': 2**50,
569
+ }
570
+
571
+ MEMORY_SIZE_PATTERN = (
572
+ '^[0-9]+('
573
+ f'{"|".join([unit.lower() for unit in MEMORY_SIZE_UNITS])}|'
574
+ f'{"|".join([unit.upper() for unit in MEMORY_SIZE_UNITS])}|'
575
+ f'{"|".join([unit[0].upper() + unit[1:] for unit in MEMORY_SIZE_UNITS if len(unit) > 1])}' # pylint: disable=line-too-long
576
+ ')?$')
577
+
578
+ LAST_USE_TRUNC_LENGTH = 25
579
+ USED_BY_TRUNC_LENGTH = 25
580
+
581
+ MIN_PRIORITY = -1000
582
+ MAX_PRIORITY = 1000
583
+ DEFAULT_PRIORITY = 0
584
+
585
+ GRACE_PERIOD_SECONDS_ENV_VAR = SKYPILOT_ENV_VAR_PREFIX + 'GRACE_PERIOD_SECONDS'
586
+ COST_REPORT_DEFAULT_DAYS = 30
587
+
588
+ ENV_VAR_LOOP_LAG_THRESHOLD_MS = (SKYPILOT_ENV_VAR_PREFIX +
589
+ 'DEBUG_LOOP_LAG_THRESHOLD_MS')
590
+
591
+ ARM64_ARCH = 'arm64'
592
+ X86_64_ARCH = 'x86_64'
593
+
594
+ SSH_DISABLE_LATENCY_MEASUREMENT_ENV_VAR = (
595
+ f'{SKYPILOT_ENV_VAR_PREFIX}SSH_DISABLE_LATENCY_MEASUREMENT')
sky/skylet/events.py CHANGED
@@ -7,12 +7,12 @@ import time
7
7
  import traceback
8
8
 
9
9
  import psutil
10
- import yaml
11
10
 
12
11
  from sky import clouds
13
12
  from sky import sky_logging
14
13
  from sky.backends import cloud_vm_ray_backend
15
- from sky.jobs import scheduler as managed_job_scheduler
14
+ from sky.jobs import constants as managed_job_constants
15
+ from sky.jobs import scheduler
16
16
  from sky.jobs import state as managed_job_state
17
17
  from sky.jobs import utils as managed_job_utils
18
18
  from sky.serve import serve_utils
@@ -21,9 +21,10 @@ from sky.skylet import constants
21
21
  from sky.skylet import job_lib
22
22
  from sky.usage import usage_lib
23
23
  from sky.utils import cluster_utils
24
- from sky.utils import common_utils
25
24
  from sky.utils import registry
25
+ from sky.utils import subprocess_utils
26
26
  from sky.utils import ux_utils
27
+ from sky.utils import yaml_utils
27
28
 
28
29
  # Seconds of sleep between the processing of skylet events.
29
30
  EVENT_CHECKING_INTERVAL_SECONDS = 20
@@ -46,6 +47,9 @@ class SkyletEvent:
46
47
  EVENT_CHECKING_INTERVAL_SECONDS))
47
48
  self._n = 0
48
49
 
50
+ def start(self):
51
+ pass
52
+
49
53
  def run(self):
50
54
  self._n = (self._n + 1) % self._event_interval
51
55
  if self._n % self._event_interval == 0:
@@ -74,9 +78,60 @@ class ManagedJobEvent(SkyletEvent):
74
78
  """Skylet event for updating and scheduling managed jobs."""
75
79
  EVENT_INTERVAL_SECONDS = 300
76
80
 
81
+ def start(self):
82
+ cpus_env_var = os.environ.get('SKYPILOT_POD_CPU_CORE_LIMIT')
83
+ if cpus_env_var is not None:
84
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_CPU_FILE),
85
+ 'w',
86
+ encoding='utf-8') as f:
87
+ f.write(cpus_env_var)
88
+ memory_env_var = os.environ.get('SKYPILOT_POD_MEMORY_GB_LIMIT')
89
+ if memory_env_var is not None:
90
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
91
+ 'w',
92
+ encoding='utf-8') as f:
93
+ f.write(memory_env_var)
94
+
77
95
  def _run(self):
96
+ if not os.path.exists(
97
+ os.path.expanduser(
98
+ managed_job_constants.JOB_CONTROLLER_INDICATOR_FILE)
99
+ ) and not managed_job_utils.is_consolidation_mode():
100
+ # Note: since the skylet is started before the user setup (in
101
+ # jobs-controller.yaml.j2) runs, it's possible that we hit this
102
+ # before the indicator file is written. However, since we will wait
103
+ # EVENT_INTERVAL_SECONDS before the first run, this should be very
104
+ # unlikely.
105
+ logger.info('No jobs controller indicator file found.')
106
+ all_job_ids = managed_job_state.get_all_job_ids_by_name(None)
107
+ if not all_job_ids:
108
+ logger.info('No jobs running. Stopping controllers.')
109
+ # TODO(cooperc): Move this to a shared function also called by
110
+ # sdk.api_stop(). (#7229)
111
+ try:
112
+ records = scheduler.get_controller_process_records()
113
+ if records is not None:
114
+ for record in records:
115
+ if managed_job_utils.controller_process_alive(
116
+ record, quiet=False):
117
+ subprocess_utils.kill_children_processes(
118
+ parent_pids=[record.pid], force=True)
119
+ os.remove(
120
+ os.path.expanduser(
121
+ scheduler.JOB_CONTROLLER_PID_PATH))
122
+ except Exception as e: # pylint: disable=broad-except
123
+ # in case we get perm issues or something is messed up, just
124
+ # ignore it and assume the process is dead
125
+ logger.error(
126
+ f'Error looking at job controller pid file: {e}')
127
+ pass
128
+ logger.info(f'{len(all_job_ids)} jobs running. Assuming the '
129
+ 'indicator file hasn\'t been written yet.')
130
+ return
131
+
132
+ logger.info('=== Updating managed job status ===')
78
133
  managed_job_utils.update_managed_jobs_statuses()
79
- managed_job_scheduler.maybe_schedule_next_jobs()
134
+ scheduler.maybe_start_controllers()
80
135
 
81
136
 
82
137
  class ServiceUpdateEvent(SkyletEvent):
@@ -87,8 +142,12 @@ class ServiceUpdateEvent(SkyletEvent):
87
142
  """
88
143
  EVENT_INTERVAL_SECONDS = 300
89
144
 
145
+ def __init__(self, pool: bool) -> None:
146
+ super().__init__()
147
+ self._pool = pool
148
+
90
149
  def _run(self):
91
- serve_utils.update_service_status()
150
+ serve_utils.update_service_status(self._pool)
92
151
 
93
152
 
94
153
  class UsageHeartbeatReportEvent(SkyletEvent):
@@ -128,23 +187,37 @@ class AutostopEvent(SkyletEvent):
128
187
  logger.debug('autostop_config not set. Skipped.')
129
188
  return
130
189
 
131
- if (job_lib.is_cluster_idle() and
132
- not managed_job_state.get_num_alive_jobs()):
133
- idle_minutes = (time.time() -
134
- autostop_lib.get_last_active_time()) // 60
190
+ ignore_idle_check = (
191
+ autostop_config.wait_for == autostop_lib.AutostopWaitFor.NONE)
192
+ is_idle = True
193
+ if not ignore_idle_check:
194
+ if not job_lib.is_cluster_idle(
195
+ ) or managed_job_state.get_num_alive_jobs() or (
196
+ autostop_config.wait_for
197
+ == autostop_lib.AutostopWaitFor.JOBS_AND_SSH and
198
+ autostop_lib.has_active_ssh_sessions()):
199
+ is_idle = False
200
+
201
+ if ignore_idle_check or is_idle:
202
+ minutes_since_last_active = (
203
+ time.time() - autostop_lib.get_last_active_time()) // 60
135
204
  logger.debug(
136
- f'Idle minutes: {idle_minutes}, '
137
- f'AutoStop config: {autostop_config.autostop_idle_minutes}')
205
+ f'Minutes since last active: {minutes_since_last_active}, '
206
+ f'AutoStop idle minutes: '
207
+ f'{autostop_config.autostop_idle_minutes}, '
208
+ f'Wait for: {autostop_config.wait_for.value}')
138
209
  else:
139
210
  autostop_lib.set_last_active_time_to_now()
140
- idle_minutes = -1
141
- logger.debug(
142
- 'Not idle. Reset idle minutes.'
143
- f'AutoStop config: {autostop_config.autostop_idle_minutes}')
144
- if idle_minutes >= autostop_config.autostop_idle_minutes:
211
+ minutes_since_last_active = -1
212
+ logger.debug('Not idle. Reset idle minutes. '
213
+ f'AutoStop idle minutes: '
214
+ f'{autostop_config.autostop_idle_minutes}, '
215
+ f'Wait for: {autostop_config.wait_for.value}')
216
+ if minutes_since_last_active >= autostop_config.autostop_idle_minutes:
145
217
  logger.info(
146
- f'{idle_minutes} idle minutes reached; threshold: '
147
- f'{autostop_config.autostop_idle_minutes} minutes. Stopping.')
218
+ f'{minutes_since_last_active} minute(s) since last active; '
219
+ f'threshold: {autostop_config.autostop_idle_minutes} minutes. '
220
+ f'Stopping.')
148
221
  self._stop_cluster(autostop_config)
149
222
 
150
223
  def _stop_cluster(self, autostop_config):
@@ -154,7 +227,7 @@ class AutostopEvent(SkyletEvent):
154
227
 
155
228
  config_path = os.path.abspath(
156
229
  os.path.expanduser(cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH))
157
- config = common_utils.read_yaml(config_path)
230
+ config = yaml_utils.read_yaml(config_path)
158
231
  provider_name = cluster_utils.get_provider_name(config)
159
232
  cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
160
233
  assert cloud is not None, f'Unknown cloud: {provider_name}'
@@ -249,8 +322,15 @@ class AutostopEvent(SkyletEvent):
249
322
  cluster_name_on_cloud = cluster_config['cluster_name']
250
323
  is_cluster_multinode = cluster_config['max_workers'] > 0
251
324
 
325
+ # Clear AWS credentials from environment to force boto3 to use IAM
326
+ # role attached to the instance (lowest priority in credential chain).
327
+ # This allows the cluster to stop/terminate itself using its IAM role.
252
328
  os.environ.pop('AWS_ACCESS_KEY_ID', None)
253
329
  os.environ.pop('AWS_SECRET_ACCESS_KEY', None)
330
+ os.environ.pop('AWS_SESSION_TOKEN', None)
331
+ # Point boto3 to /dev/null to skip reading credentials from files.
332
+ os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/dev/null'
333
+ os.environ['AWS_CONFIG_FILE'] = '/dev/null'
254
334
 
255
335
  # Stop the ray autoscaler to avoid scaling up, during
256
336
  # stopping/terminating of the cluster.
@@ -282,7 +362,7 @@ class AutostopEvent(SkyletEvent):
282
362
  else:
283
363
  yaml_str = self._CATCH_NODES.sub(r'cache_stopped_nodes: true',
284
364
  yaml_str)
285
- config = yaml.safe_load(yaml_str)
365
+ config = yaml_utils.safe_load(yaml_str)
286
366
  # Set the private key with the existed key on the remote instance.
287
367
  config['auth']['ssh_private_key'] = '~/ray_bootstrap_key.pem'
288
368
  # NOTE: We must do this, otherwise with ssh_proxy_command still under
@@ -299,5 +379,5 @@ class AutostopEvent(SkyletEvent):
299
379
  config['auth'].pop('ssh_proxy_command', None)
300
380
  # Empty the file_mounts.
301
381
  config['file_mounts'] = {}
302
- common_utils.dump_yaml(yaml_path, config)
382
+ yaml_utils.dump_yaml(yaml_path, config)
303
383
  logger.debug('Replaced upscaling speed to 0.')