skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -2,11 +2,10 @@
2
2
  import copy
3
3
  import dataclasses
4
4
  import enum
5
- import getpass
6
5
  import os
7
6
  import tempfile
8
7
  import typing
9
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
8
+ from typing import Any, Callable, Dict, Iterable, List, Optional, Set
10
9
  import uuid
11
10
 
12
11
  import colorama
@@ -24,10 +23,14 @@ from sky.clouds import gcp
24
23
  from sky.data import data_utils
25
24
  from sky.data import storage as storage_lib
26
25
  from sky.jobs import constants as managed_job_constants
26
+ from sky.jobs import state as managed_job_state
27
+ from sky.provision.kubernetes import constants as kubernetes_constants
27
28
  from sky.serve import constants as serve_constants
29
+ from sky.serve import serve_state
28
30
  from sky.setup_files import dependencies
29
31
  from sky.skylet import constants
30
32
  from sky.skylet import log_lib
33
+ from sky.utils import annotations
31
34
  from sky.utils import common
32
35
  from sky.utils import common_utils
33
36
  from sky.utils import config_utils
@@ -35,10 +38,16 @@ from sky.utils import env_options
35
38
  from sky.utils import registry
36
39
  from sky.utils import rich_utils
37
40
  from sky.utils import ux_utils
41
+ from sky.utils import yaml_utils
38
42
 
39
43
  if typing.TYPE_CHECKING:
44
+ import psutil
45
+
40
46
  from sky import task as task_lib
41
47
  from sky.backends import cloud_vm_ray_backend
48
+ else:
49
+ from sky.adaptors import common as adaptors_common
50
+ psutil = adaptors_common.LazyImport('psutil')
42
51
 
43
52
  logger = sky_logging.init_logger(__name__)
44
53
 
@@ -63,8 +72,9 @@ class _ControllerSpec:
63
72
  """Spec for skypilot controllers."""
64
73
  controller_type: str
65
74
  name: str
66
- cluster_name: str
67
- in_progress_hint: str
75
+ _cluster_name_func: Callable[[], str]
76
+ _cluster_name_from_server: Optional[str] # For client-side only
77
+ in_progress_hint: Callable[[bool], str]
68
78
  decline_cancel_hint: str
69
79
  _decline_down_when_failed_to_fetch_status_hint: str
70
80
  decline_down_for_dirty_controller_hint: str
@@ -84,6 +94,24 @@ class _ControllerSpec:
84
94
  return self._check_cluster_name_hint.format(
85
95
  cluster_name=self.cluster_name)
86
96
 
97
+ @property
98
+ def cluster_name(self) -> str:
99
+ """The cluster name of the controller.
100
+
101
+ On the server-side, the cluster name is the actual cluster name,
102
+ which is read from common.(JOB|SKY_SERVE)_CONTROLLER_NAME.
103
+
104
+ On the client-side, the cluster name may not be accurate,
105
+ as we may not know the exact name, because we are missing
106
+ the server-side common.SERVER_ID. We have to wait until
107
+ we get the actual cluster name from the server.
108
+ """
109
+ return (self._cluster_name_from_server if self._cluster_name_from_server
110
+ is not None else self._cluster_name_func())
111
+
112
+ def set_cluster_name_from_server(self, cluster_name: str) -> None:
113
+ self._cluster_name_from_server = cluster_name
114
+
87
115
 
88
116
  # TODO: refactor controller class to not be an enum.
89
117
  class Controllers(enum.Enum):
@@ -93,10 +121,11 @@ class Controllers(enum.Enum):
93
121
  JOBS_CONTROLLER = _ControllerSpec(
94
122
  controller_type='jobs',
95
123
  name='managed jobs controller',
96
- cluster_name=common.JOB_CONTROLLER_NAME,
97
- in_progress_hint=(
98
- '* {job_info}To see all managed jobs: '
99
- f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
124
+ _cluster_name_func=lambda: common.JOB_CONTROLLER_NAME,
125
+ _cluster_name_from_server=None,
126
+ in_progress_hint=lambda _:
127
+ ('* {job_info}To see all managed jobs: '
128
+ f'{colorama.Style.BRIGHT}sky jobs queue{colorama.Style.RESET_ALL}'),
100
129
  decline_cancel_hint=(
101
130
  'Cancelling the jobs controller\'s jobs is not allowed.\nTo cancel '
102
131
  f'managed jobs, use: {colorama.Style.BRIGHT}sky jobs cancel '
@@ -124,10 +153,14 @@ class Controllers(enum.Enum):
124
153
  SKY_SERVE_CONTROLLER = _ControllerSpec(
125
154
  controller_type='serve',
126
155
  name='serve controller',
127
- cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
156
+ _cluster_name_func=lambda: common.SKY_SERVE_CONTROLLER_NAME,
157
+ _cluster_name_from_server=None,
128
158
  in_progress_hint=(
129
- f'* To see detailed service status: {colorama.Style.BRIGHT}'
130
- f'sky serve status -v{colorama.Style.RESET_ALL}'),
159
+ lambda pool:
160
+ (f'* To see detailed pool status: {colorama.Style.BRIGHT}'
161
+ f'sky jobs pool status -v{colorama.Style.RESET_ALL}') if pool else
162
+ (f'* To see detailed service status: {colorama.Style.BRIGHT}'
163
+ f'sky serve status -v{colorama.Style.RESET_ALL}')),
131
164
  decline_cancel_hint=(
132
165
  'Cancelling the sky serve controller\'s jobs is not allowed.'),
133
166
  _decline_down_when_failed_to_fetch_status_hint=(
@@ -154,7 +187,9 @@ class Controllers(enum.Enum):
154
187
  default_autostop_config=serve_constants.CONTROLLER_AUTOSTOP)
155
188
 
156
189
  @classmethod
157
- def from_name(cls, name: Optional[str]) -> Optional['Controllers']:
190
+ def from_name(cls,
191
+ name: Optional[str],
192
+ expect_exact_match: bool = True) -> Optional['Controllers']:
158
193
  """Check if the cluster name is a controller name.
159
194
 
160
195
  Returns:
@@ -175,7 +210,11 @@ class Controllers(enum.Enum):
175
210
  elif name.startswith(common.JOB_CONTROLLER_PREFIX):
176
211
  controller = cls.JOBS_CONTROLLER
177
212
  prefix = common.JOB_CONTROLLER_PREFIX
178
- if controller is not None and name != controller.value.cluster_name:
213
+
214
+ if controller is not None and expect_exact_match:
215
+ assert name == controller.value.cluster_name, (
216
+ name, controller.value.cluster_name)
217
+ elif controller is not None and name != controller.value.cluster_name:
179
218
  # The client-side cluster_name is not accurate. Assume that `name`
180
219
  # is the actual cluster name, so need to set the controller's
181
220
  # cluster name to the input name.
@@ -189,7 +228,7 @@ class Controllers(enum.Enum):
189
228
  prefix)
190
229
 
191
230
  # Update the cluster name.
192
- controller.value.cluster_name = name
231
+ controller.value.set_cluster_name_from_server(name)
193
232
  return controller
194
233
 
195
234
  @classmethod
@@ -206,27 +245,35 @@ class Controllers(enum.Enum):
206
245
  return None
207
246
 
208
247
 
209
- def high_availability_specified(cluster_name: Optional[str],
210
- skip_warning: bool = True) -> bool:
248
+ def get_controller_for_pool(pool: bool) -> Controllers:
249
+ """Get the controller type."""
250
+ if pool:
251
+ return Controllers.JOBS_CONTROLLER
252
+ return Controllers.SKY_SERVE_CONTROLLER
253
+
254
+
255
+ def high_availability_specified(cluster_name: Optional[str]) -> bool:
211
256
  """Check if the controller high availability is specified in user config.
212
257
  """
213
- controller = Controllers.from_name(cluster_name)
258
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
214
259
  if controller is None:
215
260
  return False
216
261
 
262
+ if controller.value.controller_type == 'jobs':
263
+ # pylint: disable-next=import-outside-toplevel
264
+ from sky.jobs import utils as managed_job_utils
265
+ if managed_job_utils.is_consolidation_mode():
266
+ return True
267
+ elif controller.value.controller_type == 'serve':
268
+ # pylint: disable-next=import-outside-toplevel
269
+ from sky.serve import serve_utils
270
+ if serve_utils.is_consolidation_mode():
271
+ return True
272
+
217
273
  if skypilot_config.loaded():
218
- high_availability = skypilot_config.get_nested(
219
- (controller.value.controller_type, 'controller',
220
- 'high_availability'), False)
221
- if high_availability:
222
- if controller.value.controller_type != 'serve':
223
- if not skip_warning:
224
- print(f'{colorama.Fore.RED}High availability controller is'
225
- 'only supported for SkyServe controller. It cannot'
226
- f'be enabled for {controller.value.name}.'
227
- f'Skipping this flag.{colorama.Style.RESET_ALL}')
228
- else:
229
- return True
274
+ return skypilot_config.get_nested((controller.value.controller_type,
275
+ 'controller', 'high_availability'),
276
+ False)
230
277
  return False
231
278
 
232
279
 
@@ -263,6 +310,13 @@ def _get_cloud_dependencies_installation_commands(
263
310
  sky_check.get_cached_enabled_clouds_or_refresh(
264
311
  sky_cloud.CloudCapability.STORAGE))
265
312
  enabled_clouds = enabled_compute_clouds.union(enabled_storage_clouds)
313
+ enabled_k8s_and_ssh = [
314
+ repr(cloud)
315
+ for cloud in enabled_clouds
316
+ if isinstance(cloud, clouds.Kubernetes)
317
+ ]
318
+ k8s_and_ssh_label = ' and '.join(sorted(enabled_k8s_and_ssh))
319
+ k8s_dependencies_installed = False
266
320
 
267
321
  for cloud in enabled_clouds:
268
322
  cloud_python_dependencies: List[str] = copy.deepcopy(
@@ -282,10 +336,33 @@ def _get_cloud_dependencies_installation_commands(
282
336
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
283
337
  commands.append(f'echo -en "\\r{step_prefix}GCP SDK{empty_str}" &&'
284
338
  f'{gcp.GOOGLE_SDK_INSTALLATION_COMMAND}')
285
- elif isinstance(cloud, clouds.Kubernetes):
339
+ if clouds.cloud_in_iterable(clouds.Kubernetes(), enabled_clouds):
340
+ # Install gke-gcloud-auth-plugin used for exec-auth with GKE.
341
+ # We install the plugin here instead of the next elif branch
342
+ # because gcloud is required to install the plugin, so the order
343
+ # of command execution is critical.
344
+
345
+ # We install plugin here regardless of whether exec-auth is
346
+ # actually used as exec-auth may be used in the future.
347
+ # TODO (kyuds): how to implement conservative installation?
348
+ commands.append(
349
+ '(command -v gke-gcloud-auth-plugin &>/dev/null || '
350
+ '(gcloud components install gke-gcloud-auth-plugin --quiet &>/dev/null))') # pylint: disable=line-too-long
351
+ elif isinstance(cloud, clouds.Nebius):
352
+ step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
353
+ commands.append(
354
+ f'echo -en "\\r{step_prefix}Nebius{empty_str}" && '
355
+ 'curl -sSL https://storage.eu-north1.nebius.cloud/cli/install.sh ' # pylint: disable=line-too-long
356
+ '| sudo NEBIUS_INSTALL_FOLDER=/usr/local/bin bash &> /dev/null && '
357
+ 'nebius profile create --profile sky '
358
+ '--endpoint api.nebius.cloud '
359
+ '--service-account-file $HOME/.nebius/credentials.json '
360
+ '&> /dev/null || echo "Unable to create Nebius profile."')
361
+ elif (isinstance(cloud, clouds.Kubernetes) and
362
+ not k8s_dependencies_installed):
286
363
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
287
364
  commands.append(
288
- f'echo -en "\\r{step_prefix}Kubernetes{empty_str}" && '
365
+ f'echo -en "\\r{step_prefix}{k8s_and_ssh_label}{empty_str}" && '
289
366
  # Install k8s + skypilot dependencies
290
367
  'sudo bash -c "if '
291
368
  '! command -v curl &> /dev/null || '
@@ -305,7 +382,10 @@ def _get_cloud_dependencies_installation_commands(
305
382
  '(curl -s -LO "https://dl.k8s.io/release/v1.31.6'
306
383
  '/bin/linux/$ARCH/kubectl" && '
307
384
  'sudo install -o root -g root -m 0755 '
308
- 'kubectl /usr/local/bin/kubectl))')
385
+ 'kubectl /usr/local/bin/kubectl)) && '
386
+ f'echo -e \'#!/bin/bash\\nexport PATH="{kubernetes_constants.SKY_K8S_EXEC_AUTH_PATH}"\\nexec "$@"\' | sudo tee /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER} > /dev/null && ' # pylint: disable=line-too-long
387
+ f'sudo chmod +x /usr/local/bin/{kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER}') # pylint: disable=line-too-long
388
+ k8s_dependencies_installed = True
309
389
  elif isinstance(cloud, clouds.Cudo):
310
390
  step_prefix = prefix_str.replace('<step>', str(len(commands) + 1))
311
391
  commands.append(
@@ -358,7 +438,7 @@ def check_cluster_name_not_controller(
358
438
  Returns:
359
439
  None, if the cluster name is not a controller name.
360
440
  """
361
- controller = Controllers.from_name(cluster_name)
441
+ controller = Controllers.from_name(cluster_name, expect_exact_match=False)
362
442
  if controller is not None:
363
443
  msg = controller.value.check_cluster_name_hint
364
444
  if operation_str is not None:
@@ -368,10 +448,11 @@ def check_cluster_name_not_controller(
368
448
 
369
449
 
370
450
  # Internal only:
371
- def download_and_stream_latest_job_log(
451
+ def download_and_stream_job_log(
372
452
  backend: 'cloud_vm_ray_backend.CloudVmRayBackend',
373
453
  handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
374
- local_dir: str) -> Optional[str]:
454
+ local_dir: str,
455
+ job_ids: Optional[List[str]] = None) -> Optional[str]:
375
456
  """Downloads and streams the latest job log.
376
457
 
377
458
  This function is only used by jobs controller and sky serve controller.
@@ -389,7 +470,7 @@ def download_and_stream_latest_job_log(
389
470
  # multi-node cluster is preempted, and we recover the managed job
390
471
  # on the existing cluster, which leads to a larger job_id. Those
391
472
  # job_ids all represent the same logical managed job.
392
- job_ids=None,
473
+ job_ids=job_ids,
393
474
  local_dir=local_dir)
394
475
  except Exception as e: # pylint: disable=broad-except
395
476
  # We want to avoid crashing the controller. sync_down_logs() is pretty
@@ -407,7 +488,7 @@ def download_and_stream_latest_job_log(
407
488
  return None
408
489
 
409
490
  log_dir = list(log_dirs.values())[0]
410
- log_file = os.path.join(log_dir, 'run.log')
491
+ log_file = os.path.expanduser(os.path.join(log_dir, 'run.log'))
411
492
 
412
493
  # Print the logs to the console.
413
494
  # TODO(zhwu): refactor this into log_utils, along with the refactoring for
@@ -452,10 +533,13 @@ def shared_controller_vars_to_fill(
452
533
  # before popping allowed_contexts. If it is not on Kubernetes,
453
534
  # we may be able to use allowed_contexts.
454
535
  local_user_config.pop('allowed_contexts', None)
536
+ # Remove api_server config so that the controller does not try to use
537
+ # a remote API server.
538
+ local_user_config.pop('api_server', None)
455
539
  with tempfile.NamedTemporaryFile(
456
540
  delete=False,
457
541
  suffix=_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX) as temp_file:
458
- common_utils.dump_yaml(temp_file.name, dict(**local_user_config))
542
+ yaml_utils.dump_yaml(temp_file.name, dict(**local_user_config))
459
543
  local_user_config_path = temp_file.name
460
544
 
461
545
  vars_to_fill: Dict[str, Any] = {
@@ -474,7 +558,7 @@ def shared_controller_vars_to_fill(
474
558
  env_vars.update({
475
559
  # Should not use $USER here, as that env var can be empty when
476
560
  # running in a container.
477
- constants.USER_ENV_VAR: getpass.getuser(),
561
+ constants.USER_ENV_VAR: common_utils.get_current_user_name(),
478
562
  constants.USER_ID_ENV_VAR: common_utils.get_user_hash(),
479
563
  # Skip cloud identity check to avoid the overhead.
480
564
  env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.env_key: '1',
@@ -517,6 +601,30 @@ def get_controller_resources(
517
601
  if custom_controller_resources_config is not None:
518
602
  controller_resources_config_copied.update(
519
603
  custom_controller_resources_config)
604
+ # Compatibility with the old way of specifying the controller autostop
605
+ # config. TODO(cooperc): Remove this before 0.12.0.
606
+ custom_controller_autostop_config = skypilot_config.get_nested(
607
+ (controller.value.controller_type, 'controller', 'autostop'), None)
608
+ if custom_controller_autostop_config is not None:
609
+ logger.warning(
610
+ f'{colorama.Fore.YELLOW}Warning: Config value '
611
+ f'`{controller.value.controller_type}.controller.autostop` '
612
+ 'is deprecated. Please use '
613
+ f'`{controller.value.controller_type}.controller.resources.'
614
+ f'autostop` instead.{colorama.Style.RESET_ALL}')
615
+ # Only set the autostop config if it is not already specified.
616
+ if controller_resources_config_copied.get('autostop') is None:
617
+ controller_resources_config_copied['autostop'] = (
618
+ custom_controller_autostop_config)
619
+ else:
620
+ logger.warning(f'{colorama.Fore.YELLOW}Ignoring the old '
621
+ 'config, since it is already specified in '
622
+ f'resources.{colorama.Style.RESET_ALL}')
623
+ # Set the default autostop config for the controller, if not already
624
+ # specified.
625
+ if controller_resources_config_copied.get('autostop') is None:
626
+ controller_resources_config_copied['autostop'] = (
627
+ controller.value.default_autostop_config)
520
628
 
521
629
  try:
522
630
  controller_resources = resources.Resources.from_yaml_config(
@@ -542,12 +650,16 @@ def get_controller_resources(
542
650
  controller_resources_to_use: resources.Resources = list(
543
651
  controller_resources)[0]
544
652
 
545
- controller_record = global_user_state.get_cluster_from_name(
653
+ controller_handle = global_user_state.get_handle_from_cluster_name(
546
654
  controller.value.cluster_name)
547
- if controller_record is not None:
548
- handle = controller_record.get('handle', None)
549
- if handle is not None:
550
- controller_resources_to_use = handle.launched_resources
655
+ if controller_handle is not None:
656
+ if controller_handle is not None:
657
+ # Use the existing resources, but override the autostop config with
658
+ # the one currently specified in the config.
659
+ controller_resources_to_use = (
660
+ controller_handle.launched_resources.copy(
661
+ autostop=controller_resources_config_copied.get('autostop'))
662
+ )
551
663
 
552
664
  # If the controller and replicas are from the same cloud (and region/zone),
553
665
  # it should provide better connectivity. We will let the controller choose
@@ -608,8 +720,9 @@ def get_controller_resources(
608
720
  controller_zone = controller_resources_to_use.zone
609
721
 
610
722
  # Filter clouds if controller_resources_to_use.cloud is specified.
611
- filtered_clouds = ({controller_cloud} if controller_cloud is not None else
612
- requested_clouds_with_region_zone.keys())
723
+ filtered_clouds: Set[str] = {controller_cloud
724
+ } if controller_cloud is not None else set(
725
+ requested_clouds_with_region_zone.keys())
613
726
 
614
727
  # Filter regions and zones and construct the result.
615
728
  result: Set[resources.Resources] = set()
@@ -618,15 +731,17 @@ def get_controller_resources(
618
731
  {None: {None}})
619
732
 
620
733
  # Filter regions if controller_resources_to_use.region is specified.
621
- filtered_regions = ({controller_region} if controller_region is not None
622
- else regions.keys())
734
+ filtered_regions: Set[Optional[str]] = ({
735
+ controller_region
736
+ } if controller_region is not None else set(regions.keys()))
623
737
 
624
738
  for region in filtered_regions:
625
739
  zones = regions.get(region, {None})
626
740
 
627
741
  # Filter zones if controller_resources_to_use.zone is specified.
628
- filtered_zones = ({controller_zone}
629
- if controller_zone is not None else zones)
742
+ filtered_zones: Set[Optional[str]] = ({
743
+ controller_zone
744
+ } if controller_zone is not None else set(zones))
630
745
 
631
746
  # Create combinations of cloud, region, and zone.
632
747
  for zone in filtered_zones:
@@ -641,38 +756,15 @@ def get_controller_resources(
641
756
  return result
642
757
 
643
758
 
644
- def get_controller_autostop_config(
645
- controller: Controllers) -> Tuple[Optional[int], bool]:
646
- """Get the autostop config for the controller.
647
-
648
- Returns:
649
- A tuple of (idle_minutes_to_autostop, down), which correspond to the
650
- values passed to execution.launch().
651
- """
652
- controller_autostop_config_copied: Dict[str, Any] = copy.copy(
653
- controller.value.default_autostop_config)
654
- if skypilot_config.loaded():
655
- custom_controller_autostop_config = skypilot_config.get_nested(
656
- (controller.value.controller_type, 'controller', 'autostop'), None)
657
- if custom_controller_autostop_config is False:
658
- # Disabled with `autostop: false` in config.
659
- # To indicate autostop is disabled, we return None for
660
- # idle_minutes_to_autostop.
661
- return None, False
662
- elif custom_controller_autostop_config is True:
663
- # Enabled with default values. There is no change in behavior, but
664
- # this is included by for completeness, since `False` is valid.
665
- pass
666
- elif custom_controller_autostop_config is not None:
667
- # We have specific config values.
668
- # Override the controller autostop config with the ones specified in
669
- # the config.
670
- assert isinstance(custom_controller_autostop_config, dict)
671
- controller_autostop_config_copied.update(
672
- custom_controller_autostop_config)
673
-
674
- return (controller_autostop_config_copied['idle_minutes'],
675
- controller_autostop_config_copied['down'])
759
+ def get_controller_mem_size_gb() -> float:
760
+ try:
761
+ with open(os.path.expanduser(constants.CONTROLLER_K8S_MEMORY_FILE),
762
+ 'r',
763
+ encoding='utf-8') as f:
764
+ return float(f.read())
765
+ except FileNotFoundError:
766
+ pass
767
+ return common_utils.get_mem_size_gb()
676
768
 
677
769
 
678
770
  def _setup_proxy_command_on_controller(
@@ -703,7 +795,7 @@ def _setup_proxy_command_on_controller(
703
795
  # NOTE: suppose that we have a controller in old VPC, then user
704
796
  # changes 'vpc_name' in the config and does a 'job launch' /
705
797
  # 'serve up'. In general, the old controller may not successfully
706
- # launch the job in the new VPC. This happens if the two VPCs dont
798
+ # launch the job in the new VPC. This happens if the two VPCs don't
707
799
  # have peering set up. Like other places in the code, we assume
708
800
  # properly setting up networking is user's responsibilities.
709
801
  # TODO(zongheng): consider adding a basic check that checks
@@ -714,7 +806,11 @@ def _setup_proxy_command_on_controller(
714
806
  config = config_utils.Config.from_dict(user_config)
715
807
  proxy_command_key = (str(controller_launched_cloud).lower(),
716
808
  'ssh_proxy_command')
717
- ssh_proxy_command = config.get_nested(proxy_command_key, None)
809
+ ssh_proxy_command = skypilot_config.get_effective_region_config(
810
+ cloud=str(controller_launched_cloud).lower(),
811
+ region=None,
812
+ keys=('ssh_proxy_command',),
813
+ default_value=None)
718
814
  if isinstance(ssh_proxy_command, str):
719
815
  config.set_nested(proxy_command_key, None)
720
816
  elif isinstance(ssh_proxy_command, dict):
@@ -744,9 +840,9 @@ def replace_skypilot_config_path_in_file_mounts(
744
840
  continue
745
841
  if local_path.endswith(_LOCAL_SKYPILOT_CONFIG_PATH_SUFFIX):
746
842
  with tempfile.NamedTemporaryFile('w', delete=False) as f:
747
- user_config = common_utils.read_yaml(local_path)
843
+ user_config = yaml_utils.read_yaml(local_path)
748
844
  config = _setup_proxy_command_on_controller(cloud, user_config)
749
- common_utils.dump_yaml(f.name, dict(**config))
845
+ yaml_utils.dump_yaml(f.name, dict(**config))
750
846
  file_mounts[remote_path] = f.name
751
847
  replaced = True
752
848
  if replaced:
@@ -789,7 +885,7 @@ def translate_local_file_mounts_to_two_hop(
789
885
  file_mount_id = 0
790
886
 
791
887
  file_mounts_to_translate = task.file_mounts or {}
792
- if task.workdir is not None:
888
+ if task.workdir is not None and isinstance(task.workdir, str):
793
889
  file_mounts_to_translate[constants.SKY_REMOTE_WORKDIR] = task.workdir
794
890
  task.workdir = None
795
891
 
@@ -857,7 +953,8 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
857
953
  copy_mounts = {}
858
954
 
859
955
  has_local_source_paths_file_mounts = bool(copy_mounts)
860
- has_local_source_paths_workdir = task.workdir is not None
956
+ has_local_source_paths_workdir = (task.workdir is not None and
957
+ isinstance(task.workdir, str))
861
958
 
862
959
  msg = None
863
960
  if has_local_source_paths_workdir and has_local_source_paths_file_mounts:
@@ -905,7 +1002,7 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
905
1002
 
906
1003
  # Step 1: Translate the workdir to SkyPilot storage.
907
1004
  new_storage_mounts = {}
908
- if task.workdir is not None:
1005
+ if task.workdir is not None and isinstance(task.workdir, str):
909
1006
  workdir = task.workdir
910
1007
  task.workdir = None
911
1008
  if (constants.SKY_REMOTE_WORKDIR in original_file_mounts or
@@ -1126,3 +1223,81 @@ def maybe_translate_local_file_mounts_and_sync_up(task: 'task_lib.Task',
1126
1223
  task.update_storage_mounts(updated_mount_storages)
1127
1224
  if msg:
1128
1225
  logger.info(ux_utils.finishing_message('Uploaded local files/folders.'))
1226
+
1227
+
1228
+ # ======================= Resources Management Functions =======================
1229
+
1230
+ # Based on testing, assume a running job process uses 350MB memory. We use the
1231
+ # same estimation for service controller process.
1232
+ JOB_MEMORY_MB = 350
1233
+ # Monitoring process for service is 1GB. This is based on an old estimation but
1234
+ # we keep it here for now.
1235
+ # TODO(tian): Remeasure this.
1236
+ SERVE_MONITORING_MEMORY_MB = 1024
1237
+ # The ratio of service controller process to job process. We will treat each
1238
+ # service as SERVE_PROC_RATIO job processes.
1239
+ SERVE_PROC_RATIO = SERVE_MONITORING_MEMORY_MB / JOB_MEMORY_MB
1240
+ # Past 2000 simultaneous jobs, we become unstable.
1241
+ # See https://github.com/skypilot-org/skypilot/issues/4649.
1242
+ MAX_JOB_LIMIT = 2000
1243
+ # Number of ongoing launches launches allowed per CPU, for managed jobs.
1244
+ JOB_LAUNCHES_PER_CPU = 4
1245
+ # Number of ongoing launches launches allowed per CPU, for services. This is
1246
+ # also based on an old estimation, but SKyServe indeed spawn a new process
1247
+ # for each launch operation, so it should be slightly more resources demanding
1248
+ # than managed jobs.
1249
+ SERVE_LAUNCHES_PER_CPU = 2
1250
+ # The ratio of service launch to job launch. This is inverted as the parallelism
1251
+ # is determined by 1 / LAUNCHES_PER_CPU.
1252
+ SERVE_LAUNCH_RATIO = JOB_LAUNCHES_PER_CPU / SERVE_LAUNCHES_PER_CPU
1253
+
1254
+ # The _RESOURCES_LOCK should be held whenever we are checking the parallelism
1255
+ # control or updating the schedule_state of any job or service. Any code that
1256
+ # takes this lock must conclude by calling maybe_schedule_next_jobs.
1257
+ _RESOURCES_LOCK = '~/.sky/locks/controller_resources.lock'
1258
+
1259
+
1260
+ @annotations.lru_cache(scope='global', maxsize=1)
1261
+ def get_resources_lock_path() -> str:
1262
+ path = os.path.expanduser(_RESOURCES_LOCK)
1263
+ os.makedirs(os.path.dirname(path), exist_ok=True)
1264
+ return path
1265
+
1266
+
1267
+ @annotations.lru_cache(scope='request')
1268
+ def _get_job_parallelism() -> int:
1269
+ job_memory = JOB_MEMORY_MB * 1024 * 1024
1270
+ job_limit = min(psutil.virtual_memory().total // job_memory, MAX_JOB_LIMIT)
1271
+ return max(job_limit, 1)
1272
+
1273
+
1274
+ @annotations.lru_cache(scope='request')
1275
+ def _get_launch_parallelism() -> int:
1276
+ cpus = os.cpu_count()
1277
+ return cpus * JOB_LAUNCHES_PER_CPU if cpus is not None else 1
1278
+
1279
+
1280
+ def can_provision() -> bool:
1281
+ # We always prioritize terminating over provisioning, to save the cost on
1282
+ # idle resources.
1283
+ if serve_state.total_number_scheduled_to_terminate_replicas() > 0:
1284
+ return False
1285
+ return can_terminate()
1286
+
1287
+
1288
+ def can_start_new_process() -> bool:
1289
+ num_procs = (serve_state.get_num_services() * SERVE_PROC_RATIO +
1290
+ managed_job_state.get_num_alive_jobs())
1291
+ return num_procs < _get_job_parallelism()
1292
+
1293
+
1294
+ # We limit the number of terminating replicas to the number of CPUs. This is
1295
+ # just a temporary solution to avoid overwhelming the controller. After one job
1296
+ # controller PR, we should use API server to handle resources management.
1297
+ def can_terminate() -> bool:
1298
+ num_terminating = (
1299
+ serve_state.total_number_provisioning_replicas() * SERVE_LAUNCH_RATIO +
1300
+ # Each terminate process will take roughly the same CPUs as job launch.
1301
+ serve_state.total_number_terminating_replicas() +
1302
+ managed_job_state.get_num_launching_jobs())
1303
+ return num_terminating < _get_launch_parallelism()